// Designed by Peter A Noble panoble2017@gmail.com
// August 29, 2020
// Purpose is to extract the probabilities from EHRs using single priors

#include<iostream>
#include<vector>
#include<map>
#include<string>
#include<fstream>
#include<sstream>
#include<cstring>
using namespace std;

// compile the program using the following command in Unix
// g++ make_bay_ICD_dataset.cpp -o make_bay_ICD_dataset
// ./make_bay_ICD_dataset nr_t.txt codes_t.txt out_t.txt

// run the compiled program with the following commands in Unix
// ./make_bay_ICD_dataset nr_report.txt codes.txt out.txt
// ./make_bay_ICD_dataset nr_report_1.txt codes.txt out_1.txt
// ./make_bay_ICD_dataset nr_report_2.txt codes.txt out_2.txt
// ./make_bay_ICD_dataset nr_report_3.txt codes.txt out_3.txt
// ./make_bay_ICD_dataset nr_report_4.txt codes.txt out_4.txt
// ./make_bay_ICD_dataset nr_report_5.txt codes.txt out_5.txt
// ./make_bay_ICD_dataset nr_report_6.txt codes.txt out_6.txt
//  note that this program takes days to run

char icd[500000][100][20];   // 100 columns max
int num_cols[500000];

int main(int argc, char* argv[]) {
string sentence;
int num=3200;
int h=0;
int f=0;
int standard=20;
char * pos2;
int count[100];
int p_A=0;
int p_B=0;
int p_B_A=0;
double p_A_B=0.0;
int flag=0;
int rows=500000;
int columns=100;
int s=0;
int r=0;
double d_p_A=0.0;
double d_p_B=0.0;
double d_p_B_A=0.0;

char** word = new char*[num];
for (int s = 0; s < num; s++)
	{
	word[s] = new char[standard];
	}

char** word2 = new char*[num];
for (int s = 0; s < num; s++)
	{
	word2[s] = new char[standard];
	}

int i=0; int length=0;
char temp_word[standard];
char new_word[standard];

for (int t = 0; t < columns; t++)
	{
	for (int s = 0; s < rows; s++)
		{
		strcpy(icd[s][t],"0");
		}
	}

ifstream in(argv[1]); 		
ifstream in2(argv[2]); 		
ofstream out(argv[3]); 	

while(!in2.eof())  // read in the codes
    {
    in2 >> word[f];
    f=f+1;
	}
//exit(1);

// extract one record to inspect
//	for (int t = 0; t < f; t++)  
//		{
//		cout << icd[x-2][t-1] << "\t";
//		strcpy (word[t-1],code[t]);
//		} 
//	cout << "\n";
/*	
for (int t = 0; t < f; t++)  
	{
	cout << t+1 << "\t" <<word[t] << "\n";
	} 
cout << "\n" << flush;
exit(1);
*/

int x=0;
while(!in.eof())  // read in the matrix
    {
	getline(in,sentence); 
	//cout << sentence << "\n"; //exit(1); 
	h=0;
	char * char_array = new char [sentence.length()+ 1];	
	strcpy (char_array, sentence.c_str());
	char delimiters[2];
	strcpy(delimiters,"\t");
    pos2 = strtok (char_array, delimiters);
	
  	while (pos2 != NULL)
   		{
   		word2[h]=(pos2);
   		pos2 = strtok (NULL, delimiters);
		h=h+1;
  		}

	for (int s=0; s<(h);s++)  
		{
 		strcpy(icd[x][s],word2[s]);		 
		}
 		
	num_cols[x]=h; // 
	x=x+1; 
    }

/*
for (int s = 0; s < x; s++)  // print out matrix to see if correct
	{
	for (int t = 0; t < num_cols[s]; t++)  
		{
		cout << icd[s][t] << "\t";
		}
	cout << "\n";

}
cout << "\n" <<flush;
//exit(1);
*/

// matrix has been read in....  now for the stats 
// determine the num patients with target (i.e., obesity aka new_word)
//cout << "f=\t" << f << "\n" << flush;  exit(1);  f=2810
//f=1;
for (int q = 0; q < (f); q++) 
	{
	for (int r = 0; r < f; r++) 
		{
		p_A=0;p_B_A=0;p_B=0;d_p_A=0.0;d_p_B=0.0;d_p_B_A=0.0;p_A_B=0.0;
		for (int s = 0; s < (x); s++)  
			{
			for (int t = 0; t < num_cols[s]; t++)  
				{
				if (strcmp(icd[s][t],word[q])==0) 
					{
					p_A=p_A+1;
					}
				if (strcmp(icd[s][t],word[r])==0) 
					{
					p_B=p_B+1;
					for (int q1 = 0; q1 < num_cols[s]; q1++)  
						{
						if (strcmp(icd[s][q1],word[q])==0) 
							{
							p_B_A=p_B_A+1;
							}
						}
					}
				
				}
			}
		

		d_p_A=double(double(p_A)/x);  // 3/10
		d_p_B=double(double(p_B)/x);; // 2/10
		d_p_B_A=double(double(p_B_A)/p_A);  //2/3

		if (d_p_B!=0.0)
			{
			p_A_B=double(double(d_p_B_A*d_p_A)/d_p_B);
			}
		
		if (((p_A_B!=0.0) && strcmp(word[q],word[r])!=0))
		//if (strcmp(word[q],word[r])!=0)
			{
			
			cout << word[q] << "\t" << word[r] <<  "\t" << p_A <<  "\t" ;
			out << word[q] << "\t" << word[r] <<  "\t";
			cout <<  p_B_A << "\t" << p_B << "\t";  // count1 is number of stroke patients  P(A)
			//	cout << p_A << "\t" << p_B << "\t" <<  p_B_A << "\n";  // count1 is number of stroke patients  P(A)
			cout << d_p_A << "\t" << d_p_B << "\t" <<  d_p_B_A << "\t" <<  p_A_B << "\n";  // count1 is number of stroke patients  P(A)
			out <<  p_A_B << "\t";  // count1 is number of stroke patients  P(A)
			out <<  p_B_A << "\n";  // count1 is number of stroke patients  P(A)
			}
		}
	}

return 0;
}
