// Built by Peter A Noble April 2020 Email: panoble2017@gmail.com
// Copyright 2020

#include <fstream>
#include <string>
#include <iostream>
#include <math.h>
#include <cstdlib>
#include <float.h>
#include <complex>
#include <iomanip>
#include <ctype.h>
#include <vector>
#include <iterator>
#include <sstream>
#include <stdio.h>

// g++ vector_extraction2.cpp -o vector_extraction
// ./vector_extraction word_vectors.txt screened_patient_data.txt found_words.txt word_vectors_out.txt

// Purpose is to extract specific words from the pre-trained model

using namespace std;

int main (int argc, char * const argv[]) {
	ifstream in(argv[1]); 	//word_vectors.txt	
	ifstream in2(argv[2]);	//t.txt
	ofstream out(argv[3]); 	//found_words.txt
	ofstream out2(argv[4]); //word_vectors_out.txt
//	ofstream outfile; 	

// declare variables
int num2=30;
int fill_in=0;
int num=300000; //columns
int standard=100;
int standard2=6000; //rows  5000 rows  300*20=6000
int rows=0;
int columns=0;
string temp;
int pos1;
char * pos2;
int i=0;
int s=0;
int flag=0;
int count=0;
// array declare
char id[10];
string patient;

char** word = new char*[num];
for (int s = 0; s < num; s++)
	{
	word[s] = new char[standard];
	strcpy(word[s],"test");
	}

double** double_array = new double*[num];  //rows=300000
	for (int s = 0; s < num; s++)
		{
		double_array[s] = new double[standard2]; //standard2=6000
		for (int t = 0; t < standard2; t++)
			{
			double_array[s][t]=0.0;
			}
		}

//cout << word[0] << double_array[0][0]<< double_array[0][299]<< "\n" <<flush; exit(1);

char**a_word = new char*[num2];
for (int s = 0; s < num2; s++)
	{
	a_word[s] = new char[standard];
	strcpy(a_word[s],"test");
	}


// Load big array
in >> rows;
in >> columns; 
//rows=10;

for (int s = 0; s < rows; s++)
	{
	in >> temp;
	pos1 = temp.find_first_of('_');
	//strcpy(word[0],temp.substr(0, pos1));
	//cout << temp.substr(0, pos1) << "\t" << word[0] << "\n"; //report id
	char p[temp.substr(0, pos1).length()]; 
  
    for (i = 0; i < sizeof(p); i++) { p[i] = tolower(temp[i]);} p[sizeof(p)]='\0'; strcpy(word[s],p);
    //cout << word[s] << "\n";
	
	for (int t = 0; t < columns; t++)
		{
 		in >> double_array[s][t]; 		
		}
	}
	cout << "Big array loaded...\n";
//	cout << word[296629] << "\t" << double_array[296629][0] << "\t" << double_array[296629][299]<< "\n" <<flush; exit(1);
		
columns=300;  // restricts the number of vectors to a max of 300
int h=0;
while(!in2.eof())
	{
	getline(in2,patient); h=0;
	int n=patient.length();
	char * char_array = new char [patient.length()+ 1];	
	strcpy (char_array, patient.c_str());
    pos2 = strtok (char_array,"\t");
	
  	while (pos2 != NULL)
   		{
   		a_word[h]=(pos2);
    	pos2 = strtok (NULL, "\t");
		h=h+1;
  		 }

	cout << a_word[0] << "\t"; 							// patient id
 	out << a_word[0] << "\t" << a_word[1] << "\t"; 	    // patient id and smoke_index
	out2 << a_word[0] << "\t" << a_word[1] << "\t"; 	// patient id and smoke_index 

	for (int s = 1; s < h; s++)
		{
		flag=0;
		for (int j = 0; j < rows; j++)
			{
			if ((strcmp(a_word[s],word[j])==0) && (flag!=1))
				{
				flag=1;
				cout << a_word[s] << "\t"; count=count+1;
				out << a_word[s] << "\t";
				for (int t = 0; t < columns; t++)
					{
					cout << double_array[j][t] << "\t";
					out2 << double_array[j][t] << "\t";
					}
	//			cout << count << "\n";
				}
			}
//	cout << "\n";
	}
//cout << count << "\n" << flush; exit(1);

// fill in missing words and vectors.
fill_in=(14-count)*(columns);
	
if (count<14) // 14 is the max number of words
	{
	for (int j = 0; j < (14-count-1); j++)
		{
		out << "\t";
		}
	for (int j = 0; j < (fill_in); j++)
		{
		out2 << "0.000000\t";
		}

	}

count=0;
cout << "\n" << flush; //found_words.txt
out << "\n" << flush; //found_words.txt
out2 << "\n"<< flush; //word_vectors_out.txt
}

/*
cout << word[s] << "\t"; 
out << word[s] << "\t"; 
	for (int t = 0; t < columns; t++)
		{
 		cout << double_array[s][t] << "\t"; 		
 		out << double_array[s][t] << "\t"; 		
		}
out << "\n";
cout << "\n";
*/		
return 0;
}
