// Built by Peter A Noble April 2020 Email: panoble2017@gmail.com
// Copyright 2020

#include <fstream>
#include <string>
#include <iostream>
#include <math.h>
#include <cstdlib>
#include <float.h>
#include <complex>
#include <iomanip>
#include <stdio.h>
#include <ctype.h>
// g++ unique_word_count.cpp -o unique_word_count
// ./unique_word_count test.txt words_out.txt
// ./unique_word_count EHR.txt EHR_out.txt
// ./unique_word_count MobyDick.txt MobyDick_out.txt

// Purpose is to tokenize the words and count their frequency

using namespace std;

char* clean(char* str);

char* clean(char* str)
{
int flag=0;
int length=0;
length=strlen(str);
int i,j=0;
char str1[100];  //only restriction
	
for (i=0;i<(length);i++)
	{
	if ((str[i]!=';') 
	&& (str[i]!=':') 
	&& (str[i]!=',') 
	&& (str[i]!='.') 
	&& (str[i]!='?') 
	&& (str[i]!='*') 
	&& (str[i]!='!') 
//	&& (str[i]!='0')
//	&& (str[i]!='1')
//	&& (str[i]!='2')
//	&& (str[i]!='3')
//	&& (str[i]!='4')
//	&& (str[i]!='5')
//	&& (str[i]!='6')
//	&& (str[i]!='7')
//	&& (str[i]!='8')
//	&& (str[i]!='9')
	&& (str[i]!='(')
	&& (str[i]!=')')
	&& (str[i]!='[')
	&& (str[i]!=']')
	&& (str[i]!='#')
	&& (str[i]!='$')
	&& (str[i]!='&')
//	&& (str[i]!='"')
	&& (str[i]!='%')
//	&& (str[i]!='/')
	&& (str[i]!='_')
//	&& (str[i]!='-')
//	&& (str[i]!=' ')
	)
	{ 
	if (str[i]=='A') {str[i]='a';}
	if (str[i]=='B') {str[i]='b';}
	if (str[i]=='C') {str[i]='c';}
	if (str[i]=='D') {str[i]='d';}
	if (str[i]=='E') {str[i]='e';}
	if (str[i]=='F') {str[i]='f';}
	if (str[i]=='G') {str[i]='g';}
	if (str[i]=='H') {str[i]='h';}
	if (str[i]=='I') {str[i]='i';}
	if (str[i]=='J') {str[i]='j';}
	if (str[i]=='K') {str[i]='k';}
	if (str[i]=='L') {str[i]='l';}
	if (str[i]=='M') {str[i]='m';}
	if (str[i]=='N') {str[i]='n';}
	if (str[i]=='O') {str[i]='o';}
	if (str[i]=='P') {str[i]='p';}
	if (str[i]=='Q') {str[i]='q';}
	if (str[i]=='R') {str[i]='r';}
	if (str[i]=='S') {str[i]='s';}
	if (str[i]=='T') {str[i]='t';}
	if (str[i]=='U') {str[i]='u';}
	if (str[i]=='V') {str[i]='v';}
	if (str[i]=='W') {str[i]='w';}
	if (str[i]=='X') {str[i]='x';}
	if (str[i]=='Y') {str[i]='y';}
	if (str[i]=='Z') {str[i]='z';}
	if (str[i]=='/') {str[i]=' ';}
	if (str[i]=='"') {str[i]=' ';}
	
	str1[j]=str[i];
	j=j+1; 
	}
	}
	str1[j]='\0';
//	putchar (tolower(str1));
	strcpy(str,str1);
	length=strlen(str);	
	if (length==0) {strcpy(str,"blank");}
return str;
}


int main (int argc, char * const argv[]) {
	ifstream in(argv[1]); 	
	ofstream out(argv[2]); 	

int num=10000000;  // needs to be 1000000 or else Segmentation fault: 11
int standard=100;

char** token = new char*[num];
for (int s = 0; s < num; s++)
	{
	token[s] = new char[standard];
	strcpy(token[s],"test");
	}

int* int_array = new int[num];
for (int s = 0; s < num; s++)
	{
	int_array[s]=0;
	}

int count=0;
int count3=0;

int flag=0;
char word[100];
int check=0;

while(!in.eof())
	{
	in >> word; clean(word);  //puts words to lower case and rm junk
	//cout << word << "\n" << flush;
//	if (check==0) {strcpy(token[0],word); check=1;}  // for the first one...
	flag=0;
	for (int s = 0; s < count3; s++)
		{
		if (strcmp(token[s],word)==0) {flag=1; int_array[s]= int_array[s]+1;}
		}
	if (flag==0) {strcpy(token[count3],word); count3=count3+1;}  //token_1=word
	//count=count+1;  
	}
	//exit(1);
//cout << count << "\n" << flush; //exit(1);
//out << count << "\n" << flush; //exit(1);

	for (int s = 0; s < count3; s++)
		{
		cout << s+1 << "\t" << token[s] <<"\t" << int_array[s]+1 << "\n";
		out << token[s] <<"\t" << int_array[s]+1 << "\n";
		}
		
return 0;
}
