/* Copyright (c) 1994 Burra Gopal, Udi Manber.  All Rights Reserved. */

/*
 * build.c: 	Reads a file = list of indices and hashes frequently used
 *		words into a hash-table for fast retrieval of a word's no.
 *		Also maintains a string-table to get to the word from it no.
 */

#include "defs.h"
/* small tables are ok since computation is a 1-time job: plus we need the extra space if we put it in a library for glimpseindex */
extern hash_entry *get_small_hash(), *insert_small_hash();

hash_entry *dict_hash_table[SMALL_HASH_TABLE_SIZE];
hash_entry *freq_strings_table[MAX_WORD_LEN+2];
int freq_strings_num[MAX_WORD_LEN+2];
extern int MAX_WORDS;
extern int RESERVED_CHARS;

/*
 * Computes the dictionary for the compression routines: they need two things:
 * 1. A hash-table which maps words to word numbers.
 * 2. A string-table which maps word numbers to offsets where the word begins
 *    in the input file to this program (default DEF_INDEX_FILE).
 * They are o/p in two output files, hash_file and string_file (default).
 *
 * Algorithm: first build the hash table of atmost 65536 words. Then traverse
 * the table and output the hash/string files in the above format.
 */
int
compute_dictionary(THRESHOLD, FILEBLOCKSIZE, SPECIAL_WORDS, comp_dir)
	int	THRESHOLD, FILEBLOCKSIZE;
	char	comp_dir[];
{
	int	c;
	unsigned char	curline[MAX_NAME_LEN];
	int	curchar = 0;
	unsigned char	curword[MAX_NAME_LEN];
	int	curfreq;
	int	curoffset = 0;
	int	curlen;
	int	wordindex = 0;
	FILE	*fp, *freqfp, *tempfp, *awkfp;
	int	pid = getpid();
	int	i;
	hash_entry *e, **t, *p;
	int	dummy, offset, len;
	unsigned char	s[MAX_LINE_LEN];
	unsigned char	rands[MAX_LINE_LEN];
	char	index_file[MAX_LINE_LEN], string_file[MAX_LINE_LEN], hash_file[MAX_LINE_LEN], freq_file[MAX_LINE_LEN];

        strcpy(hash_file, comp_dir);
        strcat(hash_file, "/");
        strcat(hash_file, DEF_HASH_FILE);
        strcpy(freq_file, comp_dir);
        strcat(freq_file, "/");
        strcat(freq_file, DEF_FREQ_FILE);
        strcpy(string_file, comp_dir);
        strcat(string_file, "/");
        strcat(string_file, DEF_STRING_FILE);
        strcpy(index_file, comp_dir);
        strcat(index_file, "/");
        strcat(index_file, DEF_INDEX_FILE);

	memset(dict_hash_table, '\0', sizeof(hash_entry *) * SMALL_HASH_TABLE_SIZE);
	memset(freq_strings_table, '\0', sizeof(hash_entry *) * (MAX_WORD_LEN+2));
	memset(freq_strings_num, '\0', sizeof(int) * (MAX_WORD_LEN+2));
	if (THRESHOLD < 0) THRESHOLD = DEF_THRESHOLD;
	if (SPECIAL_WORDS < 0) SPECIAL_WORDS = DEF_SPECIAL_WORDS;
	if (FILEBLOCKSIZE < 0) FILEBLOCKSIZE = DEF_BLOCKSIZE;

	if (THRESHOLD < MIN_WORD_LEN || THRESHOLD > MAX_THRESHOLD) {
		fprintf(stderr, "threshold must be in [%d, %d]\n", MIN_WORD_LEN, MAX_THRESHOLD);
		return -1;
	}

	if ((SPECIAL_WORDS < 0) || (SPECIAL_WORDS > 256-MAX_SPECIAL_CHARS)) {
		fprintf(stderr, "invalid special words %d: must be in [0, %d]\n", SPECIAL_WORDS, 256-MAX_SPECIAL_CHARS);
		return -1;
	}
	RESERVED_CHARS = SPECIAL_WORDS + MAX_SPECIAL_CHARS;
	MAX_WORDS = MAX_LSB*(256-RESERVED_CHARS);

	if ((fp = fopen(index_file, "r")) == NULL) {
		fprintf(stderr, "cannot open for reading: %s\n", index_file);
		return -1;
	}

	sprintf(s, "/tmp/temp%d", pid);
	if ((tempfp = fopen(s, "w")) == NULL) {
		fprintf(stderr, "cannot open for writing: %s\n", s);
		fclose(fp);
		return -1;
	}

	while((c = getc(fp)) != EOF) {
		if (curchar >= MAX_NAME_LEN) {
			curchar = 0;
			while((c = getc(fp) != '\n') && (c != EOF));
			if (c == EOF) break;
			else continue;
		}
		curline[curchar++] = (unsigned char)c;
		if (c == '\n') {	/* reached end of record */
			int	i = 0;

			if (curline[0] == '%') {	/* initial lines */
				curchar = 0;
				continue;
			}
			curword[0] = '\0';
			while ((i<curchar) && (curline[i] != WORD_END_MARK)) {
				curword[i] = curline[i]; i++;
			}
			curword[i++] = '\0';	/* skip over WORD_END_MARK */
			curlen = strlen((char *)curword);

			curfreq = 0;
			while(i<curchar) {
				unsigned char tempbuf[MAX_NAME_LEN];
				int j = 0;

				while(isdigit(curline[i])) tempbuf[j++] = curline[i++];
				tempbuf[j] = '\0';
				curfreq += atoi(tempbuf);
				i ++;	/* skip over current ' ' or '\n' */
			}
#if	0
			printf("curlen=%d curfreq=%d\n", curlen, curfreq);
#endif	/*0*/

			if ((curlen >= MIN_WORD_LEN) && (curlen*curfreq >= THRESHOLD)) {
				fprintf(tempfp, "%d %d %s\n", curlen*curfreq, curoffset, curword);
				wordindex ++;
			}
			curoffset += curchar;	/* offset cannot begin at 0: .index_list starts with %% and some junk */
			curchar = 0;
#if	0
			printf("word=%s freq=%d\n", curword, curfreq);
#endif	/*0*/
		}
	}
	fclose(fp);

	/*
	 * Now chose MAX_WORDS words with highest frequency.
	 */
	fflush(tempfp);
	fclose(tempfp);
	if (wordindex <= SPECIAL_WORDS) {
		fprintf(stderr, "Warning: very small dictionary with only %d words!\n", wordindex);
	}
	sprintf(s, "sort -n -r /tmp/temp%d > /tmp/sort%d\n", pid, pid);
	system(s);
	sprintf(s, "rm /tmp/temp%d\n", pid);
	system(s);
	sprintf(s, "head -%d /tmp/sort%d > /tmp/temp%d\n", MAX_WORDS, pid, pid);
	system(s);

	/*
	 * The first ultra-frequent 32 words are stored in a separate table sorted by
	 * lengths and within that according to alphabetical order (=canonical order).
	 */
	sprintf(s, "/tmp/temp%d", pid);
	if ((tempfp = fopen(s, "r")) == NULL) {
		fprintf(stderr, "cannot open for reading: %s\n", s);
		return -1;
	}
	for (i=0; i<SPECIAL_WORDS; i++) {
		if (3 != fscanf(tempfp, "%d %d %s\n", &dummy, &offset, s))
			break;
		len = strlen((char *)s);
		e = (hash_entry *)malloc(sizeof(hash_entry));
		e->word = (char *)malloc(len + 2);
		e->next = NULL;
		strcpy(e->word, (char *)s);
		/* I'm not worried about the offsets now */
		t = &freq_strings_table[len];
		while(*t != NULL) {
			if (strcmp((char *)s, (*t)->word) < 0) {
				e->next = *t;
				break;
			}
			t = &(*t)->next;
		}
		*t = e;	/* valid in both cases */
		freq_strings_num[len]++;
	}

	/*
	 * Put all the other words in the hash/string tables
	 */
	for (; i<MAX_WORDS; i++) {
		if (3 != fscanf(tempfp, "%d %d %s", &dummy, &offset, s)) break;
		insert_small_hash(dict_hash_table, s, strlen((char *)s), -1, offset);	/* dummy doesn't matter now: its is just a computed-value for sort */
	}
	fclose(tempfp);

	/*
	 * At this point, the hash-table and freq-words-table have been computed properly: dump them
	 */
	if ((freqfp = fopen(freq_file, "w")) == NULL) {
		fprintf(stderr, "cannot open for writing: %s\n", freq_file);
		return -1;
	}
	/* random number signature for this set of dictionaries will be in the freq-file (.glimpse_quick) */
	srand(getpid());
	rands[0] = '\0';
	while (strlen((char*)rands) < SIGNATURE_LEN - 1) {
		sprintf(s, "%x", rand());
		strcat((char *)rands, (char *)s);
	}
	fwrite(rands, 1, SIGNATURE_LEN - 1, freqfp);
	fputc('\n', freqfp);
	fprintf(freqfp, "%d\n", SPECIAL_WORDS);
	for (i=0; i<=MAX_WORD_LEN; i++) {
		if (freq_strings_num[i] <= 0) continue;
		fprintf(freqfp, "%d %d\n", i, freq_strings_num[i]);
		e = freq_strings_table[i];
		while (e!=NULL) {
			fprintf(freqfp, "%s\n", e->word);
			p = e;
			e = e->next;
			free(p->word);
			free(p);
		}
	}
	fflush(freqfp);
	fclose(freqfp);
	if (!dump_small_hash(dict_hash_table, hash_file)) return -1;

	/*
	 * Now sort chosen ones case-insensitively according to the name so that
	 * those words all fall into the same page offset in the hash/string tables.
	 */

	/* Alter order of words in .hash_table */
	sprintf(s, "/tmp/sort%d.a", pid);
	if ((awkfp = fopen(s, "w")) == NULL) {
		fprintf(stderr, "cannot open for writing: %s\n", s);
		return -1;
	}
	sprintf(s, "BEGIN {}\n{print $3 \" \" $2 \" \" $1}\nEND {}\n");
	fwrite(s, 1, strlen((char *)s), awkfp);
	fflush(awkfp);
	fclose(awkfp);
#if	0
	sprintf(s, "cat /tmp/sort%d.a\n", pid);
	system(s);
#endif	/*0*/
#if	0
	printf("stage1:");
	getchar();
#endif	/*0*/
	sprintf(s, "awk -f /tmp/sort%d.a < %s > /tmp/sort%d\n", pid, hash_file, pid);
	system(s);
	sprintf(s, "sort -d -f /tmp/sort%d > /tmp/temp%d\n", pid, pid);
	system(s);

	sprintf(s, "/tmp/sort%d.a", pid);
	if ((awkfp = fopen(s, "w")) == NULL) {
		fprintf(stderr, "cannot open for writing: %s\n", s);
		return -1;
	}
	sprintf(s, "%s", "BEGIN {}\n{print $3 \" \" NR-1 \" \" $1}\nEND {}\n");
	fwrite(s, 1, strlen((char *)s), awkfp);
	fflush(awkfp);
	fclose(awkfp);

	sprintf(s, "awk -f /tmp/sort%d.a < /tmp/temp%d > %s\n", pid, pid, hash_file);	/* reorder and put in new word numbers */
	system(s);

#if	0
	printf("stage2:");
	getchar();
#endif	/*0*/
	/* Now extract string-table, which is the set of 2nd components of the hash-table */
	sprintf(s, "/tmp/sort%d.a", pid);
	if ((awkfp = fopen(s, "w")) == NULL) {
		fprintf(stderr, "cannot open for writing: %s\n", s);
		return -1;
	}
	sprintf(s, "%s", "BEGIN {}\n{print $3}\nEND {}\n");
	fwrite(s, 1, strlen((char *)s), awkfp);
	fflush(awkfp);
	fclose(awkfp);
#if	0
	sprintf(s, "cat /tmp/sort%d.a\n", pid);
	system(s);
#endif	/*0*/
	sprintf(s, "awk -f /tmp/sort%d.a < %s > %s\n", pid, hash_file, string_file);
	system(s);

#if	0
	printf("stage3:"); getchar();
#endif	/*0*/
	/*
	 * Now pad the hash-file and string files and store indices to words at
	 * page boundary so that search() on compressed files can be made fast
	 * -- it need not load the whole hash-table: just the page where the
	 * word occurs. The index files are very small (< 1K) so read is fast.
	 * The padded files are in binary -- this is what tcompress/tuncompress
	 * read-in. This is done to save space.
	 */
	pad_hash_file(hash_file, FILEBLOCKSIZE);
	pad_string_file(string_file, FILEBLOCKSIZE);
	return 0;
}