/* Copyright (c) 1994 Burra Gopal, Udi Manber. All Rights Reserved. */ /* * build.c: Reads a file = list of indices and hashes frequently used * words into a hash-table for fast retrieval of a word's no. * Also maintains a string-table to get to the word from it no. */ #include "defs.h" /* small tables are ok since computation is a 1-time job: plus we need the extra space if we put it in a library for glimpseindex */ extern hash_entry *get_small_hash(), *insert_small_hash(); hash_entry *dict_hash_table[SMALL_HASH_TABLE_SIZE]; hash_entry *freq_strings_table[MAX_WORD_LEN+2]; int freq_strings_num[MAX_WORD_LEN+2]; extern int MAX_WORDS; extern int RESERVED_CHARS; /* * Computes the dictionary for the compression routines: they need two things: * 1. A hash-table which maps words to word numbers. * 2. A string-table which maps word numbers to offsets where the word begins * in the input file to this program (default DEF_INDEX_FILE). * They are o/p in two output files, hash_file and string_file (default). * * Algorithm: first build the hash table of atmost 65536 words. Then traverse * the table and output the hash/string files in the above format. */ int compute_dictionary(THRESHOLD, FILEBLOCKSIZE, SPECIAL_WORDS, comp_dir) int THRESHOLD, FILEBLOCKSIZE; char comp_dir[]; { int c; unsigned char curline[MAX_NAME_LEN]; int curchar = 0; unsigned char curword[MAX_NAME_LEN]; int curfreq; int curoffset = 0; int curlen; int wordindex = 0; FILE *fp, *freqfp, *tempfp, *awkfp; int pid = getpid(); int i; hash_entry *e, **t, *p; int dummy, offset, len; unsigned char s[MAX_LINE_LEN]; unsigned char rands[MAX_LINE_LEN]; char index_file[MAX_LINE_LEN], string_file[MAX_LINE_LEN], hash_file[MAX_LINE_LEN], freq_file[MAX_LINE_LEN]; strcpy(hash_file, comp_dir); strcat(hash_file, "/"); strcat(hash_file, DEF_HASH_FILE); strcpy(freq_file, comp_dir); strcat(freq_file, "/"); strcat(freq_file, DEF_FREQ_FILE); strcpy(string_file, comp_dir); strcat(string_file, "/"); strcat(string_file, DEF_STRING_FILE); strcpy(index_file, comp_dir); strcat(index_file, "/"); strcat(index_file, DEF_INDEX_FILE); memset(dict_hash_table, '\0', sizeof(hash_entry *) * SMALL_HASH_TABLE_SIZE); memset(freq_strings_table, '\0', sizeof(hash_entry *) * (MAX_WORD_LEN+2)); memset(freq_strings_num, '\0', sizeof(int) * (MAX_WORD_LEN+2)); if (THRESHOLD < 0) THRESHOLD = DEF_THRESHOLD; if (SPECIAL_WORDS < 0) SPECIAL_WORDS = DEF_SPECIAL_WORDS; if (FILEBLOCKSIZE < 0) FILEBLOCKSIZE = DEF_BLOCKSIZE; if (THRESHOLD < MIN_WORD_LEN || THRESHOLD > MAX_THRESHOLD) { fprintf(stderr, "threshold must be in [%d, %d]\n", MIN_WORD_LEN, MAX_THRESHOLD); return -1; } if ((SPECIAL_WORDS < 0) || (SPECIAL_WORDS > 256-MAX_SPECIAL_CHARS)) { fprintf(stderr, "invalid special words %d: must be in [0, %d]\n", SPECIAL_WORDS, 256-MAX_SPECIAL_CHARS); return -1; } RESERVED_CHARS = SPECIAL_WORDS + MAX_SPECIAL_CHARS; MAX_WORDS = MAX_LSB*(256-RESERVED_CHARS); if ((fp = fopen(index_file, "r")) == NULL) { fprintf(stderr, "cannot open for reading: %s\n", index_file); return -1; } sprintf(s, "/tmp/temp%d", pid); if ((tempfp = fopen(s, "w")) == NULL) { fprintf(stderr, "cannot open for writing: %s\n", s); fclose(fp); return -1; } while((c = getc(fp)) != EOF) { if (curchar >= MAX_NAME_LEN) { curchar = 0; while((c = getc(fp) != '\n') && (c != EOF)); if (c == EOF) break; else continue; } curline[curchar++] = (unsigned char)c; if (c == '\n') { /* reached end of record */ int i = 0; if (curline[0] == '%') { /* initial lines */ curchar = 0; continue; } curword[0] = '\0'; while ((i= MIN_WORD_LEN) && (curlen*curfreq >= THRESHOLD)) { fprintf(tempfp, "%d %d %s\n", curlen*curfreq, curoffset, curword); wordindex ++; } curoffset += curchar; /* offset cannot begin at 0: .index_list starts with %% and some junk */ curchar = 0; #if 0 printf("word=%s freq=%d\n", curword, curfreq); #endif /*0*/ } } fclose(fp); /* * Now chose MAX_WORDS words with highest frequency. */ fflush(tempfp); fclose(tempfp); if (wordindex <= SPECIAL_WORDS) { fprintf(stderr, "Warning: very small dictionary with only %d words!\n", wordindex); } sprintf(s, "sort -n -r /tmp/temp%d > /tmp/sort%d\n", pid, pid); system(s); sprintf(s, "rm /tmp/temp%d\n", pid); system(s); sprintf(s, "head -%d /tmp/sort%d > /tmp/temp%d\n", MAX_WORDS, pid, pid); system(s); /* * The first ultra-frequent 32 words are stored in a separate table sorted by * lengths and within that according to alphabetical order (=canonical order). */ sprintf(s, "/tmp/temp%d", pid); if ((tempfp = fopen(s, "r")) == NULL) { fprintf(stderr, "cannot open for reading: %s\n", s); return -1; } for (i=0; iword = (char *)malloc(len + 2); e->next = NULL; strcpy(e->word, (char *)s); /* I'm not worried about the offsets now */ t = &freq_strings_table[len]; while(*t != NULL) { if (strcmp((char *)s, (*t)->word) < 0) { e->next = *t; break; } t = &(*t)->next; } *t = e; /* valid in both cases */ freq_strings_num[len]++; } /* * Put all the other words in the hash/string tables */ for (; iword); p = e; e = e->next; free(p->word); free(p); } } fflush(freqfp); fclose(freqfp); if (!dump_small_hash(dict_hash_table, hash_file)) return -1; /* * Now sort chosen ones case-insensitively according to the name so that * those words all fall into the same page offset in the hash/string tables. */ /* Alter order of words in .hash_table */ sprintf(s, "/tmp/sort%d.a", pid); if ((awkfp = fopen(s, "w")) == NULL) { fprintf(stderr, "cannot open for writing: %s\n", s); return -1; } sprintf(s, "BEGIN {}\n{print $3 \" \" $2 \" \" $1}\nEND {}\n"); fwrite(s, 1, strlen((char *)s), awkfp); fflush(awkfp); fclose(awkfp); #if 0 sprintf(s, "cat /tmp/sort%d.a\n", pid); system(s); #endif /*0*/ #if 0 printf("stage1:"); getchar(); #endif /*0*/ sprintf(s, "awk -f /tmp/sort%d.a < %s > /tmp/sort%d\n", pid, hash_file, pid); system(s); sprintf(s, "sort -d -f /tmp/sort%d > /tmp/temp%d\n", pid, pid); system(s); sprintf(s, "/tmp/sort%d.a", pid); if ((awkfp = fopen(s, "w")) == NULL) { fprintf(stderr, "cannot open for writing: %s\n", s); return -1; } sprintf(s, "%s", "BEGIN {}\n{print $3 \" \" NR-1 \" \" $1}\nEND {}\n"); fwrite(s, 1, strlen((char *)s), awkfp); fflush(awkfp); fclose(awkfp); sprintf(s, "awk -f /tmp/sort%d.a < /tmp/temp%d > %s\n", pid, pid, hash_file); /* reorder and put in new word numbers */ system(s); #if 0 printf("stage2:"); getchar(); #endif /*0*/ /* Now extract string-table, which is the set of 2nd components of the hash-table */ sprintf(s, "/tmp/sort%d.a", pid); if ((awkfp = fopen(s, "w")) == NULL) { fprintf(stderr, "cannot open for writing: %s\n", s); return -1; } sprintf(s, "%s", "BEGIN {}\n{print $3}\nEND {}\n"); fwrite(s, 1, strlen((char *)s), awkfp); fflush(awkfp); fclose(awkfp); #if 0 sprintf(s, "cat /tmp/sort%d.a\n", pid); system(s); #endif /*0*/ sprintf(s, "awk -f /tmp/sort%d.a < %s > %s\n", pid, hash_file, string_file); system(s); #if 0 printf("stage3:"); getchar(); #endif /*0*/ /* * Now pad the hash-file and string files and store indices to words at * page boundary so that search() on compressed files can be made fast * -- it need not load the whole hash-table: just the page where the * word occurs. The index files are very small (< 1K) so read is fast. * The padded files are in binary -- this is what tcompress/tuncompress * read-in. This is done to save space. */ pad_hash_file(hash_file, FILEBLOCKSIZE); pad_string_file(string_file, FILEBLOCKSIZE); return 0; }