/* Copyright (c) 1994 Sun Wu, Udi Manber, Burra Gopal. All Rights Reserved. */ /* ./glimpse/index/build_in.c */ /* -------------------------------------------------------------- build_index(): build an index list from a set of files. INPUT: a set of file names char **name_list[]; a partition table int p_table[]; OUTPUT: an index list; char *index_list; the index list is a char string as follows: each entry of the index list contains two parts: name and indices, where name is an ascii character string, and indices is a list of short integer. (unsigned char) We use newline as a 'record delimiter' (a 'record is logically a word associated with its indices), and WORD_END_MARK to separate a word from its list of indices (s.t. fscanf %s works). Since we restrict the max number of partitions to be 255. a byte is enough to represent the index value. Note that there cannot be a partition #ed '\n'. An example index list: (in logical view) this 12 19 \n is 9 17 12 18 19 \n an 7 12 \n example 16 \n -----------------------------------------------------------------------*/ #include "glimpse.h" #define debugt #define BINARY 1 /* #define SW_DEBUG the original sw output of index set */ /* This flag must always be defined: it is used only in build_in.c */ /* #define UDI_DEBUG the original outputs of each indexed file */ /* Some variables used throughout */ #if BG_DEBUG extern FILE *LOGFILE; /* file descriptor for LOG output */ #endif /*BG_DEBUG*/ extern FILE *STATFILE; /* file descriptor for statistical data about indexed files */ extern FILE *MESSAGEFILE; /* file descriptor for important messages meant for the user */ extern char INDEX_DIR[MAX_LINE_LEN]; extern char sync_path[MAX_LINE_LEN]; extern struct stat istbuf; extern struct stat excstbuf; extern struct stat incstbuf; extern int ICurrentFileOffset; extern int NextICurrentFileOffset; /* Some options used throughout */ extern int OneFilePerBlock; extern int IndexNumber; extern int CountWords; extern int StructuredIndex; extern int InterpretSpecial; extern int total_size; extern int MAXWORDSPERFILE; extern int NUMERICWORDPERCENT; extern int AddToIndex; extern int DeleteFromIndex; extern int FastIndex; extern int BuildDictionary; extern int BuildDictionaryExisting; extern int CompressAfterBuild; extern int IncludeHigherPriority; extern int FilenamesOnStdin; extern int UseFilters; extern int ByteLevelIndex; /* int IndexUnderscore; */ extern int IndexableFile; extern int MAX_INDEX_PERCENT; extern int MAX_PER_MB; extern int I_THRESHOLD; extern int usemalloc; extern int BigHashTable; extern int AddedMaxWordsMessage; extern int AddedMixedWordsMessage; extern int icount; /* count the number of my_malloc for indices structure */ extern int hash_icount; /* to see how much was added to the current hash table */ extern int save_icount; /* to see how much was added to the index by the current file */ extern int numeric_icount; /* to see how many numeric words were there in the current file */ extern int num_filter; extern int filter_len[MAX_FILTER]; extern CHAR *filter[MAX_FILTER]; extern CHAR *filter_command[MAX_FILTER]; extern int REAL_PARTITION, REAL_INDEX_BUF, MAX_ALL_INDEX, FILEMASK_SIZE; extern int mask_int[32]; struct indices *deletedlist = NULL; char **name_list[MAXNUM_INDIRECT]; unsigned int *disable_list = NULL; int *size_list[MAXNUM_INDIRECT]; /* temporary area to store size of each file */ extern int p_table[MAX_PARTITION]; int p_size_list[MAX_PARTITION]; /* sum of the sizes of the files in each partition */ int part_num; /* number of partitions */ extern int memory_usage; /* borrowd from getword.c */ extern int PrintedLongWordWarning; extern int indexable_char[256]; extern char *getword(); extern int file_num; extern int old_file_num; extern int attr_num; extern int bp; /* buffer pointer */ extern unsigned char word[MAX_WORD_BUF]; extern int FirstTraverse1; extern struct indices *ip; extern int HashTableSize; struct token **hash_table; /*[MAX_64K_HASH];*/ build_index() { int i; if (AddToIndex || FastIndex) { FirstTraverse1 = OFF; } if ((total_size < LIMIT_64K_HASH*1024*1024) || !BigHashTable) { hash_table = (struct token **)my_malloc(sizeof(struct token *) * MAX_64K_HASH); HashTableSize = MAX_64K_HASH; } else { hash_table = (struct token **)my_malloc(sizeof(struct token *) * MAX_256K_HASH); HashTableSize = MAX_256K_HASH; } build_hash(); traverse1(); return; } /* ---------------------------------------------------------------------- traverse() function: traverse the hash list of indices = a hash list is a array of linked list, where every node in a linked list contains a word whose hash_value is the same. While traversing the hash list, traverse() output a stream of index list. It also frees the memory used in hash_table. ------------------------------------------------------------------------*/ traverse() { int numseencount = 0; int numelements; int numonline; int i, j, attribute; struct token *tp, *tp_old; struct indices *ip, *ip_old; FILE *f_out; char s[256]; char *word; int x = -1, y=0, diff, temp, even_words=1; /* 0 is an even number */ #ifdef SW_DEBUG printf("in traverse()\n"); #endif sprintf(s, "%s/%s", INDEX_DIR, I2); if ((f_out = fopen(s, "w")) == NULL) { fprintf(stderr, "Cannot open %s for writing\n", s); exit(3); } for(i=0; iword; while(*word != '\0') { /* copy the word to output */ putc(*word++, f_out); } /* Look for stop lists */ if (OneFilePerBlock && !ByteLevelIndex && (file_num > MaxNum8bPartition) && (tp->totalcount > (file_num * MAX_INDEX_PERCENT / 100))) { putc(ALL_INDEX_MARK, f_out); if (StructuredIndex) { /* force big-endian as usual */ attribute = encode16b(tp->attribute); putc((attribute&0x0000ff00)>>8, f_out); putc((attribute&0x000000ff), f_out); } putc(DONT_CONFUSE_SORT, f_out); goto next_token; } else if (ByteLevelIndex && (tp->totalcount > ( (((total_size>>20) > 0) && ((total_size>>20)*MAX_PER_MB < MAX_ALL_INDEX)) ? ((total_size>>20) * MAX_PER_MB) : DEF_ALL_INDEX) )) { putc(ALL_INDEX_MARK, f_out); if (StructuredIndex) { /* force big-endian as usual */ attribute = encode16b(tp->attribute); putc((attribute&0x0000ff00)>>8, f_out); putc((attribute&0x000000ff), f_out); } putc(DONT_CONFUSE_SORT, f_out); goto next_token; } putc(WORD_END_MARK, f_out); if (StructuredIndex) { /* force big-endian as usual */ attribute = encode16b(tp->attribute); putc((attribute&0x0000ff00)>>8, f_out); putc((attribute&0x000000ff), f_out); } numonline = 0; x = -1; y = 0; even_words = 1; ip = tp->ip; /* traverse the indices list */ ip_old = ip; numelements = 0; while(ip != NULL) { numelements ++; if (CountWords) { fprintf(f_out, "%d", ip->offset[0]); } else { if (ByteLevelIndex) { for (j=0; j < INDEX_SET_SIZE; j++) { if (ip->index[j] == INDEX_ELEM_FREE) continue; if ((ip->offset[j] <= y) && (y > 0) && (x == ip->index[j])) { /* consecutive offsets not increasing in same file! */ fprintf(stderr, "ignoring (%d, %d) > (%d, %d)\n", x, y, ip->index[j], ip->offset[j]); continue; /* error! */ } if (numonline >= MAX_PER_LINE) { /* terminate current line since it is too late to put ALL_INDEX_MARK now ... Unfortunate since sort is screwedup */ putc('\n', f_out); #if 0 putc('\n', stdout); #endif /*0*/ word = tp->word; while(*word != '\0') { /* copy the word to output */ putc(*word++, f_out); } putc(WORD_END_MARK, f_out); if (StructuredIndex) { /* force big-endian as usual */ attribute = encode16b(tp->attribute); putc((attribute&0x0000ff00)>>8, f_out); putc((attribute&0x000000ff), f_out); } numonline = 0; x = -1; /* to force code below to output it as if it is a fresh file */ y = 0; /* must output first offset as is, rather than difference */ } if (x != ip->index[j]) { if (x != -1) { temp = encode8b(0); putc(temp, f_out); /* can never ordinarily happen since ICurrentFileOffset is always ++d => delimiter */ } if (file_num <= MaxNum8bPartition) { x = encode8b(ip->index[j]); putc(x&0x000000ff, f_out); } else if (file_num <= MaxNum16bPartition) { x = encode16b(ip->index[j]); putc((x&0x0000ff00)>>8, f_out); putc(x&0x000000ff, f_out); } else { x = encode24b(ip->index[j]); putc((x&0x00ff0000)>>16, f_out); putc((x&0x0000ff00)>>8, f_out); putc(x&0x000000ff, f_out); } x = ip->index[j]; /* for next round */ #if 0 printf("#######x=%d ", x); #endif /*0*/ y = 0; } diff = ip->offset[j] - y; y = ip->offset[j]; if (diff < MaxNum1BPartition) { temp = encode8b(diff); putc(temp, f_out); } else if (diff < MaxNum2BPartition) { temp = encode8b((diff/MaxNum8bPartition) | 0x40); putc(temp, f_out); temp = encode8b(diff % MaxNum8bPartition); putc(temp, f_out); } else if (diff < MaxNum3BPartition) { temp = encode8b((diff/MaxNum16bPartition) | 0x80); putc(temp, f_out); temp = encode16b(diff % MaxNum16bPartition); putc((temp & 0x0000ff00) >> 8, f_out); putc(temp & 0x000000ff, f_out); } else { temp = encode8b((diff/MaxNum24bPartition) | 0xc0); putc(temp, f_out); temp = encode24b(diff % MaxNum24bPartition); putc((temp & 0x00ff0000) >> 16, f_out); putc((temp & 0x0000ff00) >> 8, f_out); putc(temp & 0x000000ff, f_out); } numonline ++; } } /* ByteLevelIndex */ else if (OneFilePerBlock) { if (file_num <= MaxNum8bPartition) { for(j=0; j < INDEX_SET_SIZE; j++) { if (ip->index[j] == INDEX_ELEM_FREE) continue; putc(encode8b(ip->index[j]), f_out); } } else if (file_num <= MaxNum12bPartition) { for(j=0; j < INDEX_SET_SIZE; j++) { if (ip->index[j] == INDEX_ELEM_FREE) continue; x = encode12b(ip->index[j]); if (even_words) { putc(x & 0x000000ff, f_out); /* lsb */ y = (x & 0x00000f00)>>8; /* msb */ even_words = 0; } else { /* odd number of words so far */ y |= (x&0x00000f00)>>4; /* msb of x into msb of y */ putc(y, f_out); putc(x&0x000000ff, f_out); even_words = 1; } } } else if (file_num <= MaxNum16bPartition) { for(j=0; j < INDEX_SET_SIZE; j++) { if (ip->index[j] == INDEX_ELEM_FREE) continue; x = encode16b(ip->index[j]); putc((x&0x0000ff00)>>8, f_out); putc(x&0x000000ff, f_out); } } else { for(j=0; j < INDEX_SET_SIZE; j++) { if (ip->index[j] == INDEX_ELEM_FREE) continue; x = encode24b(ip->index[j]); putc((x&0x00ff0000)>>16, f_out); putc((x&0x0000ff00)>>8, f_out); putc(x&0x000000ff, f_out); } } } /* OneFilePerBlock */ else { /* normal partitions */ for(j=0; j < INDEX_SET_SIZE; j++) { if (ip->index[j] == INDEX_ELEM_FREE) continue; putc(ip->index[j], f_out); } } } ip = ip->next_i; /* go to next indices */ indicesfree(ip_old, sizeof(struct indices)); ip_old = ip; } if (!ByteLevelIndex && OneFilePerBlock && !even_words && (file_num > MaxNum8bPartition) && (file_num <= MaxNum12bPartition)) putc(y, f_out); next_token: if (putc('\n', f_out) == EOF) { fprintf(stderr, "Error: write failed at %s:%d\n", __FILE__, __LINE__); exit(2); } tp = tp->next_t; /* go to next token */ #if 0 fprintf(stderr, "numelements=%d\n", numelements); #endif /*0*/ #if BG_DEBUG memory_usage -= (strlen(tp_old->word) + 1); #endif /*BG_DEBUG*/ wordfree(tp_old->word, 0); tokenfree(tp_old, sizeof(struct token)); tp_old = tp; numseencount ++; } } tokenallfree(); indicesallfree(); wordallfree(); #if BG_DEBUG fprintf(stderr, "out of traverse(): saved/freed %d tokens: new usage: %d\n", numseencount, memory_usage); #endif fflush(f_out); fclose(f_out); } traverse1() { FILE *i1, *i2, *i3; int ret; char s[256]; char s1[MAX_LINE_LEN]; extern int errno; static int maxsortlinelen = 0; int i; if (maxsortlinelen <= 0) { if (file_num < MaxNum8bPartition) maxsortlinelen = round((MaxNum8bPartition * sizeof(int) + MAX_NAME_SIZE), MAX_LINE_LEN) * MAX_LINE_LEN; else if (file_num < MaxNum12bPartition) maxsortlinelen = round((MaxNum12bPartition * sizeof(int) + MAX_NAME_SIZE), MAX_LINE_LEN) * MAX_LINE_LEN; else maxsortlinelen = MAX_SORTLINE_LEN; } traverse(); /* will produce .i2 and my_free allocated memory */ #if USESORT_Z_OPTION #if DONTUSESORT_T_OPTION || SFS_COMPAT sprintf(s, "exec %s -z %d %s/%s > %s/%s\n", SYSTEM_SORT, maxsortlinelen, INDEX_DIR, I2, INDEX_DIR, O2); #else sprintf(s, "exec %s -T %s -z %d %s/%s > %s/%s\n", SYSTEM_SORT, INDEX_DIR, maxsortlinelen, INDEX_DIR, I2, INDEX_DIR, O2); #endif #else #if DONTUSESORT_T_OPTION || SFS_COMPAT sprintf(s, "exec %s %s/%s > %s/%s\n", SYSTEM_SORT, INDEX_DIR, I2, INDEX_DIR, O2); #else sprintf(s, "exec %s -T %s %s/%s > %s/%s\n", SYSTEM_SORT, INDEX_DIR, INDEX_DIR, I2, INDEX_DIR, O2); #endif #endif #ifdef SW_DEBUG printf("%s", s); #endif if((ret=system(s)) != 0) { sprintf(s1, "system('%s') failed at:\n\t File=%s, Line=%d, Errno=%d", s, __FILE__, __LINE__, errno); perror(s1); fprintf(stderr, "Please try to run the program again\n(If there's no memory, increase the swap area / don't use -M and -B options)\n"); exit(1); } #ifdef SW_DEBUG printf("mv .o2 .i2\n"); fflush(stdout); #endif #if SFS_COMPAT sprintf(s, "%s/%s", INDEX_DIR, O2); sprintf(s1, "%s/%s", INDEX_DIR, I2); rename(s, s1); #else sprintf(s, "exec %s %s/%s %s/%s\n", SYSTEM_MV, INDEX_DIR, O2, INDEX_DIR, I2); system(s); #endif system(sync_path); /* sync() has a bug */ #if 0 printf("traversed\n"); sprintf(s, "exec %s -10 %s/%s\n", SYSTEM_HEAD, INDEX_DIR, I2); system(s); #endif /*0*/ /* * This flag is set from outside iff build-fast | build-addto option is set. */ if(FirstTraverse1) { /* Mention whether numbers are indexed */ if(IndexNumber) sprintf(s, "exec %s %%1234567890 > %s/%s\n", SYSTEM_ECHO, INDEX_DIR, INDEX_FILE); else sprintf(s, "exec %s %% > %s/%s\n", SYSTEM_ECHO, INDEX_DIR, INDEX_FILE); system(s); /* Put the magic number: 0 if not 1file/blk, numfiles otherwise */ if (OneFilePerBlock) { if (ByteLevelIndex) sprintf(s, "echo %%-%d >> %s/%s\n", file_num, INDEX_DIR, INDEX_FILE); else sprintf(s, "echo %%%d >> %s/%s\n", file_num, INDEX_DIR, INDEX_FILE); } else sprintf(s, "exec %s %%0 >> %s/%s\n", SYSTEM_ECHO, INDEX_DIR, INDEX_FILE); system(s); /* Put the magic number: 0 if not structured index, 1 if so */ if (StructuredIndex) sprintf(s, "exec %s %%%d >> %s/%s\n", SYSTEM_ECHO, attr_num, INDEX_DIR, INDEX_FILE); else sprintf(s, "exec %s %%0 >> %s/%s\n", SYSTEM_ECHO, INDEX_DIR, INDEX_FILE); system(s); #ifdef SW_DEBUG sprintf(s, "exec %s -l %s/.glimpse*\n", SYSTEM_LS, INDEX_DIR); system(s); #endif sprintf(s, "exec %s %s/%s >> %s/%s\n", SYSTEM_CAT, INDEX_DIR, I2, INDEX_DIR, INDEX_FILE); system(s); #if SFS_COMPAT sprintf(s, "%s/%s", INDEX_DIR, I2); unlink(s); #else sprintf(s, "exec %s %s/%s\n", SYSTEM_RM, INDEX_DIR, I2); system(s); #endif #ifdef SW_DEBUG sprintf(s, "exec %s -l %s/.glimpse*\n", SYSTEM_LS, INDEX_DIR); system(s); #endif #if 0 printf("catted\n"); sprintf(s, "exec %s -10 %s/%s\n", SYSTEM_HEAD, INDEX_DIR, INDEX_FILE); system(s); #endif /*0*/ FirstTraverse1 = 0; system(sync_path); /* sync() has a bug */ return; } /* else not first-traverse */ sprintf(s, "%s/%s", INDEX_DIR, INDEX_FILE); if((i1 = fopen(s, "r")) == 0) { /* new stuff */ fprintf(stderr, "can't open %s for reading\n", s); exit(2); } sprintf(s, "%s/%s", INDEX_DIR, I2); if((i2 = fopen(s, "r")) == 0) { /* old stuff */ fprintf(stderr, "can't open %s for reading\n", s); exit(2); } sprintf(s, "%s/%s", INDEX_DIR, I3); if((i3 = fopen(s, "w")) == 0) { /* result */ fprintf(stderr, "can't open %s for writing\n", s); exit(2); } /* Copy the 3 option fields (indexnumber, onefileperblock, structuredqueries) */ fgets(s, 256, i1); s[255] = '\0'; fputs(s, i3); fgets(s, 256, i1); s[255] = '\0'; fputs(s, i3); fgets(s, 256, i1); s[255] = '\0'; fputs(s, i3); merge_in(i2, i1, i3); /* merge_in(i1, i2, i3); */ #ifdef BG_DEBUG fprintf(stderr, "out of merge_in()\n"); #endif /*BG_DEBUG*/ fclose(i1); fclose(i2); fclose(i3); #ifdef SW_DEBUG printf("mv .i3 %s\n", INDEX_FILE); fflush(stdout); #endif #if SFS_COMPAT sprintf(s, "%s/%s", INDEX_DIR, I3); sprintf(s1, "%s/%s", INDEX_DIR, INDEX_FILE); rename(s, s1); #else sprintf(s, "exec %s %s/%s %s/%s", SYSTEM_MV, INDEX_DIR, I3, INDEX_DIR, INDEX_FILE); system(s); #endif #ifdef SW_DEBUG printf("ls -l .i2 %s\n", INDEX_FILE); fflush(stdout); sprintf(s, "exec %s -l %s/.glimpse*", SYSTEM_LS, INDEX_DIR); printf("%d\n", system(s)); #endif #if 0 printf("merged\n"); sprintf(s, "exec %s -10 %s/%s\n", SYSTEM)_HEAD, INDEX_DIR, INDEX_FILE); system(s); #endif /*0*/ } /* -------------------------------------------------------------------- build_hash(): input: a set of filenames in name_list[], a partition table p_table[] output: a hash table hash_table[]. -----------------------------------------------------------------------*/ build_hash() { FILE *fd; /* opened file number */ int i, pn; /* pn: current partition */ int num_read; char word[256]; struct stat stbuf; int offset; int toread; unsigned char *buffer; /* running pointer for getword = place where reads begin */ unsigned char *bx; /* running pointer for read-loop, initially buffer */ unsigned char *buffer_end; /* place where getword should stop */ unsigned char *buffer_begin;/* constant pointer to beginning */ int residue; /* extra variable to store buffer_begin + BLOCK_SIZE - buffer_end */ int tried_once = 0; int attribute; int ret; char outname[MAX_LINE_LEN]; char *unlinkname = NULL; int pid = getpid(); if (StructuredIndex) region_initialize(); init_hash_table(); #ifdef debug printf("entering build_hash(), part_num=%d\n", part_num); #endif tried_once = 0; try_again_1: buffer_begin = buffer = (unsigned char *) my_malloc(sizeof(char)* BLOCK_SIZE + 10); /* always read in units of BLOCK_SIZE or less */ if(buffer == NULL) { fprintf(stderr, "not enough memory in build_hash\n"); if (tried_once) return; traverse1(); init_hash_table(); tried_once = 1; goto try_again_1; } bx = buffer; if (OneFilePerBlock) { for(i=0; i 0) { /* do not remove old .TZ file */ if (StructuredIndex && (-1 == region_create(outname))) { fprintf(stderr, "permission denied or non-existent file: %s\n", LIST_GET(name_list, i)); remove_filename(i, -1); continue; } if (((fd = fopen(outname, "r")) == NULL) ) { fprintf(stderr, "permission denied or non-existent file: %s\n", LIST_GET(name_list, i)); remove_filename(i, -1); if (StructuredIndex) region_destroy(); /* cannot happen! */ unlink(outname); continue; } unlinkname = outname; goto index_file1; } /* Try to apply the filter */ sprintf(outname, "%s/.glimpse_apply.%d", INDEX_DIR, pid); if ((ret = apply_filter(LIST_GET(name_list, i), outname)) == 1) { /* Some pattern matched AND some filter was successful */ if (StructuredIndex && (-1 == region_create(outname))) { fprintf(stderr, "permission denied or non-existent file: %s\n", LIST_GET(name_list, i)); remove_filename(i, -1); continue; } if (((fd = fopen(outname, "r")) == NULL) ) { /* error: shouldn't have returned 1! */ fprintf(stderr, "permission denied or non-existent file: %s\n", LIST_GET(name_list, i)); remove_filename(i, -1); if (StructuredIndex) region_destroy(); /* cannot happen! */ unlink(outname); continue; } unlinkname = outname; goto index_file1; } else if (ret == 2) { /* Some pattern matched but no filter was successful */ if (filetype(LIST_GET(name_list, i), 0)) { /* try to index input file if it satisfies filetype */ remove_filename(i, -1); unlink(outname); continue; } unlinkname = outname; } if (StructuredIndex && (-1 == region_create(LIST_GET(name_list, i)))) { fprintf(stderr, "permission denied or non-existent file: %s\n", LIST_GET(name_list, i)); remove_filename(i, -1); continue; } if (((fd = fopen(LIST_GET(name_list, i), "r")) == NULL) ) { fprintf(stderr, "permission denied or non-existent file: %s\n", LIST_GET(name_list, i)); remove_filename(i, -1); if (StructuredIndex) region_destroy(); /* cannot happen! */ if (unlinkname != NULL) unlink(unlinkname); continue; } index_file1: #ifdef SW_DEBUG if (AddToIndex || FastIndex) printf("adding words of %s in %d\n", LIST_GET(name_list,i), i); printf("%s\n", LIST_GET(name_list, i)); #endif /* stat(LIST_GET(name_list, i), &stbuf); Chris Dalton */ fstat(fileno(fd), &stbuf); #ifdef SW_DEBUG printf("filesize: %d\n", stbuf.st_size); #endif #ifdef UDI_DEBUG printf("%s ", LIST_GET(name_list, i)); printf("size: %d ", stbuf.st_size); #endif /* buffer always points to a BLOCK_SIZE block of allocated memory */ buffer = buffer_begin; residue = 0; for (offset = 0; offset < stbuf.st_size; offset += BLOCK_SIZE) { offset -= residue; NextICurrentFileOffset = ICurrentFileOffset = offset; toread = offset + BLOCK_SIZE >= stbuf.st_size ? stbuf.st_size - offset : BLOCK_SIZE; fseek(fd, offset, 0); bx= buffer; num_read = 0; while ((toread > 0) && ((num_read = fread(bx, 1, toread, fd)) < toread)) { if (num_read <= 0) { buffer = bx; fprintf(stderr, "read error on file %s at offset %d\n", LIST_GET(name_list, i), offset); goto break_break1; /* C doesn't have break; break; */ } bx += num_read; toread -= num_read; } if (num_read >= toread) { bx += num_read; toread -= num_read; } buffer_end = bx; residue = 0; if (buffer_end == buffer_begin + BLOCK_SIZE) { while ((INDEXABLE(*(buffer_end-1))) && (buffer_end > buffer_begin + MAX_WORD_SIZE)) buffer_end --; residue = buffer_begin + BLOCK_SIZE - buffer_end; /* if (residue > 0) printf("residue = %d in %s at %d\n", residue, LIST_GET(name_list, i), offset); */ } bx = buffer; PrintedLongWordWarning = 0; while ((buffer=(unsigned char *) getword(LIST_GET(name_list, i), word, buffer, buffer_end, &attribute)) < buffer_end) { /* printf("%s\n", word); */ if(word[0] == '\0') continue; if(icount - hash_icount >= I_THRESHOLD) { #if BG_DEBUG fprintf(LOGFILE, "reached I_THRESHOLD at %d\n", icount - hash_icount); #endif /*BG_DEBUG*/ traverse1(); init_hash_table(); hash_icount = icount; } insert_h(word, i, attribute); } if (word[0] != '\0') { /* printf("%s\n", word); */ if(icount - hash_icount >= I_THRESHOLD) { #if BG_DEBUG fprintf(LOGFILE, "reached I_THRESHOLD at %d\n", icount - hash_icount); #endif /*BG_DEBUG*/ traverse1(); init_hash_table(); hash_icount = icount; } insert_h(word, i, attribute); } buffer = buffer_begin; } break_break1: fclose(fd); if (unlinkname != NULL) unlink(unlinkname); #ifdef UDI_DEBUG printf("add to index: %d\n",icount-save_icount); #endif if ((MAXWORDSPERFILE > 0) && (icount-save_icount > MAXWORDSPERFILE)) { fprintf(MESSAGEFILE, "%d words are contributed by %s\n", icount-save_icount, LIST_GET(name_list, i)); AddedMaxWordsMessage = ON; } if (IndexNumber && NUMERICWORDPERCENT && (numeric_icount * 100 > (icount - save_icount) * NUMERICWORDPERCENT) && (icount - save_icount > MIN_WORDS)) { fprintf(MESSAGEFILE, "NUMBERS occur in %d%% of %d words contributed by %s\n", (numeric_icount * 100)/(icount - save_icount), icount - save_icount, LIST_GET(name_list, i)); AddedMixedWordsMessage = ON; } numeric_icount=0; save_icount=icount; if (StructuredIndex) region_destroy(); } my_free(buffer_begin, BLOCK_SIZE); return; } for(pn=1; pn < part_num; pn++) /* partition # 0 is not accessed */ { if (pn == '\n') continue; /* There cannot be a partition # '\n' or 0: see partition.c */ for(i=p_table[pn]; i 0) { /* do not remove old .TZ file */ if (StructuredIndex && (-1 == region_create(outname))) { fprintf(stderr, "permission denied or non-existent file: %s\n", LIST_GET(name_list, i)); remove_filename(i, -1); continue; } if (((fd = fopen(outname, "r")) == NULL) ) { fprintf(stderr, "permission denied or non-existent file: %s\n", LIST_GET(name_list, i)); remove_filename(i, -1); if (StructuredIndex) region_destroy(); /* cannot happen! */ unlink(outname); continue; } if (BuildDictionary && CompressAfterBuild) strcpy(LIST_GET(name_list, i), outname); /* name of clear file will be smaller, so enough space */ else unlinkname = outname; goto index_file2; } /* Try to apply the filter */ sprintf(outname, "%s/.glimpse_apply.%d", INDEX_DIR, pid); if ((ret = apply_filter(LIST_GET(name_list, i), outname)) == 1) { /* Some pattern matched AND some filter was successful */ if (StructuredIndex && (-1 == region_create(outname))) { fprintf(stderr, "permission denied or non-existent file: %s\n", LIST_GET(name_list, i)); remove_filename(i, -1); continue; } if (((fd = fopen(outname, "r")) == NULL) ) { /* error: shouldn't have returned 1! */ fprintf(stderr, "permission denied or non-existent file: %s\n", LIST_GET(name_list, i)); remove_filename(i, -1); if (StructuredIndex) region_destroy(); /* cannot happen! */ unlink(outname); continue; } unlinkname = outname; goto index_file2; } else if (ret == 2) { /* Some pattern matched but no filter was successful */ if (filetype(LIST_GET(name_list, i), 0)) { /* try to index input file if it satisfies filetype */ remove_filename(i, -1); unlink(outname); continue; } unlinkname = outname; } if (StructuredIndex && (-1 == region_create(LIST_GET(name_list, i)))) { fprintf(stderr, "permission denied or non-existent file: %s\n", LIST_GET(name_list, i)); remove_filename(i, -1); continue; } if (((fd = fopen(LIST_GET(name_list, i), "r")) == NULL) ) { fprintf(stderr, "permission denied or non-existent file: %s\n", LIST_GET(name_list, i)); remove_filename(i, -1); if (StructuredIndex) region_destroy(); /* cannot happen! */ if (unlinkname != NULL) unlink(unlinkname); continue; } index_file2: #ifdef SW_DEBUG if (AddToIndex || FastIndex) printf("adding words of %s in %d\n", LIST_GET(name_list, i), pn); printf("%s\n", LIST_GET(name_list, i)); #endif /* stat(LIST_GET(name_list, i), &stbuf); Chris Dalton */ fstat(fileno(fd), &stbuf); #ifdef SW_DEBUG printf("filesize: %d\n", stbuf.st_size); #endif #ifdef UDI_DEBUG printf("%s ", LIST_GET(name_list, i)); printf("size: %d ", stbuf.st_size); #endif /* buffer always points to a BLOCK_SIZE block of allocated memory */ buffer = buffer_begin; residue = 0; for (offset = 0; offset < stbuf.st_size; offset += BLOCK_SIZE) { offset -= residue; NextICurrentFileOffset = ICurrentFileOffset = offset; toread = offset + BLOCK_SIZE >= stbuf.st_size ? stbuf.st_size - offset : BLOCK_SIZE; fseek(fd, offset, 0); bx= buffer; num_read = 0; while ((toread > 0) && ((num_read = fread(bx, 1, toread, fd)) < toread)) { if (num_read <= 0) { buffer = bx; fprintf(stderr, "read error on file %s at offset %d\n", LIST_GET(name_list, i), offset); goto break_break2; /* C doesn't have break; break; */ } bx += num_read; toread -= num_read; } if (num_read >= toread) { bx += num_read; toread -= num_read; } buffer_end = bx; residue = 0; if (buffer_end == buffer_begin + BLOCK_SIZE) { while ((INDEXABLE(*(buffer_end-1))) && (buffer_end > buffer_begin + MAX_WORD_SIZE)) buffer_end --; residue = buffer_begin + BLOCK_SIZE - buffer_end; /* if (residue > 0) printf("residue = %d in %s at %d\n", residue, LIST_GET(name_list, i), offset); */ } bx = buffer; PrintedLongWordWarning = 0; while ((buffer=(unsigned char *) getword(LIST_GET(name_list, i), word, buffer, buffer_end, &attribute)) < buffer_end) { /* printf("%s\n", word); */ if(word[0] == '\0') continue; if(icount - hash_icount >= I_THRESHOLD) { #if BG_DEBUG fprintf(LOGFILE, "reached I_THRESHOLD at %d\n", icount - hash_icount); #endif /*BG_DEBUG*/ traverse1(); init_hash_table(); hash_icount = icount; } insert_h(word, pn, attribute); } if (word[0] != '\0') { /* printf("%s\n", word); */ if(icount - hash_icount >= I_THRESHOLD) { #if BG_DEBUG fprintf(LOGFILE, "reached I_THRESHOLD at %d\n", icount - hash_icount); #endif /*BG_DEBUG*/ traverse1(); init_hash_table(); hash_icount = icount; } insert_h(word, pn, attribute); } buffer = buffer_begin; } break_break2: fclose(fd); if (unlinkname != NULL) unlink(unlinkname); #ifdef UDI_DEBUG printf("add to index: %d\n",icount-save_icount); #endif if ((MAXWORDSPERFILE > 0) && (icount-save_icount > MAXWORDSPERFILE)) { fprintf(MESSAGEFILE, "%d words are contributed by %s\n", icount-save_icount, LIST_GET(name_list, i)); AddedMaxWordsMessage = ON; } if (IndexNumber && NUMERICWORDPERCENT && (numeric_icount * 100 > (icount - save_icount) * NUMERICWORDPERCENT) && (icount - save_icount > MIN_WORDS)) { fprintf(MESSAGEFILE, "NUMBERS occur in %d%% of %d words contributed by %s\n", (numeric_icount * 100)/(icount - save_icount), icount - save_icount, LIST_GET(name_list, i)); AddedMixedWordsMessage = ON; } numeric_icount=0; save_icount=icount; if (StructuredIndex) region_destroy(); } } my_free(buffer_begin, BLOCK_SIZE); } init_hash_table() { int i; for(i=0; iword) == 0) && (tp->attribute == attribute)) { insert_index(tp, pn); tried_once = 0; return; /* already in there */ } tp_bak = tp; tp = tp->next_t; } /* this is a new word, insert it */ if((tp = (struct token *) tokenalloc(sizeof(struct token))) == NULL) { tp_bak = NULL; if (tried_once == 1) { fprintf(stderr, "not enough memory in insert_h1 at icount=%d. skipping...\n", icount); tried_once = 0; return; /* ignore word altogether */ } traverse1(); init_hash_table(); tried_once = 1; /* memory allocation failed in malloc#1 */ return insert_h(word, pn, attribute); /* next call can't fail here since traverse() calls *allfree() */ } if((tp->word = (char *) wordalloc(sizeof(char) * (wordlen+1))) == NULL) { tp_bak = NULL; if (tried_once == 2) { fprintf(stderr, "not enough memory in insert_h2 at icount=%d. skipping...\n", icount); tokenfree(tp, sizeof(struct token)); tried_once = 0; return; /* ignore word altogether */ } tokenfree(tp, sizeof(struct token)); traverse1(); init_hash_table(); tried_once = 2; /* memory allocation failed in malloc#2 */ return insert_h(word, pn, attribute); /* next call can't fail here or above since traverse() calls *allfree() */ } strcpy(tp->word, word); tp->attribute = attribute; /* the index list has a first index */ if((iip = (struct indices *) indicesalloc(sizeof(struct indices))) == NULL) { tp_bak = NULL; if (tried_once == 3) { fprintf(stderr, "not enough memory in insert_h3 at icount=%d. skipping...\n", icount); wordfree(tp->word, wordlen + 1); tokenfree(tp, sizeof(struct token)); tried_once = 0; return; /* ignore word altogether */ } wordfree(tp->word, wordlen + 1); tokenfree(tp, sizeof(struct token)); traverse1(); init_hash_table(); tried_once = 3; /* memory allocation failed in malloc#3 */ return insert_h(word, pn, attribute); /* next call can't fail here or above or above-above since traverse() calls *allfree() */ } icount++; if (IndexNumber && NUMERICWORDPERCENT) { int i=0; while(word[i] != '\0') { if (!isalpha(word[i])) break; i++; } if (word[i] != '\0') numeric_icount ++; } #ifdef SW_DEBUG if((icount & 01777) == 0) printf("icount = %d\n", icount); #endif if (!CountWords) { iip->index[0] = pn; iip->offset[0] = ICurrentFileOffset; } for (j=1; jindex[j] = INDEX_ELEM_FREE; /* assign both head and tail */ iip->next_i = NULL; tp->ip = iip; tp->lastip = iip; if(tp_bak == NULL) hash_table[hash_value] = tp; else tp_bak->next_t = tp; tp->next_t = NULL; tp->totalcount = 1; tried_once = 0; /* now sure that there has been no memory allocation failure while inserting this word */ return; } /* ------------------------------------------------------------------- insert_index(): insert an index, i.e., pn, into an indices structure. The indices structure is a linked list where the 'first' one is always the active indices structure. When the active one is filled with 8 indices an indicies structure is created and becomes the active one. tp points to the token structure. so, tp->ip is always the active indices structure. THERE ARE NO STATE CHANGES UNLESS WE ARE SURE THAT MALLOCS WON'T FAIL: BG ------------------------------------------------------------------- */ insert_index(tp, pn) struct token *tp; /* insert a index into a indices structure */ int pn; { struct indices *iip, *temp; struct indices *ip = (ByteLevelIndex ? tp->lastip : tp->ip); static int tried_once = 0; int j; if (CountWords) { /* I am not interested in maintaining where a word occurs: only the number of times it occurs */ ip->offset[0] ++; return; } /* Check for stop-list */ if (OneFilePerBlock && !ByteLevelIndex && (file_num > MaxNum8bPartition) && (tp->totalcount > (file_num * MAX_INDEX_PERCENT / 100))) return; if (ByteLevelIndex && (tp->totalcount > ( (((total_size>>20) > 0) && ((total_size>>20)*MAX_PER_MB < MAX_ALL_INDEX)) ? ((total_size>>20) * MAX_PER_MB) : DEF_ALL_INDEX) )) return; if (ByteLevelIndex) { for (j=INDEX_SET_SIZE; j>0; j--) { if(ip->index[j-1] == INDEX_ELEM_FREE) continue; if ((ip->index[j-1] == pn) && (ip->offset[j-1] == ICurrentFileOffset)) return; /* in identical position */ else break; } } else { for (j=INDEX_SET_SIZE; j>0; j--) { if(ip->index[j-1] == INDEX_ELEM_FREE) continue; if (ip->index[j-1] == pn) return; /* current word is not the first appearance in partition pn */ else break; } } /* ip->index[j] is the place to insert new pn provided j < INDEX_SET_SIZE */ if(j < INDEX_SET_SIZE) { ip->offset[j] = ICurrentFileOffset; ip->index[j] = pn; return; } if((iip = (struct indices *) indicesalloc(sizeof(struct indices)))==NULL) { if (tried_once == 1) { fprintf(stderr, "not enough memory in insert_index at icount=%d. skipping...\n", icount); tried_once = 0; return; /* ignore index altogether */ } traverse1(); init_hash_table(); tried_once = 1; /* memory allocation failed in malloc#1 */ return insert_index(tp, pn); } icount++; if (ByteLevelIndex) { /* insert at the end */ tp->lastip->next_i = iip; iip->next_i = NULL; tp->lastip = iip; } else { iip->next_i = tp->ip; tp->ip = iip; } iip->offset[0] = ICurrentFileOffset; iip->index[0] = pn; for (j=1; jindex[j] = INDEX_ELEM_FREE; tp->totalcount ++; if ( (OneFilePerBlock && !ByteLevelIndex && (file_num > MaxNum8bPartition) && (tp->totalcount > (file_num * MAX_INDEX_PERCENT / 100))) || (ByteLevelIndex && (tp->totalcount > ( (((total_size>>20) > 0) && ((total_size>>20)*MAX_PER_MB < MAX_ALL_INDEX)) ? ((total_size>>20) * MAX_PER_MB) : DEF_ALL_INDEX) )) ) { for (iip=tp->ip; iip != NULL; temp = iip, iip = iip->next_i, indicesfree(temp, sizeof(struct indices))); tp->ip = NULL; /* never need to insert anything else here */ } /* printf("returning from insert_index()\n"); fflush(stderr); */ tried_once = 0; return; } /* Scan the indexed "word" from an index line: see io.c/merge_splits() */ scanword(word, buffer, buffer_end, attr) unsigned char *word, *buffer, *buffer_end; unsigned int *attr; { int i = MAX_WORD_SIZE; while ((i-- != 0) && (buffer <= buffer_end) && (*buffer != ALL_INDEX_MARK) && (*buffer != WORD_END_MARK) && (*buffer != '\n') && (*buffer != '\0')) *word ++ = *buffer ++; *word = '\0'; *attr = encode16b(0); if (StructuredIndex) { if ((*buffer == ALL_INDEX_MARK) || (*buffer == WORD_END_MARK)) { buffer ++; *attr = ((*buffer) << 8) | (*(buffer + 1)); } } } /* Globals used in merge, and also in glimpse's main.c */ extern unsigned int *src_index_set; extern unsigned int *dest_index_set; extern unsigned char *src_index_buf; extern unsigned char *dest_index_buf; extern unsigned char *merge_index_buf; /* merge index file f1 and f2, then put the result in index file f3 */ merge_in(f1, f2, f3) FILE *f1, *f2, *f3; { int src_mark, dest_mark; int src_num, dest_num; int src_end_pt, dest_end_pt; int cmp=0; /* the result of strcmp */ int bdx, bdx1, bdx2, merge_len, i, j; int TAIL1=0; char word1[MAX_WORD_SIZE+6]; /* used only for strcmp() */ char word2[MAX_WORD_SIZE+6]; /* used only for strcmp() */ unsigned int attr1, attr2; int x=0, y=0, even_words = 1; /* LOOK OUT FOR: [memset, fgets, endpt-forloop, scanword] 4-tuples: invariant */ #if debug printf("in merge_in()\n"); fflush(stdout); #endif memset(dest_index_buf, '\0', REAL_INDEX_BUF); if (fgets(dest_index_buf, REAL_INDEX_BUF, f2) == NULL) dest_index_buf[0] = '\0'; else dest_index_buf[REAL_INDEX_BUF - 1] = '\0'; dest_end_pt = strlen(dest_index_buf); scanword(word2, dest_index_buf, dest_index_buf+dest_end_pt, &attr2); #ifdef debug printf("word2 = %s\n", word2); #endif memset(src_index_buf, '\0', REAL_INDEX_BUF); while(fgets(src_index_buf, REAL_INDEX_BUF, f1)) { src_index_buf[REAL_INDEX_BUF - 1] = '\0'; src_end_pt = strlen(src_index_buf); scanword(word1, src_index_buf, src_index_buf+src_end_pt, &attr1); #ifdef debug printf("word1 = %s\n", word1); #endif while (((cmp = strncmp(word1, word2, MAX_WORD_SIZE+4)) > 0) || (StructuredIndex && (cmp == 0) && (attr1 > attr2))) { fputs(dest_index_buf, f3); memset(dest_index_buf, '\0', dest_end_pt+2); if(fgets(dest_index_buf, REAL_INDEX_BUF, f2) == NULL) { dest_index_buf[REAL_INDEX_BUF - 1] = '\0'; TAIL1 = ON; break; } dest_index_buf[REAL_INDEX_BUF - 1] = '\0'; dest_end_pt = strlen(dest_index_buf); scanword(word2, dest_index_buf, dest_index_buf+dest_end_pt, &attr2); } if(TAIL1 == ON) break; if ((cmp == 0) && (attr1 == attr2)) { /* we need to join the index of word1 and word2 */ #ifdef debug printf("joining src_index_buf and dest_index_buf\n"); printf("src_index_buf = %s", src_index_buf); printf("dest_index_buf = %s", dest_index_buf); #endif if (!CountWords && !ByteLevelIndex) { /* have to look for common indices and exclude them */ int oldbdx1, oldbdx2; merge_index_buf[0] = '\0'; merge_len = 0; oldbdx1 = bdx1 = 0; /* src_index_buf[src_end_pt] is '\0', src_index_buf[src_end_pt-1] is '\n' */ while((bdx1 < src_end_pt) && (src_index_buf[bdx1] != WORD_END_MARK) && (src_index_buf[bdx1] != ALL_INDEX_MARK)) bdx1 ++; if ((bdx1 > oldbdx1) && (bdx1 < src_end_pt)) { /* src_index_buf[bdx1] is the word-end-mark */ src_mark = src_index_buf[bdx1]; src_index_buf[bdx1] = '\0'; /* terminate word */ strcpy(merge_index_buf, src_index_buf); /* save the word itself */ merge_len = strlen(src_index_buf); /* merge_index_buf[merge_len] is '\0', merge_index_buf[merge_len-1] is a part of the word */ bdx1 ++; /* skip word end marker */ if (StructuredIndex) bdx1 += 2; /* skip attribute field */ } even_words = 1; src_num = 0; if (OneFilePerBlock) memset((char *)src_index_set, '\0', sizeof(int)*REAL_PARTITION); else memset((char *)src_index_set, '\0', sizeof(int) * (MAX_PARTITION + 1)); while(bdx1 < src_end_pt - 1) { if (OneFilePerBlock) { x = 0; if (file_num <= MaxNum8bPartition) { x = decode8b(src_index_buf[bdx1]); bdx1 ++; } else if (file_num <= MaxNum12bPartition) { if (even_words) { x = ((src_index_buf[bdx1+1] & 0x0000000f) << 8) | src_index_buf[bdx1]; x = decode12b(x); bdx1 += 2; even_words = 0; } else { /* odd number of words so far */ x = ((src_index_buf[bdx1-1] & 0x000000f0) << 4) | src_index_buf[bdx1]; x = decode12b(x); bdx1 ++; even_words = 1; } } else if (file_num <= MaxNum16bPartition) { x = (src_index_buf[bdx1] << 8) | src_index_buf[bdx1+1]; x = decode16b(x); bdx1 += 2; } else { x = (src_index_buf[bdx1] << 16) | (src_index_buf[bdx1+1] << 8) | src_index_buf[bdx1+2]; x = decode24b(x); bdx1 += 3; } src_index_set[block2index(x)] |= mask_int[x % (8*sizeof(int))]; src_num ++; } else src_index_set[src_num++] = src_index_buf[bdx1++]; } oldbdx2 = bdx2 = 0; /* dest_index_buf[dest_end_pt] is '\0', dest_index_buf[dest_end_pt-1] is '\n' */ while((bdx2 < dest_end_pt) && (dest_index_buf[bdx2] != WORD_END_MARK) && (dest_index_buf[bdx2] != ALL_INDEX_MARK)) bdx2 ++; if ((bdx2 > oldbdx2) && (bdx2 < dest_end_pt)) { /* dest_index_buf[bdx2] is the word-end-mark */ dest_mark = dest_index_buf[bdx2]; dest_index_buf[bdx2] = '\0'; /* terminate word */ if (merge_len == 0) { strcpy(merge_index_buf, dest_index_buf); /* save the word itself */ merge_len = strlen(merge_index_buf); /* merge_index_buf[merge_len] is '\0', merge_index_buf[merge_len-1] is a part of the word */ } bdx2 ++; /* skip word end marker */ if (StructuredIndex) bdx2 += 2; /* skip attribute field */ } even_words = 1; dest_num = 0; if (OneFilePerBlock) memset((char *)dest_index_set, '\0', sizeof(int)*REAL_PARTITION); else memset((char *)dest_index_set, '\0', sizeof(int) * (MAX_PARTITION + 1)); while(bdx2 < dest_end_pt - 1) { if (OneFilePerBlock) { x = 0; if (file_num <= MaxNum8bPartition) { x = decode8b(dest_index_buf[bdx2]); bdx2 ++; } else if (file_num <= MaxNum12bPartition) { if (even_words) { x = ((dest_index_buf[bdx2+1] & 0x0000000f) << 8) | dest_index_buf[bdx2]; x = decode12b(x); bdx2 += 2; even_words = 0; } else { /* odd number of words so far */ x = ((dest_index_buf[bdx2-1] & 0x000000f0) << 4) | dest_index_buf[bdx2]; x = decode12b(x); bdx2 ++; even_words = 1; } } else if (file_num <= MaxNum16bPartition) { x = (dest_index_buf[bdx2] << 8) | dest_index_buf[bdx2+1]; x = decode16b(x); bdx2 += 2; } else { x = (dest_index_buf[bdx2] << 16) | (dest_index_buf[bdx2+1] << 8) | dest_index_buf[bdx2+2]; x = decode24b(x); bdx2 += 3; } dest_index_set[block2index(x)] |= mask_int[x % (8*sizeof(int))]; dest_num ++; } else dest_index_set[dest_num++] = dest_index_buf[bdx2++]; } even_words = 1; if (merge_len > 0) { if(OneFilePerBlock && ((src_mark == ALL_INDEX_MARK) || (dest_mark == ALL_INDEX_MARK) || ((file_num > MaxNum8bPartition) && (src_num + dest_num > file_num*MAX_INDEX_PERCENT / 100)) )) { merge_index_buf[merge_len++] = ALL_INDEX_MARK; if (StructuredIndex) { merge_index_buf[merge_len++] = (attr1 & 0xff00) >> 8; merge_index_buf[merge_len++] = (attr1 & 0xff); } if (file_num <= MaxNum8bPartition) merge_index_buf[merge_len ++] = encode8b(DONT_CONFUSE_SORT); else if (file_num <= MaxNum12bPartition) { merge_index_buf[merge_len ++] = (encode12b(DONT_CONFUSE_SORT) & 0x00000f00) >> 8; merge_index_buf[merge_len ++] = encode12b(DONT_CONFUSE_SORT) & 0x000000ff; } else if (file_num <= MaxNum16bPartition) { merge_index_buf[merge_len ++] = (encode16b(DONT_CONFUSE_SORT) & 0x0000ff00) >> 8; merge_index_buf[merge_len ++] = encode16b(DONT_CONFUSE_SORT) & 0x000000ff; } else { merge_index_buf[merge_len ++] = (encode24b(DONT_CONFUSE_SORT) & 0x00ff0000) >> 16; merge_index_buf[merge_len ++] = (encode24b(DONT_CONFUSE_SORT) & 0x0000ff00) >> 8; merge_index_buf[merge_len ++] = encode24b(DONT_CONFUSE_SORT) & 0x000000ff; } goto final_merge; } merge_index_buf[merge_len++] = WORD_END_MARK; if (StructuredIndex) { merge_index_buf[merge_len++] = (attr1 & 0xff00) >> 8; merge_index_buf[merge_len++] = (attr1 & 0xff); } if (OneFilePerBlock) { for (i=0; i>8; /* msb */ even_words = 0; } else { /* odd number of words so far */ y |= (x&0x00000f00)>>4; /* msb of x into msb of y */ merge_index_buf[merge_len ++] = y; merge_index_buf[merge_len ++] = x&0x000000ff; even_words = 1; } } else if (file_num <= MaxNum16bPartition) { x = encode16b(x); merge_index_buf[merge_len ++] = (x&0x0000ff00)>>8; merge_index_buf[merge_len ++] = x&0x000000ff; } else { x = encode24b(x); merge_index_buf[merge_len ++] = (x&0x00ff0000)>>16; merge_index_buf[merge_len ++] = (x&0x0000ff00)>>8; merge_index_buf[merge_len ++] = x&0x000000ff; } } if (!even_words && (file_num > MaxNum8bPartition) && (file_num <= MaxNum12bPartition)) merge_index_buf[merge_len ++] = y; } else { /* normal indexing */ for (i=0; i=src_num) /* did not find match */ merge_index_buf[merge_len++] = dest_index_set[j]; /* Doesn't matter if dest_index_set is int-array (merge_index_buf being char array) since dest_index_set has only a char */ } } final_merge: merge_index_buf[merge_len++] = '\n'; merge_index_buf[merge_len] = '\0'; fputs(merge_index_buf, f3); /* fprintf(stderr, "%d+%d=%d ", src_end_pt, dest_end_pt, merge_len); */ } /* merge_len > 0 */ } else if (CountWords) { /* indices are frequencies, so just merge them: OneFilPerBlock is ignored */ strcpy(merge_index_buf, src_index_buf); bdx = strlen(merge_index_buf); /* merge_index_buf[bdx] is '\0', merge_index_buf[bdx-1] is '\n' */ if (bdx > 1) bdx--; /* now merge_index_buf[bdx] is '\n', merge_index_buf[bdx-1] is the last index */ bdx2 = 0; /* find the first index */ if (IndexNumber) while(isalnum(dest_index_buf[bdx2])) bdx2 ++; else while(isalpha(dest_index_buf[bdx2])) bdx2++; /* to skip over the word-end marker of dest_index_buf (which is a blank) */ if (bdx2 > 0) bdx2 ++; if (StructuredIndex) bdx2 += 2; /* this is a nop since CountWords and StructuredIndex don't make sense together */ if (bdx >= 1) { merge_index_buf[bdx++] = ' '; /* blank separated fscanf-able list of integers representing counts */ } /* append the indices of word1 to the buffer */ if (dest_index_buf[bdx2] > 0) { while(dest_index_buf[bdx2]>0) merge_index_buf[bdx++] = dest_index_buf[bdx2++]; /* '\n' gets copied */ merge_index_buf[bdx] = '\0'; } /* else, no need to copy anything */ fputs(merge_index_buf, f3); } else { /* indices are actual occurrences (ByteLevelIndex), so just cat them one after the other, src first since that is i2, the 1st one */ /* First put out the word */ bdx1 = 0; while ((bdx1= src_end_pt) || (src_index_buf[bdx1] == ALL_INDEX_MARK) || (src_end_pt + dest_end_pt >= MAX_SORTLINE_LEN)) { putc(ALL_INDEX_MARK, f3); if (StructuredIndex) { putc((attr1&0xff00) >> 8, f3); putc((attr1&0xff), f3); } putc(DONT_CONFUSE_SORT, f3); putc('\n', f3); } else { /* dest can be all index mark */ bdx2 = 0; while ((bdx2= dest_end_pt) || (dest_index_buf[bdx2] == ALL_INDEX_MARK)) { putc(ALL_INDEX_MARK, f3); if (StructuredIndex) { putc((attr1&0xff00) >> 8, f3); putc((attr1&0xff), f3); } putc(DONT_CONFUSE_SORT, f3); putc('\n', f3); } else { /* we have to put out both the lists */ putc(WORD_END_MARK, f3); bdx1 ++; /* skip over WORD_END_MARK */ if (StructuredIndex) { putc((attr1&0xff00) >> 8, f3); putc((attr1&0xff), f3); bdx1 += 2; } while ((bdx1 < src_end_pt) && (src_index_buf[bdx1] != '\n') && (src_index_buf[bdx1] != '\0')) putc(src_index_buf[bdx1++], f3); fputc(encode8b(0), f3); /* instead of the '\n' after end of src_index_buf */ bdx2 ++; /* skip over WORD_END_MARK */ if (StructuredIndex) bdx2 += 2; while ((bdx2 < dest_end_pt) && (dest_index_buf[bdx2] != '\n') && (dest_index_buf[bdx2] != '\0')) putc(dest_index_buf[bdx2++], f3); putc('\n', f3); } } } #if debug printf("merge_index_buf = %s", merge_index_buf); #endif /*debug*/ memset(dest_index_buf, '\0', dest_end_pt+2); if(fgets(dest_index_buf, REAL_INDEX_BUF, f2) == 0) { dest_index_buf[REAL_INDEX_BUF - 1] = '\0'; TAIL1 = ON; break; } dest_index_buf[REAL_INDEX_BUF - 1] = '\0'; dest_end_pt = strlen(dest_index_buf); scanword(word2, dest_index_buf, dest_index_buf+dest_end_pt, &attr2); } else { /* word1 < word2, so output src_index_buf */ fputs(src_index_buf, f3); } memset(src_index_buf, '\0', src_end_pt+2); } if(TAIL1) { if(cmp != 0) fputs(src_index_buf, f3); memset(src_index_buf, '\0', src_end_pt+2); while(fgets(src_index_buf, REAL_INDEX_BUF, f1)) { src_index_buf[REAL_INDEX_BUF - 1] = '\0'; src_end_pt = strlen(src_index_buf); fputs(src_index_buf, f3); memset(src_index_buf, '\0', src_end_pt+2); } } else { /* output the tail of f2 */ fputs(dest_index_buf, f3); memset(dest_index_buf, '\0', dest_end_pt+2); while(fgets(dest_index_buf, REAL_INDEX_BUF, f2)) { dest_index_buf[REAL_INDEX_BUF - 1] = '\0'; dest_end_pt = strlen(dest_index_buf); fputs(dest_index_buf, f3); memset(dest_index_buf, '\0', dest_end_pt+2); } } return; } remove_filename(fileindex, new_partition) int fileindex, new_partition; { if ((fileindex < 0) || (fileindex >= MaxNum24bPartition)) return; #if BG_DEBUG fprintf(LOGFILE, "removing %s from index\n", LIST_GET(name_list, fileindex)); memory_usage -= (strlen(LIST_GET(name_list, fileindex)) + 2); #endif /*BG_DEBUG*/ my_free(LIST_GET(name_list, fileindex), 0); LIST_SUREGET(name_list, fileindex) = NULL; if ((disable_list != NULL) && (fileindex < old_file_num)) disable_list[block2index(fileindex)] |= mask_int[fileindex % (8*sizeof(int))]; } /* returns the set of deleted files in the struct indices format: note that by construction, the list is sorted according to fileindex */ struct indices* get_removed_indices() { int i, j; char *name; struct indices *head = NULL, **tail, *new; tail = &head; for (i=0; iindex[INDEX_SET_SIZE - 1] != INDEX_ELEM_FREE)) { new = (struct indices *)my_malloc(sizeof(struct indices)); memset(new, '\0', sizeof(struct indices)); for (j=0; jindex[j] = INDEX_ELEM_FREE; if (*tail != NULL) { (*tail)->next_i = new; tail = &(*tail)->next_i; } else head = new; } for (j=0; jindex[j] == INDEX_ELEM_FREE) break; /* j must be < INDEX_SET_SIZE */ (*tail)->index[j] = i; } } return head; } /* returns a -ve number if there is no newfileindex for this file (deleted from index), or the new index otherwise */ /* length_of_deletedlist = MaxNum24bPartition + 1 - get_new_index(deletedlist, MaxNum24bPartition + 1); */ get_new_index(deletedlist, oldfileindex) struct indices *deletedlist; int oldfileindex; { int j; int reduction = 0; struct indices *head = deletedlist; while (head!=NULL) { for (j=0; jindex[j] == INDEX_ELEM_FREE) return oldfileindex; /* crossed the limit */ else if (oldfileindex == head->index[j]) return -1; /* oldfileindex has been deleted now */ else if (oldfileindex > head->index[j]) oldfileindex --; else /* oldfileindex < head->index[j] */ return oldfileindex; /* no more have been deleted before oldfileindex */ } head = head->next_i; } return oldfileindex; /* crossed the limit */ } delete_removed_indices(deletedlist) struct indices *deletedlist; { int j; int reduction = 0; struct indices *head = deletedlist, *temp; while (head!=NULL) { temp = head; head = head->next_i; my_free(temp, sizeof(struct indices)); } return reduction; } /* The savings in index space are not worth it if you call this function w/o OneFilePerBlock: returns new_file_num (after purging index of deleted files) */ int purge_index() { int new_file_num; int i, firsti; /* firsti used to eliminate dead words from index */ char s[MAX_LINE_LEN]; char s1[MAX_LINE_LEN]; int j; unsigned char c; FILE *i_in; FILE *i_out; int offset, index; char indexnumberbuf[256]; int onefileperblock, structuredindex; int even_words, dest_even_words; int delim = encode8b(0); int x, y, new_x; int prevy, diff, oldj; if (!OneFilePerBlock) return file_num; #if 0 /* initialized in glimpse.c now */ if ((deletedlist = get_removed_indices()) == NULL) return file_num; printf("into purge_index()\n"); #endif new_file_num = file_num - ( MaxNum24bPartition + 1 - get_new_index(deletedlist, MaxNum24bPartition + 1) ); /* since encoding may change */ /* else, we have already done merge-splits and the result is in INDEX_FILE */ #if 0 printf("purging indexing from %d to %d files\n", file_num, new_file_num); #endif sprintf(s, "%s/%s", INDEX_DIR, INDEX_FILE); if((i_in = fopen(s, "r")) == NULL) { fprintf(stderr, "can't open %s for reading\n", s); exit(2); } sprintf(s, "%s/.glimpse_merge.%d", INDEX_DIR, getpid()); if ((i_out = fopen(s, "w")) == NULL) { fprintf(stderr, "cannot open for writing: %s\n", s); exit(3); } /* modified the original in glimpse's main.c */ fgets(indexnumberbuf, 256, i_in); fputs(indexnumberbuf, i_out); fscanf(i_in, "%%%d\n", &onefileperblock); if (ByteLevelIndex) fprintf(i_out, "%%-%d\n", new_file_num); /* #of files might have changed due to -f/-a */ else fprintf(i_out, "%%%d\n", new_file_num); /* This was the stupidest thing of all! */ fscanf(i_in, "%%%d\n", &structuredindex); if (structuredindex <= 0) structuredindex = 0; fprintf(i_out, "%%%d\n", attr_num); /* attributes might have been added during last merge */ while (fgets(src_index_buf, REAL_INDEX_BUF, i_in)) { i = 0; j = 0; while ((j < REAL_INDEX_BUF) && (src_index_buf[j] != WORD_END_MARK) && (src_index_buf[j] != ALL_INDEX_MARK) && (src_index_buf[j] != '\0') && (src_index_buf[j] != '\n')) dest_index_buf[i++] = src_index_buf[j++]; if ((j >= REAL_INDEX_BUF) || (src_index_buf[j] == '\0') || (src_index_buf[j] == '\n')) continue; /* else it is WORD_END_MARK or ALL_INDEX_MARK */ c = src_index_buf[j]; dest_index_buf[i++] = src_index_buf[j++]; if (structuredindex) { /* convert all attributes to 2B to make merge_in()s easy in build_in.c */ if (structuredindex < MaxNum8bPartition - 1) { dest_index_buf[i++] = src_index_buf[j++]; } else { dest_index_buf[i++] = src_index_buf[j++]; dest_index_buf[i++] = src_index_buf[j++]; } } if (c == ALL_INDEX_MARK) { dest_index_buf[i++] = DONT_CONFUSE_SORT; dest_index_buf[i++] = '\n'; dest_index_buf[i] = '\0'; fputs(dest_index_buf, i_out); continue; } /* src_index_buf[j] points to the first byte of the block numbers, dest_index_buf[i] points to the first empty byte to fill up */ firsti = i; /* * Main while loop: Copied from glimpse/get_index.c/get_set() with minor modifications. */ even_words = 1; dest_even_words = 1; while((j= 0) { /* since encoding may change */ if (ByteLevelIndex) { if (new_file_num <= MaxNum8bPartition) { x = encode8b(new_x); dest_index_buf[i++] = (x&0x000000ff); } else if (new_file_num <= MaxNum16bPartition) { x = encode16b(new_x); dest_index_buf[i++] = ((x&0x0000ff00)>>8); dest_index_buf[i++] = (x&0x000000ff); } else { x = encode24b(new_x); dest_index_buf[i++] = ((x&0x00ff0000)>>16); dest_index_buf[i++] = ((x&0x0000ff00)>>8); dest_index_buf[i++] = (x&0x000000ff); } } /* ByteLevelIndex */ else if (OneFilePerBlock) { if (new_file_num <= MaxNum8bPartition) { dest_index_buf[i++] = (encode8b(new_x)); } else if (new_file_num <= MaxNum12bPartition) { x = encode12b(new_x); if (dest_even_words) { dest_index_buf[i++] = (x & 0x000000ff); /* lsb */ y = (x & 0x00000f00)>>8; /* msb */ dest_even_words = 0; } else { /* odd number of words so far */ y |= (x&0x00000f00)>>4; /* msb of x into msb of y */ dest_index_buf[i++] = (y); dest_index_buf[i++] = (x&0x000000ff); dest_even_words = 1; } } else if (file_num <= MaxNum16bPartition) { x = encode16b(new_x); dest_index_buf[i++] = ((x&0x0000ff00)>>8); dest_index_buf[i++] = (x&0x000000ff); } else { x = encode24b(new_x); dest_index_buf[i++] = ((x&0x00ff0000)>>16); dest_index_buf[i++] = ((x&0x0000ff00)>>8); dest_index_buf[i++] = (x&0x000000ff); } } /* OneFilePerBlock */ } /* else don't put anything */ prevy = 0; if (ByteLevelIndex) { while ((j= 0) { memcpy(&dest_index_buf[i], &src_index_buf[oldj], j - oldj); i += j - oldj; } if ((j= 0) { dest_index_buf[i++] = src_index_buf[j++]; } else j ++; break; } } /* else don't put anything */ } } if (!ByteLevelIndex && OneFilePerBlock && !dest_even_words && (new_file_num > MaxNum8bPartition) && (new_file_num <= MaxNum12bPartition)) dest_index_buf[i++] = (y); if (firsti < i) { dest_index_buf[i++] = '\n'; dest_index_buf[i] = '\0'; fputs(dest_index_buf, i_out); } else { /* word with no references, so ditch it: fill up junk for safety */ dest_index_buf[i] = '\n'; dest_index_buf[i+1] = '\n'; } } fclose(i_in); fflush(i_out); fclose(i_out); #if SFS_COMPAT sprintf(s, "%s/.glimpse_merge.%d", INDEX_DIR, getpid()); sprintf(s1, "%s/%s", INDEX_DIR, INDEX_FILE); rename(s, s1); #else sprintf(s, "exec %s %s/.glimpse_merge.%d %s/%s", SYSTEM_MV, INDEX_DIR, getpid(), INDEX_DIR, INDEX_FILE); system(s); #endif return new_file_num; } /* This does not effect global variables other than disable_list: used only in incremental indexing */ initialize_disable_list(files) int files; { int _FILEMASK_SIZE = ((files + 1)/(8*sizeof(int)) + 4); int _REAL_PARTITION = (FILEMASK_SIZE + 4); if (disable_list == NULL) disable_list = (unsigned int *)my_malloc(sizeof(int)*_REAL_PARTITION); memset(disable_list, '\0', sizeof(int) * _REAL_PARTITION); /* nothing is disabled initially */ }