/* Copyright (c) 1994 Sun Wu, Udi Manber, Burra Gopal. All Rights Reserved. */ /* ./glimpse/index/io.c */ #include "glimpse.h" #include #include extern char INDEX_DIR[MAX_LINE_LEN]; extern int memory_usage; #include "utils.c" int REAL_INDEX_BUF = DEF_REAL_INDEX_BUF, MAX_ALL_INDEX = DEF_MAX_ALL_INDEX, FILEMASK_SIZE = DEF_FILEMASK_SIZE, REAL_PARTITION = DEF_REAL_PARTITION; /* -------------------------------------------------------------------- get_array_of_lines() input: an input filename, address of the table, maximum number of entries of the table, and a overflow handling flag. output: a set of strings in the table. when overflow is ON, the function returns after the table is filled. otherwise the function will exit if overflow occurs. In normal return, the function returns the number of entries read. ----------------------------------------------------------------------*/ get_array_of_lines(inputfile, table, max_entry, overflow_ok) char *inputfile; char **table[]; int max_entry; /* max number of entries in the table */ int overflow_ok; /* flag for handling overflow */ { int tx=0; /* index for table */ FILE *file_in; unsigned char buffer[MAX_NAME_BUF]; char *np; int line_length; int num_lines; if((file_in = fopen(inputfile, "r")) == NULL) { if (overflow_ok) return 0; fprintf(stderr, "can't open for reading: %s\n", inputfile); exit(3); } fgets(buffer, MAX_NAME_BUF, file_in); sscanf(buffer, "%d", &num_lines); if ((num_lines < 0) || (num_lines > MaxNum24bPartition)) { fclose(file_in); if (overflow_ok) return 0; fprintf(stderr, "Error in reading: %s\n", inputfile); exit(3); } while(fgets(buffer, MAX_NAME_BUF, file_in)) { line_length = strlen(buffer); buffer[line_length-1] = '\0'; /* discard the '\n' */ #if BG_DEBUG np = (char *) my_malloc(sizeof(char) * (line_length + 2)); #else /*BG_DEBUG*/ np = (char *) my_malloc(sizeof(char) * (line_length + 2)); #endif /*BG_DEBUG*/ if(np == NULL) { int i=0; fclose(file_in); for (i=0; i max_entry) { fclose(file_in); if(overflow_ok) { fclose(file_in); return(tx); } fprintf(stderr, "overflow in get_array_of_lines()\n"); exit(2); } } fclose(file_in); return(tx); /* return number of lines read */ } /* -------------------------------------------------------------------- get_table(): input: an input filename, address of the table, maximum number of entries of the table, and a overflow handling flag. output: a set of integers in the table. when overflow_ok is ON, the function returns after the table is filled. otherwise the function will exit if overflow occurs. In normal return, the function returns the number of entries read. ----------------------------------------------------------------------*/ int get_table(inputfile, table, max_entry, overflow_ok) char *inputfile; int table[]; int max_entry; int overflow_ok; { int val = 0; int c = 0; FILE *file_in; int tx=0; /* number of entries read */ if((file_in = fopen(inputfile, "r")) == NULL) { if (overflow_ok) return 0; fprintf(stderr, "can't open %s for reading\n", inputfile); exit(2); } while((c = getc(file_in)) != EOF) { val = c << 24; if ((c = getc(file_in)) == EOF) break; val |= c << 16; if ((c = getc(file_in)) == EOF) break; val |= c << 8; if ((c = getc(file_in)) == EOF) break; val |= c; table[tx++] = val; if(tx > max_entry) { if(!overflow_ok) { fprintf(stderr, "in get_table: table overflow\n"); exit(2); } break; } } fclose(file_in); return(tx); } get_index_type(s) char *s; { FILE *fp = fopen(s, "r"); char buf[32]; int num; if (fp == NULL) return 0; fscanf(fp, "%s\n%%%d\n", buf, &num); /* buf is "%%" or "%%1234567890", num is >= 0 */ /* printf("get_index_type(): %s %d\n", buf, num); */ fclose(fp); return num; } /* Read offset from srcbuf first so that you can use it with srcbuf=destbuf */ get_block_numbers(srcbuf, destbuf, partfp) unsigned char *srcbuf, *destbuf; FILE *partfp; { int offset, pat_size; static int printederror = 0; /* Does not do caching of blocks seen so far: done in OS hopefully */ offset = (srcbuf[0] << 24) | (srcbuf[1] << 16) | (srcbuf[2] << 8) | (srcbuf[3]); pat_size = decode32b(offset); if (-1 == fseek(partfp, pat_size, 0)) { if (!printederror) { fprintf(stderr, "Warning! Error in the format of the index!\n"); printederror = 1; } } destbuf[0] = '\n'; destbuf[1] = '\0'; destbuf[2] = '\0'; destbuf[3] = '\0'; if (fgets(destbuf, REAL_INDEX_BUF - MAX_WORD_BUF - 1, partfp) == NULL) { destbuf[0] = '\n'; destbuf[1] = '\0'; destbuf[2] = '\0'; destbuf[3] = '\0'; } } extern int num_filter; extern int filter_len[MAX_FILTER]; extern CHAR *filter[MAX_FILTER]; extern CHAR *filter_command[MAX_FILTER]; read_filters(index_dir, dofilter) char *index_dir; int dofilter; { int len; int patlen; int patpos; int commandpos; FILE *filterfile; char filterbuf[MAX_LINE_LEN]; char tempbuf[MAX_LINE_LEN]; char s[MAX_LINE_LEN]; num_filter = 0; memset(filter, '\0', sizeof(CHAR *) * MAX_FILTER); memset(filter_command, '\0', sizeof(CHAR *) * MAX_FILTER); memset(filter_len, '\0', sizeof(int) * MAX_FILTER); if (!dofilter) return; sprintf(s, "%s/%s", index_dir, FILTER_FILE); filterfile = fopen(s, "r"); if(filterfile == NULL) { /* fprintf(stderr, "can't open filter file %s\n", s); -- no need */ num_filter = 0; } else { while((num_filter < MAX_FILTER) && fgets(filterbuf, MAX_LINE_LEN, filterfile)) { if ((len = strlen(filterbuf)) < 1) continue; filterbuf[len-1] = '\0'; commandpos = 0; while ((commandpos < len) && ((filterbuf[commandpos] == ' ') || (filterbuf[commandpos] == '\t'))) commandpos ++; /* leading spaces */ if (commandpos >= len) continue; if (filterbuf[commandpos] == '\'') { commandpos ++; patpos = commandpos; patlen = 0; while (commandpos < len) { if (filterbuf[commandpos] == '\\') { commandpos += 2; patlen += 2; } else if (filterbuf[commandpos] != '\'') { commandpos ++; patlen ++; } else break; } if ((commandpos >= len) || (patlen <= 0)) continue; commandpos ++; } else { patpos = commandpos; patlen = 0; while ((commandpos < len) && (filterbuf[commandpos] != ' ') && (filterbuf[commandpos] != '\t')) { commandpos ++; patlen ++; } while ((commandpos < len) && ((filterbuf[commandpos] == ' ') || (filterbuf[commandpos] == '\t'))) commandpos ++; if (commandpos >= len) continue; } memcpy(tempbuf, &filterbuf[patpos], patlen); tempbuf[patlen] = '\0'; if ((filter_len[num_filter] = convert2agrepregexp(tempbuf, patlen)) == 0) continue; /* inplace conversion */ filter[num_filter] = (unsigned char *) strdup(tempbuf); filter_command[num_filter] = (unsigned char *)strdup(&filterbuf[commandpos]); num_filter ++; } fclose(filterfile); } } /* 1 if filter application was successful and the output (>1B) is in outname, 2 if some pattern matched but there is no output, 0 otherwise: sep 15-18 '94 */ /* memagrep is initialized in partition.c for calls from dir.c, and it is already done by the time we call this function from main.c */ apply_filter(inname, outname) char *inname, *outname; /* outname is in-out, inname is in */ { int i; char name[MAX_LINE_LEN]; int name_len = strlen(inname); char s[MAX_LINE_LEN]; FILE *dummyout; FILE *dummyin; char dummybuf[4]; char prevoutname[MAX_LINE_LEN]; char newoutname[MAX_LINE_LEN]; int ret = 0; int unlink_prevoutname = 0; if (num_filter <= 0) return 0; if ((dummyout = fopen("/dev/null", "w")) == NULL) return 0; /* ready for memgrep */ name[0] = '\n'; strcpy(name+1, inname); strcpy(prevoutname, inname); strcpy(newoutname, outname); /* Current properly filtered output is always in prevoutname */ for(i=0; i 0) { char *suffix; name[name_len + 1] = '\0'; /* if (strstr(name+1, filter[i]) != NULL) { Chris Dalton */ if ((suffix = strstr(name+1, filter[i])) != NULL) { if (ret == 0) ret = 2; /* yes, it matched: now apply the command and get the output */ /* printf("filtering %s\n", name); */ sprintf(s, "exec %s '%s' > %s", filter_command[i], prevoutname, newoutname); system(s); if (((dummyin = fopen(newoutname, "r")) == NULL) || (fread(dummybuf, 1, 1, dummyin) <= 0)) { if (dummyin != NULL) fclose(dummyin); unlink(newoutname); continue; } /* Filter was successful: output exists and has atleast 1 byte in it */ fclose(dummyin); if (unlink_prevoutname) unlink(prevoutname); strcpy(prevoutname, newoutname); sprintf(newoutname, "%s.o", prevoutname); ret = 1; unlink_prevoutname = 1; #if 1 /* if the matched text was a proper suffix of the name, */ /* remove the suffix just processed before examining the */ /* name again. Chris Dalton */ /* And I don't know what the equivalent thing is with */ /* memagrep_search: since it doesn't return a pointer to */ /* the place where the match occured. Burra Gopal */ if (strcmp(filter[i], suffix) == 0) { name_len -= strlen(suffix); *suffix= '\0'; } #endif /*1*/ if (strlen(newoutname) >= MAX_LINE_LEN - 1) break; } } else { /* must call memagrep */ name[name_len + 1] = '\n'; /* memagrep wants names to end with '\n': '\0' is not necessary */ /* printf("i=%d filterlen=%d filter=%s inlen=%d input=%s\n", i, -filter_len[i], filter[i], len_current_dir_buf, current_dir_buf); */ if (((filter_len[i] == -2) && (filter[i][0] == '.') && (filter[i][1] == '*')) || (memagrep_search(-filter_len[i], filter[i], name_len + 2, name, 0, dummyout) > 0)) { if (ret == 0) ret = 2; /* yes, it matched: now apply the command and get the output */ /* printf("filtering %s\n", name); */ sprintf(s, "exec %s '%s' > %s", filter_command[i], prevoutname, newoutname); system(s); if (((dummyin = fopen(newoutname, "r")) == NULL) || (fread(dummybuf, 1, 1, dummyin) <= 0)) { if (dummyin != NULL) fclose(dummyin); unlink(newoutname); continue; } /* Filter was successful: output exists and has atleast 1 byte in it */ fclose(dummyin); if (unlink_prevoutname) unlink(prevoutname); strcpy(prevoutname, newoutname); sprintf(newoutname, "%s.o", prevoutname); ret = 1; unlink_prevoutname = 1; if (strlen(newoutname) >= MAX_LINE_LEN - 1) break; } } } if (ret == 1) strcpy(outname, prevoutname); else { /* dummy filter that copies input to output: caller can use inname but this has easy interface */ sprintf(s, "exec %s %s > %s\n", SYSTEM_CAT, inname, outname); system(s); } fclose(dummyout); return ret; } /* Use a modified wais stoplist to do this with simple strcmp's in a for loop */ static_stop_list(word) char *word; { return 0; } /* This is the stuff that used to be present in the old build_in.c */ /* Some variables used throughout */ #if BG_DEBUG FILE *LOGFILE; /* file descriptor for LOG output */ #endif /*BG_DEBUG*/ FILE *STATFILE; /* file descriptor for statistical data about indexed files */ FILE *MESSAGEFILE; /* file descriptor for important messages meant for the user */ char INDEX_DIR[MAX_LINE_LEN]; char sync_path[MAX_LINE_LEN]; struct stat istbuf; struct stat excstbuf; struct stat incstbuf; int ICurrentFileOffset; int NextICurrentFileOffset; /* Some options used throughout */ int GenerateHash = OFF; int KeepFilenames = OFF; int OneFilePerBlock = OFF; int total_size = 0; int total_deleted = 0; int MAXWORDSPERFILE = 0; int NUMERICWORDPERCENT = 50; int AddToIndex = OFF; int DeleteFromIndex = OFF; int PurgeIndex = ON; int FastIndex = OFF; int BuildDictionary = OFF; int BuildDictionaryExisting = OFF; int CompressAfterBuild = OFF; int IncludeHigherPriority = OFF; int FilenamesOnStdin = OFF; int UseFilters = OFF; int ByteLevelIndex = OFF; /* int IndexUnderscore = OFF; */ int IndexableFile = OFF; int MAX_INDEX_PERCENT = DEF_MAX_INDEX_PERCENT; int MAX_PER_MB = DEF_MAX_PER_MB; int I_THRESHOLD = DEF_I_THRESHOLD; int BigHashTable = OFF; int IndexEverything = OFF; int HashTableSize = MAX_64K_HASH; int BuildTurbo = OFF; int AddedMaxWordsMessage = OFF; int AddedMixedWordsMessage = OFF; int icount=0; /* count the number of my_malloc for indices structure */ int hash_icount=0; /* to see how much was added to the current hash table */ int save_icount=0; /* to see how much was added to the index by the current file */ int numeric_icount=0; /* to see how many numeric words were there in the current file */ int num_filter=0; int filter_len[MAX_FILTER]; CHAR *filter[MAX_FILTER]; CHAR *filter_command[MAX_FILTER]; int mask_int[32] = MASK_INT; int p_table[MAX_PARTITION]; int memory_usage = 0; char * my_malloc(len) int len; { char *s; static int i=100; if ((s = malloc(len)) != NULL) memory_usage += len; else fprintf(stderr, "malloc failed after memory_usage = %x Bytes\n", memory_usage); /* Don't exit since might do traverse here: exit in glimpse though */ #if BG_DEBUG printf("m:%x ", memory_usage); i--; if (i==0) { printf("\n"); i = 100; } #endif /*BG_DEBUG*/ return s; } my_free(ptr, size) void *ptr; int size; { free(ptr); memory_usage -= size; #if BG_DEBUG printf("f:%x ", memory_usage); #endif /*BG_DEBUG*/ } int file_num = 0; int old_file_num = 0; /* upto what file number should disable list be accessed: < file_num if incremental indexing */ int new_file_num = -1; /* after purging index, how many files are left: for save_data_structures() */ int bp=0; /* buffer pointer */ unsigned char word[MAX_WORD_BUF]; int FirstTraverse1 = ON; struct indices *ip; /* Globals used in merge, and also in glimpse's get_index.c */ unsigned int *src_index_set = NULL; unsigned int *dest_index_set = NULL; unsigned char *src_index_buf = NULL; unsigned char *dest_index_buf = NULL; unsigned char *merge_index_buf = NULL; /* * Routines for zonal memory allocation for glimpseindex and very fast search in glimpse. */ int next_free_token = 0; struct token *free_token = NULL; /*[I_THRESHOLD/AVG_OCCURRENCES]; */ int next_free_indices = 0; struct indices *free_indices = NULL; /*[I_THRESHOLD]; */ int next_free_word = 0; char *free_word = NULL; /*[I_THRESHOLD/AVG_OCCURRENCES * AVG_WORD_LEN]; */ extern int usemalloc; /* * The beauty of this allocation scheme is that "free" does not need to be implemented! */ tokenallfree() { next_free_token = 0; } tokenfree(e, len) struct token *e; int len; { if (usemalloc) my_free(e, sizeof(struct token)); } struct token * tokenalloc(len) int len; { struct token *e; if (usemalloc) (e) = (struct token *)my_malloc(sizeof(struct token)); else { if (free_token == NULL) free_token = (struct token *)my_malloc(sizeof(struct token) * I_THRESHOLD / INDICES_PER_TOKEN); if (free_token == NULL) {fprintf(stderr, "malloc failure in tokenalloc()\n"); exit(3);} else (e) = ((next_free_token >= I_THRESHOLD / INDICES_PER_TOKEN) ? (NULL) : (&(free_token[next_free_token ++]))); } return e; } indicesallfree() { next_free_indices = 0; } indicesfree(e, len) struct indices *e; int len; { if (usemalloc) my_free(e, sizeof(struct indices)); } struct indices * indicesalloc(len) int len; { struct indices *e; if (usemalloc) (e) = (struct indices *)my_malloc(sizeof(struct indices)); else { if (free_indices == NULL) free_indices = (struct indices *)my_malloc(sizeof(struct indices) * I_THRESHOLD); if (free_indices == NULL) {fprintf(stderr, "malloc failure in indicesalloc()\n"); exit(3);} else (e) = ((next_free_indices >= I_THRESHOLD) ? (NULL) : (&(free_indices[next_free_indices ++]))); } return e; } /* For words in a token structure */ wordallfree() { next_free_word = 0; } wordfree(s, len) char *s; int len; { if (usemalloc) my_free(s, len); } char * wordalloc(len) int len; { char *s; if (usemalloc) (s) = (char *)my_malloc(len); else { if (free_word == NULL) free_word = (char *)my_malloc(AVG_WORD_LEN * I_THRESHOLD/INDICES_PER_TOKEN); if (free_word == NULL) {fprintf(stderr, "malloc failure in wordalloc()\n"); exit(3); } else (s) = ((next_free_word + len + 2 >= AVG_WORD_LEN * I_THRESHOLD/INDICES_PER_TOKEN) ? (NULL) : (&(free_word[next_free_word]))); next_free_word += (len); /* 2 for 1 char word with '\0' */ } return s; } struct mini *mini_array = NULL; int mini_array_len = 0; #if WORD_SORTED /* * Routines that operate on the index using the mini-index. * * The index is a list of words+delim+attr+offset+\n sorted * by the word (using strcmp). * * The mini-index keeps track of the offsets in the index * where every WORDS_PER_REGION-th word in the index occurs. * There is no direct way for glimpse to seek into the mini * file for the exact offset of this word since unlike hash * values words are of variable length. * * This is small enough to be kept in memory and searched * directly with full word case insensitive string compares * with binary search. For 256000 words in index there will be * 256000/128 = 2000 words in mini-index that will occupy * 2000*32 (avgword + off + delim/attr + sizeof(struct mini)), * which is less than 16 pages (can always be resident in mem). * * We just need to string search log_2(2000) + 128 words of * length 12B each in the worst case ===> VERY FAST. This is * not the best possible but space is the limit. If we hash the * whole index/regions in the index, we need TOO MUCH memory. */ /* * Binary search mini_array[beginindex..endindex); return 1 if success, 0 if failure. * Sets begin and end offsets for direct search; initially beginindex=0, endindex=mini_array_len */ int get_mini(word, len, beginoffset, endoffset, beginindex, endindex, minifp) unsigned char *word; int len; long *beginoffset, *endoffset; int beginindex, endindex; FILE *minifp; { int cmp, midindex; if ((mini_array == NULL) || (mini_array_len <= 0)) return 0; midindex = beginindex + (endindex - beginindex)/2; cmp = strcmp(word, mini_array[midindex].word); if (cmp < 0) { /* word DEFINITELY BEFORE midindex (but still at or after beginindex) */ if (beginindex >= midindex) { /* range of search is just ONE element in array */ *beginoffset = mini_array[midindex].offset; if (midindex + 1 < mini_array_len) { *endoffset = mini_array[midindex + 1].offset; } else *endoffset = -1; /* go till end of file */ return 1; } else return get_mini(word, len, beginoffset, endoffset, beginindex, midindex); } else { /* word DEFINITELY AT OR AFTER midindex (but still before endindex) */ if ((cmp == 0) || (endindex <= midindex + 1)) { /* range of search is just ONE element in array */ *beginoffset = mini_array[midindex].offset; if (midindex + 1 < mini_array_len) { *endoffset = mini_array[midindex + 1].offset; } else *endoffset = -1; /* go till end of file */ return 1; } else return get_mini(word, len, beginoffset, endoffset, midindex, endindex); } } /* Returns: #of words in mini_array if success or already read, -1 if failure */ int read_mini(indexfp, minifp) FILE *indexfp, *minifp; /* indexfp pointing right to first line of word+... */ { unsigned char s[MAX_LINE_LEN], word[MAX_NAME_LEN]; int wordnum = 0, wordlen; long offset; struct stat st; if ((mini_array != NULL) && (mini_array_len > 0)) return mini_array_len; if (minifp == NULL) return 0; if (fstat(fileno(minifp), &st) == -1) { fprintf(stderr, "Can't stat: %s\n", s); return -1; } rewind(minifp); fscanf(minifp, "%d\n", &mini_array_len); if ((mini_array_len <= 0) || (mini_array_len > (st.st_size / 4 /* \n, space, 1char offset, 1char word */))) { fprintf(stderr, "Error in format of: %s\n", s); return -1; } mini_array = (struct mini *)my_malloc(sizeof(struct mini) * mini_array_len); memset(mini_array, '\0', sizeof(struct mini) * mini_array_len); while ((wordnum < mini_array_len) && (fscanf(minifp, "%s %ld\n", word, &offset) != EOF)) { wordlen = strlen((char *)word); mini_array[wordnum].word = (char *)my_malloc(wordlen + 2); strcpy((char *)mini_array[wordnum].word, (char *)word); mini_array[wordnum].offset = offset; wordnum ++; } return mini_array_len; } dump_mini(indexfile) char *indexfile; { unsigned char s[MAX_LINE_LEN], word[MAX_NAME_LEN]; FILE *indexfp; FILE *minifp; int wordnum = 0, j, attr_num; long offset; /* offset if offset of beginning of word */ if ((indexfp = fopen(indexfile, "r")) == NULL) { fprintf(stderr, "Can't open for reading: %s\n", indexfile); return; } sprintf(s, "%s/%s.tmp", INDEX_DIR, MINI_FILE); if ((minifp = fopen(s, "w")) == NULL) { fprintf(stderr, "Can't open for writing: %s\n", s); fclose(indexfp); return; } fgets(s, 256, indexfp); /* indexnumbers */ fgets(s, 256, indexfp); /* onefileperblock */ fscanf(indexfp, "%%%d\n", &attr_num); /* structured index */ offset = ftell(indexfp); while (fgets(s, MAX_LINE_LEN, indexfp) != NULL) { if ((wordnum % WORDS_PER_REGION) == 0) { j = 0; while ((j < MAX_LINE_LEN) && (s[j] != WORD_END_MARK) && (s[j] != ALL_INDEX_MARK) && (s[j] != '\n')) j++; if ((j >= MAX_LINE_LEN) || (s[j] == '\n')) { wordnum ++; offset = ftell(indexfp); continue; } /* else it is WORD_END_MARK or ALL_INDEX_MARK */ s[j] = '\0'; strcpy((char *)word, (char *)s); if (fprintf(minifp, "%s %ld\n", word, offset) == EOF) { fprintf(stderr, "Error: write failed at %s:%d\n", __FILE__, __LINE__); break; } mini_array_len ++; } wordnum ++; offset = ftell(indexfp); } fclose(indexfp); fflush(minifp); fclose(minifp); /* * Add amount of space needed for mini_array at the beginning */ sprintf(s, "%s/%s", INDEX_DIR, MINI_FILE); if ((minifp = fopen(s, "w")) == NULL) { fprintf(stderr, "Can't open for writing: %s\n", s); goto end; } sprintf(s, "%s/%s.tmp", INDEX_DIR, MINI_FILE); if ((indexfp = fopen(s, "r")) == NULL) { fprintf(stderr, "Can't open for reading: %s\n", s); fclose(minifp); goto end; } fprintf(minifp, "%d\n", mini_array_len); while (fgets(s, MAX_LINE_LEN, indexfp) != NULL) { fputs(s, minifp); } fflush(minifp); fclose(minifp); end: sprintf(s, "%s/%s.tmp", INDEX_DIR, MINI_FILE); unlink(s); return; } #else /* WORD_SORTED */ int get_mini(word, len, beginoffset, endoffset, beginindex, endindex, minifp) unsigned char *word; int len; long *beginoffset, *endoffset; int beginindex, endindex; FILE *minifp; { int index; unsigned char array[sizeof(int)]; extern int glimpse_isserver; /* in agrep/agrep.c */ index = hash64k(word, len); if ((mini_array == NULL) || (mini_array_len <= 0) || !glimpse_isserver) { if (minifp == NULL) return 0; fseek(minifp, (long)(index*sizeof(int)), 0); if (fread((void *)array, sizeof(int), 1, minifp) != 1) return 0; *beginoffset = decode32b((array[0] << 24) | (array[1] << 16) | (array[2] << 8) | array[3]); if (fread((void *)array, sizeof(int), 1, minifp) != 1) *endoffset = -1; else *endoffset = decode32b((array[0] << 24) | (array[1] << 16) | (array[2] << 8) | array[3]); return 1; } *beginoffset = mini_array[index].offset; if (index + 1 < endindex) *endoffset = mini_array[index + 1].offset; else *endoffset = -1; return 1; } /* Returns: #of words in mini_array if success or already read, -1 if failure */ int read_mini(indexfp, minifp) FILE *indexfp, *minifp; /* indexfp pointing right to first line of word+... */ { unsigned char s[MAX_LINE_LEN], array[sizeof(int)]; int offset, hash_value; if ((mini_array != NULL) && (mini_array_len > 0)) return mini_array_len; if (minifp == NULL) return 0; rewind(minifp); mini_array_len = MINI_ARRAY_LEN; mini_array = (struct mini *)my_malloc(sizeof(struct mini) * mini_array_len); memset(mini_array, '\0', sizeof(struct mini) * mini_array_len); hash_value = 0; /* line# I am going to scan */ offset = 0; while ((hash_value < MINI_ARRAY_LEN) && (fread((void *)array, sizeof(int), 1, minifp) == 1)) { offset = (array[0] << 24) | (array[1] << 16) | (array[2] << 8) | array[3]; mini_array[hash_value++].offset = decode32b(offset); } for (; hash_value= linelen) || (s[j] == '\n') || (s[j] == '\0')) { continue; } /* else it is WORD_END_MARK or ALL_INDEX_MARK */ c = s[j]; s[j] = '\0'; hash_value = hash64k(s, j); s[j] = c; fprintf(newindexfp, "%d ", hash_value); if (fputs(s, newindexfp) == EOF) { fprintf(stderr, "Error: write failed at %s:%d\n", __FILE__, __LINE__); exit(3); } } fclose(indexfp); fflush(newindexfp); fclose(newindexfp); #if SFS_COMPAT unlink(indexfile); #else sprintf(s, "exec %s %s", SYSTEM_RM, indexfile); system(s); #endif #if DONTUSESORT_T_OPTION || SFS_COMPAT sprintf(s, "exec %s -n %s.tmp > %s\n", SYSTEM_SORT, indexfile, indexfile); #else sprintf(s, "exec %s -n -T %s %s.tmp > %s\n", SYSTEM_SORT, INDEX_DIR, indexfile, indexfile); #endif rc = system(s); if (rc >> 8) { fprintf (stderr, "'sort' command:\n"); fprintf (stderr, " %s\n", s); fprintf (stderr, "failed with exit status %d\n", rc>>8); exit(3); } #if SFS_COMPAT sprintf(s, "%s.tmp", indexfile); unlink(s); #else sprintf(s, "exec %s %s.tmp", SYSTEM_RM, indexfile); system(s); #endif system(sync_path); /* sync() has a BUG */ /* * Now dump the mini-file's offsets and create the stripped index file */ if ((indexfp = fopen(indexfile, "r")) == NULL) { fprintf(stderr, "Can't open for reading: %s\n", indexfile); exit(3); } sprintf(s, "%s.tmp", indexfile); if ((newindexfp = fopen(s, "w")) == NULL) { fprintf(stderr, "Can't open for writing: %s\n", s); fclose(indexfp); exit(3); } sprintf(s, "%s/%s", INDEX_DIR, MINI_FILE); if ((minifp = fopen(s, "w")) == NULL) { fprintf(stderr, "Can't open for writing: %s\n", s); fclose(indexfp); fclose(newindexfp); exit(3); } fputs(indexnumber, newindexfp); fputs(onefileperblock, newindexfp); fprintf(newindexfp, "%%%d\n", attr_num); prev_hash_value = -1; hash_value = 0; offset = ftell(newindexfp); while (fgets(s, MAX_LINE_LEN, indexfp) != NULL) { linelen = strlen(s); t = s; while ((*t != ' ') && (t < s + linelen)) t++; if (t >= s + linelen) continue; *t = '\0'; sscanf(s, "%d", &hash_value); t ++; /* points to first character of the beginning of s */ fputs(t, newindexfp); if (hash_value != prev_hash_value) { for (j=prev_hash_value + 1; j<=hash_value; j++) { eoffset = encode32b((int)offset); putc((eoffset & 0xff000000) >> 24, minifp); putc((eoffset & 0xff0000) >> 16, minifp); putc((eoffset & 0xff00) >> 8, minifp); if (putc((eoffset & 0xff), minifp) == EOF) { fprintf(stderr, "Error: write failed at %s:%d\n", __FILE__, __LINE__); exit(3); } } prev_hash_value = hash_value; } offset = ftell(newindexfp); } for (hash_value = prev_hash_value + 1; hash_value> 24, minifp); putc((eoffset & 0xff0000) >> 16, minifp); putc((eoffset & 0xff00) >> 8, minifp); if (putc((eoffset & 0xff), minifp) == EOF) { fprintf(stderr, "Error: write failed at %s:%d\n", __FILE__, __LINE__); exit(3); } } fclose(indexfp); fflush(newindexfp); fclose(newindexfp); fflush(minifp); fclose(minifp); #if SFS_COMPAT unlink(indexfile); #else sprintf(s, "exec %s %s", SYSTEM_RM, indexfile); system(s); #endif #if SFS_COMPAT sprintf(s, "%s.tmp", indexfile); rename(s, indexfile); #else sprintf(s, "exec %s %s.tmp %s\n", SYSTEM_MV, indexfile, indexfile); system(s); #endif system(sync_path); /* sync() has a BUG */ } #endif /* WORD_SORTED */ /* Creates data structures that are related to the number of files present in * ".glimpse_filenames". These data structures are: * 1. index sets -- use my_malloc * 2. index bufs -- use my_malloc * Once this is done, this function can be called directly from glimpse/get_filenames() * and that can use all sets/bufs data structures directly. * This doesn't care how name_list() is created to be an array of arrays to be able to * add/delete dynamically from it: this uses malloc completely. * But: * disable_list (which is used only inside glimpse_index) must be malloced separately. * multi_dest_index_set (which is used only inside glimpse) must be malloced separately. */ initialize_data_structures(files) int files; { FILEMASK_SIZE = ((files + 1)/(8*sizeof(int)) + 4); REAL_PARTITION = (FILEMASK_SIZE + 4); if (REAL_PARTITION < MAX_PARTITION + 2) REAL_PARTITION = MAX_PARTITION + 2; REAL_INDEX_BUF = ((files + 1) + 2*MAX_WORD_BUF + 2); /* index line length with OneFilePerBlock (and/or ByteLevelIndex) */ if (REAL_INDEX_BUF < MAX_SORTLINE_LEN) REAL_INDEX_BUF = MAX_SORTLINE_LEN; MAX_ALL_INDEX = (REAL_INDEX_BUF / 2); if (src_index_set == NULL) src_index_set = (unsigned int *)my_malloc(sizeof(int)*REAL_PARTITION); memset(src_index_set, '\0', sizeof(int) * REAL_PARTITION); if (dest_index_set == NULL) dest_index_set = (unsigned int *)my_malloc(sizeof(int)*REAL_PARTITION); memset(dest_index_set, '\0', sizeof(int) * REAL_PARTITION); if (src_index_buf == NULL) src_index_buf = (unsigned char *)my_malloc(sizeof(char)*REAL_INDEX_BUF); memset(src_index_buf, '\0', sizeof(char)*REAL_INDEX_BUF); if (dest_index_buf == NULL) dest_index_buf = (unsigned char *)my_malloc(sizeof(char)*REAL_INDEX_BUF); memset(dest_index_buf, '\0', sizeof(char)*REAL_INDEX_BUF); if (merge_index_buf == NULL) merge_index_buf = (unsigned char *)my_malloc(sizeof(char)*REAL_INDEX_BUF); memset(merge_index_buf, '\0', sizeof(char)*REAL_INDEX_BUF); } destroy_data_structures() { if (src_index_set != NULL) free(src_index_set); src_index_set = NULL; if (dest_index_set != NULL) free(dest_index_set); dest_index_set = NULL; if (src_index_buf != NULL) free(src_index_buf); src_index_buf = NULL; if (dest_index_buf != NULL) free(dest_index_buf); dest_index_buf = NULL; if (merge_index_buf != NULL) free(merge_index_buf); merge_index_buf = NULL; } name_hashelement *name_hashtable[MAX_4K_HASH]; /* * Returns the index of the name if the it is found amongst the set * of files in name_array; -1 otherwise. */ int get_filename_index(name) char *name; { int index = hash4k(name, strlen(name)); /* int skips=0; */ name_hashelement *e; e = name_hashtable[index]; while((e != NULL) && (strncmp(name, e->name, e->name_len))) { /* skips ++; */ e = e->next; } /* fprintf(STATFILE, "skips = %d\n", skips); */ if (e == NULL) return -1; return e->index; } insert_filename(name, name_index) char *name; int name_index; { int len; int index; name_hashelement **pe; if (name == NULL) return; len = strlen(name); index = hash4k(name, len); pe = &name_hashtable[index]; while((*pe != NULL) && (strncmp((*pe)->name, name, (*pe)->name_len))) pe = &(*pe)->next; if ((*pe) != NULL) return; if ((*pe = (name_hashelement *)my_malloc(sizeof(name_hashelement))) == NULL) { fprintf(stderr, "malloc failure in insert_filename %s:%d\n", __FILE__, __LINE__); exit(2); } (*pe)->next = NULL; #if 0 if (((*pe)->name = (char *)my_malloc(len + 2)) == NULL) { fprintf(stderr, "malloc failure in insert_filename %s:%d\n", __FILE__, __LINE__); exit(2); } strcpy((*pe)->name, name); #else (*pe)->name = name; #endif (*pe)->name_len = strlen(name); (*pe)->index = name_index; } init_filename_hashtable() { int i; for (i=0; inext; #if 0 my_free(t->name); #endif my_free(t, sizeof(name_hashelement)); } *pe = NULL; } built_filename_hashtable = 0; }