/* Copyright (c) 1994 Sun Wu, Udi Manber, Burra Gopal. All Rights Reserved. */ /* ./glimpse/index/partition.c */ #include "glimpse.h" #include extern int DeleteFromIndex; extern int FastIndex; extern int FilenamesOnStdin; extern char INDEX_DIR[MAX_LINE_LEN]; extern char sync_path[MAX_LINE_LEN]; extern int file_num; /* the number of files */ extern int new_file_num; /* the new number of files after purging some from index */ extern char **name_list[MAXNUM_INDIRECT]; /* to store the file names */ extern int *size_list[MAXNUM_INDIRECT]; /* store size of each file */ extern int p_table[MAX_PARTITION]; /* partition table, the i-th partition begins at p_table[i] and ends at p_tables[i+1] */ extern int p_size_list[MAX_PARTITION]; /* sum of the sizes of the files in each partition */ extern int part_num; /* number of partitions, 1 initially since partition # 0 is not accessed */ extern int built_filename_hashtable; extern name_hashelement *name_hashtable[MAX_4K_HASH]; extern int total_size; /* total size of the directory */ extern int total_deleted; /* number of files being deleted */ int part_size=DEFAULT_PART_SIZE; /* partition size */ int new_partition; int files_per_partition; int files_in_partition; char patbuf[MAX_PAT]; extern unsigned char *src_index_buf; extern unsigned char *dest_index_buf; extern int REAL_PARTITION, REAL_INDEX_BUF, MAX_ALL_INDEX, FILEMASK_SIZE; extern int memory_usage; extern struct indices *deletedlist; extern FILE *STATFILE; extern FILE *MESSAGEFILE; extern struct stat excstbuf; extern struct stat incstbuf; extern int GenerateHash; extern int KeepFilenames; extern int OneFilePerBlock; extern int ByteLevelIndex; extern int StructuredIndex; extern int attr_num; extern char INDEX_DIR[MAX_LINE_LEN]; extern int AddToIndex; extern int IndexableFile; extern int BuildTurbo; char *exin_argv[8]; int exin_argc; char current_dir_buf[2*MAX_LINE_LEN + 4]; /* must have space to store pattern after directory name */ unsigned char dummypat[MAX_PAT]; int dummylen; FILE *dummyout; partition(dir_num, dir_name) char **dir_name; int dir_num; { int num_pat=0; int num_inc=0; int len; int pat_len[MAX_EXCLUSIVE]; int inc_len[MAX_EXCLUSIVE]; CHAR *inc[MAX_INCLUSIVE]; /* store the patterns used to mask in files */ CHAR *pat[MAX_EXCLUSIVE]; /* store the patterns that are used to mask out those files that are not to be indexed */ int MinPartNum; /* minimum number of partitions */ int i=0, j; int subtotal=0; int pdx = 0; /* index pointer for p_table */ FILE *patfile; /* file descriptor for prohibit pattern file */ FILE *incfile; /* file descriptor for include pattern file */ char *current_dir; /* must have '\n' before directory name */ char s[MAX_LINE_LEN]; char working_dir[MAX_LINE_LEN]; struct stat sbuf; current_dir_buf[0] = '\n'; current_dir_buf[1] = '\0'; current_dir = ¤t_dir_buf[1]; /* if (IndexableFile) goto directlytofsize; */ if ((dummyout = fopen("/dev/null", "w")) == NULL) return -1; exin_argv[0] = "glimpseindex"; exin_argv[1] = "dummypat"; exin_argc = 2; if ((dummylen = memagrep_init(exin_argc, exin_argv, MAX_PAT, dummypat)) <= 0) return -1; /* exclude/include pattern search */ sprintf(s, "%s/%s", INDEX_DIR, PROHIBIT_LIST); patfile = fopen(s, "r"); if(patfile == NULL) { /* fprintf(stderr, "can't open exclude-pattern file\n"); -- no need! */ num_pat = 0; } else { while((num_pat < MAX_EXCLUSIVE) && fgets(patbuf, MAX_PAT, patfile)) { if ((len = strlen(patbuf)) < 1) continue; patbuf[len-1] = '\0'; if ((pat_len[num_pat] = convert2agrepregexp(patbuf, len-1)) == 0) continue; pat[num_pat++] = (unsigned char *) strdup(patbuf); } fclose(patfile); } #if 0 printf("num_pat %d\n", num_pat); for(i=0; i MaxNumPartition) { printed_warning = 1; if (AddToIndex) { fprintf(MESSAGEFILE, "Warning: partition-table overflow! Fresh indexing recommended.\n"); } else { fprintf(MESSAGEFILE, "Warning: partition-table overflow! Commencing fresh indexing...\n"); return partition(dir_num, dir_name); } } } if ((dir_num <= 1) && FilenamesOnStdin) while (fgets(current_dir, MAX_LINE_LEN, stdin) == current_dir) { current_dir[strlen(current_dir)-1] = '\0'; /* overwrite \n with \0 */ /* Get absolute path name of the directory or file being indexed */ if (-1 == stat(current_dir, &sbuf)) { fprintf(stderr, "permission denied or non-existent: %s\n", current_dir); continue; } if ((S_ISDIR(sbuf.st_mode)) && (current_dir[0] != '/')) { getcwd(working_dir, MAX_LINE_LEN - 1); if (-1 == chdir(current_dir)) { fprintf(stderr, "Cannot chdir to %s\n", current_dir); continue; } getcwd(current_dir, MAX_LINE_LEN - 1); chdir(working_dir); } if (!DeleteFromIndex) printf("Indexing \"%s\" ...\n", current_dir); fsize(current_dir, pat, pat_len, num_pat, inc, inc_len, num_inc, 1); /* the file names will be in name_list[] */ } else for(i=1; i= 0) && (new_file_num <= file_num)) file_num = new_file_num; /* only if purge_index() was called: -f/-a/-d only */ /* Dump attributes */ if (StructuredIndex && (attr_num > 0)) { int ret; sprintf(s, "%s/%s", INDEX_DIR, ATTRIBUTE_FILE); if (-1 == (ret = attr_dump_names(s))) { fprintf(stderr, "can't open %s for writing\n", s); exit(2); } } /* Dump partition table; change index if necessary */ sprintf(s, "%s/%s", INDEX_DIR, P_TABLE); if((p_out = fopen(s, "w")) == NULL) { fprintf(stderr, "can't open for writing: %s\n", s); exit(2); } if (!OneFilePerBlock) { #ifdef SW_DEBUG printf("part_num = %d, part_size = %d\n", part_num, part_size); #endif for(i=0; i<=part_num; i++) { /* Assumes sizeof(int) is 32bits, which is true even for ALPHA */ putc((p_table[i] & 0xff000000) >> 24, p_out); putc((p_table[i] & 0x00ff0000) >> 16, p_out); putc((p_table[i] & 0x0000ff00) >> 8, p_out); if (putc((p_table[i] & 0x000000ff), p_out) == EOF) { fprintf(stderr, "Error: write failed at %s:%d\n", __FILE__, __LINE__); exit(2); } if (i==part_num) break; if (p_table[i] == p_table[i+1]) { fprintf(STATFILE, "part_num = %d, files = none, part_size = 0\n",i); continue; } fprintf(STATFILE, "part_num = %d, files = %d .. %d, part_size = %d\n", i, p_table[i], p_table[i+1] - 1, p_size_list[i]); } if (StructuredIndex) { /* check if we can reduce default 2B attributeids to smaller ones */ sprintf(s, "%s/.glimpse_split.%d", INDEX_DIR, getpid()); if((i_out = fopen(s, "w")) == NULL) { fprintf(stderr, "can't open %s for writing\n", s); exit(2); } sprintf(s, "%s/%s", INDEX_DIR, INDEX_FILE); if((i_in = fopen(s, "r")) == NULL) { fprintf(stderr, "can't open %s for reading\n", s); exit(2); } /* modified the original in glimpse's main.c */ fgets(indexnumberbuf, 256, i_in); fputs(indexnumberbuf, i_out); fscanf(i_in, "%%%d\n", &onefileperblock); fprintf(i_out, "%%%d\n", onefileperblock); /* If #of files change, then they are added to a new partition, which is updated above */ fscanf(i_in, "%%%d\n", &structuredindex); if (structuredindex <= 0) structuredindex = 0; fprintf(i_out, "%%%d\n", attr_num); /* attributes might have been added during last merge */ while(fgets(src_index_buf, REAL_INDEX_BUF, i_in)) { j = 0; while ((j < REAL_INDEX_BUF) && (src_index_buf[j] != WORD_END_MARK) && (src_index_buf[j] != ALL_INDEX_MARK) && (src_index_buf[j] != '\0') && (src_index_buf[j] != '\n')) j++; if ((j >= REAL_INDEX_BUF) || (src_index_buf[j] == '\0') || (src_index_buf[j] == '\n')) continue; /* else it is WORD_END_MARK or ALL_INDEX_MARK */ c = src_index_buf[j+1]; src_index_buf[j+1] = '\0'; fputs(src_index_buf, i_out); src_index_buf[j+1] = c; index=decode16b((src_index_buf[j+1] << 8) | (src_index_buf[j+2])); if ((attr_num > 0) && (attr_num < MaxNum8bPartition - 1)) { putc(encode8b(index), i_out); } else if (attr_num > 0) { putc(src_index_buf[j+1], i_out); putc(src_index_buf[j+2], i_out); } j += 3; if (fputs(src_index_buf+j, i_out) == EOF) { /* Rest of the partitions information */ fprintf(stderr, "Error: write failed at %s:%d\n", __FILE__, __LINE__); exit(2); } } fclose(i_in); fflush(i_out); fclose(i_out); #if SFS_COMPAT sprintf(s, "%s/.glimpse_split.%d", INDEX_DIR, getpid()); sprintf(s1, "%s/%s", INDEX_DIR, INDEX_FILE); rename(s, s1); #else sprintf(s, "exec %s %s/.glimpse_split.%d %s/%s", SYSTEM_MV, INDEX_DIR, getpid(), INDEX_DIR, INDEX_FILE); system(s); #endif } } else { /* Don't care about individual file sizes in statistics since the user can look at it anyway by ls -l! */ sprintf(s, "%s/.glimpse_split.%d", INDEX_DIR, getpid()); if((i_out = fopen(s, "w")) == NULL) { fprintf(stderr, "can't open %s for writing\n", s); exit(2); } sprintf(s, "%s/%s", INDEX_DIR, INDEX_FILE); if((i_in = fopen(s, "r")) == NULL) { fprintf(stderr, "can't open %s for reading\n", s); exit(2); } /* modified the original in glimpse's main.c */ fgets(indexnumberbuf, 256, i_in); fputs(indexnumberbuf, i_out); fscanf(i_in, "%%%d\n", &onefileperblock); if (ByteLevelIndex) fprintf(i_out, "%%-%d\n", file_num); /* #of files might have changed due to -f/-a */ else fprintf(i_out, "%%%d\n", file_num); /* This was the stupidest thing of all! */ fscanf(i_in, "%%%d\n", &structuredindex); if (structuredindex <= 0) structuredindex = 0; fprintf(i_out, "%%%d\n", attr_num); /* attributes might have been added during last merge */ part_size = 0; /* current offset in the p_table file */ while(fgets(src_index_buf, REAL_INDEX_BUF, i_in)) { j = 0; while ((j < REAL_INDEX_BUF) && (src_index_buf[j] != WORD_END_MARK) && (src_index_buf[j] != ALL_INDEX_MARK) && (src_index_buf[j] != '\n')) j++; if ((j >= REAL_INDEX_BUF) || (src_index_buf[j] == '\n')) continue; /* else it is WORD_END_MARK or ALL_INDEX_MARK */ c = src_index_buf[j+1]; src_index_buf[j+1] = '\0'; fputs(src_index_buf, i_out); src_index_buf[j+1] = c; c = src_index_buf[j]; if (StructuredIndex) { index = decode16b((src_index_buf[j+1] << 8) | (src_index_buf[j+2])); if ((attr_num > 0) && (attr_num < MaxNum8bPartition - 1)) { putc(encode8b(index), i_out); } else if (attr_num > 0) { putc(src_index_buf[j+1], i_out); putc(src_index_buf[j+2], i_out); } j += 2; } if (c == ALL_INDEX_MARK) { putc(DONT_CONFUSE_SORT, i_out); if (putc('\n', i_out) == EOF) { fprintf(stderr, "Error: write failed at %s:%d\n", __FILE__, __LINE__); exit(2); } continue; } offset = encode32b(part_size); putc((offset & 0xff000000) >> 24, i_out); /* force big-endian */ putc((offset & 0x00ff0000) >> 16, i_out); putc((offset & 0x0000ff00) >> 8, i_out); putc((offset & 0x000000ff), i_out); if (putc('\n', i_out) == EOF) { fprintf(stderr, "Error: write failed at %s:%d\n", __FILE__, __LINE__); exit(2); } j++; /* @first byte of the block numbers */ while((src_index_buf[j] != '\n') && (src_index_buf[j] != '\0')) { putc(src_index_buf[j++], p_out); part_size ++; } if (putc('\n', p_out) == EOF) { fprintf(stderr, "Error: write failed at %s:%d\n", __FILE__, __LINE__); exit(2); } part_size ++; } fclose(i_in); fflush(i_out); fclose(i_out); #if SFS_COMPAT sprintf(s, "%s/.glimpse_split.%d", INDEX_DIR, getpid()); sprintf(s1, "%s/%s", INDEX_DIR, INDEX_FILE); rename(s, s1); #else sprintf(s, "exec %s %s/.glimpse_split.%d %s/%s", SYSTEM_MV, INDEX_DIR, getpid(), INDEX_DIR, INDEX_FILE); system(s); #endif system(sync_path); /* sync() has a BUG */ sprintf(s, "%s/%s", INDEX_DIR, INDEX_FILE); if (BuildTurbo) dump_mini(s); } fflush(p_out); fclose(p_out); /* Dump file names */ if (KeepFilenames) { sprintf(s, "exec %s %s/%s %s/%s.prev", SYSTEM_CP, INDEX_DIR, NAME_LIST, INDEX_DIR, NAME_LIST); system(s); sprintf(s, "exec %s %s/%s %s/%s.prev", SYSTEM_CP, INDEX_DIR, NAME_LIST_INDEX, INDEX_DIR, NAME_LIST_INDEX); system(s); } sprintf(s, "%s/%s", INDEX_DIR, NAME_LIST); if((f_out = fopen(s, "w")) == NULL) { fprintf(stderr, "can't open %s for writing\n", s); exit(2); } sprintf(s, "%s/%s", INDEX_DIR, NAME_LIST_INDEX); if((i_out = fopen(s, "w")) == NULL) { fprintf(stderr, "can't open %s for writing\n", s); exit(2); } fprintf(f_out, "%d\n", file_num); for(i=0,offset=ftell(f_out); i> 24, i_out); putc((offset&0xff0000) >> 16, i_out); putc((offset&0xff00) >> 8, i_out); putc((offset&0xff), i_out); fputs(LIST_GET(name_list, i), f_out); putc('\n', f_out); offset += strlen(LIST_GET(name_list, i)) + 1; } else { /* else empty line to indicate file that was removed = HOLE */ if (name_list_size == file_num) { putc((offset&0xff000000) >> 24, i_out); putc((offset&0xff0000) >> 16, i_out); putc((offset&0xff00) >> 8, i_out); putc((offset&0xff), i_out); putc('\n', f_out); offset += 1; } } /* else there are no holes since index was purged, so don't put anything */ } fflush(f_out); fclose(f_out); fflush(i_out); fclose(i_out); if (GenerateHash) { /* Dump file hash: don't want to keep filenames in hash-order like index since adding a file can shift many hash-values and change the whole index! */ if (KeepFilenames) { sprintf(s, "exec %s %s/%s %s/%s.prev", SYSTEM_CP, INDEX_DIR, NAME_HASH, INDEX_DIR, NAME_HASH); system(s); sprintf(s, "exec %s %s/%s %s/%s.prev", SYSTEM_CP, INDEX_DIR, NAME_HASH_INDEX, INDEX_DIR, NAME_HASH_INDEX); system(s); } sprintf(s, "%s/%s", INDEX_DIR, NAME_HASH); if((f_out = fopen(s, "w")) == NULL) { fprintf(stderr, "can't open %s for writing\n", s); exit(2); } sprintf(s, "%s/%s", INDEX_DIR, NAME_HASH_INDEX); if((i_out = fopen(s, "w")) == NULL) { fprintf(stderr, "can't open %s for writing\n", s); exit(2); } if (!built_filename_hashtable) build_filename_hashtable(name_list, file_num); for (i=0,offset=ftell(f_out); i> 24, i_out); putc((offset&0xff0000) >> 16, i_out); putc((offset&0xff00) >> 8, i_out); putc((offset&0xff), i_out); e = name_hashtable[i]; while(e!=NULL) { if ((index = get_new_index(deletedlist, e->index)) < 0) { e = e->next; continue; } putc(((index)&0xff000000)>>24, f_out); putc(((index)&0xff0000)>>16, f_out); putc(((index)&0xff00)>>8, f_out); putc(((index)&0xff), f_out); offset += 4; fputs(e->name, f_out); fputc('\0', f_out); /* so that I can do direct strcmp */ offset += strlen(e->name) + 1; e = e->next; } } fflush(f_out); fclose(f_out); fflush(i_out); fclose(i_out); } #if 0 fflush(stdout); printf("AFTER SAVE_DATA_STRUCTURES:\n"); sprintf(s, "exec %s -lg .glimpse_*", SYSTEM_LS); system(s); sprintf(s, "exec %s .glimpse_index", SYSTEM_WC); system(s); getchar(); #endif /*0*/ return 0; } /* Merges the index split by save_data_structures into a single index */ merge_splits() { FILE *i_in; FILE *p_in; FILE *i_out; char s[MAX_LINE_LEN], s1[MAX_LINE_LEN]; int j, index; unsigned char c; char indexnumberbuf[256]; int onefileperblock, structuredindex, i; #if 0 fflush(stdout); printf("BEFORE MERGE_SPLITS:\n"); sprintf(s, "exec %s -lg .glimpse_*", SYSTEM_LS); system(s); sprintf(s, "exec %s .glimpse_index", SYSTEM_HEAD); system(s); getchar(); #endif /*0*/ sprintf(s, "%s/%s", INDEX_DIR, P_TABLE); if ((p_in = fopen(s, "r")) == NULL) { fprintf(stderr, "cannot open for reading: %s\n", s); exit(3); } sprintf(s, "%s/%s", INDEX_DIR, INDEX_FILE); if ((i_in = fopen(s, "r")) == NULL) { fprintf(stderr, "cannot open for reading: %s\n", s); exit(3); } sprintf(s, "%s/.glimpse_merge.%d", INDEX_DIR, getpid()); if ((i_out = fopen(s, "w")) == NULL) { fprintf(stderr, "cannot open for writing: %s\n", s); exit(3); } /* modified the original in glimpse's main.c */ fgets(indexnumberbuf, 256, i_in); fputs(indexnumberbuf, i_out); fscanf(i_in, "%%%d\n", &onefileperblock); fprintf(i_out, "%%%d\n", onefileperblock); fscanf(i_in, "%%%d\n", &structuredindex); if (structuredindex <= 0) structuredindex = 0; fprintf(i_out, "%%%d\n", structuredindex); #if !WORD_SORTED if (!DeleteFromIndex || FastIndex) { /* a new index is going to be built in this case: must sort by word */ fclose(i_in); sprintf(s, "%s/%s", INDEX_DIR, MINI_FILE); if ((i_in = fopen(s, "r")) != NULL) { /* minifile exists */ #if DONTUSESORT_T_OPTION || SFS_COMPAT sprintf(s, "exec %s %s/%s > %s/%s.tmp", SYSTEM_SORT, INDEX_DIR, INDEX_FILE, INDEX_DIR, INDEX_FILE); #else sprintf(s, "exec %s -T %s %s/%s > %s/%s.tmp", SYSTEM_SORT, INDEX_DIR, INDEX_DIR, INDEX_FILE, INDEX_DIR, INDEX_FILE); #endif system(s); #if SFS_COMPAT sprintf(s, "%s/%s.tmp", INDEX_DIR, INDEX_FILE); sprintf(s1, "%s/%s", INDEX_DIR, INDEX_FILE); rename(s, s1); #else sprintf(s, "exec %s %s/%s.tmp %s/%s", SYSTEM_MV, INDEX_DIR, INDEX_FILE, INDEX_DIR, INDEX_FILE); system(s); #endif system(sync_path); /* sync() has a BUG */ fclose(i_in); } sprintf(s, "%s/%s", INDEX_DIR, INDEX_FILE); if ((i_in = fopen(s, "r")) == NULL) { fprintf(stderr, "cannot open for reading: %s\n", s); exit(3); } /* skip the 1st 3 lines which might get jumbled up */ fgets(s, MAX_LINE_LEN, i_in); fgets(s, MAX_LINE_LEN, i_in); fgets(s, MAX_LINE_LEN, i_in); } #endif /* !WORD_SORTED */ while (fgets(src_index_buf, REAL_INDEX_BUF, i_in)) { j = 0; while ((j < REAL_INDEX_BUF) && (src_index_buf[j] != WORD_END_MARK) && (src_index_buf[j] != ALL_INDEX_MARK) && (src_index_buf[j] != '\0') && (src_index_buf[j] != '\n')) j++; if ((j >= REAL_INDEX_BUF) || (src_index_buf[j] == '\0') || (src_index_buf[j] == '\n')) continue; /* else it is WORD_END_MARK or ALL_INDEX_MARK */ c = src_index_buf[j+1]; src_index_buf[j+1] = '\0'; fputs(src_index_buf, i_out); src_index_buf[j+1] = c; c = src_index_buf[j]; if (structuredindex) { /* convert all attributes to 2B to make merge_in()s easy in build_in.c */ if (structuredindex < MaxNum8bPartition - 1) { index = encode16b(decode8b(src_index_buf[j+1])); putc((index & 0x0000ff00) >> 8, i_out); putc(index & 0x000000ff, i_out); j ++; } else { putc(src_index_buf[j+1], i_out); putc(src_index_buf[j+2], i_out); j += 2; } } if (c == ALL_INDEX_MARK) { putc(DONT_CONFUSE_SORT, i_out); putc('\n', i_out); continue; } /* src_index_buf[j+1] points to the first byte of the offset */ get_block_numbers(&src_index_buf[j+1], &dest_index_buf[0], p_in); j = 0; /* first byte of the block numbers */ while ((dest_index_buf[j] != '\n') && (dest_index_buf[j] != '\0')) { putc(dest_index_buf[j], i_out); dest_index_buf[j] = '\0'; j++; } if (putc('\n', i_out) == EOF) { fprintf(stderr, "Error: write failed at %s:%d\n", __FILE__, __LINE__); exit(2); } } fclose(i_in); fclose(p_in); fflush(i_out); fclose(i_out); #if SFS_COMPAT sprintf(s, "%s/.glimpse_merge.%d", INDEX_DIR, getpid()); sprintf(s1, "%s/%s", INDEX_DIR, INDEX_FILE); rename(s, s1); #else sprintf(s, "exec %s %s/.glimpse_merge.%d %s/%s", SYSTEM_MV, INDEX_DIR, getpid(), INDEX_DIR, INDEX_FILE); system(s); #endif #if 0 fflush(stdout); printf("AFTER MERGE_SPLITS:\n"); sprintf(s, "exec %s -lg .glimpse_*", SYSTEM_LS); system(s); sprintf(s, "exec %s .glimpse_index"SYSTEM_HEAD); system(s); getchar(); #endif /*0*/ }