/* Copyright (c) 1994 Sun Wu, Udi Manber, Burra Gopal. All Rights Reserved. */ /* ./glimpse/index/filetype.c */ /* -------------------------------------------------------------------------- this function detect whether a given file is of special type which we do not want to index. if so, then return(1) else return (0). a file is said to be binary if more than 10% of character > 128 in the sampled input. a file is a uuencoded file if (maybe after mail header), there is a "begin" followed by 3 digits, and no lower case character. statistics we are concerned of: 1) average word length: should not be greater than 10. 2) index density: (the number of different words v.s. number of words). -----------------------------------------------------------------------------*/ #include "glimpse.h" #define SAMPLE_SIZE 8192 #define WORD_THRESHOLD 18 /* the ratio between number of characters and delimiters (blanks or \n) above which the file is determined to be hqx or other non-natural language text */ #if BG_DEBUG extern FILE *LOGFILE; #endif /*BG_DEBUG*/ char *member[MAX_4K_HASH]; int member_tag[MAX_4K_HASH]; int file_id; extern char *getword(); extern char INDEX_DIR[MAX_LINE_LEN]; /* * dosuffix > 0 => processes suffixes (build_in.c after filtering); * dosuffix > 0 but != 1 => processes suffixes only (IndexEverything, dir.c where we don't want to read files); * dosuffix == 0 => processes other ad-hoc file checks (Default, dir.c where we want to discard un-indexable files). */ int filetype(name, dosuffix) char *name; int dosuffix; { unsigned char buffer[SAMPLE_SIZE+1]; int num_read; int BINARY=0; int UUENCODED=0; int fd; if (!dosuffix) goto nosuffix; if (!strcmp(COMP_SUFFIX, &name[strlen(name)-strlen(COMP_SUFFIX)])) return 0; if (test_special_suffix(name)) { #if BG_DEBUG fprintf(LOGFILE, "special suffix: %s -- not indexing\n", name); #endif /*BG_DEBUG*/ return 1; } if (dosuffix != 1) return 0; nosuffix: if((fd = open(name, 0)) < 0) { /* This is the only thing the user might want to know: suppress other warnings */ fprintf(stderr, "permission denied or non-existent file: %s\n", name); return(1); } if ((num_read = read(fd, buffer, SAMPLE_SIZE)) <= 0) { #if BG_DEBUG fprintf(LOGFILE, "no data: %s -- not indexing\n", name); #endif /*BG_DEBUG*/ close(fd); return 1; } if (test_postscript(buffer, num_read)) { #if BG_DEBUG fprintf(LOGFILE, "postscript file: %s -- not indexing\n", name); #endif /*BG_DEBUG*/ close(fd); return 1; } BINARY = test_binary(buffer, num_read); if(BINARY == ON) { #if BG_DEBUG fprintf(LOGFILE, "binary file: %s -- not indexing\n", name); #endif /*BG_DEBUG*/ close(fd); return(1); } /* now check for uuencoded file */ UUENCODED = test_uuencode(buffer, num_read); if(UUENCODED == ON) { #if BG_DEBUG fprintf(LOGFILE, "uuencoded file: %s -- not indexing\n", name); #endif /*BG_DEBUG*/ close(fd); return(1); } if(heavy_index(name, buffer, num_read)) { #if BG_DEBUG fprintf(LOGFILE, "heavy index file: %s -- not indexing\n ", name); #endif /*BG_DEBUG*/ close(fd); return(1); } if(hqx(name, buffer, num_read)) { #if BG_DEBUG fprintf(LOGFILE, "too few real words: %s -- not indexing\n", name); #endif /*BG_DEBUG*/ close(fd); return(1); } close(fd); return(0); } /* ---------------------------------------------------------------------- check for heavy index file. the function first test block 1 (of SAMPLE_SIZE bytes). the file is determined to be heavy index file if index_ratio > 0.9 and num_words > 500 ??? ---------------------------------------------------------------------- */ heavy_index(name, buffer, num_read) char *name; char *buffer; int num_read; { char *buffer_end; int hash_value; int new_word_num=0; int word_num=0; char word[256]; buffer_end = &buffer[num_read]; while((buffer = getword(name, word, buffer, buffer_end, NULL)) < buffer_end) { if(word[0] == '\0') continue; word_num++; hash_value = hash4k(word, strlen(word)); if(member_tag[hash_value] != file_id) { new_word_num++; member_tag[hash_value] = file_id; } } if(new_word_num * 100 >= word_num * 83 && word_num >= 500) return(1); #ifdef debug printf("%s: new_word_num=%d, word_num=%d\n", name, new_word_num, word_num); #endif return(0); } /* ---------------------------------------------------------------------- check for hqx encoded files or other files with long lines, for example, postscript files, core files, and others. the function first test block 1 (of SAMPLE_SIZE bytes). the file is determined to be bad if the ratio of blanks or newlines is too small. ---------------------------------------------------------------------- */ hqx(name, buffer, num_read) char *name; char *buffer; int num_read; { int i; char c; int sep=0; if (num_read < 2048) return(0) ; for (i=0; i < num_read ; i++) { c=buffer[i]; if (c == '\n' || c == ' ' || c == '/') sep++; /* the '/' is for list of file names. */ /* the \n is for lists of words, but should be excluded really so that dictionaries are excluded */ } if (!sep) return(1); if (num_read/sep > WORD_THRESHOLD) return(1); else return(0); }