/* Copyright (c) 1994 Sun Wu, Udi Manber, Burra Gopal. All Rights Reserved. */ /* ./glimpse/index/pirs.h */ #ifndef _GLIMPSE_H_ #define _GLIMPSE_H_ #include #include #include #include /*#include */ #include #include #include #include #undef log #include "agrep.h" #ifndef S_ISREG /* #define S_ISREG(mode) (0100000&(mode)) */ #define S_ISREG(mode) (((mode) & (_S_IFMT)) == (_S_IFREG)) #endif #ifndef S_ISDIR /* #define S_ISDIR(mode) (0040000&(mode)) */ #define S_ISDIR(mode) (((mode) & (_S_IFMT)) == (_S_IFDIR)) #endif #define IC_PORTRELEASE 20 /* time till used TCP port is released */ #ifndef ON #define ON 1 #endif #ifndef OFF #define OFF 0 #endif #ifndef CHAR #define CHAR unsigned char #endif #define MAX_INCLUSIVE 256 /* max number of inclusive patterns for files to be indexed even if filetype.c says otherwise. */ #define MAX_EXCLUSIVE 256 /* max number of exclusive patterns for not_to_be_indexed files */ #define MAX_FILTER 256 /* max number of filter patterns */ #define DEF_I_THRESHOLD 40000 /* 100000 originally, debugging 10000 */ #define AVG_OCCURRENCES 8 /* #of places a word occurs on average: sizeof(.glimpse_partitions)/`wc -l .glimpse_index`: divisible by INDEX_SET_SIZE */ #define MAX_LIST 0177777 #define DEFAULT_PART_SIZE (1 << 13) #define MAX_64K_HASH (64*1024) #define MAX_256K_HASH (256*1024) #define MAX_4K_HASH (4*1024) #define DISKBLOCKSIZE 8192 #define BLOCK_SIZE (1024*64) #define MAX_PARTITION 255 #define MaxNumPartition 250 /* it's not 255, since there is fragmentation*/ /* The idea behind our encoding is: dividend = divisor * quotient + remainder */ #define MaxNum4bPartition (16 - 2) /* since 10 and 0 can't be in LSB/MSB */ #define MaxNum8bPartition (256 - 2) #define MaxNum12bPartition (MaxNum4bPartition*MaxNum8bPartition) #define MaxNum16bPartition (MaxNum8bPartition*MaxNum8bPartition) #define MaxNum24bPartition (MaxNum8bPartition*MaxNum16bPartition) #define MaxNum32bPartition (MaxNum8bPartition*MaxNum24bPartition) /* These help in encoding byte-level indices: 1st byte's top 2 bits tell the #of bytes - 1 in offset-difference encoding; offset-diff 0 => new file follows */ #define MaxNum1BPartition (MaxNum8bPartition & 0x3f) /* 62: top byte is 0x00 | x % MaxNum8bPartition === x; just encode x */ #define MaxNum2BPartition (MaxNum1BPartition * MaxNum8bPartition) /* top byte = 0x40 | x / MaxNum8bPartition; rest is x % ~; encode both separately */ #define MaxNum3BPartition (MaxNum1BPartition * MaxNum16bPartition) /* top byte = 0x80 | x / MaxNum16bPartition; rest is x % ~; encode both separately */ #define MaxNum4BPartition (MaxNum1BPartition * MaxNum24bPartition) /* top byte = 0xc0 | x / MaxNum24bPartition; rest is x % ~; encode both separately */ #define MIN_WORDS 50 /* before we inform about numeric words */ #define MAX_SEARCH_PERCENT 20 /* warn user if searching > this % of blocks */ #define DEF_MAX_INDEX_PERCENT 80 /* if word in > 80%, say everywhere for one-file-per-block */ #define DONT_CONFUSE_SORT 1 #define WORD_END_MARK 2 #define ALL_INDEX_MARK 3 /* If this, then word is in > 60% of blocks */ #define ATTR_END_MARK 4 /* After list of attributes before file offset/block numbers */ #define AVG_WORD_LEN 12 /* average word length is 8-9 including '\0': have safety margin */ #define MAX_NAME_SIZE 256 #define MAX_NAME_LEN MAX_NAME_SIZE #define MaxNameLength MAX_NAME_SIZE #define MAX_LINE_SIZE 1024 #define MAX_LINE_LEN 1024 #define MAX_SORTLINE_LEN (MAX_LINE_LEN * 16) /* Can be ((MaxNum16bPartition*sizeof(int)+MAX_NAME_LEN)*MAX_INDEX_PERCENT/100) in the worst case */ #define MAX_NAME_BUF MAX_NAME_SIZE #define MAX_WORD_SIZE 64 /* w/o '\0'; was 24 in 2.1 */ #define MAX_WORD_LEN MAX_WORD_SIZE #define MAX_WORD_BUF 80 /* was 32 in 2.1 */ #define MAX_PAT 256 #define MAXNUM_INDIRECT MaxNum8bPartition #define MAX_INDEX_BUF (MAX_PARTITION + 1 + 2*MAX_WORD_BUF + 2) /* index line length without OneFilePerBlock */ #define DEF_REAL_INDEX_BUF (MaxNum16bPartition + 2*MAX_WORD_BUF + 2) /* index line length with OneFilePerBlock */ /* Must write fresh code to calculate these sets based by multiplying defaults below with round(file_num, MaxNum16bPartition) */ #define DEF_FILESET_SIZE MaxNum16bPartition /* used when OneFilePerBlock is ON */ #define DEF_FILEMASK_SIZE (DEF_FILESET_SIZE/(8*sizeof(int)) + 4) /* bit mask of files */ #define DEF_REAL_PARTITION (DEF_FILEMASK_SIZE + 4) /* must be > MAX_PARTITION + 1 */ /* block must be in 0..DEF_FILESET_SIZE-1, and integers should represent bit-masks */ #define block2index(i) (i/(8*sizeof(int))) #define block2mask(i) (1<<(i%(8*sizeof(int)))) /* not used */ #define round(x, y) (((x)+(y)-1)/(y)) #define FILES_PER_PARTITION(x) (16 + round(x, MAX_PARTITION)*4) /* 16 is minimum length of buffer: thereafter, allow noise upto 4 times average */ #define LIST_GET(list, elem) ((list[(elem)/MaxNum16bPartition] == 0) ? (0) : (list[(elem)/MaxNum16bPartition][(elem)%MaxNum16bPartition])) #define LIST_SUREGET(list, elem) (list[(elem)/MaxNum16bPartition][(elem)%MaxNum16bPartition]) #define LIST_ADD(list, elem, what, type) \ {\ int index = (elem + 1)/MaxNum16bPartition;\ if (list[index] == NULL) {\ list[index] = (type *)malloc(sizeof(type)*MaxNum16bPartition);\ memset(list[index], '\0', sizeof(type)*MaxNum16bPartition);\ }\ LIST_SUREGET(list, elem) = what;\ } #define DEFAULT_REGION_LIMIT 256 /* default limit for a record: for ByteLevelIndex: pattern is ignored since can't avoid false matches w/o search */ #define MAX_REGION_LIMIT 16384 /* max amount of space I am going allocate for a record bounded by a delimiter */ #define MAX_PER_LINE (MAX_SORTLINE_LEN / 2) /* #of words that can occur on one line before we split it up: not implemented at present */ #define DEF_MAX_PER_MB 500 /* Maximum number of times a word should occur in a megabyte before we say its everywhere */ #define DEF_ALL_INDEX 10000 /* Must be < DEF_MAX_ALL_INDEX */ #define DEF_MAX_ALL_INDEX (DEF_REAL_INDEX_BUF / 2) /* THIS * 2 must be < DEF_REAL_INDEX_BUF to prevent seg-faults! */ /* Default file names */ #define FILTER_FILE ".glimpse_filters" #define ATTRIBUTE_FILE ".glimpse_attributes" #define INDEX_FILE ".glimpse_index" #define MINI_FILE ".glimpse_turbo" #define P_TABLE ".glimpse_partitions" #define NAME_LIST ".glimpse_filenames" #define NAME_LIST_INDEX ".glimpse_filenames_index" #define NAME_HASH ".glimpse_filehash" #define NAME_HASH_INDEX ".glimpse_filehash_index" #define DEF_LOG_FILE ".glimpse_log" #define DEF_MESSAGE_FILE ".glimpse_messages" #define DEF_STAT_FILE ".glimpse_statistics" #define PROHIBIT_LIST ".glimpse_exclude" #define INCLUDE_LIST ".glimpse_include" #define DEBUG_FILE ".glimpse_debug" #define I2 ".glimpse_tmpi2" #define I3 ".glimpse_tmpi3" #define I1 ".glimpse_tmpi1" #define O1 ".glimpse_tmpo1" #define O2 ".glimpse_tmpo2" #define O3 ".glimpse_tmpo3" #define DEF_LOCK_FILE ".glimpse_lock" #define HARVEST_PREFIX "glimpse" /* so that Darren can filterout error messages a user should see from the stuff outputted by glimpse on an error */ #define MASK_INT \ { 0x00000001, 0x00000002, 0x00000004, 0x00000008, 0x00000010, 0x00000020, 0x00000040, 0x00000080,\ 0x00000100, 0x00000200, 0x00000400, 0x00000800, 0x00001000, 0x00002000, 0x00004000, 0x00008000,\ 0x00010000, 0x00020000, 0x00040000, 0x00080000, 0x00100000, 0x00200000, 0x00400000, 0x00800000,\ 0x01000000, 0x02000000, 0x04000000, 0x08000000, 0x10000000, 0x20000000, 0x40000000, 0x80000000\ } #define INDEXABLE(c) (indexable_char[c]) #if SFS_COMPAT #define IGNORED_SUFFIXES {"gz", "Z", "z", "zip", "o", "hqx", "tar", "glimpse_index", "glimpse_partitions", "glimpse_filenames", "glimpse_filenames.prev", "glimpse_statistics", "glimpse_messages", "glimpse_exclude", "glimpse_include", "glimpse_filters", "glimpse_attributes"} #define NUM_SUFFIXES 17 #else #define IGNORED_SUFFIXES {"gz", "Z", "z", "zip", "o", "hqx", "tar", "glimpse_index", "glimpse_partitions"} #define NUM_SUFFIXES 9 #endif /* Version and release year: same for glimpse and glimspeindex since glimpse HAS to interpret glimpseindex */ #define GLIMPSE_VERSION "3.0" #define GLIMPSE_DATE "1995" #define GLIMPSE_EMAIL "glimpse@cs.arizona.edu" /* Some extern functions used in structured queries */ extern int attr_name_to_id(), attr_load_names(), attr_dump_names(); extern char *attr_id_to_name(); /* Data structures for hash-tables in build_in.c */ struct token { /* each token stores a unique word and unique attribute */ struct token *next_t; /* keep it a pointer even with tokenalloc to keep build_in.c same */ char *word; struct indices *ip; /* points to the head of the list of indices */ struct indices *lastip; /* tail of this list = last elemet (for increasing order insertion) */ unsigned int attribute; unsigned int totalcount;/* no. of indices structures in a token */ }; #define INDEX_SET_SIZE 4 #define INDEX_ELEM_FREE (MaxNum24bPartition + 1) /* can never be equal to a partition value */ struct indices { struct indices *next_i; /* keep it a pointer even with indexalloc to keep build_in.c same */ /*unsigned*/ int index[INDEX_SET_SIZE]; /* changed from char, 31/3/94 */ /*unsigned*/ int offset[INDEX_SET_SIZE]; /* added 19/9/94 */ }; /* Added 20/9/94 for get_index.c in glimpse (make it more efficient in space later) */ struct offsets { struct offsets *next; int offset; /* NOT unsigned!!! */ short sign; /* if 0, then indeterminate (bothways), 1 then +ve, -1 then -ve */ short done; /* if 0, then this did not have an intersection now, else it has had it */ }; #define INDICES_PER_TOKEN (AVG_OCCURRENCES/INDEX_SET_SIZE) /* average no. of struct indices per struct token: purely empirical result :-) */ /* Memory allocators: in io.c */ extern char *my_malloc(); extern int my_free(); extern char *wordalloc(); extern int wordfree(); extern int allwordfree(); extern struct indices *indicesalloc(); extern int indicesfree(); extern int allindicesfree(); extern struct token *tokenalloc(); extern int tokenfree(); extern int alltokenfree(); #define LIMIT_64K_HASH 50 /* size of total stuff to be indexed in MB after which 256K hash tables make more sense with the -B option */ #define hashword(word, wordlen) (((total_size < LIMIT_64K_HASH*1024*1024) || !BigHashTable) ? (hash64k(word, wordlen)) : (hash256k(word, wordlen))); /* * Just stores the word, wordlength and offset present in a line of the index in a structure (when made with -o or -b). * Doesn't store the attribute since we just need a hint into .glimpse_index from where agrep should begin search. */ #define WORD_SORTED 0 #if WORD_SORTED struct mini { char *word; long offset; }; /* Region searched with strcmp. #of regions = mini_array_len = (`wc -l .glimpse_index` - 3) / WORDS_PER_REGION */ #define WORDS_PER_REGION 128 #else /* WORD_SORTED */ struct mini { long offset; }; /* Range of each mini_array entry is words with same hash32k value => 32K offsets into the index need to be stored */ #define MINI_ARRAY_LEN (64*1024) #endif /* WORD_SORTED */ /* For incremental indexing only */ typedef struct _name_hashelement { struct _name_hashelement *next; char *name; int name_len; int index; } name_hashelement; /* * Limit on number of files is MaxNum24bPartition. To change it, you need * to add encode/decode code everywhere, INDEX_ELEM_FREE and MAXNUM_INDIRECT. * * Limit on number of attributes is MaxNum16bPartition. To change it, you * need to add encode/decode code everywhere. That is: merge_splits(), * save_data_structures(), traverse(), merge_in() and scanword() * in glimpseindex; get_set() in glimpse; and printx.c. * * No need to change any other data structures. */ /* Names of various system commands used in glimpseindex: use mv/rm etc rather than rename()/unlink() since former don't return unless parent-dir is sync-ed */ #define SYSTEM_SORT "sort" /* replace with different sort with longer lines. Later write a procedure for sort that doesn't need system() */ #define SYSTEM_LS "ls" #define SYSTEM_MV "mv" /* this doesn't work with SFS */ #define SYSTEM_RM "rm" /* this doesn't work with SFS */ #define SYSTEM_CAT "cat" #define SYSTEM_HEAD "head" #define SYSTEM_CP "cp" #define SYSTEM_ECHO "echo" #define SYSTEM_WC "wc" #endif /* _GLIMPSE_H_ */