/* Copyright (c) 1994 Sun Wu, Udi Manber, Burra Gopal. All Rights Reserved. */ /* ./glimpse/index/getword.c */ #include "glimpse.h" extern FILE *MESSAGEFILE; extern int NextICurrentFileOffset, ICurrentFileOffset; int StructuredIndex = 0; int WORD_TOO_LONG = 0; int IndexNumber = 0; int CountWords = 0; int InterpretSpecial = 0; int indexable_char[256]; int GMAX_WORD_SIZE = MAX_WORD_SIZE; int PrintedLongWordWarning = 0; #define ALL_LOWER 0 /* default, what you start with: all are possible */ #define FIRST_UPPER 1 /* only first one seen is upper: 0 is impossible */ #define ALL_UPPER 2 /* all seen so far are upper: 2 and 3 are possible */ #define MIXED 3 /* neither of the above 3 */ #define ALPHANUM 1 #define ALPHAONLY 2 #define NUMONLY 3 /* ------------------------------------------------------------------------- getword(): get a word from stream pointed to by buffer. a word is a string of alpha-numeric characters. After the word is gotten, return a new pointer that points to a alpha-numeric character. For the first call to such function when the first character is not a alpha-numeric character, getword() only adjust the pointer to point to a alpha-numeric character. --------------------------------------------------------------------------*/ unsigned char *getword(filename, word, buffer, buffer_end, pattr) unsigned char *filename; unsigned char *word; unsigned char *buffer; unsigned char *buffer_end; int *pattr; { int word_length=0; unsigned char c, *wp=word; unsigned char *oldword=word; unsigned char *old_buffer = buffer; int previslsq = 0; int withinsq = 0; ICurrentFileOffset = NextICurrentFileOffset; if (pattr != NULL) *pattr = 0; if (CountWords) { /* don't convert case, ignore special, don't bother about offsets. */ unsigned char *temp_buffer; int flag = ALL_LOWER; for(temp_buffer = buffer; (temp_buffer - buffer < GMAX_WORD_SIZE) && (temp_buffer < buffer_end); temp_buffer ++) { if (!INDEXABLE(*temp_buffer)) break; if (isupper(*temp_buffer)) { if (flag == ALL_LOWER) { if (temp_buffer == buffer) flag = FIRST_UPPER; else { flag = MIXED; break; } } else if (flag == FIRST_UPPER) { if (temp_buffer == buffer + 1) flag = ALL_UPPER; else { flag = MIXED; break; } } else continue; /* must be ALL_UPPER -> let it remain so */ } else if (islower(*temp_buffer)) { if (flag == ALL_LOWER) continue; else if (flag == FIRST_UPPER) continue; else if (flag == ALL_UPPER) { flag = MIXED; break; } } /* else, not alphabet: ignore */ } if (flag == MIXED) { /* discard mixed words since they cannot be indexed */ word[0] = '\0'; if (IndexNumber) while(isalnum(*temp_buffer++)); else while(isalpha(*temp_buffer++)); return temp_buffer; } while(buffer < buffer_end) { if(INDEXABLE(*buffer)) { *word++ = *buffer ++; word_length++; } else { while((buffer< buffer_end) && !(INDEXABLE(*buffer))) buffer++; break; } if(word_length > GMAX_WORD_SIZE) { word = wp; WORD_TOO_LONG = ON; while((buffer < buffer_end) && INDEXABLE(*buffer)) buffer++; /* skip current long word */ break; } } } else { /* convert case, maybe interpret special */ while(buffer < buffer_end) { if (INDEXABLE(*buffer)) { /* ICurrentFileOffset is in the right place */ if (*buffer == '[') { previslsq = 1; withinsq = 1; } else { previslsq = 0; if (*buffer == ']') withinsq = 0; } if ((*buffer == '-') && !withinsq) { /* terminate word here */ buffer ++; ICurrentFileOffset ++; break; } if (isupper(*buffer)) *word++ = tolower(*buffer++); else *word++ = *buffer++; word_length++; } else if (INDEXABLE('[') && (*buffer == '^') && previslsq) { *word ++ = *buffer ++; word_length ++; previslsq = 0; } else { previslsq = 0; if (InterpretSpecial && (*buffer == '\\')) { /* skip two things AND terminate word HERE */ if (buffer < buffer_end - 1) { buffer += 2; if (word_length <= 0) ICurrentFileOffset += 2; } else if (buffer < buffer_end) { buffer ++; if (word_length <= 0) ICurrentFileOffset ++; } } else { if (word_length <= 0) while((buffer < buffer_end) && !(INDEXABLE(*buffer))) { ICurrentFileOffset ++; buffer++; } else while((buffer < buffer_end) && !(INDEXABLE(*buffer))) buffer++; } break; } if(word_length > GMAX_WORD_SIZE) { word = wp; WORD_TOO_LONG = ON; while((buffer < buffer_end) && INDEXABLE(*buffer)) buffer++; /* skip current long word */ break; } } } if(WORD_TOO_LONG) { c = wp[GMAX_WORD_SIZE]; wp[GMAX_WORD_SIZE] = '\0'; if (!PrintedLongWordWarning) { fprintf(MESSAGEFILE, "Warning: ignoring very long word '%s' (with > %d chars) in %s\n", oldword, GMAX_WORD_SIZE, filename); PrintedLongWordWarning = 1; } wp[GMAX_WORD_SIZE] = c; *wp = '\0'; } *word = '\0'; WORD_TOO_LONG = 0; if ((pattr != NULL) && (word_length > 0) && (StructuredIndex)) *pattr = region_identify(ICurrentFileOffset, 0); NextICurrentFileOffset += (buffer <= old_buffer) ? 1 : (buffer - old_buffer); /* beginning of next word, atleast 1 */ return(buffer); } set_indexable_char(indexable_char) int indexable_char[256]; { int i; /* Saves a lot of calls during run-time! */ for (i=0; i<256; i++) { if(!ISASCII((unsigned char)i) && !isalpha((unsigned char)i)) indexable_char[i] = 0; else if(IndexNumber) indexable_char[i] = isalnum(i); else indexable_char[i] = isalpha((unsigned char)i); } indexable_char['_'] = 1; } set_special_char(special_char) int special_char[256]; { /* * Set all special characters interpreted by agrep to 1. * Assume set_indexable_char has been done on it. */ special_char['-'] = 1; /* special_char[','] = 1; */ /* special_char[';'] = 1; */ /* special_char['.'] = 1; */ /* special_char['#'] = 1; */ /* special_char['|'] = 1; */ special_char['['] = 1; special_char[']'] = 1; /* special_char['('] = 1; */ /* special_char[')'] = 1; */ /* special_char['>'] = 1; */ /* special_char['<'] = 1; */ /* special_char['^'] = 1; */ /* special_char['$'] = 1; */ /* special_char['+'] = 1; */ }