/* Copyright (c) 1994 Sun Wu, Udi Manber, Burra Gopal. All Rights Reserved. */ #include #include #include "agrep.h" #undef MAXSYM #define MAXSYM 256 #define MAXMEMBER 8192 #define CHARTYPE unsigned char #undef MaxError /* don't use agrep.h definition */ #define MaxError 20 #define MAXPATT 256 #undef MAXLINE #define MAXLINE 1024 #undef MAXNAME #define MAXNAME 256 #undef MaxCan /* don't use agrep.h definition */ #define MaxCan 2048 #define BLOCKSIZE 16384 #define MAX_SHIFT_2 4096 #undef ON #define ON 1 #undef OFF #define OFF 0 #define LOG_ASCII 8 #define LOG_DNA 3 #define MAXMEMBER_1 65536 #define LONG_EXAC 20 #define LONG_APPX 24 #if ISO_CHAR_SET #define W_DELIM 256 #else #define W_DELIM 128 #endif #include extern int tuncompressible(); extern int quick_tcompress(); extern int quick_tuncompress(); extern int DELIMITER, OUTTAIL; extern int D_length, tc_D_length; extern unsigned char D_pattern[MaxDelimit *2], tc_D_pattern[MaxDelimit *2]; extern int LIMITOUTPUT, LIMITPERFILE, INVERSE; extern int CurrentByteOffset; extern int BYTECOUNT; extern int PRINTOFFSET; extern int PRINTRECORD; extern int CONSTANT, COUNT, FNAME, SILENT, FILENAMEONLY, prev_num_of_matched, num_of_matched; extern int DNA ; /* DNA flag is set in checksg when pattern is DNA pattern and p_size > 16 */ extern WORDBOUND, WHOLELINE, NOUPPER; extern unsigned char CurrentFileName[], Progname[]; extern unsigned Mask[]; extern unsigned endposition; extern int agrep_inlen; extern CHARTYPE *agrep_inbuffer; extern int agrep_initialfd; extern FILE *agrep_finalfp; extern int agrep_outpointer; extern int agrep_outlen; extern CHARTYPE * agrep_outbuffer; extern int NEW_FILE, POST_FILTER; extern int EXITONERROR; extern int errno; extern int TCOMPRESSED; extern int EASYSEARCH; extern char FREQ_FILE[MAX_LINE_LEN], HASH_FILE[MAX_LINE_LEN], STRING_FILE[MAX_LINE_LEN]; #if MEASURE_TIMES /* timing variables */ extern int OUTFILTER_ms; extern int FILTERALGO_ms; extern int INFILTER_ms; #endif /*MEASURE_TIMES*/ unsigned char BSize; /* log_c m */ unsigned char char_map[MAXSYM]; /* data area */ int shift_1; CHARTYPE SHIFT[MAXSYM]; CHARTYPE MEMBER[MAXMEMBER]; CHARTYPE pat[MAXPATT]; unsigned Hashmask; char MEMBER_1[MAXMEMBER_1]; CHARTYPE TR[MAXSYM]; static void initmask(); static void am_preprocess(); static void m_preprocess(); static void prep(); static void prep4(); static void prep_bm(); /* * General idea behind output processing with delimiters, inverse, compression, etc. * CAUTION: In compressed files, we can search ONLY for simple patterns or their ;,. * Attempts to search for complex patterns / with errors might lead to spurious matches. * 1. Once we find the match, go back and forward to get the delimiters that surround * the matched region. * 2. If it is a compressed file, verify that the match is "real" (compressed files * can have pseudo matches hence this filtering step is required). * 3. Increment num_of_matched. * 4. Process some output options which print stuff before the matched region is * printed. * 5. If there is compression, decomress and output the matched region. Otherwise * just output it as is. Remember, from step (1) we know the matched region. * 6. If inverse is set, then we must keep track of the end of the last matched region * in the variable lastout. When there is a match, we must print everything from * lastout to the beginning of the current matched region (curtextbegin) and then * update lastout to point to the end of the current matched region (curtextend). * ALSO: if we exit from the main loops, we must output everything from the end * of the last matched region to the end of the input buffer. * 7. Delimiter handling in complex patterns is different: there the search is done * for a boolean and of the delimiter pattern and the actual pattern. */ /* skips over escaped characters */ unsigned char * mystrchr(s, c) unsigned char *s; int c; { unsigned char *t = s; while (*t) { if (*t == '\\') t++; else if (c == *t) return t; t ++; } return NULL; } void char_tr(pat, m) unsigned char *pat; int *m; { int i; unsigned char temp[MAXPATT]; for(i=0; i1) && (pat[m-2] != '\\') && ((pat[m-1] == '^') || (pat[m-1] == '$'))) pat[m-1] = '\n';\ }\ /* whether constant or not, interpret the escape character */\ for (k=0; k= MAXPATT) {\ fprintf(stderr, "%s: pattern too long (has > %d chars)\n", Progname, MAXPATT);\ if (!EXITONERROR) {\ errno = AGREP_ERROR;\ return -1;\ }\ else exit(2);\ }\ if(D == 0) {\ if(m > LONG_EXAC) m_preprocess(pat);\ else prep_bm(pat, m);\ }\ else if (DNA) prep4(pat, m);\ else if(m >= LONG_APPX) am_preprocess(pat);\ else {\ prep(pat, m, D);\ initmask(pat, Mask, m, 0, &endposition);\ } #if AGREP_POINTER if (fd != -1) { #endif /*AGREP_POINTER*/ alloc_buf(fd, &text, 2*BLOCKSIZE+2*MAXLINE+MAXPATT); text[offset-1] = '\n'; /* initial case */ for(i=0; i < MAXLINE; i++) text[i] = 0; /* security zone */ start = offset; if(WHOLELINE) { start--; CurrentByteOffset --; } while( (num_read = fill_buf(fd, text+offset, 2*BLOCKSIZE)) > 0) { buf_end = end = offset + num_read -1 ; oldCurrentByteOffset = CurrentByteOffset; if (first_time) { if ((TCOMPRESSED == ON) && tuncompressible(text+offset, num_read)) { EASYSEARCH = text[offset+SIGNATURE_LEN-1]; start += SIGNATURE_LEN; CurrentByteOffset += SIGNATURE_LEN; if (!EASYSEARCH) { fprintf(stderr, "not compressed for easy-search: can miss some matches in: %s\n", CurrentFileName); } #if MEASURE_TIMES gettimeofday(&initt, NULL); #endif /*MEASURE_TIMES*/ if (samepattern || ((newm = quick_tcompress(FREQ_FILE, HASH_FILE, pat, m, newpat, MAXLINE-8, EASYSEARCH)) > 0)) { oldm = m; oldpat = pat; m = newm; pat = newpat; } #if MEASURE_TIMES gettimeofday(&finalt, NULL); INFILTER_ms += (finalt.tv_sec*1000 + finalt.tv_usec/1000) - (initt.tv_sec*1000 + initt.tv_usec/1000); #endif /*MEASURE_TIMES*/ } else TCOMPRESSED = OFF; PROCESS_PATTERN /* must be AFTER we know that it is a compressed pattern... */ for(i=1; i<=m; i++) text[2*BLOCKSIZE+offset+i] = pat[m-1]; /* to make sure the skip loop in bm() won't go out of bound in later iterations */ first_time = 0; } if (!DELIMITER) { while ((text[end] != '\n') && (end > offset)) end--; text[start-1] = '\n'; } else { unsigned char *newbuf = text + end + 1; newbuf = backward_delimiter(newbuf, text+offset, D_pattern, D_length, OUTTAIL); /* see agrep.c/'d' */ if (newbuf < text+offset+D_length) newbuf = text + end + 1; end = newbuf - text - 1; memcpy(text+start-D_length, D_pattern, D_length); } residue = buf_end - end + 1 ; /* SGREP_PROCESS */ /* No harm in sending a few extra parameters even if they are unused: they are not accessed in monkey*()s */ if(D==0) { if(m > LONG_EXAC) { if (-1 == monkey(pat, m, text+start, text+end, oldpat, oldm)) { free_buf(fd, text); return -1; } } else { if (-1 == bm(pat, m, text+start, text+end, oldpat, oldm)) { free_buf(fd, text); return -1; } } } else { if(DNA) { if (-1 == monkey4( pat, m, text+start, text+end, D , oldpat, oldm )) { free_buf(fd, text); return -1; } } else { if(m >= LONG_APPX) { if (-1 == a_monkey(pat, m, text+start, text+end, D, oldpat, oldm)) { free_buf(fd, text); return -1; } } else { if (-1 == agrep(pat, m, text+start, text+end, D, oldpat, oldm)) { free_buf(fd, text); return -1; } } } } if(FILENAMEONLY && (num_of_matched - prev_num_of_matched) && (NEW_FILE || !POST_FILTER)) { if (agrep_finalfp != NULL) fprintf(agrep_finalfp, "%s\n", CurrentFileName); else { int outindex; for(outindex=0; (outindex+agrep_outpointer=agrep_outlen)) { OUTPUT_OVERFLOW; free_buf(fd, text); return -1; } else agrep_outbuffer[agrep_outpointer+outindex++] = '\n'; agrep_outpointer += outindex; } free_buf(fd, text); NEW_FILE = OFF; return 0; } CurrentByteOffset = oldCurrentByteOffset + end - start + 1; /* for a new iteration: avoid complicated calculations below */ start = offset - residue ; if(start < MAXLINE) { start = MAXLINE; } strncpy(text+start, text+end, residue); start++; if (((LIMITOUTPUT > 0) && (LIMITOUTPUT <= num_of_matched)) || ((LIMITPERFILE > 0) && (LIMITPERFILE <= num_of_matched - prev_num_of_matched))) { free_buf(fd, text); return 0; /* done */ } } /* end of while(num_read = ...) */ if (!DELIMITER) { text[start-1] = '\n'; text[start+residue] = '\n'; } else { if (start > D_length) memcpy(text+start-D_length, D_pattern, D_length); memcpy(text+start+residue, D_pattern, D_length); } end = start + residue - 2; if(residue > 1) { /* SGREP_PROCESS */ /* No harm in sending a few extra parameters even if they are unused: they are not accessed in monkey*()s */ if(D==0) { if(m > LONG_EXAC) { if (-1 == monkey(pat, m, text+start, text+end, oldpat, oldm)) { free_buf(fd, text); return -1; } } else { if (-1 == bm(pat, m, text+start, text+end, oldpat, oldm)) { free_buf(fd, text); return -1; } } } else { if(DNA) { if (-1 == monkey4( pat, m, text+start, text+end, D , oldpat, oldm )) { free_buf(fd, text); return -1; } } else { if(m >= LONG_APPX) { if (-1 == a_monkey(pat, m, text+start, text+end, D, oldpat, oldm)) { free_buf(fd, text); return -1; } } else { if (-1 == agrep(pat, m, text+start, text+end, D, oldpat, oldm)) { free_buf(fd, text); return -1; } } } } if(FILENAMEONLY && (num_of_matched - prev_num_of_matched) && (NEW_FILE || !POST_FILTER)) { if (agrep_finalfp != NULL) fprintf(agrep_finalfp, "%s\n", CurrentFileName); else { int outindex; for(outindex=0; (outindex+agrep_outpointer=agrep_outlen)) { OUTPUT_OVERFLOW; free_buf(fd, text); return -1; } else agrep_outbuffer[agrep_outpointer+outindex++] = '\n'; agrep_outpointer += outindex; } free_buf(fd, text); NEW_FILE = OFF; return 0; } } free_buf(fd, text); return 0; #if AGREP_POINTER } else { /* as if only one iteration of the while-loop and offset = 0 */ tempbuf = (CHARTYPE*)malloc(m); text = (CHARTYPE *)agrep_inbuffer; num_read = agrep_inlen; start = 0; buf_end = end = num_read - 1; #if 0 if (WHOLELINE) { start --; CurrentByteOffset --; } #endif if ((TCOMPRESSED == ON) && tuncompressible(text+1, num_read)) { EASYSEARCH = text[offset+SIGNATURE_LEN-1]; start += SIGNATURE_LEN; CurrentByteOffset += SIGNATURE_LEN; if (!EASYSEARCH) { fprintf(stderr, "not compressed for easy-search: can miss some matches in: %s\n", CurrentFileName); } #if MEASURE_TIMES gettimeofday(&initt, NULL); #endif /*MEASURE_TIMES*/ if (samepattern || ((newm = quick_tcompress(FREQ_FILE, HASH_FILE, pat, m, newpat, MAXLINE-8, EASYSEARCH)) > 0)) { oldm = m; oldpat = pat; m = newm; pat = newpat; } #if MEASURE_TIMES gettimeofday(&finalt, NULL); INFILTER_ms += (finalt.tv_sec*1000 + finalt.tv_usec/1000) - (initt.tv_sec*1000 + initt.tv_usec/1000); #endif /*MEASURE_TIMES*/ } else TCOMPRESSED = OFF; PROCESS_PATTERN /* must be after we know whether it is compressed or not */ memcpy(tempbuf, text+end+1, m); /* save portion being overwritten */ for(i=1; i<=m; i++) text[end+i] = pat[m-1]; /* to make sure the skip loop in bm() won't go out of bound in later iterations */ if (!DELIMITER) while(text[end] != '\n' && end > 1) end--; else { unsigned char *newbuf = text + end + 1; newbuf = backward_delimiter(newbuf, text, D_pattern, D_length, OUTTAIL); /* see agrep.c/'d' */ if (newbuf < text+offset+D_length) newbuf = text + end + 1; end = newbuf - text - 1; } /* text[0] = text[end] = r_newline; : the user must ensure that the delimiter is there at text[0] and occurs somewhere before text[end ] */ /* An exact copy of the above SGREP_PROCESS */ /* No harm in sending a few extra parameters even if they are unused: they are not accessed in monkey*()s */ if(D==0) { if(m > LONG_EXAC) { if (-1 == monkey(pat, m, text+start, text+end, oldpat, oldm)) { free_buf(fd, text); memcpy(text+end+1, tempbuf, m); /* restore */ free(tempbuf); return -1; } } else { if (-1 == bm(pat, m, text+start, text+end, oldpat, oldm)) { free_buf(fd, text); memcpy(text+end+1, tempbuf, m); /* restore */ free(tempbuf); return -1; } } } else { if(DNA) { if (-1 == monkey4( pat, m, text+start, text+end, D , oldpat, oldm )) { free_buf(fd, text); memcpy(text+end+1, tempbuf, m); /* restore */ free(tempbuf); return -1; } } else { if(m >= LONG_APPX) { if (-1 == a_monkey(pat, m, text+start, text+end, D, oldpat, oldm)) { free_buf(fd, text); memcpy(text+end+1, tempbuf, m); /* restore */ free(tempbuf); return -1; } } else { if (-1 == agrep(pat, m, text+start, text+end, D, oldpat, oldm)) { free_buf(fd, text); memcpy(text+end+1, tempbuf, m); /* restore */ free(tempbuf); return -1; } } } } if(FILENAMEONLY && (num_of_matched - prev_num_of_matched) && (NEW_FILE || !POST_FILTER)) { /* externally set */ if (agrep_finalfp != NULL) fprintf(agrep_finalfp, "%s\n", CurrentFileName); else { int outindex; for(outindex=0; (outindex+agrep_outpointer=agrep_outlen)) { OUTPUT_OVERFLOW; free_buf(fd, text); memcpy(text+end+1, tempbuf, m); /* restore */ free(tempbuf); return -1; } else agrep_outbuffer[agrep_outpointer+outindex++] = '\n'; agrep_outpointer += outindex; } free_buf(fd, text); NEW_FILE = OFF; } memcpy(text+end+1, tempbuf, m); /* restore */ free(tempbuf); return 0; } #endif /*AGREP_POINTER*/ } /* end sgrep */ /* SUN: bm assumes that the content of text[n]...text[n+m-1] is pat[m-1] such that the skip loop is guaranteed to terminated */ int bm(pat, m, text, textend, oldpat, oldm) CHARTYPE *text, *textend, *pat, *oldpat; int m, oldm; { int PRINTED = 0; register int shift; register int m1, j, d1; CHARTYPE *textbegin = text; int newlen; CHARTYPE *textstart; CHARTYPE *curtextbegin; CHARTYPE *curtextend; #if MEASURE_TIMES struct timeval initt, finalt; #endif CHARTYPE *lastout = text; d1 = shift_1; /* at least 1 */ m1 = m - 1; shift = 0; while (text <= textend) { textstart = text; shift = SHIFT[*(text += shift)]; while(shift) { shift = SHIFT[*(text += shift)]; shift = SHIFT[*(text += shift)]; shift = SHIFT[*(text += shift)]; } CurrentByteOffset += text - textstart; j = 0; while(TR[pat[m1 - j]] == TR[*(text - j)]) { if(++j == m) break; /* if statement can be saved, but for safty ... */ } if (j == m ) { if(text > textend) return 0; if(WORDBOUND) { if(isalnum(*(text+1))) goto CONT; /* as if there was no match */ if(isalnum(*(text-m))) goto CONT; /* as if there was no match */ /* changed by Udi 11/7/94 to avoid having to set TR[] to W_delim */ } if (TCOMPRESSED == ON) { /* Don't update CurrentByteOffset here: only before outputting properly */ if (!DELIMITER) { curtextbegin = text; while((curtextbegin > textbegin) && (*(--curtextbegin) != '\n')); if (*curtextbegin == '\n') curtextbegin ++; curtextend = text+1; while((curtextend < textend) && (*curtextend != '\n')) curtextend ++; if (*curtextend == '\n') curtextend ++; } else { curtextbegin = backward_delimiter(text, textbegin, tc_D_pattern, tc_D_length, OUTTAIL); curtextend = forward_delimiter(text+1, textend, tc_D_pattern, tc_D_length, OUTTAIL); } } else { /* Don't update CurrentByteOffset here: only before outputting properly */ if (!DELIMITER) { curtextbegin = text; while((curtextbegin > textbegin) && (*(--curtextbegin) != '\n')); if (*curtextbegin == '\n') curtextbegin ++; curtextend = text+1; while((curtextend < textend) && (*curtextend != '\n')) curtextend ++; if (*curtextend == '\n') curtextend ++; } else { curtextbegin = backward_delimiter(text, textbegin, D_pattern, D_length, OUTTAIL); curtextend = forward_delimiter(text+1, textend, D_pattern, D_length, OUTTAIL); } } if (TCOMPRESSED == ON) { #if MEASURE_TIMES gettimeofday(&initt, NULL); #endif /*MEASURE_TIMES*/ if (-1 == exists_tcompressed_word(pat, m, curtextbegin, text - curtextbegin + m, EASYSEARCH)) goto CONT; /* as if there was no match */ #if MEASURE_TIMES gettimeofday(&finalt, NULL); FILTERALGO_ms += (finalt.tv_sec *1000 + finalt.tv_usec/1000) - (initt.tv_sec*1000 + initt.tv_usec/1000); #endif /*MEASURE_TIMES*/ } textbegin = curtextend; /* (curtextend - 1 > textbegin ? curtextend - 1 : curtextend); */ num_of_matched++; if(FILENAMEONLY) return 0; if(!COUNT) { if (!INVERSE) { if(FNAME && (NEW_FILE || !POST_FILTER)) { char nextchar = (POST_FILTER == ON)?'\n':' '; char *prevstring = (POST_FILTER == ON)?"\n":""; if (agrep_finalfp != NULL) fprintf(agrep_finalfp, "%s%s:%c", prevstring, CurrentFileName, nextchar); else { int outindex; if (prevstring[0] != '\0') { if(agrep_outpointer + 1 >= agrep_outlen) { OUTPUT_OVERFLOW; return -1; } else agrep_outbuffer[agrep_outpointer ++] = prevstring[0]; } for(outindex=0; (outindex+agrep_outpointer=agrep_outlen)) { OUTPUT_OVERFLOW; return -1; } else { agrep_outbuffer[agrep_outpointer+outindex++] = ':'; agrep_outbuffer[agrep_outpointer+outindex++] = nextchar; } agrep_outpointer += outindex; } NEW_FILE = OFF; PRINTED = 1; } if(BYTECOUNT) { if (agrep_finalfp != NULL) fprintf(agrep_finalfp, "%d= ", CurrentByteOffset); else { char s[32]; int outindex; sprintf(s, "%d=", CurrentByteOffset); for(outindex=0; (outindex+agrep_outpointer 0) { if (agrep_outpointer + newlen + 1 >= agrep_outlen) { OUTPUT_OVERFLOW; return -1; } agrep_outpointer += newlen; } } #if MEASURE_TIMES gettimeofday(&finalt, NULL); OUTFILTER_ms += (finalt.tv_sec*1000 + finalt.tv_usec/1000) - (initt.tv_sec*1000 + initt.tv_usec/1000); #endif /*MEASURE_TIMES*/ } else { if (agrep_finalfp != NULL) { fwrite(curtextbegin, 1, curtextend - curtextbegin, agrep_finalfp); } else { if (agrep_outpointer + curtextend - curtextbegin >= agrep_outlen) { OUTPUT_OVERFLOW; return -1; } memcpy(agrep_outbuffer+agrep_outpointer, curtextbegin, curtextend-curtextbegin); agrep_outpointer += curtextend - curtextbegin; } } } else if (PRINTED) { if (agrep_finalfp != NULL) fputc('\n', agrep_finalfp); else agrep_outbuffer[agrep_outpointer ++] = '\n'; PRINTED = 0; } } else { /* INVERSE */ if (TCOMPRESSED == ON) { /* INVERSE: Don't care about filtering time */ if (agrep_finalfp != NULL) newlen = quick_tuncompress(FREQ_FILE, STRING_FILE, lastout, curtextbegin - lastout, agrep_finalfp, -1, EASYSEARCH); else { if ((newlen = quick_tuncompress(FREQ_FILE, STRING_FILE, lastout, curtextbegin - lastout, agrep_outbuffer, agrep_outlen - agrep_outpointer, EASYSEARCH)) > 0) { if (newlen + agrep_outpointer >= agrep_outlen) { OUTPUT_OVERFLOW; return -1; } agrep_outpointer += newlen; } } lastout=textbegin; CurrentByteOffset += textbegin - text; text = textbegin; } else { /* NOT TCOMPRESSED */ if (agrep_finalfp != NULL) fwrite(lastout, 1, curtextbegin-lastout, agrep_finalfp); else { if (curtextbegin - lastout + agrep_outpointer >= agrep_outlen) { OUTPUT_OVERFLOW; return -1; } memcpy(agrep_outbuffer+agrep_outpointer, lastout, curtextbegin-lastout); agrep_outpointer += (curtextbegin - lastout); } lastout=textbegin; CurrentByteOffset += textbegin - text; text = textbegin; } /* TCOMPRESSED */ } /* INVERSE */ } else { /* COUNT */ CurrentByteOffset += textbegin - text; text = textbegin; } if (((LIMITOUTPUT > 0) && (LIMITOUTPUT <= num_of_matched)) || ((LIMITPERFILE > 0) && (LIMITPERFILE <= num_of_matched - prev_num_of_matched))) return 0; /* done */ CONT: shift = 1; } else shift = d1; } if (INVERSE && !COUNT && (lastout <= textend)) { if (TCOMPRESSED == ON) { /* INVERSE: Don't care about filtering time */ if (agrep_finalfp != NULL) newlen = quick_tuncompress(FREQ_FILE, STRING_FILE, lastout, textend - lastout + 1, agrep_finalfp, -1, EASYSEARCH); else { if ((newlen = quick_tuncompress(FREQ_FILE, STRING_FILE, lastout, textend - lastout + 1, agrep_outbuffer, agrep_outlen - agrep_outpointer, EASYSEARCH)) > 0) { if (newlen + agrep_outpointer >= agrep_outlen) { OUTPUT_OVERFLOW; return -1; } agrep_outpointer += newlen; } } } else { /* NOT TCOMPRESSED */ if (agrep_finalfp != NULL) fwrite(lastout, 1, textend-lastout + 1, agrep_finalfp); else { if (textend - lastout + 1 + agrep_outpointer >= agrep_outlen) { OUTPUT_OVERFLOW; return -1; } memcpy(agrep_outbuffer+agrep_outpointer, lastout, textend-lastout + 1); agrep_outpointer += (textend - lastout + 1); } } /* TCOMPRESSED */ } return 0; } /* initmask() initializes the mask table for the pattern */ /* endposition is a mask for the endposition of the pattern */ /* endposition will contain k mask bits if the pattern contains k fragments */ static void initmask(pattern, Mask, m, D, endposition) CHARTYPE *pattern; unsigned *Mask; register int m, D; unsigned *endposition; { register unsigned Bit1, c; register int i, j, frag_num; /* Bit1 = 1 << 31;*/ /* the first bit of Bit1 is 1, others 0. */ Bit1 = (unsigned)0x80000000; frag_num = D+1; *endposition = 0; for (i = 0; i < frag_num; i++) *endposition = *endposition | (Bit1 >> i); *endposition = *endposition >> (m - frag_num); for(i = 0; i < m; i++) if (pattern[i] == '^' || pattern[i] == '$') { pattern[i] = '\n'; } for(i = 0; i < MAXSYM; i++) Mask[i] = ~0; for(i = 0; i < m; i++) /* initialize the mask table */ { c = pattern[i]; for ( j = 0; j < m; j++) if( c == pattern[j] ) Mask[c] = Mask[c] & ~( Bit1 >> j ) ; } } static void prep(Pattern, M, D) /* preprocessing for partitioning_bm */ CHARTYPE *Pattern; /* can be fine-tuned to choose a better partition */ register int M, D; { register int i, j, k, p, shift; register unsigned m; unsigned hash, b_size = 3; m = M/(D+1); p = M - m*(D+1); for (i = 0; i < MAXSYM; i++) SHIFT[i] = m; for (i = M-1; i>=p ; i--) { shift = (M-1-i)%m; hash = Pattern[i]; if((int)(SHIFT[hash]) > (int)(shift)) SHIFT[hash] = shift; } #ifdef DEBUG for(i=0; i Candidate[cdx][1]) { Candidate[++cdx][0] = i-M-D-2; Candidate[cdx][1] = i+M+D; } else Candidate[cdx][1] = i+M+D; shift = d1; } else shift = d1; } CurrentByteOffset += (textbegin - text); text = textbegin; n = textend - textbegin; r_newline = '\n'; /* for those candidate areas, find the D-error matches */ if(Candidate[1][0] < 0) Candidate[1][0] = 0; endpos = endposition; /* the mask table and the endposition */ /* Bit1 = (1 << 31); */ Bit1 = (unsigned)0x80000000; oldbyteoffset = CurrentByteOffset; for(round = 0; round <= cdx; round++) { i = Candidate[round][0] ; if(Candidate[round][1] > n) Candidate[round][1] = n; if(i < 0) i = 0; CurrentByteOffset = oldbyteoffset+i; R1[0] = R2[0] = ~0; R1[1] = R2[1] = ~Bit1; for(k = 1; k <= D; k++) R1[k] = R2[k] = (R1[k-1] >> 1) & R1[k-1]; while (i < Candidate[round][1]) { c = text[i++]; CurrentByteOffset ++; if(c == r_newline) { for(k = 0 ; k <= D; k++) R1[k] = R2[k] = (~0 ); } r1 = Mask[c]; R1[0] = (R2[0] >> 1) | r1; for(k=1; k<=D; k++) R1[k] = ((R2[k] >> 1) | r1) & R2[k-1] & ((R1[k-1] & R2[k-1]) >> 1); if((R1[D] & endpos) == 0) { num_of_matched++; if(FILENAMEONLY) return 0; currentpos = i; if(i <= lastend) { CurrentByteOffset += lastend - i; i = lastend; } else { int oldcurrentpos = currentpos; if (-1 == s_output(text, ¤tpos, textbegin, textend, &lastout, pat, M, oldpat, oldM)) return -1; CurrentByteOffset += currentpos - oldcurrentpos; i = currentpos; } lastend = i; for(k=0; k<=D; k++) R1[k] = R2[k] = ~0; if (((LIMITOUTPUT > 0) && (LIMITOUTPUT <= num_of_matched)) || ((LIMITPERFILE > 0) && (LIMITPERFILE <= num_of_matched - prev_num_of_matched))) return 0; /* done */ } /* copying the code to save a few instructions. you need to understand the shift-or algorithm to figure this one... */ c = text[i++]; CurrentByteOffset ++; if(c == r_newline) { for(k = 0 ; k <= D; k++) R1[k] = R2[k] = (~0 ); } r1 = Mask[c]; R2[0] = (R1[0] >> 1) | r1; for(k = 1; k <= D; k++) R2[k] = ((R1[k] >> 1) | r1) & R1[k-1] & ((R1[k-1] & R2[k-1]) >> 1); if((R2[D] & endpos) == 0) { currentpos = i; num_of_matched++; if(FILENAMEONLY) return 0; if(i <= lastend) { CurrentByteOffset += lastend - i; i = lastend; } else { int oldcurrentpos = currentpos; if (-1 == s_output(text, ¤tpos, textbegin, textend, &lastout, pat, M, oldpat, oldM)) return -1; CurrentByteOffset += currentpos - oldcurrentpos; i = currentpos; } lastend = i; for(k=0; k<=D; k++) R1[k] = R2[k] = ~0; if (((LIMITOUTPUT > 0) && (LIMITOUTPUT <= num_of_matched)) || ((LIMITPERFILE > 0) && (LIMITPERFILE <= num_of_matched - prev_num_of_matched))) return 0; /* done */ } } } if (INVERSE && !COUNT && (lastout <= textend)) { if (TCOMPRESSED == ON) { /* INVERSE: Don't care about filtering time */ if (agrep_finalfp != NULL) newlen = quick_tuncompress(FREQ_FILE, STRING_FILE, lastout, textend - lastout + 1, agrep_finalfp, -1, EASYSEARCH); else { if ((newlen = quick_tuncompress(FREQ_FILE, STRING_FILE, lastout, textend - lastout + 1, agrep_outbuffer, agrep_outlen - agrep_outpointer, EASYSEARCH)) > 0) { if (newlen + agrep_outpointer >= agrep_outlen) { OUTPUT_OVERFLOW; return -1; } agrep_outpointer += newlen; } } } else { /* NOT TCOMPRESSED */ if (agrep_finalfp != NULL) fwrite(lastout, 1, textend-lastout + 1, agrep_finalfp); else { if (textend - lastout + 1 + agrep_outpointer >= agrep_outlen) { OUTPUT_OVERFLOW; return -1; } memcpy(agrep_outbuffer+agrep_outpointer, lastout, textend-lastout + 1); agrep_outpointer += (textend - lastout + 1); } } /* TCOMPRESSED */ } return 0; } /* Don't update CurrentByteOffset here: done by caller */ int s_output(text, i, textbegin, textend, lastout, pat, m, oldpat, oldm) int *i; /* in, out */ int m, oldm; CHARTYPE *text, *textbegin, *textend, *pat, *oldpat; CHARTYPE **lastout; /* in, out */ { int PRINTED = 0; int newlen; int oldi; CHARTYPE *curtextbegin; CHARTYPE *curtextend; #if MEASURE_TIMES struct timeval initt, finalt; #endif if(SILENT) return 0; if (TCOMPRESSED == ON) { if (!DELIMITER) { curtextbegin = text + *i; while((curtextbegin > textbegin) && (*(--curtextbegin) != '\n')); if (*curtextbegin == '\n') curtextbegin ++; curtextend = text + *i /* + 1 agrep() has i++ */; while((curtextend < textend) && (*curtextend != '\n')) curtextend ++; if (*curtextend == '\n') curtextend ++; } else { curtextbegin = backward_delimiter(text + *i, text, tc_D_pattern, tc_D_length, OUTTAIL); curtextend = forward_delimiter(text + *i /* + 1 agrep() has i++ */, textend, tc_D_pattern, tc_D_length, OUTTAIL); } } else { if (!DELIMITER) { curtextbegin = text + *i; while((curtextbegin > textbegin) && (*(--curtextbegin) != '\n')); if (*curtextbegin == '\n') curtextbegin ++; curtextend = text + *i /* + 1 agrep() has i++ */; while((curtextend < textend) && (*curtextend != '\n')) curtextend ++; if (*curtextend == '\n') curtextend ++; } else { curtextbegin = backward_delimiter(text + *i, text, D_pattern, D_length, OUTTAIL); curtextend = forward_delimiter(text + *i /* + 1 agrep() has i++ */, textend, D_pattern, D_length, OUTTAIL); } } if (TCOMPRESSED == ON) { #if MEASURE_TIMES gettimeofday(&initt, NULL); #endif /*MEASURE_TIMES*/ if (-1 == exists_tcompressed_word(pat, m, curtextbegin, text + *i - curtextbegin + m, EASYSEARCH)) { num_of_matched --; return 0; } #if MEASURE_TIMES gettimeofday(&finalt, NULL); FILTERALGO_ms += (finalt.tv_sec *1000 + finalt.tv_usec/1000) - (initt.tv_sec*1000 + initt.tv_usec/1000); #endif /*MEASURE_TIMES*/ } textbegin = curtextend; /*(curtextend - 1 > textbegin ? curtextend - 1 : curtextend); */ oldi = *i; *i += textbegin - (text + *i); if(COUNT) return 0; if (INVERSE) { if (TCOMPRESSED == ON) { /* INVERSE: Don't care about filtering time */ if (agrep_finalfp != NULL) newlen = quick_tuncompress(FREQ_FILE, STRING_FILE, *lastout, curtextbegin - *lastout, agrep_finalfp, -1, EASYSEARCH); else { if ((newlen = quick_tuncompress(FREQ_FILE, STRING_FILE, *lastout, curtextbegin - *lastout, agrep_outbuffer, agrep_outlen - agrep_outpointer, EASYSEARCH)) > 0) { if (newlen + agrep_outpointer >= agrep_outlen) { OUTPUT_OVERFLOW; return -1; } agrep_outpointer += newlen; } } *lastout=textbegin; CurrentByteOffset += textbegin - text; text = textbegin; } else { /* NOT TCOMPRESSED */ if (agrep_finalfp != NULL) fwrite(*lastout, 1, curtextbegin-*lastout, agrep_finalfp); else { if (curtextbegin - *lastout + agrep_outpointer >= agrep_outlen) { OUTPUT_OVERFLOW; return -1; } memcpy(agrep_outbuffer+agrep_outpointer, *lastout, curtextbegin-*lastout); agrep_outpointer += (curtextbegin - *lastout); } *lastout=textbegin; CurrentByteOffset += textbegin - text; text = textbegin; } /* TCOMPRESSED */ return 0; } if(FNAME && (NEW_FILE || !POST_FILTER)) { char nextchar = (POST_FILTER == ON)?'\n':' '; char *prevstring = (POST_FILTER == ON)?"\n":""; if (agrep_finalfp != NULL) fprintf(agrep_finalfp, "%s%s:%c", prevstring, CurrentFileName, nextchar); else { int outindex; if (prevstring[0] != '\0') { if(agrep_outpointer + 1 >= agrep_outlen) { OUTPUT_OVERFLOW; return -1; } else agrep_outbuffer[agrep_outpointer ++] = prevstring[0]; } for(outindex=0; (outindex+agrep_outpointer= agrep_outlen)) { OUTPUT_OVERFLOW; return -1; } agrep_outbuffer[agrep_outpointer + outindex++] = ':'; agrep_outbuffer[agrep_outpointer + outindex++] = nextchar; agrep_outpointer += outindex; } NEW_FILE = OFF; PRINTED = 1; } if(BYTECOUNT) { if (agrep_finalfp != NULL) fprintf(agrep_finalfp, "%d= ", CurrentByteOffset); else { char s[32]; int outindex; sprintf(s, "%d= ", CurrentByteOffset); for(outindex=0; (outindex+agrep_outpointer 0) { if (agrep_outpointer + newlen + 1 >= agrep_outlen) { OUTPUT_OVERFLOW; return -1; } agrep_outpointer += newlen; } } #if MEASURE_TIMES gettimeofday(&finalt, NULL); OUTFILTER_ms += (finalt.tv_sec*1000 + finalt.tv_usec/1000) - (initt.tv_sec*1000 + initt.tv_usec/1000); #endif /*MEASURE_TIMES*/ } else { if (agrep_finalfp != NULL) { fwrite(curtextbegin, 1, curtextend - curtextbegin, agrep_finalfp); } else { if (agrep_outpointer + curtextend - curtextbegin >= agrep_outlen) { OUTPUT_OVERFLOW; return -1; } memcpy(agrep_outbuffer + agrep_outpointer, curtextbegin, curtextend - curtextbegin); agrep_outpointer += curtextend - curtextbegin; } } } else if (PRINTED) { if (agrep_finalfp != NULL) fputc('\n', agrep_finalfp); else agrep_outbuffer[agrep_outpointer ++] = '\n'; PRINTED = 0; } return 0; } static void prep_bm(Pattern, m) unsigned char *Pattern; register m; { int i; unsigned hash; unsigned char lastc; for (i = 0; i < MAXSYM; i++) SHIFT[i] = m; for (i = m-1; i>=0; i--) { hash = TR[Pattern[i]]; if((int)(SHIFT[hash]) >= (int)(m - 1)) SHIFT[hash] = m-1-i; } shift_1 = m-1; /* shift_1 records the previous occurrence of the last character of the pattern. When we match this last character but do not have a match, we can shift until we reach the next occurrence from the right. */ lastc = TR[Pattern[m-1]]; for (i= m-2; i>=0; i--) { if(TR[Pattern[i]] == lastc ) { shift_1 = m-1 - i; i = -1; } } if(shift_1 == 0) shift_1 = 1; /* can never happen - Udi 11/7/94 */ if(NOUPPER) for(i=0; i textend) return 0; /* Udi: used to be >= for some reason */ /* added by Udi 11/7/94 */ if(WORDBOUND) { if(isalnum(*(text+1))) goto CONT; /* as if there was no match */ if(isalnum(*(text-m))) goto CONT; /* as if there was no match */ /* changed by Udi 11/7/94 to avoid having to set TR[] to W_delim */ } if (TCOMPRESSED == ON) { /* Don't update CurrentByteOffset here: only before outputting properly */ if (!DELIMITER) { curtextbegin = text; while((curtextbegin > textbegin) && (*(--curtextbegin) != '\n')); if (*curtextbegin == '\n') curtextbegin ++; curtextend = text+1; while((curtextend < textend) && (*curtextend != '\n')) curtextend ++; if (*curtextend == '\n') curtextend ++; } else { curtextbegin = backward_delimiter(text, textbegin, tc_D_pattern, tc_D_length, OUTTAIL); curtextend = forward_delimiter(text + 1, textend, tc_D_pattern, tc_D_length, OUTTAIL); } } else { /* Don't update CurrentByteOffset here: only before outputting properly */ if (!DELIMITER) { curtextbegin = text; while((curtextbegin > textbegin) && (*(--curtextbegin) != '\n')); if (*curtextbegin == '\n') curtextbegin ++; curtextend = text+1; while((curtextend < textend) && (*curtextend != '\n')) curtextend ++; if (*curtextend == '\n') curtextend ++; } else { curtextbegin = backward_delimiter(text, textbegin, D_pattern, D_length, OUTTAIL); curtextend = forward_delimiter(text + 1, textend, D_pattern, D_length, OUTTAIL); } } if (TCOMPRESSED == ON) { #if MEASURE_TIMES gettimeofday(&initt, NULL); #endif /*MEASURE_TIMES*/ if (-1 == exists_tcompressed_word(pat, m, curtextbegin, text - curtextbegin + m, EASYSEARCH)) goto CONT; /* as if there was no match */ #if MEASURE_TIMES gettimeofday(&finalt, NULL); FILTERALGO_ms += (finalt.tv_sec *1000 + finalt.tv_usec/1000) - (initt.tv_sec*1000 + initt.tv_usec/1000); #endif /*MEASURE_TIMES*/ } textbegin = curtextend; /*(curtextend - 1 > textbegin ? curtextend - 1 : curtextend); */ num_of_matched++; if(FILENAMEONLY) return 0; if (!COUNT) { if (!INVERSE) { if(FNAME && (NEW_FILE || !POST_FILTER)) { char nextchar = (POST_FILTER == ON)?'\n':' '; char *prevstring = (POST_FILTER == ON)?"\n":""; if (agrep_finalfp != NULL) fprintf(agrep_finalfp, "%s%s:%c", prevstring, CurrentFileName, nextchar); else { int outindex; if (prevstring[0] != '\0') { if(agrep_outpointer + 1 >= agrep_outlen) { OUTPUT_OVERFLOW; return -1; } else agrep_outbuffer[agrep_outpointer ++] = prevstring[0]; } for(outindex=0; (outindex+agrep_outpointer= agrep_outlen)) { OUTPUT_OVERFLOW; return -1; } agrep_outbuffer[agrep_outpointer + outindex++] = ':'; agrep_outbuffer[agrep_outpointer + outindex++] = nextchar; agrep_outpointer += outindex; } NEW_FILE = OFF; PRINTED = 1; } if(BYTECOUNT) { if (agrep_finalfp != NULL) fprintf(agrep_finalfp, "%d= ", CurrentByteOffset); else { char s[32]; int outindex; sprintf(s, "%d= ", CurrentByteOffset); for(outindex=0; (outindex+agrep_outpointer 0) { if (agrep_outpointer + newlen + 1 >= agrep_outlen) { OUTPUT_OVERFLOW; return -1; } agrep_outpointer += newlen; } } #if MEASURE_TIMES gettimeofday(&finalt, NULL); OUTFILTER_ms += (finalt.tv_sec*1000 + finalt.tv_usec/1000) - (initt.tv_sec*1000 + initt.tv_usec/1000); #endif /*MEASURE_TIMES*/ } else { if (agrep_finalfp != NULL) { fwrite(curtextbegin, 1, curtextend - curtextbegin, agrep_finalfp); } else { if (agrep_outpointer + curtextend - curtextbegin >= agrep_outlen) { OUTPUT_OVERFLOW; return -1; } memcpy(agrep_outbuffer+agrep_outpointer, curtextbegin, curtextend-curtextbegin); agrep_outpointer += curtextend - curtextbegin; } } } else if (PRINTED) { if (agrep_finalfp != NULL) fputc('\n', agrep_finalfp); else agrep_outbuffer[agrep_outpointer ++] = '\n'; PRINTED = 0; } } else { /* INVERSE */ if (TCOMPRESSED == ON) { /* INVERSE: Don't care about filtering time */ if (agrep_finalfp != NULL) newlen = quick_tuncompress(FREQ_FILE, STRING_FILE, lastout, curtextbegin - lastout, agrep_finalfp, -1, EASYSEARCH); else { if ((newlen = quick_tuncompress(FREQ_FILE, STRING_FILE, lastout, curtextbegin - lastout, agrep_outbuffer, agrep_outlen - agrep_outpointer, EASYSEARCH)) > 0) { if (newlen + agrep_outpointer >= agrep_outlen) { OUTPUT_OVERFLOW; return -1; } agrep_outpointer += newlen; } } lastout=textbegin; CurrentByteOffset += textbegin - text; text = textbegin; } else { /* NOT TCOMPRESSED */ if (agrep_finalfp != NULL) fwrite(lastout, 1, curtextbegin-lastout, agrep_finalfp); else { if (curtextbegin - lastout + agrep_outpointer >= agrep_outlen) { OUTPUT_OVERFLOW; return -1; } memcpy(agrep_outbuffer+agrep_outpointer, lastout, curtextbegin-lastout); agrep_outpointer += (curtextbegin - lastout); } lastout=textbegin; CurrentByteOffset += textbegin - text; text = textbegin; } /* TCOMPRESSED */ } /* INVERSE */ } else { /* COUNT */ CurrentByteOffset += textbegin - text; text = textbegin; } /* Counteract the ++ below */ text --; CurrentByteOffset --; if (((LIMITOUTPUT > 0) && (LIMITOUTPUT <= num_of_matched)) || ((LIMITPERFILE > 0) && (LIMITPERFILE <= num_of_matched - prev_num_of_matched))) return 0; /* done */ } CONT: text++; CurrentByteOffset ++; } if (INVERSE && !COUNT && (lastout <= textend)) { if (TCOMPRESSED == ON) { /* INVERSE: Don't care about filtering time */ if (agrep_finalfp != NULL) newlen = quick_tuncompress(FREQ_FILE, STRING_FILE, lastout, textend - lastout + 1, agrep_finalfp, -1, EASYSEARCH); else { if ((newlen = quick_tuncompress(FREQ_FILE, STRING_FILE, lastout, textend - lastout + 1, agrep_outbuffer, agrep_outlen - agrep_outpointer, EASYSEARCH)) > 0) { if (newlen + agrep_outpointer >= agrep_outlen) { OUTPUT_OVERFLOW; return -1; } agrep_outpointer += newlen; } } } else { /* NOT TCOMPRESSED */ if (agrep_finalfp != NULL) fwrite(lastout, 1, textend-lastout + 1, agrep_finalfp); else { if (textend - lastout + 1 + agrep_outpointer >= agrep_outlen) { OUTPUT_OVERFLOW; return -1; } memcpy(agrep_outbuffer+agrep_outpointer, lastout, textend-lastout + 1); agrep_outpointer += (textend - lastout + 1); } } /* TCOMPRESSED */ } return 0; } /* a_monkey() the approximate monkey move */ int a_monkey( pat, m, text, textend, D ) register int m, D ; register CHARTYPE *text, *textend, *pat; { int PRINTED = 0; register CHARTYPE *oldtext; CHARTYPE *curtextbegin; CHARTYPE *curtextend; register unsigned hash, hashmask, suffix_error; register int m1 = m-1-D, pos; CHARTYPE *textbegin = text; CHARTYPE *textstart; CHARTYPE *lastout = text; int newlen; hashmask = Hashmask; oldtext = text; while (text < textend) { textstart = text; text = text+m1; suffix_error = 0; while(suffix_error <= D) { hash = *text--; while(MEMBER_1[hash]) { hash = ((hash << LOG_ASCII) + *(text--)) & hashmask; } suffix_error++; } CurrentByteOffset += text - textstart; if(text <= oldtext) { if((pos = verify(m, 2*m+D, D, pat, oldtext)) > 0) { CurrentByteOffset += (oldtext+pos - text); text = oldtext+pos; if(text > textend) return 0; /* Don't update CurrentByteOffset here: only before outputting properly */ if (TCOMPRESSED == ON) { if (!DELIMITER) { curtextbegin = text; while((curtextbegin > textbegin) && (*(--curtextbegin) != '\n')); if (*curtextbegin == '\n') curtextbegin ++; curtextend = text + 1; while((curtextend < textend) && (*curtextend != '\n')) curtextend ++; if (*curtextend == '\n') curtextend ++; } else { curtextbegin = backward_delimiter(text, textbegin, tc_D_pattern, tc_D_length, OUTTAIL); curtextend = forward_delimiter(text + 1, textend, tc_D_pattern, tc_D_length, OUTTAIL); } } else { if (!DELIMITER) { curtextbegin = text; while((curtextbegin > textbegin) && (*(--curtextbegin) != '\n')); if (*curtextbegin == '\n') curtextbegin ++; curtextend = text + 1; while((curtextend < textend) && (*curtextend != '\n')) curtextend ++; if (*curtextend == '\n') curtextend ++; } else { curtextbegin = backward_delimiter(text, textbegin, D_pattern, D_length, OUTTAIL); curtextend = forward_delimiter(text + 1, textend, D_pattern, D_length, OUTTAIL); } } textbegin = curtextend; /* (curtextend - 1 > textbegin ? curtextend - 1 : curtextend); */ num_of_matched++; if(FILENAMEONLY) return 0; if(!COUNT) { if (!INVERSE) { if(FNAME && (NEW_FILE || !POST_FILTER)) { char nextchar = (POST_FILTER == ON)?'\n':' '; char *prevstring = (POST_FILTER == ON)?"\n":""; if (agrep_finalfp != NULL) fprintf(agrep_finalfp, "%s%s:%c", prevstring, CurrentFileName, nextchar); else { int outindex; if (prevstring[0] != '\0') { if(agrep_outpointer + 1 >= agrep_outlen) { OUTPUT_OVERFLOW; return -1; } else agrep_outbuffer[agrep_outpointer ++] = prevstring[0]; } for(outindex=0; (outindex+agrep_outpointer= agrep_outlen)) { OUTPUT_OVERFLOW; return -1; } agrep_outbuffer[agrep_outpointer + outindex++] = ':'; agrep_outbuffer[agrep_outpointer + outindex++] = nextchar; agrep_outpointer += outindex; } NEW_FILE = OFF; PRINTED = 1; } if(BYTECOUNT) { if (agrep_finalfp != NULL) fprintf(agrep_finalfp, "%d= ", CurrentByteOffset); else { char s[32]; int outindex; sprintf(s, "%d= ", CurrentByteOffset); for(outindex=0; (outindex+agrep_outpointer 0) { if (agrep_outpointer + newlen + 1 >= agrep_outlen) { OUTPUT_OVERFLOW; return -1; } agrep_outpointer += newlen; } } #if MEASURE_TIMES gettimeofday(&finalt, NULL); OUTFILTER_ms += (finalt.tv_sec*1000 + finalt.tv_usec/1000) - (initt.tv_sec*1000 + initt.tv_usec/1000); #endif /*MEASURE_TIMES*/ } else { if (agrep_finalfp != NULL) { fwrite(curtextbegin, 1, curtextend - curtextbegin, agrep_finalfp); } else { if (agrep_outpointer + curtextend - curtextbegin >= agrep_outlen) { OUTPUT_OVERFLOW; return -1; } memcpy(agrep_outbuffer+agrep_outpointer, curtextbegin, curtextend-curtextbegin); agrep_outpointer += curtextend - curtextbegin; } } } else if (PRINTED) { if (agrep_finalfp != NULL) fputc('\n', agrep_finalfp); else agrep_outbuffer[agrep_outpointer ++] = '\n'; PRINTED = 0; } } else { /* INVERSE */ if (TCOMPRESSED == ON) { /* INVERSE: Don't care about filtering time */ if (agrep_finalfp != NULL) newlen = quick_tuncompress(FREQ_FILE, STRING_FILE, lastout, curtextbegin - lastout, agrep_finalfp, -1, EASYSEARCH); else { if ((newlen = quick_tuncompress(FREQ_FILE, STRING_FILE, lastout, curtextbegin - lastout, agrep_outbuffer, agrep_outlen - agrep_outpointer, EASYSEARCH)) > 0) { if (newlen + agrep_outpointer >= agrep_outlen) { OUTPUT_OVERFLOW; return -1; } agrep_outpointer += newlen; } } lastout=textbegin; CurrentByteOffset += textbegin - text; text = textbegin; } else { /* NOT TCOMPRESSED */ if (agrep_finalfp != NULL) fwrite(lastout, 1, curtextbegin-lastout, agrep_finalfp); else { if (curtextbegin - lastout + agrep_outpointer >= agrep_outlen) { OUTPUT_OVERFLOW; return -1; } memcpy(agrep_outbuffer+agrep_outpointer, lastout, curtextbegin-lastout); agrep_outpointer += (curtextbegin - lastout); } lastout=textbegin; CurrentByteOffset += textbegin - text; text = textbegin; } /* TCOMPRESSED */ } } else { /* COUNT */ CurrentByteOffset += textbegin - text; text = textbegin; } if (((LIMITOUTPUT > 0) && (LIMITOUTPUT <= num_of_matched)) || ((LIMITPERFILE > 0) && (LIMITPERFILE <= num_of_matched - prev_num_of_matched))) return 0; /* done */ } else { CurrentByteOffset += (oldtext + m - text); text = oldtext + m; } } oldtext = text; } if (INVERSE && !COUNT && (lastout <= textend)) { if (TCOMPRESSED == ON) { /* INVERSE: Don't care about filtering time */ if (agrep_finalfp != NULL) newlen = quick_tuncompress(FREQ_FILE, STRING_FILE, lastout, textend - lastout + 1, agrep_finalfp, -1, EASYSEARCH); else { if ((newlen = quick_tuncompress(FREQ_FILE, STRING_FILE, lastout, textend - lastout + 1, agrep_outbuffer, agrep_outlen - agrep_outpointer, EASYSEARCH)) > 0) { if (newlen + agrep_outpointer >= agrep_outlen) { OUTPUT_OVERFLOW; return -1; } agrep_outpointer += newlen; } } } else { /* NOT TCOMPRESSED */ if (agrep_finalfp != NULL) fwrite(lastout, 1, textend-lastout + 1, agrep_finalfp); else { if (textend - lastout + 1 + agrep_outpointer >= agrep_outlen) { OUTPUT_OVERFLOW; return -1; } memcpy(agrep_outbuffer+agrep_outpointer, lastout, textend-lastout + 1); agrep_outpointer += (textend - lastout + 1); } } /* TCOMPRESSED */ } return 0; } static void am_preprocess(Pattern) CHARTYPE *Pattern; { int i, m; m = strlen(Pattern); for (i = 1, Hashmask = 1 ; i<16 ; i++) Hashmask = (Hashmask << 1) + 1 ; for (i = 0; i < MAXMEMBER_1; i++) MEMBER_1[i] = 0; for (i = m-1; i>=0; i--) { MEMBER_1[Pattern[i]] = 1; } for (i = m-1; i > 0; i--) { MEMBER_1[(Pattern[i] << LOG_ASCII) + Pattern[i-1]] = 1; } } int verify(m, n, D, pat, text) register int m, n, D; CHARTYPE *pat, *text; { int A[MAXPATT], B[MAXPATT]; register int last = D; register int cost = 0; register int k, i, c; register int m1 = m+1; CHARTYPE *textend = text+n; CHARTYPE *textbegin = text; for (i = 0; i <= m1; i++) A[i] = B[i] = i; while (text < textend) { for (k = 1; k <= last; k++) { cost = B[k-1]+1; if (pat[k-1] != *text) { if (B[k]+1 < cost) cost = B[k]+1; if (A[k-1]+1 < cost) cost = A[k-1]+1; } else cost = cost -1; A[k] = cost; } if(pat[last] == *text++) { A[last+1] = B[last]; last++; } if(A[last] < D) A[last+1] = A[last++]+1; while (A[last] > D) last = last - 1; if(last >= m) return(text - textbegin - 1); if(*text == '\n') { last = D; for(c = 0; c<=m1; c++) A[c] = B[c] = c; } for (k = 1; k <= last; k++) { cost = A[k-1]+1; if (pat[k-1] != *text) { if (A[k]+1 < cost) cost = A[k]+1; if (B[k-1]+1 < cost) cost = B[k-1]+1; } else cost = cost -1; B[k] = cost; } if(pat[last] == *text++) { B[last+1] = A[last]; last++; } if(B[last] < D) B[last+1] = B[last++]+1; while (B[last] > D) last = last -1; if(last >= m) return(text - textbegin - 1); if(*text == '\n') { last = D; for(c = 0; c<=m1; c++) A[c] = B[c] = c; } } return(0); } /* preprocessing for monkey() */ static void m_preprocess(Pattern) CHARTYPE *Pattern; { int i, j, m; unsigned hash; m = strlen(Pattern); for (i = 0; i < MAX_SHIFT_2; i++) SHIFT_2[i] = m; for (i = m-1; i>=1; i--) { hash = TR[Pattern[i]]; hash = hash << 3; for (j = 0; j< MAXSYM; j++) { if(SHIFT_2[hash+j] == m) SHIFT_2[hash+j] = m-1; } hash = hash + TR[Pattern[i-1]]; if((int)(SHIFT_2[hash]) >= (int)(m - 1)) SHIFT_2[hash] = m-1-i; } shift_1 = m-1; for (i= m-2; i>=0; i--) { if(TR[Pattern[i]] == TR[Pattern[m-1]] ) { shift_1 = m-1 - i; i = -1; } } if(shift_1 == 0) shift_1 = 1; SHIFT_2[0] = 0; } /* monkey4() the approximate monkey move */ char *MEMBER_D = NULL; int monkey4( pat, m, text, textend, D ) register int m, D ; register unsigned char *text, *pat, *textend; { int PRINTED = 0; register unsigned char *oldtext; register unsigned hash, hashmask, suffix_error; register int m1=m-1-D, pos; CHARTYPE *textbegin = text; CHARTYPE *textstart; CHARTYPE *curtextbegin; CHARTYPE *curtextend; CHARTYPE *lastout = text; int newlen; hashmask = Hashmask; oldtext = text ; while (text < textend) { textstart = text; text = text + m1; suffix_error = 0; while(suffix_error <= D) { hash = char_map[*text--]; hash = ((hash << LOG_DNA) + char_map[*(text--)]) & hashmask; while(MEMBER_D[hash]) { hash = ((hash << LOG_DNA) + char_map[*(text--)]) & hashmask; } suffix_error++; } CurrentByteOffset += text - textstart; if(text <= oldtext) { if((pos = verify(m, 2*m+D, D, pat, oldtext)) > 0) { CurrentByteOffset += (oldtext+pos - text); text = oldtext+pos; if(text > textend) return 0; if (TCOMPRESSED == ON) { /* Don't update CurrentByteOffset here: only before outputting properly */ if (!DELIMITER) { curtextbegin = text; while((curtextbegin > textbegin) && (*(--curtextbegin) != '\n')); if (*curtextbegin == '\n') curtextbegin ++; curtextend = text + 1; while((curtextend < textend) && (*curtextend != '\n')) curtextend ++; if (*curtextend == '\n') curtextend ++; } else { curtextbegin = backward_delimiter(text, textbegin, tc_D_pattern, tc_D_length, OUTTAIL); curtextend = forward_delimiter(text + 1, textend, tc_D_pattern, tc_D_length, OUTTAIL); } } else { /* Don't update CurrentByteOffset here: only before outputting properly */ if (!DELIMITER) { curtextbegin = text; while((curtextbegin > textbegin) && (*(--curtextbegin) != '\n')); if (*curtextbegin == '\n') curtextbegin ++; curtextend = text + 1; while((curtextend < textend) && (*curtextend != '\n')) curtextend ++; if (*curtextend == '\n') curtextend ++; } else { curtextbegin = backward_delimiter(text, textbegin, D_pattern, D_length, OUTTAIL); curtextend = forward_delimiter(text + 1, textend, D_pattern, D_length, OUTTAIL); } } textbegin = curtextend; /*(curtextend - 1 > textbegin ? curtextend - 1 : curtextend); */ num_of_matched++; if(FILENAMEONLY) return 0; if(!COUNT) { if (!INVERSE) { if(FNAME && (NEW_FILE || !POST_FILTER)) { char nextchar = (POST_FILTER == ON)?'\n':' '; char *prevstring = (POST_FILTER == ON)?"\n":""; if (agrep_finalfp != NULL) fprintf(agrep_finalfp, "%s%s:%c", prevstring, CurrentFileName, nextchar); else { int outindex; if (prevstring[0] != '\0') { if(agrep_outpointer + 1 >= agrep_outlen) { OUTPUT_OVERFLOW; return -1; } else agrep_outbuffer[agrep_outpointer ++] = prevstring[0]; } for(outindex=0; (outindex+agrep_outpointer= agrep_outlen)) { OUTPUT_OVERFLOW; return -1; } agrep_outbuffer[agrep_outpointer + outindex++] = ':'; agrep_outbuffer[agrep_outpointer + outindex++] = nextchar; agrep_outpointer += outindex; } NEW_FILE = OFF; PRINTED = 1; } if(BYTECOUNT) { if (agrep_finalfp != NULL) fprintf(agrep_finalfp, "%d= ", CurrentByteOffset); else { char s[32]; int outindex; sprintf(s, "%d= ", CurrentByteOffset); for(outindex=0; (outindex+agrep_outpointer 0) { if (agrep_outpointer + newlen + 1 >= agrep_outlen) { OUTPUT_OVERFLOW; return -1; } agrep_outpointer += newlen; } } #if MEASURE_TIMES gettimeofday(&finalt, NULL); OUTFILTER_ms += (finalt.tv_sec*1000 + finalt.tv_usec/1000) - (initt.tv_sec*1000 + initt.tv_usec/1000); #endif /*MEASURE_TIMES*/ } else { if (agrep_finalfp != NULL) { fwrite(curtextbegin, 1, curtextend - curtextbegin, agrep_finalfp); } else { if (agrep_outpointer + curtextend - curtextbegin >= agrep_outlen) { OUTPUT_OVERFLOW; return -1; } memcpy(agrep_outbuffer+agrep_outpointer, curtextbegin, curtextend-curtextbegin); agrep_outpointer += curtextend - curtextbegin; } } } else if (PRINTED) { if (agrep_finalfp != NULL) fputc('\n', agrep_finalfp); else agrep_outbuffer[agrep_outpointer ++] = '\n'; PRINTED = 0; } } else { /* INVERSE */ if (TCOMPRESSED == ON) { /* INVERSE: Don't care about filtering time */ if (agrep_finalfp != NULL) newlen = quick_tuncompress(FREQ_FILE, STRING_FILE, lastout, curtextbegin - lastout, agrep_finalfp, -1, EASYSEARCH); else { if ((newlen = quick_tuncompress(FREQ_FILE, STRING_FILE, lastout, curtextbegin - lastout, agrep_outbuffer, agrep_outlen - agrep_outpointer, EASYSEARCH)) > 0) { if (newlen + agrep_outpointer >= agrep_outlen) { OUTPUT_OVERFLOW; return -1; } agrep_outpointer += newlen; } } lastout=textbegin; CurrentByteOffset += textbegin + 1 - text; text = textbegin + 1; } else { /* NOT TCOMPRESSED */ if (agrep_finalfp != NULL) fwrite(lastout, 1, curtextbegin-lastout, agrep_finalfp); else { if (curtextbegin - lastout + agrep_outpointer >= agrep_outlen) { OUTPUT_OVERFLOW; return -1; } memcpy(agrep_outbuffer+agrep_outpointer, lastout, curtextbegin-lastout); agrep_outpointer += (curtextbegin - lastout); } lastout=textbegin; CurrentByteOffset += textbegin + 1 - text; text = textbegin + 1; } /* TCOMPRESSED */ } } else { /* COUNT */ CurrentByteOffset += textbegin + 1 - text; text = textbegin + 1 ; } if (((LIMITOUTPUT > 0) && (LIMITOUTPUT <= num_of_matched)) || ((LIMITPERFILE > 0) && (LIMITPERFILE <= num_of_matched - prev_num_of_matched))) return 0; /* done */ } else { CurrentByteOffset += (oldtext + m - text); text = oldtext + m; } } oldtext = text; } if (INVERSE && !COUNT && (lastout <= textend)) { if (TCOMPRESSED == ON) { /* INVERSE: Don't care about filtering time */ if (agrep_finalfp != NULL) newlen = quick_tuncompress(FREQ_FILE, STRING_FILE, lastout, textend - lastout + 1, agrep_finalfp, -1, EASYSEARCH); else { if ((newlen = quick_tuncompress(FREQ_FILE, STRING_FILE, lastout, textend - lastout + 1, agrep_outbuffer, agrep_outlen - agrep_outpointer, EASYSEARCH)) > 0) { if (newlen + agrep_outpointer >= agrep_outlen) { OUTPUT_OVERFLOW; return -1; } agrep_outpointer += newlen; } } } else { /* NOT TCOMPRESSED */ if (agrep_finalfp != NULL) fwrite(lastout, 1, textend-lastout + 1, agrep_finalfp); else { if (textend - lastout + 1 + agrep_outpointer >= agrep_outlen) { OUTPUT_OVERFLOW; return -1; } memcpy(agrep_outbuffer+agrep_outpointer, lastout, textend-lastout + 1); agrep_outpointer += (textend - lastout + 1); } } /* TCOMPRESSED */ } return 0; } static void prep4(Pattern, m) char *Pattern; int m; { int i, j, k; unsigned hash; for(i=0; i< MAXSYM; i++) char_map[i] = 0; char_map['a'] = char_map['A'] = 4; char_map['g'] = char_map['g'] = 1; char_map['t'] = char_map['t'] = 2; char_map['c'] = char_map['c'] = 3; char_map['n'] = char_map['n'] = 5; BSize = blog(4, m); for (i = 1, Hashmask = 1 ; i<(int)(BSize*LOG_DNA); i++) Hashmask = (Hashmask << 1) + 1 ; if (MEMBER_D != NULL) free(MEMBER_D); MEMBER_D = (char *) malloc((Hashmask+1) * sizeof(char)); #ifdef DEBUG printf("BSize = %d", BSize); #endif for (i=0; i <= Hashmask; i++) MEMBER_D[i] = 0; for (j=0; j < (int)BSize; j++) { for(i=m-1; i >= j; i--) { hash = 0; for(k=0; k <= j; k++) hash = (hash << LOG_DNA) +char_map[Pattern[i-k]]; #ifdef DEBUG printf("< %d >, ", hash); #endif MEMBER_D[hash] = 1; } } } int blog(base, m ) int base, m; { int i, exp; exp = base; m = m + m/2; for (i = 1; exp < m; i++) exp = exp * base; return(i); }