/* Copyright (c) 1994 Sun Wu, Udi Manber, Burra Gopal. All Rights Reserved. */ /* substitute metachar with special symbol */ /* if regularr expression, then set flag REGEX */ /* if REGEX and MULTIPAT then report error message, */ /* -w only for single word pattern. If WORDBOUND & MULTIWORD error */ /* process start of line, endof line symbol, */ /* process -w WORDBOUND option, append special symbol at begin&end of */ /* process -d option before this routine */ /* the delimiter pattern is in D_pattern (need to end with '; ') */ /* if '-t' (suggestion: how about -B) the pattern is passed to sgrep */ /* and doesn't go here */ /* in that case, -d is ignored? or not necessary */ /* upon return, Pattern contains the pattern to be processed by maskgen */ /* D_pattern contains transformed D_pattern */ #include "agrep.h" extern int PAT_FILE, PAT_BUFFER; extern ParseTree *AParse; extern int WHOLELINE, REGEX, FASTREGEX, RE_ERR, DELIMITER, TAIL, WORDBOUND; extern int HEAD; extern CHAR Progname[]; extern int D_length, tc_D_length; extern CHAR tc_D_pattern[MaxDelimit * 2]; extern int table[WORD][WORD]; extern int agrep_initialfd; extern int EXITONERROR; extern int errno; extern int multifd; extern char *multibuf; extern int multilen; extern int anum_terminals; extern ParseTree aterminals[MAXNUM_PAT]; extern char FREQ_FILE[MAX_LINE_LEN], HASH_FILE[MAX_LINE_LEN], STRING_FILE[MAX_LINE_LEN]; /* interfacing with tcompress */ extern int AComplexBoolean; int preprocess(D_pattern, Pattern) /* need two parameters */ CHAR D_pattern[], Pattern[]; { CHAR temp[Maxline], *r_pat, *old_pat; /* r_pat for r.e. */ CHAR old_D_pat[MaxDelimit*2]; int i, j=0, rp=0, m, t=0, num_pos, ANDON = 0; int d_end ; int IN_RANGE=0; int ret1, ret2; #if DEBUG fprintf(stderr, "preprocess: m=%d, pat=%s, PAT_FILE=%d, PAT_BUFFER=%d\n", strlen(Pattern), Pattern, PAT_FILE, PAT_BUFFER); #endif if ((m = strlen(Pattern)) <= 0) return 0; if (PAT_FILE || PAT_BUFFER) return 0; REGEX = OFF; FASTREGEX = OFF; old_pat = Pattern; /* to remember the starting position */ /* Check if pattern is a concatenation of ands OR ors of simple patterns */ multibuf = (char *)malloc(m * 2 + 2); /* worst case: a,a,a,a,a,a */ if (multibuf == NULL) goto normal_processing; /* if (WORDBOUND) goto normal_processing; */ multilen = 0; AParse = 0; ret1 = ret2 = 0; if (((ret1 = asplit_pattern(Pattern, m, aterminals, &anum_terminals, &AParse)) <= 0) || /* can change the pattern if simple boolean with {} */ ((ret2 = asplit_terminal(0, anum_terminals, multibuf, &multilen)) <= 0) || ((ret2 == 1) && !(aterminals[0].op & NOTPAT))) { /* must do normal processing */ if (AComplexBoolean && (AParse != NULL)) destroy_tree(AParse); /* so that direct exec invocations don't use AParse by mistake! */ #if DEBUG fprintf(stderr, "preprocess: split_pat = %d, split_term = %d, #terms = %d\n", ret1, ret2, anum_terminals); #endif /*DEBUG*/ /* if (ret2 == 1) { strcpy(Pattern, aterminals[0].data.leaf.value); m = strlen(Pattern); } */ m = strlen(Pattern); AParse = 0; free(multibuf); multibuf = NULL; multilen = 0; goto normal_processing; } /* This is quick processing */ if (AParse != 0) { /* successfully converted to ANDPAT/ORPAT */ PAT_BUFFER = 1; /* printf("preprocess(): converted= %d, patterns= %s", AParse, multibuf); */ /* Now I have to process the delimiter if any */ if (DELIMITER) { /* D_pattern is "; ", D_length is 1 + length of string PAT: see agrep.c/'d' */ preprocess_delimiter(D_pattern+1, D_length - 1, D_pattern, &D_length); /* D_pattern is the exact stuff we want to match, D_length is its strlen */ if ((tc_D_length = quick_tcompress(FREQ_FILE, HASH_FILE, D_pattern, D_length, tc_D_pattern, MaxDelimit*2, TC_EASYSEARCH)) <= 0) { strcpy(tc_D_pattern, D_pattern); tc_D_length = D_length; } /* printf("mgrep's delim=%s,%d tc_delim=%s,%d\n", D_pattern, D_length, tc_D_pattern, tc_D_length); */ } return 0; } /* else either unknown character, one simple pattern or none at all */ normal_processing: for(i=0; i< m; i++) { if(Pattern[i] == '\\') i++; else if(Pattern[i] == '|' || Pattern[i] == '*') REGEX = ON; } r_pat = (CHAR *) malloc(strlen(Pattern)+2*strlen(D_pattern) + 8); /* bug-report, From: Chris Dalton */ strcpy(temp, D_pattern); d_end = t = strlen(temp); /* size of D_pattern, including '; ' */ if (WHOLELINE) { temp[t++] = LANGLE; temp[t++] = NNLINE; temp[t++] = RANGLE; temp[t] = '\0'; strcat(temp, Pattern); m = strlen(temp); temp[m++] = LANGLE; temp[m++] = '\n'; temp[m++] = RANGLE; temp[m] = '\0'; } else { if (WORDBOUND) { temp[t++] = LANGLE; temp[t++] = WORDB; temp[t++] = RANGLE; temp[t] = '\0'; } strcat(temp, Pattern); m = strlen(temp); if (WORDBOUND) { temp[m++] = LANGLE; temp[m++] = WORDB; temp[m++] = RANGLE; } temp[m] = '\0'; } /* now temp contains augmented pattern , m it's size */ D_length = 0; for (i=0, j=0; i< d_end-2; i++) { switch(temp[i]) { case '\\' : i++; Pattern[j++] = temp[i]; old_D_pat[D_length++] = temp[i]; break; case '<' : Pattern[j++] = LANGLE; break; case '>' : Pattern[j++] = RANGLE; break; case '^' : Pattern[j++] = '\n'; old_D_pat[D_length++] = temp[i]; break; case '$' : Pattern[j++] = '\n'; old_D_pat[D_length++] = temp[i]; break; default : Pattern[j++] = temp[i]; old_D_pat[D_length++] = temp[i]; break; } } if(D_length > MAXDELIM) { fprintf(stderr, "%s: delimiter pattern too long (has > %d chars)\n", Progname, MAXDELIM); free(r_pat); if (!EXITONERROR) { errno = AGREP_ERROR; return -1; } else exit(2); } Pattern[j++] = ANDPAT; old_D_pat[D_length] = '\0'; strcpy(D_pattern, old_D_pat); D_length++; /* Pattern[j++] = ' '; */ Pattern[j] = '\0'; rp = 0; if(REGEX) { r_pat[rp++] = '.'; /* if REGEX: always append '.' in front */ r_pat[rp++] = '('; Pattern[j++] = NOCARE; HEAD = ON; } for (i=d_end; i < m ; i++) { switch(temp[i]) { case '\\': i++; Pattern[j++] = temp[i]; r_pat[rp++] = 'o'; /* the symbol doesn't matter */ break; case '#': FASTREGEX = ON; if(REGEX) { Pattern[j++] = NOCARE; r_pat[rp++] = '.'; r_pat[rp++] = '*'; break; } Pattern[j++] = WILDCD; break; case '(': Pattern[j++] = LPARENT; r_pat[rp++] = '('; break; case ')': Pattern[j++] = RPARENT; r_pat[rp++] = ')'; break; case '[': Pattern[j++] = LRANGE; r_pat[rp++] = '['; IN_RANGE = ON; break; case ']': Pattern[j++] = RRANGE; r_pat[rp++] = ']'; IN_RANGE = OFF; break; case '<': Pattern[j++] = LANGLE; break; case '>': Pattern[j++] = RANGLE; break; case '^': if (temp[i-1] == '[') Pattern[j++] = NOTSYM; else Pattern[j++] = '\n'; r_pat[rp++] = '^'; break; case '$': Pattern[j++] = '\n'; r_pat[rp++] = '$'; break; case '.': Pattern[j++] = NOCARE; r_pat[rp++] = '.'; break; case '*': Pattern[j++] = STAR; r_pat[rp++] = '*'; break; case '|': Pattern[j++] = ORSYM; r_pat[rp++] = '|'; break; case ',': Pattern[j++] = ORPAT; RE_ERR = ON; break; case ';': if(ANDON) RE_ERR = ON; Pattern[j++] = ANDPAT; ANDON = ON; break; case '-': if(IN_RANGE) { Pattern[j++] = HYPHEN; r_pat[rp++] = '-'; } else { Pattern[j++] = temp[i]; r_pat[rp++] = temp[i]; } break; case NNLINE : Pattern[j++] = temp[i]; r_pat[rp++] = 'N'; break; default: Pattern[j++] = temp[i]; r_pat[rp++] = temp[i]; break; } } if(REGEX) { /* append ').' at end of regular expression */ r_pat[rp++] = ')'; r_pat[rp++] = '.'; Pattern[j++] = NOCARE; TAIL = ON; } Pattern[j] = '\0'; m = j; r_pat[rp] = '\0'; if(REGEX) { if(DELIMITER || WORDBOUND) { fprintf(stderr, "%s: -d or -w option is not supported for this pattern\n", Progname); free(r_pat); if (!EXITONERROR) { errno = AGREP_ERROR; return -1; } else exit(2); } if(RE_ERR) { fprintf(stderr, "%s: illegal regular expression\n", Progname); free(r_pat); if (!EXITONERROR) { errno = AGREP_ERROR; return -1; } else exit(2); } while(*Pattern != NOCARE && m-- > 0) Pattern++; /* poit to . */ num_pos = init(r_pat, table); if(num_pos <= 0) { fprintf(stderr, "%s: illegal regular expression\n", Progname); free(r_pat); if (!EXITONERROR) { errno = AGREP_ERROR; return -1; } else exit(2); } if(num_pos > 30) { fprintf(stderr, "%s: regular expression too long\n", Progname); free(r_pat); if (!EXITONERROR) { errno = AGREP_ERROR; return -1; } else exit(2); } strcpy(old_pat, Pattern); /* do real change to the Pattern to be returned */ free(r_pat); return 0; } /* if regex */ free(r_pat); return 0; }