diff -ru grep-0.16/binary.c grep-patched/binary.c --- grep-0.16/binary.c Fri Oct 29 11:02:36 1999 +++ grep-patched/binary.c Tue Jun 24 18:34:49 2003 @@ -40,7 +40,7 @@ char buf[BUFFER_SIZE]; int i, m; - if (fseek(f, SEEK_SET, 0) == -1) + if (fseek(f, 0L, SEEK_SET) == -1) return 0; if ((m = (int)fread(buf, 1, BUFFER_SIZE, f)) == 0) @@ -78,10 +78,10 @@ mmbin_file(mmf_t *f) { int i; - + /* XXX knows too much about mmf internals */ for (i = 0; i < BUFFER_SIZE && i < f->len; i++) - if (!isprint(f->base[i])) + if (!isprint(f->base[i]) && !isspace(f->base[i])) return 1; mmrewind(f); return 0; diff -ru grep-0.16/file.c grep-patched/file.c --- grep-0.16/file.c Fri Oct 29 11:02:36 1999 +++ grep-patched/file.c Tue Jun 24 18:34:49 2003 @@ -37,7 +37,7 @@ static char fname[MAXPATHLEN]; static char *lnbuf; -static int lnbuflen; +static size_t lnbuflen; #define FILE_STDIO 0 #define FILE_MMAP 1 @@ -64,7 +64,7 @@ if (gzeof(f)) break; - + gzerrstr = gzerror(f, &gzerr); if (gzerr == Z_ERRNO) err(1, "%s", fname); @@ -92,13 +92,13 @@ file_t *f; if (fd == 0) - sprintf(fname, "(standard input)"); + snprintf(fname, sizeof (fname), "(standard input)"); else - sprintf(fname, "(fd %d)", fd); - + snprintf(fname, sizeof (fname), "(fd %d)", fd); + f = grep_malloc(sizeof *f); - - if (Zflag) { + + if (cfs.Zflag) { f->type = FILE_GZIP; if ((f->gzf = gzdopen(fd, mode)) != NULL) return f; @@ -107,35 +107,31 @@ if ((f->f = fdopen(fd, mode)) != NULL) return f; } - + free(f); return NULL; } file_t * -grep_open(char *path, char *mode) +grep_open(char *path, struct stat *statp, char *mode) { file_t *f; - snprintf(fname, MAXPATHLEN, "%s", path); - + snprintf(fname, sizeof (fname), "%s", path); + f = grep_malloc(sizeof *f); - - if (Zflag) { + + if (cfs.Zflag) { f->type = FILE_GZIP; if ((f->gzf = gzopen(fname, mode)) != NULL) return f; - } else { - /* try mmap first; if it fails, try stdio */ - if ((f->mmf = mmopen(fname, mode)) != NULL) { - f->type = FILE_MMAP; - return f; - } - f->type = FILE_STDIO; - if ((f->f = fopen(path, mode)) != NULL) - return f; } - + else if ((f->mmf = mmopen(fname, statp, mode)) != NULL) + { + f->type = FILE_MMAP; + return f; + } + free(f); return NULL; } @@ -205,4 +201,8 @@ /* can't happen */ errx(1, "invalid file type"); } + + free(f); + + return; } diff -ru grep-0.16/grep.1 grep-patched/grep.1 --- grep-0.16/grep.1 Fri Oct 29 11:02:36 1999 +++ grep-patched/grep.1 Tue Jun 24 18:34:49 2003 @@ -35,12 +35,14 @@ .Dt GREP 1 .Os .Sh NAME -.Nm grep, egrep, fgrep. zgrep +.Nm grep, egrep, fgrep. zgrep, zegrep, zfgrep .Nd file pattern searcher .Sh SYNOPSIS .Nm grep .Op Fl AB Ar num -.Op Fl CEFGHLPRSVZabchilnoqsvwx +.Op Fl CEFGHILPRSUVZabchilnoqsvwx +.Op Fl -context Ns Op = Ns Ar num +.Op Fl -binary-files Ns = Ns Ar value .Op Fl e Ar pattern .Op Fl f Ar file .Op Ar @@ -73,8 +75,16 @@ or more lines, allowing any of the pattern lines to match a portion of the input. The -.Nm zgrep -utility acts like grep, but accepts input files compressed with the +.Nm zgrep , +.Nm zegrep +and +.Nm zfgrep +utilities act like +.Nm grep , +.Nm egrep +and +.Nm fgrep +respectively but accept input files compressed with the .Xr compress 1 or .Xr gzip 1 @@ -117,6 +127,8 @@ .Fl R is specified, follow symbolic links only if they were explictly listed on the command line. +.It Fl I +Ignore binary files. .It Fl L Only the names of files not containing selected lines are written to standard output. @@ -134,6 +146,8 @@ If .Fl R is specified, all symbolic links are followed. +.It Fl U +Search binary files but do not attempt to print them. .It Fl V Display version information. .It Fl Z @@ -142,7 +156,7 @@ to behave as .Nm zgrep . .It Fl a -Do not search in binary files. +Treat all files as text. .It Fl b The block number on the disk in which a matched pattern is located is displayed in front of the respective matched line. @@ -154,12 +168,8 @@ options can be used to specify multiple patterns; an input line is selected if it matches any of the specified patterns. .It Fl f Ar pattern_file -The pattern is read from the specified file. Trailing newlines in the +Read one or more newline-separated patterns from file. Trailing newlines in the pattern file are ignored. -.Pf ( Nm Egrep -and -.Nm fgrep -only). .It Fl h Never print filename headers with output lines. .It Fl i @@ -199,6 +209,25 @@ .It Fl x Only input lines selected against an entire fixed string or regular expression are considered to be matching lines. +.Sm off +.It Fl Fl context Op = Ar num +.Sm on +Print +.Ar num +lines of leading and trailing context. +Default is 2. +.Sm off +.It Fl Fl binary-files No = Ar value +.Sm on +Controls searching and printing of binary files. +Options are +.Ar binary , +the default, search binary files but do not print them; +.Ar without-match , +do not search binary files; +and +.Ar text , +treat all files as text. .Pp .El If no file arguments are specified, the standard input is used. diff -ru grep-0.16/grep.c grep-patched/grep.c --- grep-0.16/grep.c Fri Nov 5 10:44:56 1999 +++ grep-patched/grep.c Wed Jun 25 17:00:05 2003 @@ -45,44 +45,29 @@ int eflags = REG_STARTEND; int matchall; /* shortcut */ -int patterns, pattern_sz; +int patterns, totalPatterns, pattern_sz; char **pattern; regex_t *r_pattern; +fastgrep_t *fg_pattern; /* For regex errors */ char re_error[RE_ERROR_BUF + 1]; /* Command-line flags */ -int Aflag; /* -A x: print x lines trailing each match */ -int Bflag; /* -B x: print x lines leading each match */ -int Eflag; /* -E: interpret pattern as extended regexp */ -int Fflag; /* -F: interpret pattern as list of fixed strings */ -int Gflag; /* -G: interpret pattern as basic regexp */ -int Hflag; /* -H: if -R, follow explicitly listed symlinks */ -int Lflag; /* -L: only show names of files with no matches */ -int Pflag; /* -P: if -R, no symlinks are followed */ -int Rflag; /* -R: recursively search directory trees */ -int Sflag; /* -S: if -R, follow all symlinks */ -int Vflag; /* -V: display version information */ -int Zflag; /* -Z: decompress input before processing */ -int aflag; /* -a: only search ascii files */ -int bflag; /* -b: show block numbers for each match */ -int cflag; /* -c: only show a count of matching lines */ -int hflag; /* -h: don't print filename headers */ -int iflag; /* -i: ignore case */ -int lflag; /* -l: only show names of files with matches */ -int nflag; /* -n: show line numbers in front of matching lines */ -int oflag; /* -o: always print file name */ -int qflag; /* -q: quiet mode (don't output anything) */ -int sflag; /* -s: silent mode (ignore errors) */ -int vflag; /* -v: only show non-matching lines */ -int wflag; /* -w: pattern must start and end on word boundaries */ -int xflag; /* -x: pattern must match entire line */ +cmdflags_t cfs = { 0 }; + +enum { + BIN_OPT = CHAR_MAX + 1, + HELP_OPT, + MMAP_OPT +}; /* Housekeeping */ -int first; /* flag whether or not this is our fist match */ +int first; /* flag whether or not this is our first match */ int tail; /* lines left to print */ int lead; /* number of lines in leading context queue */ +int boleol; /* At least one pattern has an bol and/or an eol. */ +int maxPatternLen; /* Longest length of all patterns. */ char *progname; @@ -91,43 +76,46 @@ { fprintf(stderr, "usage: %s %s %s\n", progname, - "[-[AB] num] [-CEFGHLPRSVZabchilnoqsvwx]", - "[-e patttern] [-f file]"); + "[-[AB] num] [-CEFGHILPRSUVZabchilnoqsvwx]", + "[-e pattern] [-f file] [file ...]"); exit(2); } -static char *optstr = "0123456789A:B:CEFGHLPSRUVZabce:f:hilnoqrsuvwxy"; +static char *optstr = "0123456789A:B:CEFGHILPSRUVZabce:f:hilnoqrsuvwxy"; -struct option long_options[] = +struct option long_options[] = { - {"basic-regexp", no_argument, NULL, 'G'}, - {"extended-regexp", no_argument, NULL, 'E'}, - {"fixed-strings", no_argument, NULL, 'F'}, + {"binary-files", no_argument, NULL, BIN_OPT}, + {"help", no_argument, NULL, HELP_OPT}, + {"mmap", no_argument, NULL, MMAP_OPT}, {"after-context", required_argument, NULL, 'A'}, {"before-context", required_argument, NULL, 'B'}, {"context", optional_argument, NULL, 'C'}, + {"extended-regexp", no_argument, NULL, 'E'}, + {"fixed-strings", no_argument, NULL, 'F'}, + {"basic-regexp", no_argument, NULL, 'G'}, + {"files-without-match", no_argument, NULL, 'L'}, + {"binary", no_argument, NULL, 'U'}, {"version", no_argument, NULL, 'V'}, + {"decompress", no_argument, NULL, 'Z'}, + {"text", no_argument, NULL, 'a'}, {"byte-offset", no_argument, NULL, 'b'}, {"count", no_argument, NULL, 'c'}, {"regexp", required_argument, NULL, 'e'}, {"file", required_argument, NULL, 'f'}, {"no-filename", no_argument, NULL, 'h'}, {"ignore-case", no_argument, NULL, 'i'}, - {"files-without-match", no_argument, NULL, 'L'}, {"files-with-matches", no_argument, NULL, 'l'}, {"line-number", no_argument, NULL, 'n'}, {"quiet", no_argument, NULL, 'q'}, {"silent", no_argument, NULL, 'q'}, {"recursive", no_argument, NULL, 'r'}, {"no-messages", no_argument, NULL, 's'}, - {"text", no_argument, NULL, 'a'}, + {"unix-byte-offsets", no_argument, NULL, 'u'}, {"revert-match", no_argument, NULL, 'v'}, {"word-regexp", no_argument, NULL, 'w'}, {"line-regexp", no_argument, NULL, 'x'}, - {"binary", no_argument, NULL, 'U'}, - {"unix-byte-offsets", no_argument, NULL, 'u'}, - {"decompress", no_argument, NULL, 'Z'}, - + {NULL, no_argument, NULL, 0} }; @@ -141,14 +129,109 @@ } if (patterns == pattern_sz) { pattern_sz *= 2; - pattern = grep_realloc(pattern, ++pattern_sz); + pattern = grep_realloc(pattern, + ((++pattern_sz) * sizeof (*pattern))); } - if (pat[len-1] == '\n') + if (pat[len - 1] == '\n') --len; - pattern[patterns] = grep_malloc(len+1); - strncpy(pattern[patterns], pat, len); + pattern[patterns] = grep_malloc(len + 1); + memcpy(pattern[patterns], pat, len); pattern[patterns][len] = '\0'; ++patterns; + + /* Find the maximum pattern length of all patterns. */ + if (len > maxPatternLen) + { + maxPatternLen = len; + } + + return; +} + +/* + * Compile a pattern into a fast or regular RE. + */ +static void +grep_comp(char *pattern, + int patternNdx, + int len) +{ + char *termPattern = (char *) grep_malloc(len + 1); + + /* Create a null-terminated string. */ + memcpy(termPattern, pattern, len); + termPattern[len] = '\0'; + + /* Check if cheating is allowed. */ + if (fastcomp(&fg_pattern[patternNdx], termPattern)) + { + /* Fall back to full regex library. */ + char c; + if ((c = regcomp(&r_pattern[patternNdx], termPattern, cflags))) + { + regerror(c, &r_pattern[patternNdx], re_error, + RE_ERROR_BUF); + errx(1, "%s", re_error); + } + } + + free(termPattern); + + return; +} + +/* + * Compile all patterns watching out for fgrep-mode. In that mode, patterns + * will be split with embedded newlines. + */ +static void +comp_patterns(void) +{ + char *nlLoc; + int patLen; + char *pat; + int len; + int i; + + totalPatterns = 0; + for (i = 0; i < patterns; ++i) + { + pat = pattern[i]; + len = strlen(pattern[i]); + patLen = len; + + /* + * Running in fgrep-mode. Split apart patterns with embedded + * newlines. + */ + if (cfs.Fflag && memchr(pattern[i], (char)'\n', + strlen(pattern[i]))) + { + /* Add each pattern within a string. */ + while ((nlLoc = memchr(pat, (char)'\n', len))) + { + /* Determine length of pattern piece. */ + patLen = nlLoc - pat; + + /* Compile pattern. */ + grep_comp(pat, totalPatterns, patLen); + + /* Skip the newline to next pattern. */ + pat = nlLoc + 1; + len -= (patLen + 1); + + /* Pattern was split. */ + totalPatterns++; + } + } + + /* Add only pattern given or last pattern piece. */ + grep_comp(pat, totalPatterns, len); + + totalPatterns++; + } + + return; } static void @@ -179,180 +262,250 @@ fclose(f); } -int +static void +free_patterns() +{ + int i; + + for (i = 0; i < patterns; i++) + { + if (fg_pattern[i].pattern) + { + free(fg_pattern[i].pattern); + } + else + { + regfree(&r_pattern[i]); + } + } + + free(fg_pattern); + free(r_pattern); + free(pattern); + + return; +} + + int main(int argc, char *argv[]) { char *tmp; - int c, i; + int c; + + /* Initialize. */ + cfs.iflag = 0; + cfs.Iflag = BIN_FILE_BIN; + boleol = 0; + maxPatternLen = 0; if ((progname = strrchr(*argv, '/')) != NULL) ++progname; else progname = *argv; - while ((c = getopt_long(argc, argv, optstr, + while ((c = getopt_long(argc, argv, optstr, long_options, (int *)NULL)) != -1) { switch (c) { - case '0': case '1': case '2': case '3': case '4': - case '5': case '6': case '7': case '8': case '9': - tmp = argv[optind - 1]; - if (tmp[0] == '-' && tmp[1] == c && !tmp[2]) - Aflag = Bflag = strtol(++tmp, (char **)NULL, 10); - else - Aflag = Bflag = strtol(argv[optind] + 1, (char **)NULL, 10); - break; - case 'A': - Aflag = strtol(optarg, (char **)NULL, 10); - break; - case 'B': - Bflag = strtol(optarg, (char **)NULL, 10); - break; - case 'C': - if (optarg == NULL) - Aflag = Bflag = 2; - else - Aflag = Bflag = strtol(optarg, (char **)NULL, 10); - break; - case 'E': - Eflag++; - break; - case 'F': - Fflag++; - break; - case 'G': - Gflag++; - break; - case 'H': - Hflag++; - break; - case 'L': - lflag = 0; - Lflag = qflag = 1; - break; - case 'P': - Pflag++; - break; - case 'S': - Sflag++; - break; - case 'R': - case 'r': - Rflag++; - oflag++; - break; - case 'U': - case 'u': - /* these are here for compatability */ - break; - case 'V': - fprintf(stderr, "grep version %u.%u\n", VER_MAJ, VER_MIN); - fprintf(stderr, argv[0]); - usage(); - break; - case 'Z': - Zflag++; - break; - case 'a': - aflag = 1; - break; - case 'b': - bflag = 1; - break; - case 'c': - cflag = 1; - break; - case 'e': - add_pattern(optarg, strlen(optarg)); - break; - case 'f': - read_patterns(optarg); - break; - case 'h': - oflag = 0; - hflag = 1; - break; - case 'i': - case 'y': - cflags |= REG_ICASE; - break; - case 'l': - Lflag = 0; - lflag = qflag = 1; - break; - case 'n': - nflag = 1; - break; - case 'o': - hflag = 0; - oflag = 1; - break; - case 'q': - qflag = 1; - break; - case 's': - sflag = 1; - break; - case 'v': - vflag = 1; - break; - case 'w': - wflag = 1; - break; - case 'x': - xflag = 1; - break; - default: - usage(); + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + tmp = argv[optind - 1]; + if (tmp[0] == '-' && tmp[1] == c && !tmp[2]) + cfs.Aflag = strtol(++tmp, + (char **)NULL, 10); + else + cfs.Aflag = strtol(argv[optind] + 1, + (char **)NULL, 10); + cfs.Bflag = cfs.Aflag; + break; + case 'A': + cfs.Aflag = strtol(optarg, (char **)NULL, 10); + break; + case 'B': + cfs.Bflag = strtol(optarg, (char **)NULL, 10); + break; + case 'C': + if (optarg == NULL) + cfs.Aflag = 2; + else + cfs.Aflag = strtol(optarg, + (char **)NULL, 10); + cfs.Bflag = cfs.Aflag; + break; + case 'E': + cfs.Eflag++; + break; + case 'F': + cfs.Fflag++; + break; + case 'G': + cfs.Gflag++; + break; + case 'H': + cfs.Hflag++; + break; + case 'I': + cfs.Iflag = BIN_FILE_SKIP; + break; + case 'L': + cfs.lflag = 0; + cfs.Lflag = cfs.qflag = 1; + break; + case 'P': + cfs.Pflag++; + break; + case 'S': + cfs.Sflag++; + break; + case 'R': + case 'r': + cfs.Rflag++; + cfs.oflag++; + break; + case 'U': + cfs.Iflag = BIN_FILE_BIN; + break; + case 'V': + fprintf(stderr, "grep version %u.%u\n", + VER_MAJ, VER_MIN); + fprintf(stderr, argv[0]); + usage(); + exit(0); + break; + case 'Z': + cfs.Zflag++; + switch (progname[1]) { + case 'e': + cfs.Eflag++; + break; + case 'f': + cfs.Fflag++; + break; + case 'g': + cfs.Gflag++; + break; + } + break; + case 'a': + cfs.Iflag = BIN_FILE_TEXT; + break; + case 'b': + cfs.bflag = 1; + break; + case 'c': + cfs.cflag = 1; + break; + case 'e': + add_pattern(optarg, strlen(optarg)); + break; + case 'f': + read_patterns(optarg); + break; + case 'h': + cfs.oflag = 0; + cfs.hflag = 1; + break; + case 'i': + case 'y': + cfs.iflag = 1; + cflags |= REG_ICASE; + break; + case 'l': + cfs.Lflag = 0; + cfs.lflag = cfs.qflag = 1; + break; + case 'n': + cfs.nflag = 1; + break; + case 'o': + cfs.hflag = 0; + cfs.oflag = 1; + break; + case 'q': + cfs.qflag = 1; + break; + case 's': + cfs.sflag = 1; + break; + case 'v': + cfs.vflag = 1; + break; + case 'w': + cfs.wflag = 1; + break; + case 'x': + cfs.xflag = 1; + break; + case BIN_OPT: + if (strcmp("binary", optarg) == 0) + cfs.Iflag = BIN_FILE_BIN; + else if (strcmp("without-match", optarg) == 0) + cfs.Iflag = BIN_FILE_SKIP; + else if (strcmp("text", optarg) == 0) + cfs.Iflag = BIN_FILE_TEXT; + else + errx(2, "Unknown binary-files option"); + break; + case 'u': + case MMAP_OPT: + /* default, compatibility */ + break; + case HELP_OPT: + default: + usage(); } } argc -= optind; argv += optind; - if (argc == 0 && patterns == 0) - usage(); + /* Logic if no '-e' option was given. */ + if ((!matchall) && (patterns == 0)) + { + if (argc == 0) + usage(); - if (patterns == 0) { add_pattern(*argv, strlen(*argv)); --argc; ++argv; } - + switch (*progname) { - case 'e': - Eflag++; - break; - case 'f': - Fflag++; - break; - case 'g': - Gflag++; - break; - case 'z': - Zflag++; - break; + case 'e': + cfs.Eflag++; + break; + case 'f': + cfs.Fflag++; + break; + case 'g': + cfs.Gflag++; + break; + case 'z': + cfs.Zflag++; + break; } - cflags |= Eflag ? REG_EXTENDED : REG_BASIC; + cflags |= cfs.Eflag ? REG_EXTENDED : REG_BASIC; + fg_pattern = grep_malloc(patterns * sizeof(*fg_pattern)); r_pattern = grep_malloc(patterns * sizeof(regex_t)); - for (i = 0; i < patterns; ++i) { - if ((c = regcomp(&r_pattern[i], pattern[i], cflags))) { - regerror(c, &r_pattern[i], re_error, RE_ERROR_BUF); - errx(1, "%s", re_error); - } - } + comp_patterns(); - if ((argc == 0 || argc == 1) && !oflag) - hflag = 1; + if ((argc == 0 || argc == 1) && !cfs.oflag) + cfs.hflag = 1; if (argc == 0) - exit(!procfile(NULL)); - - if (Rflag) - c = grep_tree(argv); + { + c = procfile(NULL, NULL); + } else - for (c = 0; argc--; ++argv) - c += procfile(*argv); + { + if (cfs.Rflag) + c = grep_tree(argv); + else + for (c = 0; argc--; ++argv) + c += procfile(*argv, NULL); + } + + free_patterns(); exit(!c); } diff -ru grep-0.16/grep.h grep-patched/grep.h --- grep-0.16/grep.h Fri Oct 29 11:02:36 1999 +++ grep-patched/grep.h Tue Jun 24 18:34:49 2003 @@ -25,14 +25,20 @@ */ #include +#include #include #include +#include #include #define VER_MAJ 0 #define VER_MIN 9 +#define BIN_FILE_BIN 0 +#define BIN_FILE_SKIP 1 +#define BIN_FILE_TEXT 2 + typedef struct { size_t len; int line_no; @@ -41,28 +47,71 @@ char *dat; } str_t; +typedef struct { + unsigned char *pattern; + int patternLen; + int qsBc[UCHAR_MAX + 1]; + + /* Flags. */ + int bol; + int eol; + int reversedSearch; +} fastgrep_t; + /* Flags passed to regcomp() and regexec() */ extern int cflags, eflags; /* Command line flags */ -extern int Aflag, Bflag, Hflag, Lflag, Pflag, Sflag, Rflag, Zflag, - aflag, bflag, cflag, hflag, lflag, nflag, qflag, sflag, - vflag, wflag, xflag; +typedef struct cmdflags +{ + int Aflag; /* -A x: print x lines trailing each match */ + int Bflag; /* -B x: print x lines leading each match */ + int Eflag; /* -E: interpret pattern as extended regexp */ + int Fflag; /* -F: interpret pattern as list of fixed strings */ + int Gflag; /* -G: interpret pattern as basic regexp */ + int Hflag; /* -H: if -R, follow explicitly listed symlinks */ + int Iflag; /* -I, -a, -U: how to interpret binary files */ + int Lflag; /* -L: only show names of files with no matches */ + int Pflag; /* -P: if -R, no symlinks are followed */ + int Rflag; /* -R: recursively search directory trees */ + int Sflag; /* -S: if -R, follow all symlinks */ + int Vflag; /* -V: display version information */ + int Zflag; /* -Z: decompress input before processing */ + int bflag; /* -b: show block numbers for each match */ + int cflag; /* -c: only show a count of matching lines */ + int hflag; /* -h: don't print filename headers */ + int iflag; /* -i: ignore case */ + int lflag; /* -l: only show names of files with matches */ + int nflag; /* -n: show line numbers in front of matching lines */ + int oflag; /* -o: always print file name */ + int qflag; /* -q: quiet mode (don't output anything) */ + int sflag; /* -s: silent mode (ignore errors) */ + int vflag; /* -v: only show non-matching lines */ + int wflag; /* -w: pattern must start and end on word boundaries */ + int xflag; /* -x: pattern must match entire line */ +} cmdflags_t; + +extern cmdflags_t cfs; +extern int boleol; +extern int maxPatternLen; -extern int first, lead, matchall, patterns, tail; +extern int first, lead, matchall, totalPatterns, tail; extern char **pattern; extern regex_t *r_pattern; +extern fastgrep_t *fg_pattern; /* For regex errors */ #define RE_ERROR_BUF 512 extern char re_error[RE_ERROR_BUF + 1]; /* Seems big enough */ /* util.c */ -int procfile(char *fn); +int procfile(char *fn, struct stat *statp); int grep_tree(char **argv); void *grep_malloc(size_t size); +unsigned char *grep_strdup(const char *str, int len); void *grep_realloc(void *ptr, size_t size); void printline(str_t *line, int sep); +int fastcomp(fastgrep_t *fg, const char *pattern); /* queue.c */ void initqueue(); @@ -77,7 +126,7 @@ char *base, *end, *ptr; } mmf_t; -mmf_t *mmopen(char *fn, char *mode); +mmf_t *mmopen(char *fn, struct stat *statp, char *mode); void mmclose(mmf_t *mmf); char *mmfgetln(mmf_t *mmf, size_t *l); long mmtell(mmf_t *mmf); @@ -88,7 +137,7 @@ typedef struct file file_t; file_t *grep_fdopen(int fd, char *mode); -file_t *grep_open(char *path, char *mode); +file_t *grep_open(char *path, struct stat *statp, char *mode); int grep_bin_file(file_t *f); long grep_tell(file_t *f); char *grep_fgetln(file_t *f, size_t *l); diff -ru grep-0.16/mmfile.c grep-patched/mmfile.c --- grep-0.16/mmfile.c Fri Oct 29 11:02:36 1999 +++ grep-patched/mmfile.c Tue Jun 24 18:34:49 2003 @@ -38,27 +38,33 @@ #include "grep.h" #define MAX_MAP_LEN 1048576 +#define BLOCKSIZE 32768 mmf_t * -mmopen(char *fn, char *mode) +mmopen(char *fn, struct stat *statp, char *mode) { mmf_t *mmf; - struct stat st; /* XXX ignore mode for now */ mode = mode; - - mmf = grep_malloc(sizeof *mmf); + + mmf = grep_malloc(sizeof *mmf); if ((mmf->fd = open(fn, O_RDONLY)) == -1) goto ouch1; - if (fstat(mmf->fd, &st) == -1) - goto ouch2; - if (st.st_size > SIZE_T_MAX) /* too big to mmap */ + /* Perform a new stat() if needed. */ + if ((! statp->st_mode) || (S_ISLNK(statp->st_mode))) + { + if (fstat(mmf->fd, statp) == -1) + goto ouch2; + } + /* only mmap regular files and links */ + if (!(S_ISREG(statp->st_mode) || S_ISLNK(statp->st_mode))) goto ouch2; - if ((st.st_mode & S_IFREG) == 0) /* only mmap regular files */ + if (statp->st_size > SIZE_T_MAX) /* too big to mmap */ goto ouch2; - mmf->len = (size_t)st.st_size; - mmf->base = mmap(NULL, mmf->len, PROT_READ, MAP_PRIVATE, mmf->fd, 0); + mmf->len = (size_t)statp->st_size; + mmf->base = mmap(NULL, mmf->len, PROT_READ, MAP_PRIVATE, mmf->fd, + (off_t)0); if (mmf->base == NULL) goto ouch2; mmf->ptr = mmf->base; @@ -79,6 +85,8 @@ munmap(mmf->base, mmf->len); close(mmf->fd); free(mmf); + + return; } char * @@ -88,22 +96,56 @@ if (mmf->ptr >= mmf->end) return NULL; - for (p = mmf->ptr; mmf->ptr < mmf->end; ++mmf->ptr) - if (*mmf->ptr == '\n') - break; + + if ((cfs.lflag || cfs.qflag) && !boleol) + { + /* Find starting point to search. */ + if (mmf->ptr == mmf->base) + { + p = mmf->ptr; + } + else + { + /* + * Move back enough to make sure that the pattern is not + * split between buffers. + */ + p = mmf->ptr - maxPatternLen; + } + + /* Move the start pointer ahead for next iteration. */ + if (mmf->end - mmf->ptr > BLOCKSIZE) + { + mmf->ptr += BLOCKSIZE; + } + else + { + mmf->ptr = mmf->end; + } + } + else + { + for (p = mmf->ptr; mmf->ptr < mmf->end; ++mmf->ptr) + if (*mmf->ptr == '\n') + break; + } + *l = mmf->ptr - p; ++mmf->ptr; + return p; } long mmtell(mmf_t *mmf) { - return mmf->ptr - mmf->base; + return mmf->ptr - mmf->base; } void mmrewind(mmf_t *mmf) { - mmf->ptr = mmf->base; + mmf->ptr = mmf->base; + + return; } diff -ru grep-0.16/queue.c grep-patched/queue.c --- grep-0.16/queue.c Fri Oct 29 11:02:36 1999 +++ grep-patched/queue.c Tue Jun 24 18:34:49 2003 @@ -81,7 +81,7 @@ q_tail = item; } - if (++count > Bflag) + if (++count > cfs.Bflag) free_item(dequeue()); } diff -ru grep-0.16/util.c grep-patched/util.c --- grep-0.16/util.c Fri Oct 29 11:02:36 1999 +++ grep-patched/util.c Wed Jun 25 17:19:04 2003 @@ -47,9 +47,15 @@ */ static int linesqueued; -static int procline(str_t *l); +static int procline(str_t *l, int nottext); +static int grep_search(fastgrep_t *fg, unsigned char *data, int dataLen, + regmatch_t *pmatch); +static int grep_cmp(const unsigned char *pattern, + const unsigned char *data, size_t len); +static void grep_revstr(unsigned char *str, int len); -int + +int grep_tree(char **argv) { FTS *fts; @@ -58,14 +64,14 @@ c = fts_flags = 0; - if (Hflag) + if (cfs.Hflag) fts_flags = FTS_COMFOLLOW; - if (Pflag) + if (cfs.Pflag) fts_flags = FTS_PHYSICAL; - if (Sflag) + if (cfs.Sflag) fts_flags = FTS_LOGICAL; - fts_flags |= FTS_NOSTAT | FTS_NOCHDIR; + fts_flags |= FTS_NOCHDIR; if (!(fts = fts_open(argv, fts_flags, (int (*) ()) NULL))) err(1, NULL); @@ -73,13 +79,18 @@ switch (p->fts_info) { case FTS_DNR: break; + case FTS_NS: case FTS_ERR: + case FTS_SLNONE: errx(1, "%s: %s", p->fts_path, strerror(p->fts_errno)); break; case FTS_DP: break; default: - c += procfile(p->fts_path); + if (!S_ISDIR(p->fts_statp->st_mode)) + { + c += procfile(p->fts_path, p->fts_statp); + } break; } } @@ -88,24 +99,35 @@ } int -procfile(char *fn) +procfile(char *fn, + struct stat *statp) { str_t ln; file_t *f; - int c, t, z; + int c, t, z, nottext; + struct stat st = {0}; + + if (statp) + { + memcpy(&st, statp, sizeof (st)); + } if (fn == NULL) { fn = "(standard input)"; f = grep_fdopen(STDIN_FILENO, "r"); } else { - f = grep_open(fn, "r"); + f = grep_open(fn, &st, "r"); } if (f == NULL) { - if (!sflag) + if ((!cfs.sflag) && !S_ISDIR(st.st_mode)) + { warn("%s", fn); + } return 0; } - if (aflag && grep_bin_file(f)) { + + nottext = grep_bin_file(f); + if (nottext && cfs.Iflag == BIN_FILE_SKIP) { grep_close(f); return 0; } @@ -115,9 +137,9 @@ linesqueued = 0; ln.off = -1; - if (Bflag > 0) + if (cfs.Bflag > 0) initqueue(); - for (c = 0; !(lflag && c);) { + for (c = 0; !(cfs.qflag && c);) { ln.off += ln.len + 1; if ((ln.dat = grep_fgetln(f, &ln.len)) == NULL) break; @@ -126,30 +148,34 @@ ln.line_no++; z = tail; - - if ((t = procline(&ln)) == 0 && Bflag > 0 && z == 0) { + + if ((t = procline(&ln, nottext)) == 0 && cfs.Bflag > 0 && + z == 0) { enqueue(&ln); linesqueued++; } c += t; } - if (Bflag > 0) + if (cfs.Bflag > 0) clearqueue(); grep_close(f); - if (cflag) { - if (!hflag) + if (cfs.cflag) { + if (!cfs.hflag) printf("%s:", ln.file); printf("%u\n", c); } - if (lflag && c != 0) + if (cfs.lflag && c != 0) printf("%s\n", fn); - if (Lflag && c == 0) + if (cfs.Lflag && c == 0) printf("%s\n", fn); + if (c && !cfs.cflag && !cfs.lflag && !cfs.Lflag && + (cfs.Iflag == BIN_FILE_BIN) && !cfs.qflag && nottext) + printf("Binary file %s matches\n", fn); + return c; } - /* * Process an individual line in a file. Return non-zero if it matches. */ @@ -157,30 +183,40 @@ #define isword(x) (isalnum(x) || (x) == '_') static int -procline(str_t *l) +procline(str_t *l, int nottext) { regmatch_t pmatch; int c, i, r, t; if (matchall) { - c = !vflag; + c = !cfs.vflag; goto print; } - - t = vflag ? REG_NOMATCH : 0; - pmatch.rm_so = 0; - pmatch.rm_eo = l->len; - for (c = i = 0; i < patterns; i++) { - r = regexec(&r_pattern[i], l->dat, 0, &pmatch, eflags); + + t = cfs.vflag ? REG_NOMATCH : 0; + for (c = i = 0; i < totalPatterns; i++) { + pmatch.rm_so = 0; + pmatch.rm_eo = l->len; + if (fg_pattern[i].pattern) + { + r = grep_search(&fg_pattern[i], (unsigned char *)l->dat, + l->len, &pmatch); + } + else + { + r = regexec(&r_pattern[i], l->dat, 1, &pmatch, eflags); + } if (r == REG_NOMATCH && t == 0) continue; if (r == 0) { - if (wflag) { - if ((pmatch.rm_so != 0 && isword(l->dat[pmatch.rm_so - 1])) - || (pmatch.rm_eo != l->len && isword(l->dat[pmatch.rm_eo]))) + if (cfs.wflag) { + if ((pmatch.rm_so != 0 && + isword(l->dat[pmatch.rm_so - 1])) || + (pmatch.rm_eo != l->len && + isword(l->dat[pmatch.rm_eo]))) r = REG_NOMATCH; } - if (xflag) { + if (cfs.xflag) { if (pmatch.rm_so != 0 || pmatch.rm_eo != l->len) r = REG_NOMATCH; } @@ -190,15 +226,20 @@ break; } } - + print: - if ((tail > 0 || c) && !cflag && !qflag) { + if (c && cfs.Iflag == BIN_FILE_BIN && nottext) + return c; /* Binary file */ + + if ((tail > 0 || c) && !cfs.cflag && !cfs.qflag) { if (c) { - if (first > 0 && tail == 0 && (Bflag < linesqueued) && (Aflag || Bflag)) + if (first > 0 && tail == 0 && + (cfs.Bflag < linesqueued) && + (cfs.Aflag || cfs.Bflag)) printf("--\n"); first = 1; - tail = Aflag; - if (Bflag > 0) + tail = cfs.Aflag; + if (cfs.Bflag > 0) printqueue(); linesqueued = 0; printline(l, ':'); @@ -210,13 +251,300 @@ return c; } +/* + * Returns: -1 on failure + * 0 on success + */ +int fastcomp(fastgrep_t *fg, + const char *pattern) +{ + int i; + int bol = 0; + int eol = 0; + int origPatternLen; + int shiftPatternLen; + int hasDot = 0; + int firstHalfDot = -1; + int firstLastHalfDot = -1; + int lastHalfDot = 0; + + /* Initialize. */ + origPatternLen = fg->patternLen = strlen(pattern); + fg->bol = 0; + fg->eol = 0; + fg->reversedSearch = 0; + + /* Remove end-of-line character ('$'). */ + if (pattern[fg->patternLen - 1] == '$') + { + eol++; + fg->eol = 1; + fg->patternLen--; + boleol = 1; + } + + /* Remove beginning-of-line character ('^'). */ + if (pattern[0] == '^') + { + bol++; + fg->bol = 1; + fg->patternLen--; + boleol = 1; + } + + /* + * Copy pattern minus '^' and '$' characters at the beginning and ending + * of the string respectively. + */ + fg->pattern = grep_strdup(pattern + bol, origPatternLen - (bol + eol)); + + /* Look for ways to cheat...er...avoid the full regex engine. */ + for (i = 0; i < fg->patternLen; i++) + { + /* Can still cheat? */ + if ((isalnum(fg->pattern[i])) || isspace(fg->pattern[i]) || + (fg->pattern[i] == '_') || (fg->pattern[i] == ',') || + (fg->pattern[i] == '^') || (fg->pattern[i] == '$') || + (fg->pattern[i] == '=') || (fg->pattern[i] == '-') || + (fg->pattern[i] == ':') || (fg->pattern[i] == '/')) + { + /* As long as it is good, upper case it for later. */ + if (cfs.iflag) + { + fg->pattern[i] = toupper(fg->pattern[i]); + } + } + else if (fg->pattern[i] == '.') + { + hasDot = i; + if (i < fg->patternLen / 2) + { + if (firstHalfDot < 0) + { + /* + * Closest dot to the beginning of the + * pattern. + */ + firstHalfDot = i; + } + } + else + { + /* Closest dot to the end of the pattern. */ + lastHalfDot = i; + if (firstLastHalfDot < 0) + { + firstLastHalfDot = i; + } + } + } + else + { + /* Free memory and let others know this is empty. */ + free(fg->pattern); + fg->pattern = NULL; + + return (-1); + } + } + + /* + * Determine if a reverse search would be faster based on the placement + * of the dots. + */ + if ((!(cfs.lflag || cfs.cflag)) && + ((!(bol || eol)) && + ((lastHalfDot) && + ((firstHalfDot < 0) || + ((fg->patternLen - (lastHalfDot + 1)) < firstHalfDot))))) + { + fg->reversedSearch = 1; + hasDot = fg->patternLen - (firstHalfDot < 0 ? + firstLastHalfDot : firstHalfDot) - 1; + grep_revstr(fg->pattern, fg->patternLen); + } + + /* + * Normal Quick Search would require a shift based on the position the + * next character after the comparison is within the pattern. With + * wildcards, the position of the last dot effects the maximum shift + * distance. + * + * The closer to the end the wild card is the slower the search. A + * reverse version of this algorithm would be useful for wildcards near + * the end of the string. + * + * Examples: + * Pattern Max shift + * ------- --------- + * this 5 + * .his 4 + * t.is 3 + * th.s 2 + * thi. 1 + */ + + /* Adjust the shift based on location of the last dot ('.'). */ + shiftPatternLen = fg->patternLen - hasDot; + + /* Preprocess pattern. */ + for (i = 0; i <= UCHAR_MAX; i++) + { + fg->qsBc[i] = shiftPatternLen; + } + for (i = hasDot + 1; i < fg->patternLen; i++) + { + fg->qsBc[fg->pattern[i]] = fg->patternLen - i; + + /* + * If case is ignored, make the jump apply to both upper and + * lower cased characters. As the pattern is stored in upper + * case, apply the same to the lower case equivalents. + */ + if (cfs.iflag) + { + fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i; + } + } + + /* + * Put pattern back to normal after pre-processing to allow for easy + * comparisons later. + */ + if (fg->reversedSearch) + { + grep_revstr(fg->pattern, fg->patternLen); + } + + return (0); +} + +/* + * Performs a search on a block of data. + */ +static int grep_search(fastgrep_t *fg, + unsigned char *data, + int dataLen, + regmatch_t *pmatch) +{ + int j; + int rtrnVal = REG_NOMATCH; + + /* No point in going farther if we do not have enough data. */ + if (dataLen < fg->patternLen) + { + return (rtrnVal); + } + + /* Only try once at the beginning or ending of the line. */ + if (fg->bol || fg->eol) + { + /* Simple text comparison. */ + + /* Verify data is >= pattern length before searching on it. */ + if (dataLen >= fg->patternLen) + { + /* Determine where in data to start search. */ + if (fg->eol) + { + j = dataLen - fg->patternLen; + } + else + { + j = 0; + } + + if (!((fg->bol && fg->eol) && + (dataLen != fg->patternLen))) + { + if (grep_cmp(fg->pattern, data + j, + fg->patternLen) == -1) + { + pmatch->rm_so = j; + pmatch->rm_eo = j + fg->patternLen; + rtrnVal = 0; + } + } + } + } + else if (fg->reversedSearch) + { + /* Quick Search algorithm. */ + j = dataLen; + do + { + if (grep_cmp(fg->pattern, data + j - fg->patternLen, + fg->patternLen) == -1) + { + pmatch->rm_so = j - fg->patternLen; + pmatch->rm_eo = j; + rtrnVal = 0; + + break; + } + + /* Shift if within bounds, otherwise, we are done. */ + if (j == 0) + { + break; + } + else + { + j -= fg->qsBc[data[j - fg->patternLen - 1]]; + } + } while (j >= 0); + } + else + { + /* Quick Search algorithm. */ + j = 0; + do + { + if (grep_cmp(fg->pattern, data + j, + fg->patternLen) == -1) + { + pmatch->rm_so = j; + pmatch->rm_eo = j + fg->patternLen; + rtrnVal = 0; + + break; + } + + /* Shift if within bounds, otherwise, we are done. */ + if (j + fg->patternLen == dataLen) + { + break; + } + else + { + j += fg->qsBc[data[j + fg->patternLen]]; + } + } while (j <= (dataLen - fg->patternLen)); + } + + return (rtrnVal); +} + void * grep_malloc(size_t size) { void *ptr; if ((ptr = malloc(size)) == NULL) - err(1, "malloc"); + errx(1, "malloc"); + return ptr; +} + +unsigned char * +grep_strdup(const char *str, + int len) +{ + unsigned char *ptr; + + /* Copy string and null character. */ + ptr = (unsigned char *) grep_malloc(len + 1); + memcpy(ptr, str, len + 1); + return ptr; } @@ -224,27 +552,75 @@ grep_realloc(void *ptr, size_t size) { if ((ptr = realloc(ptr, size)) == NULL) - err(1, "realloc"); + errx(1, "realloc"); return ptr; } +/* + * Character array comparison with case sensitive and insensitive capabilities + * depending on how iflag is set. + * + * Returns: i >= 0 on failure (position that it failed) + * -1 on success + */ +int +grep_cmp(const unsigned char *pattern, + const unsigned char *data, + size_t len) +{ + int i; + + for (i = 0; i < len; i++) + { + if (((pattern[i] == data[i]) || (pattern[i] == '.')) || + (cfs.iflag && pattern[i] == toupper(data[i]))) + { + continue; + } + + return (i); + } + + return (-1); +} + +/* + * Reverses a string within a buffer. + */ +static void +grep_revstr(unsigned char *str, + int len) +{ + int i; + char c; + + for (i = 0; i < len / 2; i++) + { + c = str[i]; + str[i] = str[len - i - 1]; + str[len - i - 1] = c; + } + + return; +} + void printline(str_t *line, int sep) { int n; - + n = 0; - if (!hflag) { + if (!cfs.hflag) { fputs(line->file, stdout); ++n; } - if (nflag) { + if (cfs.nflag) { if (n) putchar(sep); printf("%d", line->line_no); ++n; } - if (bflag) { + if (cfs.bflag) { if (n) putchar(sep); printf("%lu", (unsigned long)line->off);