grep: optimize built-in grep by skipping lines that do not hit

The internal "grep" engine we use checks for hits line-by-line, instead of
letting the underlying regexec()/fixmatch() routines scan for the first
match from the rest of the buffer.  This was a major source of overhead
compared to the external grep.

Introduce a "look-ahead" mechanism to find the next line that would
potentially match by using regexec()/fixmatch() in the remainder of the
text to skip unmatching lines, and use it when the query criteria is
simple enough (i.e. punt for an advanced grep boolean expression like
"lines that have both X and Y but not Z" for now) and we are not running
under "-v" (aka "--invert-match") option.

Note that "-L" (aka "--files-without-match") is not a reason to disable
this optimization.  Under the option, we are interested if the file has
any hit at all, and that is what we determine reliably with or without the
optimization.

Signed-off-by: Junio C Hamano <gitster@pobox.com>
This commit is contained in:
Junio C Hamano 2010-01-10 22:39:36 -08:00
parent cb572206d9
commit a26345b608

75
grep.c
View File

@ -608,6 +608,65 @@ static void show_pre_context(struct grep_opt *opt, const char *name, char *buf,
}
}
static int should_lookahead(struct grep_opt *opt)
{
struct grep_pat *p;
if (opt->extended)
return 0; /* punt for too complex stuff */
if (opt->invert)
return 0;
for (p = opt->pattern_list; p; p = p->next) {
if (p->token != GREP_PATTERN)
return 0; /* punt for "header only" and stuff */
}
return 1;
}
static int look_ahead(struct grep_opt *opt,
unsigned long *left_p,
unsigned *lno_p,
char **bol_p)
{
unsigned lno = *lno_p;
char *bol = *bol_p;
struct grep_pat *p;
char *sp, *last_bol;
regoff_t earliest = -1;
for (p = opt->pattern_list; p; p = p->next) {
int hit;
regmatch_t m;
if (p->fixed)
hit = !fixmatch(p->pattern, bol, &m);
else
hit = !regexec(&p->regexp, bol, 1, &m, 0);
if (!hit || m.rm_so < 0 || m.rm_eo < 0)
continue;
if (earliest < 0 || m.rm_so < earliest)
earliest = m.rm_so;
}
if (earliest < 0) {
*bol_p = bol + *left_p;
*left_p = 0;
return 1;
}
for (sp = bol + earliest; bol < sp && sp[-1] != '\n'; sp--)
; /* find the beginning of the line */
last_bol = sp;
for (sp = bol; sp < last_bol; sp++) {
if (*sp == '\n')
lno++;
}
*left_p -= last_bol - bol;
*bol_p = last_bol;
*lno_p = lno;
return 0;
}
static int grep_buffer_1(struct grep_opt *opt, const char *name,
char *buf, unsigned long size, int collect_hits)
{
@ -617,6 +676,7 @@ static int grep_buffer_1(struct grep_opt *opt, const char *name,
unsigned last_hit = 0;
int binary_match_only = 0;
unsigned count = 0;
int try_lookahead = 0;
enum grep_context ctx = GREP_CONTEXT_HEAD;
xdemitconf_t xecfg;
@ -645,11 +705,26 @@ static int grep_buffer_1(struct grep_opt *opt, const char *name,
opt->priv = &xecfg;
}
}
try_lookahead = should_lookahead(opt);
while (left) {
char *eol, ch;
int hit;
/*
* look_ahead() skips quicly to the line that possibly
* has the next hit; don't call it if we need to do
* something more than just skipping the current line
* in response to an unmatch for the current line. E.g.
* inside a post-context window, we will show the current
* line as a context around the previous hit when it
* doesn't hit.
*/
if (try_lookahead
&& !(last_hit
&& lno <= last_hit + opt->post_context)
&& look_ahead(opt, &left, &lno, &bol))
break;
eol = end_of_line(bol, &left);
ch = *eol;
*eol = 0;