2006-09-18 01:02:52 +02:00
|
|
|
#include "cache.h"
|
2017-06-14 20:07:36 +02:00
|
|
|
#include "config.h"
|
2006-09-18 01:02:52 +02:00
|
|
|
#include "grep.h"
|
2018-05-16 01:42:15 +02:00
|
|
|
#include "object-store.h"
|
2009-07-02 00:07:24 +02:00
|
|
|
#include "userdiff.h"
|
2007-06-05 04:36:11 +02:00
|
|
|
#include "xdiff-interface.h"
|
2013-05-10 17:10:15 +02:00
|
|
|
#include "diff.h"
|
|
|
|
#include "diffcore.h"
|
2016-06-25 07:22:30 +02:00
|
|
|
#include "commit.h"
|
2016-06-25 07:22:31 +02:00
|
|
|
#include "quote.h"
|
2018-05-26 15:55:24 +02:00
|
|
|
#include "help.h"
|
2006-09-18 01:02:52 +02:00
|
|
|
|
2012-09-15 23:04:36 +02:00
|
|
|
static int grep_source_load(struct grep_source *gs);
|
2018-09-21 17:57:33 +02:00
|
|
|
static int grep_source_is_binary(struct grep_source *gs,
|
|
|
|
struct index_state *istate);
|
2012-09-15 23:04:36 +02:00
|
|
|
|
2020-11-21 19:31:08 +01:00
|
|
|
static void std_output(struct grep_opt *opt, const void *buf, size_t size)
|
|
|
|
{
|
|
|
|
fwrite(buf, size, 1, stdout);
|
|
|
|
}
|
|
|
|
|
2018-05-26 15:55:22 +02:00
|
|
|
static const char *color_grep_slots[] = {
|
|
|
|
[GREP_COLOR_CONTEXT] = "context",
|
|
|
|
[GREP_COLOR_FILENAME] = "filename",
|
|
|
|
[GREP_COLOR_FUNCTION] = "function",
|
|
|
|
[GREP_COLOR_LINENO] = "lineNumber",
|
2018-07-18 21:20:31 +02:00
|
|
|
[GREP_COLOR_COLUMNNO] = "column",
|
2018-05-26 15:55:22 +02:00
|
|
|
[GREP_COLOR_MATCH_CONTEXT] = "matchContext",
|
|
|
|
[GREP_COLOR_MATCH_SELECTED] = "matchSelected",
|
|
|
|
[GREP_COLOR_SELECTED] = "selected",
|
|
|
|
[GREP_COLOR_SEP] = "separator",
|
|
|
|
};
|
|
|
|
|
2012-10-10 01:17:50 +02:00
|
|
|
static int parse_pattern_type_arg(const char *opt, const char *arg)
|
|
|
|
{
|
|
|
|
if (!strcmp(arg, "default"))
|
|
|
|
return GREP_PATTERN_TYPE_UNSPECIFIED;
|
|
|
|
else if (!strcmp(arg, "basic"))
|
|
|
|
return GREP_PATTERN_TYPE_BRE;
|
|
|
|
else if (!strcmp(arg, "extended"))
|
|
|
|
return GREP_PATTERN_TYPE_ERE;
|
|
|
|
else if (!strcmp(arg, "fixed"))
|
|
|
|
return GREP_PATTERN_TYPE_FIXED;
|
|
|
|
else if (!strcmp(arg, "perl"))
|
|
|
|
return GREP_PATTERN_TYPE_PCRE;
|
|
|
|
die("bad %s argument: %s", opt, arg);
|
|
|
|
}
|
|
|
|
|
2018-05-26 15:55:24 +02:00
|
|
|
define_list_config_array_extra(color_grep_slots, {"match"});
|
|
|
|
|
2012-10-10 01:17:50 +02:00
|
|
|
/*
|
|
|
|
* Read the configuration file once and store it in
|
|
|
|
* the grep_defaults template.
|
|
|
|
*/
|
|
|
|
int grep_config(const char *var, const char *value, void *cb)
|
|
|
|
{
|
2022-02-16 01:00:36 +01:00
|
|
|
struct grep_opt *opt = cb;
|
2018-05-26 15:55:22 +02:00
|
|
|
const char *slot;
|
2012-10-10 01:17:50 +02:00
|
|
|
|
|
|
|
if (userdiff_config(var, value) < 0)
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
if (!strcmp(var, "grep.extendedregexp")) {
|
2017-06-30 00:22:18 +02:00
|
|
|
opt->extended_regexp_option = git_config_bool(var, value);
|
2012-10-10 01:17:50 +02:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!strcmp(var, "grep.patterntype")) {
|
|
|
|
opt->pattern_type_option = parse_pattern_type_arg(var, value);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!strcmp(var, "grep.linenumber")) {
|
|
|
|
opt->linenum = git_config_bool(var, value);
|
|
|
|
return 0;
|
|
|
|
}
|
2018-06-22 17:49:49 +02:00
|
|
|
if (!strcmp(var, "grep.column")) {
|
|
|
|
opt->columnnum = git_config_bool(var, value);
|
|
|
|
return 0;
|
|
|
|
}
|
2012-10-10 01:17:50 +02:00
|
|
|
|
2014-03-17 20:16:05 +01:00
|
|
|
if (!strcmp(var, "grep.fullname")) {
|
|
|
|
opt->relative = !git_config_bool(var, value);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2012-10-10 01:17:50 +02:00
|
|
|
if (!strcmp(var, "color.grep"))
|
|
|
|
opt->color = git_config_colorbool(var, value);
|
2018-05-26 15:55:22 +02:00
|
|
|
if (!strcmp(var, "color.grep.match")) {
|
|
|
|
if (grep_config("color.grep.matchcontext", value, cb) < 0)
|
|
|
|
return -1;
|
|
|
|
if (grep_config("color.grep.matchselected", value, cb) < 0)
|
|
|
|
return -1;
|
|
|
|
} else if (skip_prefix(var, "color.grep.", &slot)) {
|
|
|
|
int i = LOOKUP_CONFIG(color_grep_slots, slot);
|
|
|
|
char *color;
|
|
|
|
|
|
|
|
if (i < 0)
|
|
|
|
return -1;
|
|
|
|
color = opt->colors[i];
|
2012-10-10 01:17:50 +02:00
|
|
|
if (!value)
|
|
|
|
return config_error_nonbool(var);
|
2014-10-07 21:33:09 +02:00
|
|
|
return color_parse(value, color);
|
2012-10-10 01:17:50 +02:00
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
built-ins: trust the "prefix" from run_builtin()
Change code in "builtin/grep.c" and "builtin/ls-tree.c" to trust the
"prefix" passed from "run_builtin()". The "prefix" we get from setup.c
is either going to be NULL or a string of length >0, never "".
So we can drop the "prefix && *prefix" checks added for
"builtin/grep.c" in 0d042fecf2f (git-grep: show pathnames relative to
the current directory, 2006-08-11), and for "builtin/ls-tree.c" in
a69dd585fca (ls-tree: chomp leading directories when run from a
subdirectory, 2005-12-23).
As seen in code in revision.c that was added in cd676a51367 (diff
--relative: output paths as relative to the current subdirectory,
2008-02-12) we already have existing code that does away with this
assertion.
This makes it easier to reason about a subsequent change to the
"prefix_length" code in grep.c in a subsequent commit, and since we're
going to the trouble of doing that let's leave behind an assert() to
promise this to any future callers.
For "builtin/grep.c" it would be painful to pass the "prefix" down the
callchain of:
cmd_grep -> grep_tree -> grep_submodule -> grep_cache -> grep_oid ->
grep_source_name
So for the code that needs it in grep_source_name() let's add a
"grep_prefix" variable similar to the existing "ls_tree_prefix".
While at it let's move the code in cmd_ls_tree() around so that we
assign to the "ls_tree_prefix" right after declaring the variables,
and stop assigning to "prefix". We only subsequently used that
variable later in the function after clobbering it. Let's just use our
own "grep_prefix" instead.
Let's also add an assert() in git.c, so that we'll make this promise
about the "prefix" to any current and future callers, as well as to
any readers of the code.
Code history:
* The strlen() in "grep.c" hasn't been used since 493b7a08d80 (grep:
accept relative paths outside current working directory, 2009-09-05).
When that code was added in 0d042fecf2f (git-grep: show pathnames
relative to the current directory, 2006-08-11) we used the length.
But since 493b7a08d80 we haven't used it for anything except a
boolean check that we could have done on the "prefix" member
itself.
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2022-02-16 01:00:34 +01:00
|
|
|
void grep_init(struct grep_opt *opt, struct repository *repo)
|
2012-10-10 01:17:50 +02:00
|
|
|
{
|
2022-02-16 01:00:36 +01:00
|
|
|
struct grep_opt blank = GREP_OPT_INIT;
|
|
|
|
memcpy(opt, &blank, sizeof(*opt));
|
2020-11-29 20:52:21 +01:00
|
|
|
|
2018-09-21 17:57:23 +02:00
|
|
|
opt->repo = repo;
|
2012-10-10 01:17:50 +02:00
|
|
|
opt->pattern_tail = &opt->pattern_list;
|
|
|
|
opt->header_tail = &opt->header_list;
|
|
|
|
}
|
2012-09-15 23:04:36 +02:00
|
|
|
|
2012-05-20 16:32:39 +02:00
|
|
|
static struct grep_pat *create_grep_pat(const char *pat, size_t patlen,
|
|
|
|
const char *origin, int no,
|
|
|
|
enum grep_pat_token t,
|
|
|
|
enum grep_header_field field)
|
log --author/--committer: really match only with name part
When we tried to find commits done by AUTHOR, the first implementation
tried to pattern match a line with "^author .*AUTHOR", which later was
enhanced to strip leading caret and look for "^author AUTHOR" when the
search pattern was anchored at the left end (i.e. --author="^AUTHOR").
This had a few problems:
* When looking for fixed strings (e.g. "git log -F --author=x --grep=y"),
the regexp internally used "^author .*x" would never match anything;
* To match at the end (e.g. "git log --author='google.com>$'"), the
generated regexp has to also match the trailing timestamp part the
commit header lines have. Also, in order to determine if the '$' at
the end means "match at the end of the line" or just a literal dollar
sign (probably backslash-quoted), we would need to parse the regexp
ourselves.
An earlier alternative tried to make sure that a line matches "^author "
(to limit by field name) and the user supplied pattern at the same time.
While it solved the -F problem by introducing a special override for
matching the "^author ", it did not solve the trailing timestamp nor tail
match problem. It also would have matched every commit if --author=author
was asked for, not because the author's email part had this string, but
because every commit header line that talks about the author begins with
that field name, regardleses of who wrote it.
Instead of piling more hacks on top of hacks, this rethinks the grep
machinery that is used to look for strings in the commit header, and makes
sure that (1) field name matches literally at the beginning of the line,
followed by a SP, and (2) the user supplied pattern is matched against the
remainder of the line, excluding the trailing timestamp data.
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2008-09-05 07:15:02 +02:00
|
|
|
{
|
|
|
|
struct grep_pat *p = xcalloc(1, sizeof(*p));
|
2012-05-20 16:33:07 +02:00
|
|
|
p->pattern = xmemdupz(pat, patlen);
|
2012-05-20 16:32:39 +02:00
|
|
|
p->patternlen = patlen;
|
|
|
|
p->origin = origin;
|
|
|
|
p->no = no;
|
|
|
|
p->token = t;
|
log --author/--committer: really match only with name part
When we tried to find commits done by AUTHOR, the first implementation
tried to pattern match a line with "^author .*AUTHOR", which later was
enhanced to strip leading caret and look for "^author AUTHOR" when the
search pattern was anchored at the left end (i.e. --author="^AUTHOR").
This had a few problems:
* When looking for fixed strings (e.g. "git log -F --author=x --grep=y"),
the regexp internally used "^author .*x" would never match anything;
* To match at the end (e.g. "git log --author='google.com>$'"), the
generated regexp has to also match the trailing timestamp part the
commit header lines have. Also, in order to determine if the '$' at
the end means "match at the end of the line" or just a literal dollar
sign (probably backslash-quoted), we would need to parse the regexp
ourselves.
An earlier alternative tried to make sure that a line matches "^author "
(to limit by field name) and the user supplied pattern at the same time.
While it solved the -F problem by introducing a special override for
matching the "^author ", it did not solve the trailing timestamp nor tail
match problem. It also would have matched every commit if --author=author
was asked for, not because the author's email part had this string, but
because every commit header line that talks about the author begins with
that field name, regardleses of who wrote it.
Instead of piling more hacks on top of hacks, this rethinks the grep
machinery that is used to look for strings in the commit header, and makes
sure that (1) field name matches literally at the beginning of the line,
followed by a SP, and (2) the user supplied pattern is matched against the
remainder of the line, excluding the trailing timestamp data.
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2008-09-05 07:15:02 +02:00
|
|
|
p->field = field;
|
2012-05-20 16:32:39 +02:00
|
|
|
return p;
|
|
|
|
}
|
|
|
|
|
2012-05-20 16:32:54 +02:00
|
|
|
static void do_append_grep_pat(struct grep_pat ***tail, struct grep_pat *p)
|
|
|
|
{
|
|
|
|
**tail = p;
|
|
|
|
*tail = &p->next;
|
log --author/--committer: really match only with name part
When we tried to find commits done by AUTHOR, the first implementation
tried to pattern match a line with "^author .*AUTHOR", which later was
enhanced to strip leading caret and look for "^author AUTHOR" when the
search pattern was anchored at the left end (i.e. --author="^AUTHOR").
This had a few problems:
* When looking for fixed strings (e.g. "git log -F --author=x --grep=y"),
the regexp internally used "^author .*x" would never match anything;
* To match at the end (e.g. "git log --author='google.com>$'"), the
generated regexp has to also match the trailing timestamp part the
commit header lines have. Also, in order to determine if the '$' at
the end means "match at the end of the line" or just a literal dollar
sign (probably backslash-quoted), we would need to parse the regexp
ourselves.
An earlier alternative tried to make sure that a line matches "^author "
(to limit by field name) and the user supplied pattern at the same time.
While it solved the -F problem by introducing a special override for
matching the "^author ", it did not solve the trailing timestamp nor tail
match problem. It also would have matched every commit if --author=author
was asked for, not because the author's email part had this string, but
because every commit header line that talks about the author begins with
that field name, regardleses of who wrote it.
Instead of piling more hacks on top of hacks, this rethinks the grep
machinery that is used to look for strings in the commit header, and makes
sure that (1) field name matches literally at the beginning of the line,
followed by a SP, and (2) the user supplied pattern is matched against the
remainder of the line, excluding the trailing timestamp data.
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2008-09-05 07:15:02 +02:00
|
|
|
p->next = NULL;
|
2012-05-20 16:33:07 +02:00
|
|
|
|
|
|
|
switch (p->token) {
|
|
|
|
case GREP_PATTERN: /* atom */
|
|
|
|
case GREP_PATTERN_HEAD:
|
|
|
|
case GREP_PATTERN_BODY:
|
|
|
|
for (;;) {
|
|
|
|
struct grep_pat *new_pat;
|
|
|
|
size_t len = 0;
|
|
|
|
char *cp = p->pattern + p->patternlen, *nl = NULL;
|
|
|
|
while (++len <= p->patternlen) {
|
|
|
|
if (*(--cp) == '\n') {
|
|
|
|
nl = cp;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (!nl)
|
|
|
|
break;
|
|
|
|
new_pat = create_grep_pat(nl + 1, len - 1, p->origin,
|
|
|
|
p->no, p->token, p->field);
|
|
|
|
new_pat->next = p->next;
|
|
|
|
if (!p->next)
|
|
|
|
*tail = &new_pat->next;
|
|
|
|
p->next = new_pat;
|
|
|
|
*nl = '\0';
|
|
|
|
p->patternlen -= len;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
2012-05-20 16:32:54 +02:00
|
|
|
}
|
|
|
|
|
2012-05-20 16:32:39 +02:00
|
|
|
void append_header_grep_pattern(struct grep_opt *opt,
|
|
|
|
enum grep_header_field field, const char *pat)
|
|
|
|
{
|
|
|
|
struct grep_pat *p = create_grep_pat(pat, strlen(pat), "header", 0,
|
|
|
|
GREP_PATTERN_HEAD, field);
|
2012-09-29 20:59:52 +02:00
|
|
|
if (field == GREP_HEADER_REFLOG)
|
|
|
|
opt->use_reflog_filter = 1;
|
2012-05-20 16:32:54 +02:00
|
|
|
do_append_grep_pat(&opt->header_tail, p);
|
log --author/--committer: really match only with name part
When we tried to find commits done by AUTHOR, the first implementation
tried to pattern match a line with "^author .*AUTHOR", which later was
enhanced to strip leading caret and look for "^author AUTHOR" when the
search pattern was anchored at the left end (i.e. --author="^AUTHOR").
This had a few problems:
* When looking for fixed strings (e.g. "git log -F --author=x --grep=y"),
the regexp internally used "^author .*x" would never match anything;
* To match at the end (e.g. "git log --author='google.com>$'"), the
generated regexp has to also match the trailing timestamp part the
commit header lines have. Also, in order to determine if the '$' at
the end means "match at the end of the line" or just a literal dollar
sign (probably backslash-quoted), we would need to parse the regexp
ourselves.
An earlier alternative tried to make sure that a line matches "^author "
(to limit by field name) and the user supplied pattern at the same time.
While it solved the -F problem by introducing a special override for
matching the "^author ", it did not solve the trailing timestamp nor tail
match problem. It also would have matched every commit if --author=author
was asked for, not because the author's email part had this string, but
because every commit header line that talks about the author begins with
that field name, regardleses of who wrote it.
Instead of piling more hacks on top of hacks, this rethinks the grep
machinery that is used to look for strings in the commit header, and makes
sure that (1) field name matches literally at the beginning of the line,
followed by a SP, and (2) the user supplied pattern is matched against the
remainder of the line, excluding the trailing timestamp data.
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2008-09-05 07:15:02 +02:00
|
|
|
}
|
|
|
|
|
2006-09-18 01:02:52 +02:00
|
|
|
void append_grep_pattern(struct grep_opt *opt, const char *pat,
|
|
|
|
const char *origin, int no, enum grep_pat_token t)
|
2010-05-22 23:43:43 +02:00
|
|
|
{
|
|
|
|
append_grep_pat(opt, pat, strlen(pat), origin, no, t);
|
|
|
|
}
|
|
|
|
|
|
|
|
void append_grep_pat(struct grep_opt *opt, const char *pat, size_t patlen,
|
|
|
|
const char *origin, int no, enum grep_pat_token t)
|
2006-09-18 01:02:52 +02:00
|
|
|
{
|
2012-05-20 16:32:39 +02:00
|
|
|
struct grep_pat *p = create_grep_pat(pat, patlen, origin, no, t, 0);
|
2012-05-20 16:32:54 +02:00
|
|
|
do_append_grep_pat(&opt->pattern_tail, p);
|
2006-09-18 01:02:52 +02:00
|
|
|
}
|
|
|
|
|
2010-01-25 23:51:39 +01:00
|
|
|
struct grep_opt *grep_opt_dup(const struct grep_opt *opt)
|
|
|
|
{
|
|
|
|
struct grep_pat *pat;
|
|
|
|
struct grep_opt *ret = xmalloc(sizeof(struct grep_opt));
|
|
|
|
*ret = *opt;
|
|
|
|
|
|
|
|
ret->pattern_list = NULL;
|
|
|
|
ret->pattern_tail = &ret->pattern_list;
|
|
|
|
|
|
|
|
for(pat = opt->pattern_list; pat != NULL; pat = pat->next)
|
|
|
|
{
|
|
|
|
if(pat->token == GREP_PATTERN_HEAD)
|
|
|
|
append_header_grep_pattern(ret, pat->field,
|
|
|
|
pat->pattern);
|
|
|
|
else
|
2010-05-22 23:43:43 +02:00
|
|
|
append_grep_pat(ret, pat->pattern, pat->patternlen,
|
|
|
|
pat->origin, pat->no, pat->token);
|
2010-01-25 23:51:39 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2011-05-09 23:52:04 +02:00
|
|
|
static NORETURN void compile_regexp_failed(const struct grep_pat *p,
|
|
|
|
const char *error)
|
|
|
|
{
|
|
|
|
char where[1024];
|
|
|
|
|
|
|
|
if (p->no)
|
2015-09-24 23:06:51 +02:00
|
|
|
xsnprintf(where, sizeof(where), "In '%s' at %d, ", p->origin, p->no);
|
2011-05-09 23:52:04 +02:00
|
|
|
else if (p->origin)
|
2015-09-24 23:06:51 +02:00
|
|
|
xsnprintf(where, sizeof(where), "%s, ", p->origin);
|
2011-05-09 23:52:04 +02:00
|
|
|
else
|
|
|
|
where[0] = 0;
|
|
|
|
|
|
|
|
die("%s'%s': %s", where, p->pattern, error);
|
|
|
|
}
|
|
|
|
|
2017-05-25 21:45:30 +02:00
|
|
|
static int is_fixed(const char *s, size_t len)
|
|
|
|
{
|
|
|
|
size_t i;
|
|
|
|
|
|
|
|
for (i = 0; i < len; i++) {
|
|
|
|
if (is_regex_special(s[i]))
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
grep: add support for PCRE v2
Add support for v2 of the PCRE API. This is a new major version of
PCRE that came out in early 2015[1].
The regular expression syntax is the same, but while the API is
similar, pretty much every function is either renamed or takes
different arguments. Thus using it via entirely new functions makes
sense, as opposed to trying to e.g. have one compile_pcre_pattern()
that would call either PCRE v1 or v2 functions.
Git can now be compiled with either USE_LIBPCRE1=YesPlease or
USE_LIBPCRE2=YesPlease, with USE_LIBPCRE=YesPlease currently being a
synonym for the former. Providing both is a compile-time error.
With earlier patches to enable JIT for PCRE v1 the performance of the
release versions of both libraries is almost exactly the same, with
PCRE v2 being around 1% slower.
However after I reported this to the pcre-dev mailing list[2] I got a
lot of help with the API use from Zoltán Herczeg, he subsequently
optimized some of the JIT functionality in v2 of the library.
Running the p7820-grep-engines.sh performance test against the latest
Subversion trunk of both, with both them and git compiled as -O3, and
the test run against linux.git, gives the following results. Just the
/perl/ tests shown:
$ GIT_PERF_REPEAT_COUNT=30 GIT_PERF_LARGE_REPO=~/g/linux GIT_PERF_MAKE_COMMAND='grep -q LIBPCRE2 Makefile && make -j8 USE_LIBPCRE2=YesPlease CC=~/perl5/installed/bin/gcc NO_R_TO_GCC_LINKER=YesPlease CFLAGS=-O3 LIBPCREDIR=/home/avar/g/pcre2/inst LDFLAGS=-Wl,-rpath,/home/avar/g/pcre2/inst/lib || make -j8 USE_LIBPCRE=YesPlease CC=~/perl5/installed/bin/gcc NO_R_TO_GCC_LINKER=YesPlease CFLAGS=-O3 LIBPCREDIR=/home/avar/g/pcre/inst LDFLAGS=-Wl,-rpath,/home/avar/g/pcre/inst/lib' ./run HEAD~5 HEAD~ HEAD p7820-grep-engines.sh
[...]
Test HEAD~5 HEAD~ HEAD
-----------------------------------------------------------------------------------------------------------------
7820.3: perl grep 'how.to' 0.31(1.10+0.48) 0.21(0.35+0.56) -32.3% 0.21(0.34+0.55) -32.3%
7820.7: perl grep '^how to' 0.56(2.70+0.40) 0.24(0.64+0.52) -57.1% 0.20(0.28+0.60) -64.3%
7820.11: perl grep '[how] to' 0.56(2.66+0.38) 0.29(0.95+0.45) -48.2% 0.23(0.45+0.54) -58.9%
7820.15: perl grep '(e.t[^ ]*|v.ry) rare' 1.02(5.77+0.42) 0.31(1.02+0.54) -69.6% 0.23(0.50+0.54) -77.5%
7820.19: perl grep 'm(ú|u)lt.b(æ|y)te' 0.38(1.57+0.42) 0.27(0.85+0.46) -28.9% 0.21(0.33+0.57) -44.7%
See commit ("perf: add a comparison test of grep regex engines",
2017-04-19) for details on the machine the above test run was executed
on.
Here HEAD~2 is git with PCRE v1 without JIT, HEAD~ is PCRE v1 with
JIT, and HEAD is PCRE v2 (also with JIT). See previous commits of mine
mentioning p7820-grep-engines.sh for more details on the test setup.
For ease of readability, a different run just of HEAD~ (PCRE v1 with
JIT v.s. PCRE v2), again with just the /perl/ tests shown:
[...]
Test HEAD~ HEAD
----------------------------------------------------------------------------------------
7820.3: perl grep 'how.to' 0.21(0.42+0.52) 0.21(0.31+0.58) +0.0%
7820.7: perl grep '^how to' 0.25(0.65+0.50) 0.20(0.31+0.57) -20.0%
7820.11: perl grep '[how] to' 0.30(0.90+0.50) 0.23(0.46+0.53) -23.3%
7820.15: perl grep '(e.t[^ ]*|v.ry) rare' 0.30(1.19+0.38) 0.23(0.51+0.51) -23.3%
7820.19: perl grep 'm(ú|u)lt.b(æ|y)te' 0.27(0.84+0.48) 0.21(0.34+0.57) -22.2%
I.e. the two are either neck-to-neck, but PCRE v2 usually pulls ahead,
when it does it's around 20% faster.
A brief note on thread safety: As noted in pcre2api(3) & pcre2jit(3)
the compiled pattern can be shared between threads, but not some of
the JIT context, however the grep threading support does all pattern &
JIT compilation in separate threads, so this code doesn't need to
concern itself with thread safety.
See commit 63e7e9d8b6 ("git-grep: Learn PCRE", 2011-05-09) for the
initial addition of PCRE v1. This change follows some of the same
patterns it did (and which were discussed on list at the time),
e.g. mocking up types with typedef instead of ifdef-ing them out when
USE_LIBPCRE2 isn't defined. This adds some trivial memory use to the
program, but makes the code look nicer.
1. https://lists.exim.org/lurker/message/20150105.162835.0666407a.en.html
2. https://lists.exim.org/lurker/thread/20170419.172322.833ee099.en.html
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-06-01 20:20:56 +02:00
|
|
|
#ifdef USE_LIBPCRE2
|
2021-02-18 01:07:28 +01:00
|
|
|
#define GREP_PCRE2_DEBUG_MALLOC 0
|
|
|
|
|
|
|
|
static void *pcre2_malloc(PCRE2_SIZE size, MAYBE_UNUSED void *memory_data)
|
|
|
|
{
|
|
|
|
void *pointer = malloc(size);
|
|
|
|
#if GREP_PCRE2_DEBUG_MALLOC
|
|
|
|
static int count = 1;
|
|
|
|
fprintf(stderr, "PCRE2:%p -> #%02d: alloc(%lu)\n", pointer, count++, size);
|
|
|
|
#endif
|
|
|
|
return pointer;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void pcre2_free(void *pointer, MAYBE_UNUSED void *memory_data)
|
|
|
|
{
|
|
|
|
#if GREP_PCRE2_DEBUG_MALLOC
|
|
|
|
static int count = 1;
|
|
|
|
if (pointer)
|
|
|
|
fprintf(stderr, "PCRE2:%p -> #%02d: free()\n", pointer, count++);
|
|
|
|
#endif
|
|
|
|
free(pointer);
|
|
|
|
}
|
|
|
|
|
grep: fall back to interpreter if JIT memory allocation fails
Under Linux systems with SELinux's 'deny_execmem' or PaX's MPROTECT
enabled, the allocation of PCRE2's JIT rwx memory may be prohibited,
making pcre2_jit_compile() fail with PCRE2_ERROR_NOMEMORY (-48):
[user@fedora git]$ git grep -c PCRE2_JIT
grep.c:1
[user@fedora git]$ # Enable SELinux's W^X policy
[user@fedora git]$ sudo semanage boolean -m -1 deny_execmem
[user@fedora git]$ # JIT memory allocation fails, breaking 'git grep'
[user@fedora git]$ git grep -c PCRE2_JIT
fatal: Couldn't JIT the PCRE2 pattern 'PCRE2_JIT', got '-48'
Instead of failing hard in this case and making 'git grep' unusable on
such systems, simply fall back to interpreter mode, leading to a much
better user experience.
As having a functional PCRE2 JIT compiler is a legitimate use case for
performance reasons, we'll only do the fallback if the supposedly
available JIT is found to be non-functional by attempting to JIT compile
a very simple pattern. If this fails, JIT is deemed to be non-functional
and we do the interpreter fallback. For all other cases, i.e. the simple
pattern can be compiled but the user provided cannot, we fail hard as we
do now as the reason for the failure must be the pattern itself. To aid
users in helping themselves change the error message to include a hint
about the '(*NO_JIT)' prefix. Also clip the pattern at 64 characters to
ensure the hint will be seen by the user and not internally truncated by
the die() function.
Cc: Carlo Marcelo Arenas Belón <carenas@gmail.com>
Signed-off-by: Mathias Krause <minipli@grsecurity.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2023-01-31 19:56:11 +01:00
|
|
|
static int pcre2_jit_functional(void)
|
|
|
|
{
|
|
|
|
static int jit_working = -1;
|
|
|
|
pcre2_code *code;
|
|
|
|
size_t off;
|
|
|
|
int err;
|
|
|
|
|
|
|
|
if (jit_working != -1)
|
|
|
|
return jit_working;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Try to JIT compile a simple pattern to probe if the JIT is
|
|
|
|
* working in general. It might fail for systems where creating
|
|
|
|
* memory mappings for runtime code generation is restricted.
|
|
|
|
*/
|
|
|
|
code = pcre2_compile((PCRE2_SPTR)".", 1, 0, &err, &off, NULL);
|
|
|
|
if (!code)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
jit_working = pcre2_jit_compile(code, PCRE2_JIT_COMPLETE) == 0;
|
|
|
|
pcre2_code_free(code);
|
|
|
|
|
|
|
|
return jit_working;
|
|
|
|
}
|
|
|
|
|
grep: add support for PCRE v2
Add support for v2 of the PCRE API. This is a new major version of
PCRE that came out in early 2015[1].
The regular expression syntax is the same, but while the API is
similar, pretty much every function is either renamed or takes
different arguments. Thus using it via entirely new functions makes
sense, as opposed to trying to e.g. have one compile_pcre_pattern()
that would call either PCRE v1 or v2 functions.
Git can now be compiled with either USE_LIBPCRE1=YesPlease or
USE_LIBPCRE2=YesPlease, with USE_LIBPCRE=YesPlease currently being a
synonym for the former. Providing both is a compile-time error.
With earlier patches to enable JIT for PCRE v1 the performance of the
release versions of both libraries is almost exactly the same, with
PCRE v2 being around 1% slower.
However after I reported this to the pcre-dev mailing list[2] I got a
lot of help with the API use from Zoltán Herczeg, he subsequently
optimized some of the JIT functionality in v2 of the library.
Running the p7820-grep-engines.sh performance test against the latest
Subversion trunk of both, with both them and git compiled as -O3, and
the test run against linux.git, gives the following results. Just the
/perl/ tests shown:
$ GIT_PERF_REPEAT_COUNT=30 GIT_PERF_LARGE_REPO=~/g/linux GIT_PERF_MAKE_COMMAND='grep -q LIBPCRE2 Makefile && make -j8 USE_LIBPCRE2=YesPlease CC=~/perl5/installed/bin/gcc NO_R_TO_GCC_LINKER=YesPlease CFLAGS=-O3 LIBPCREDIR=/home/avar/g/pcre2/inst LDFLAGS=-Wl,-rpath,/home/avar/g/pcre2/inst/lib || make -j8 USE_LIBPCRE=YesPlease CC=~/perl5/installed/bin/gcc NO_R_TO_GCC_LINKER=YesPlease CFLAGS=-O3 LIBPCREDIR=/home/avar/g/pcre/inst LDFLAGS=-Wl,-rpath,/home/avar/g/pcre/inst/lib' ./run HEAD~5 HEAD~ HEAD p7820-grep-engines.sh
[...]
Test HEAD~5 HEAD~ HEAD
-----------------------------------------------------------------------------------------------------------------
7820.3: perl grep 'how.to' 0.31(1.10+0.48) 0.21(0.35+0.56) -32.3% 0.21(0.34+0.55) -32.3%
7820.7: perl grep '^how to' 0.56(2.70+0.40) 0.24(0.64+0.52) -57.1% 0.20(0.28+0.60) -64.3%
7820.11: perl grep '[how] to' 0.56(2.66+0.38) 0.29(0.95+0.45) -48.2% 0.23(0.45+0.54) -58.9%
7820.15: perl grep '(e.t[^ ]*|v.ry) rare' 1.02(5.77+0.42) 0.31(1.02+0.54) -69.6% 0.23(0.50+0.54) -77.5%
7820.19: perl grep 'm(ú|u)lt.b(æ|y)te' 0.38(1.57+0.42) 0.27(0.85+0.46) -28.9% 0.21(0.33+0.57) -44.7%
See commit ("perf: add a comparison test of grep regex engines",
2017-04-19) for details on the machine the above test run was executed
on.
Here HEAD~2 is git with PCRE v1 without JIT, HEAD~ is PCRE v1 with
JIT, and HEAD is PCRE v2 (also with JIT). See previous commits of mine
mentioning p7820-grep-engines.sh for more details on the test setup.
For ease of readability, a different run just of HEAD~ (PCRE v1 with
JIT v.s. PCRE v2), again with just the /perl/ tests shown:
[...]
Test HEAD~ HEAD
----------------------------------------------------------------------------------------
7820.3: perl grep 'how.to' 0.21(0.42+0.52) 0.21(0.31+0.58) +0.0%
7820.7: perl grep '^how to' 0.25(0.65+0.50) 0.20(0.31+0.57) -20.0%
7820.11: perl grep '[how] to' 0.30(0.90+0.50) 0.23(0.46+0.53) -23.3%
7820.15: perl grep '(e.t[^ ]*|v.ry) rare' 0.30(1.19+0.38) 0.23(0.51+0.51) -23.3%
7820.19: perl grep 'm(ú|u)lt.b(æ|y)te' 0.27(0.84+0.48) 0.21(0.34+0.57) -22.2%
I.e. the two are either neck-to-neck, but PCRE v2 usually pulls ahead,
when it does it's around 20% faster.
A brief note on thread safety: As noted in pcre2api(3) & pcre2jit(3)
the compiled pattern can be shared between threads, but not some of
the JIT context, however the grep threading support does all pattern &
JIT compilation in separate threads, so this code doesn't need to
concern itself with thread safety.
See commit 63e7e9d8b6 ("git-grep: Learn PCRE", 2011-05-09) for the
initial addition of PCRE v1. This change follows some of the same
patterns it did (and which were discussed on list at the time),
e.g. mocking up types with typedef instead of ifdef-ing them out when
USE_LIBPCRE2 isn't defined. This adds some trivial memory use to the
program, but makes the code look nicer.
1. https://lists.exim.org/lurker/message/20150105.162835.0666407a.en.html
2. https://lists.exim.org/lurker/thread/20170419.172322.833ee099.en.html
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-06-01 20:20:56 +02:00
|
|
|
static void compile_pcre2_pattern(struct grep_pat *p, const struct grep_opt *opt)
|
|
|
|
{
|
|
|
|
int error;
|
|
|
|
PCRE2_UCHAR errbuf[256];
|
|
|
|
PCRE2_SIZE erroffset;
|
|
|
|
int options = PCRE2_MULTILINE;
|
|
|
|
int jitret;
|
2017-11-23 15:16:58 +01:00
|
|
|
int patinforet;
|
|
|
|
size_t jitsizearg;
|
2021-12-18 20:53:15 +01:00
|
|
|
int literal = !opt->ignore_case && (p->fixed || p->is_fixed);
|
grep: add support for PCRE v2
Add support for v2 of the PCRE API. This is a new major version of
PCRE that came out in early 2015[1].
The regular expression syntax is the same, but while the API is
similar, pretty much every function is either renamed or takes
different arguments. Thus using it via entirely new functions makes
sense, as opposed to trying to e.g. have one compile_pcre_pattern()
that would call either PCRE v1 or v2 functions.
Git can now be compiled with either USE_LIBPCRE1=YesPlease or
USE_LIBPCRE2=YesPlease, with USE_LIBPCRE=YesPlease currently being a
synonym for the former. Providing both is a compile-time error.
With earlier patches to enable JIT for PCRE v1 the performance of the
release versions of both libraries is almost exactly the same, with
PCRE v2 being around 1% slower.
However after I reported this to the pcre-dev mailing list[2] I got a
lot of help with the API use from Zoltán Herczeg, he subsequently
optimized some of the JIT functionality in v2 of the library.
Running the p7820-grep-engines.sh performance test against the latest
Subversion trunk of both, with both them and git compiled as -O3, and
the test run against linux.git, gives the following results. Just the
/perl/ tests shown:
$ GIT_PERF_REPEAT_COUNT=30 GIT_PERF_LARGE_REPO=~/g/linux GIT_PERF_MAKE_COMMAND='grep -q LIBPCRE2 Makefile && make -j8 USE_LIBPCRE2=YesPlease CC=~/perl5/installed/bin/gcc NO_R_TO_GCC_LINKER=YesPlease CFLAGS=-O3 LIBPCREDIR=/home/avar/g/pcre2/inst LDFLAGS=-Wl,-rpath,/home/avar/g/pcre2/inst/lib || make -j8 USE_LIBPCRE=YesPlease CC=~/perl5/installed/bin/gcc NO_R_TO_GCC_LINKER=YesPlease CFLAGS=-O3 LIBPCREDIR=/home/avar/g/pcre/inst LDFLAGS=-Wl,-rpath,/home/avar/g/pcre/inst/lib' ./run HEAD~5 HEAD~ HEAD p7820-grep-engines.sh
[...]
Test HEAD~5 HEAD~ HEAD
-----------------------------------------------------------------------------------------------------------------
7820.3: perl grep 'how.to' 0.31(1.10+0.48) 0.21(0.35+0.56) -32.3% 0.21(0.34+0.55) -32.3%
7820.7: perl grep '^how to' 0.56(2.70+0.40) 0.24(0.64+0.52) -57.1% 0.20(0.28+0.60) -64.3%
7820.11: perl grep '[how] to' 0.56(2.66+0.38) 0.29(0.95+0.45) -48.2% 0.23(0.45+0.54) -58.9%
7820.15: perl grep '(e.t[^ ]*|v.ry) rare' 1.02(5.77+0.42) 0.31(1.02+0.54) -69.6% 0.23(0.50+0.54) -77.5%
7820.19: perl grep 'm(ú|u)lt.b(æ|y)te' 0.38(1.57+0.42) 0.27(0.85+0.46) -28.9% 0.21(0.33+0.57) -44.7%
See commit ("perf: add a comparison test of grep regex engines",
2017-04-19) for details on the machine the above test run was executed
on.
Here HEAD~2 is git with PCRE v1 without JIT, HEAD~ is PCRE v1 with
JIT, and HEAD is PCRE v2 (also with JIT). See previous commits of mine
mentioning p7820-grep-engines.sh for more details on the test setup.
For ease of readability, a different run just of HEAD~ (PCRE v1 with
JIT v.s. PCRE v2), again with just the /perl/ tests shown:
[...]
Test HEAD~ HEAD
----------------------------------------------------------------------------------------
7820.3: perl grep 'how.to' 0.21(0.42+0.52) 0.21(0.31+0.58) +0.0%
7820.7: perl grep '^how to' 0.25(0.65+0.50) 0.20(0.31+0.57) -20.0%
7820.11: perl grep '[how] to' 0.30(0.90+0.50) 0.23(0.46+0.53) -23.3%
7820.15: perl grep '(e.t[^ ]*|v.ry) rare' 0.30(1.19+0.38) 0.23(0.51+0.51) -23.3%
7820.19: perl grep 'm(ú|u)lt.b(æ|y)te' 0.27(0.84+0.48) 0.21(0.34+0.57) -22.2%
I.e. the two are either neck-to-neck, but PCRE v2 usually pulls ahead,
when it does it's around 20% faster.
A brief note on thread safety: As noted in pcre2api(3) & pcre2jit(3)
the compiled pattern can be shared between threads, but not some of
the JIT context, however the grep threading support does all pattern &
JIT compilation in separate threads, so this code doesn't need to
concern itself with thread safety.
See commit 63e7e9d8b6 ("git-grep: Learn PCRE", 2011-05-09) for the
initial addition of PCRE v1. This change follows some of the same
patterns it did (and which were discussed on list at the time),
e.g. mocking up types with typedef instead of ifdef-ing them out when
USE_LIBPCRE2 isn't defined. This adds some trivial memory use to the
program, but makes the code look nicer.
1. https://lists.exim.org/lurker/message/20150105.162835.0666407a.en.html
2. https://lists.exim.org/lurker/thread/20170419.172322.833ee099.en.html
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-06-01 20:20:56 +02:00
|
|
|
|
2021-02-18 01:07:27 +01:00
|
|
|
/*
|
|
|
|
* Call pcre2_general_context_create() before calling any
|
|
|
|
* other pcre2_*(). It sets up our malloc()/free() functions
|
|
|
|
* with which everything else is allocated.
|
|
|
|
*/
|
|
|
|
p->pcre2_general_context = pcre2_general_context_create(
|
|
|
|
pcre2_malloc, pcre2_free, NULL);
|
|
|
|
if (!p->pcre2_general_context)
|
|
|
|
die("Couldn't allocate PCRE2 general context");
|
grep: add support for PCRE v2
Add support for v2 of the PCRE API. This is a new major version of
PCRE that came out in early 2015[1].
The regular expression syntax is the same, but while the API is
similar, pretty much every function is either renamed or takes
different arguments. Thus using it via entirely new functions makes
sense, as opposed to trying to e.g. have one compile_pcre_pattern()
that would call either PCRE v1 or v2 functions.
Git can now be compiled with either USE_LIBPCRE1=YesPlease or
USE_LIBPCRE2=YesPlease, with USE_LIBPCRE=YesPlease currently being a
synonym for the former. Providing both is a compile-time error.
With earlier patches to enable JIT for PCRE v1 the performance of the
release versions of both libraries is almost exactly the same, with
PCRE v2 being around 1% slower.
However after I reported this to the pcre-dev mailing list[2] I got a
lot of help with the API use from Zoltán Herczeg, he subsequently
optimized some of the JIT functionality in v2 of the library.
Running the p7820-grep-engines.sh performance test against the latest
Subversion trunk of both, with both them and git compiled as -O3, and
the test run against linux.git, gives the following results. Just the
/perl/ tests shown:
$ GIT_PERF_REPEAT_COUNT=30 GIT_PERF_LARGE_REPO=~/g/linux GIT_PERF_MAKE_COMMAND='grep -q LIBPCRE2 Makefile && make -j8 USE_LIBPCRE2=YesPlease CC=~/perl5/installed/bin/gcc NO_R_TO_GCC_LINKER=YesPlease CFLAGS=-O3 LIBPCREDIR=/home/avar/g/pcre2/inst LDFLAGS=-Wl,-rpath,/home/avar/g/pcre2/inst/lib || make -j8 USE_LIBPCRE=YesPlease CC=~/perl5/installed/bin/gcc NO_R_TO_GCC_LINKER=YesPlease CFLAGS=-O3 LIBPCREDIR=/home/avar/g/pcre/inst LDFLAGS=-Wl,-rpath,/home/avar/g/pcre/inst/lib' ./run HEAD~5 HEAD~ HEAD p7820-grep-engines.sh
[...]
Test HEAD~5 HEAD~ HEAD
-----------------------------------------------------------------------------------------------------------------
7820.3: perl grep 'how.to' 0.31(1.10+0.48) 0.21(0.35+0.56) -32.3% 0.21(0.34+0.55) -32.3%
7820.7: perl grep '^how to' 0.56(2.70+0.40) 0.24(0.64+0.52) -57.1% 0.20(0.28+0.60) -64.3%
7820.11: perl grep '[how] to' 0.56(2.66+0.38) 0.29(0.95+0.45) -48.2% 0.23(0.45+0.54) -58.9%
7820.15: perl grep '(e.t[^ ]*|v.ry) rare' 1.02(5.77+0.42) 0.31(1.02+0.54) -69.6% 0.23(0.50+0.54) -77.5%
7820.19: perl grep 'm(ú|u)lt.b(æ|y)te' 0.38(1.57+0.42) 0.27(0.85+0.46) -28.9% 0.21(0.33+0.57) -44.7%
See commit ("perf: add a comparison test of grep regex engines",
2017-04-19) for details on the machine the above test run was executed
on.
Here HEAD~2 is git with PCRE v1 without JIT, HEAD~ is PCRE v1 with
JIT, and HEAD is PCRE v2 (also with JIT). See previous commits of mine
mentioning p7820-grep-engines.sh for more details on the test setup.
For ease of readability, a different run just of HEAD~ (PCRE v1 with
JIT v.s. PCRE v2), again with just the /perl/ tests shown:
[...]
Test HEAD~ HEAD
----------------------------------------------------------------------------------------
7820.3: perl grep 'how.to' 0.21(0.42+0.52) 0.21(0.31+0.58) +0.0%
7820.7: perl grep '^how to' 0.25(0.65+0.50) 0.20(0.31+0.57) -20.0%
7820.11: perl grep '[how] to' 0.30(0.90+0.50) 0.23(0.46+0.53) -23.3%
7820.15: perl grep '(e.t[^ ]*|v.ry) rare' 0.30(1.19+0.38) 0.23(0.51+0.51) -23.3%
7820.19: perl grep 'm(ú|u)lt.b(æ|y)te' 0.27(0.84+0.48) 0.21(0.34+0.57) -22.2%
I.e. the two are either neck-to-neck, but PCRE v2 usually pulls ahead,
when it does it's around 20% faster.
A brief note on thread safety: As noted in pcre2api(3) & pcre2jit(3)
the compiled pattern can be shared between threads, but not some of
the JIT context, however the grep threading support does all pattern &
JIT compilation in separate threads, so this code doesn't need to
concern itself with thread safety.
See commit 63e7e9d8b6 ("git-grep: Learn PCRE", 2011-05-09) for the
initial addition of PCRE v1. This change follows some of the same
patterns it did (and which were discussed on list at the time),
e.g. mocking up types with typedef instead of ifdef-ing them out when
USE_LIBPCRE2 isn't defined. This adds some trivial memory use to the
program, but makes the code look nicer.
1. https://lists.exim.org/lurker/message/20150105.162835.0666407a.en.html
2. https://lists.exim.org/lurker/thread/20170419.172322.833ee099.en.html
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-06-01 20:20:56 +02:00
|
|
|
|
|
|
|
if (opt->ignore_case) {
|
2019-06-28 01:39:05 +02:00
|
|
|
if (!opt->ignore_locale && has_non_ascii(p->pattern)) {
|
2021-02-18 01:07:27 +01:00
|
|
|
p->pcre2_tables = pcre2_maketables(p->pcre2_general_context);
|
|
|
|
p->pcre2_compile_context = pcre2_compile_context_create(p->pcre2_general_context);
|
2019-10-16 14:10:24 +02:00
|
|
|
pcre2_set_character_tables(p->pcre2_compile_context,
|
|
|
|
p->pcre2_tables);
|
grep: add support for PCRE v2
Add support for v2 of the PCRE API. This is a new major version of
PCRE that came out in early 2015[1].
The regular expression syntax is the same, but while the API is
similar, pretty much every function is either renamed or takes
different arguments. Thus using it via entirely new functions makes
sense, as opposed to trying to e.g. have one compile_pcre_pattern()
that would call either PCRE v1 or v2 functions.
Git can now be compiled with either USE_LIBPCRE1=YesPlease or
USE_LIBPCRE2=YesPlease, with USE_LIBPCRE=YesPlease currently being a
synonym for the former. Providing both is a compile-time error.
With earlier patches to enable JIT for PCRE v1 the performance of the
release versions of both libraries is almost exactly the same, with
PCRE v2 being around 1% slower.
However after I reported this to the pcre-dev mailing list[2] I got a
lot of help with the API use from Zoltán Herczeg, he subsequently
optimized some of the JIT functionality in v2 of the library.
Running the p7820-grep-engines.sh performance test against the latest
Subversion trunk of both, with both them and git compiled as -O3, and
the test run against linux.git, gives the following results. Just the
/perl/ tests shown:
$ GIT_PERF_REPEAT_COUNT=30 GIT_PERF_LARGE_REPO=~/g/linux GIT_PERF_MAKE_COMMAND='grep -q LIBPCRE2 Makefile && make -j8 USE_LIBPCRE2=YesPlease CC=~/perl5/installed/bin/gcc NO_R_TO_GCC_LINKER=YesPlease CFLAGS=-O3 LIBPCREDIR=/home/avar/g/pcre2/inst LDFLAGS=-Wl,-rpath,/home/avar/g/pcre2/inst/lib || make -j8 USE_LIBPCRE=YesPlease CC=~/perl5/installed/bin/gcc NO_R_TO_GCC_LINKER=YesPlease CFLAGS=-O3 LIBPCREDIR=/home/avar/g/pcre/inst LDFLAGS=-Wl,-rpath,/home/avar/g/pcre/inst/lib' ./run HEAD~5 HEAD~ HEAD p7820-grep-engines.sh
[...]
Test HEAD~5 HEAD~ HEAD
-----------------------------------------------------------------------------------------------------------------
7820.3: perl grep 'how.to' 0.31(1.10+0.48) 0.21(0.35+0.56) -32.3% 0.21(0.34+0.55) -32.3%
7820.7: perl grep '^how to' 0.56(2.70+0.40) 0.24(0.64+0.52) -57.1% 0.20(0.28+0.60) -64.3%
7820.11: perl grep '[how] to' 0.56(2.66+0.38) 0.29(0.95+0.45) -48.2% 0.23(0.45+0.54) -58.9%
7820.15: perl grep '(e.t[^ ]*|v.ry) rare' 1.02(5.77+0.42) 0.31(1.02+0.54) -69.6% 0.23(0.50+0.54) -77.5%
7820.19: perl grep 'm(ú|u)lt.b(æ|y)te' 0.38(1.57+0.42) 0.27(0.85+0.46) -28.9% 0.21(0.33+0.57) -44.7%
See commit ("perf: add a comparison test of grep regex engines",
2017-04-19) for details on the machine the above test run was executed
on.
Here HEAD~2 is git with PCRE v1 without JIT, HEAD~ is PCRE v1 with
JIT, and HEAD is PCRE v2 (also with JIT). See previous commits of mine
mentioning p7820-grep-engines.sh for more details on the test setup.
For ease of readability, a different run just of HEAD~ (PCRE v1 with
JIT v.s. PCRE v2), again with just the /perl/ tests shown:
[...]
Test HEAD~ HEAD
----------------------------------------------------------------------------------------
7820.3: perl grep 'how.to' 0.21(0.42+0.52) 0.21(0.31+0.58) +0.0%
7820.7: perl grep '^how to' 0.25(0.65+0.50) 0.20(0.31+0.57) -20.0%
7820.11: perl grep '[how] to' 0.30(0.90+0.50) 0.23(0.46+0.53) -23.3%
7820.15: perl grep '(e.t[^ ]*|v.ry) rare' 0.30(1.19+0.38) 0.23(0.51+0.51) -23.3%
7820.19: perl grep 'm(ú|u)lt.b(æ|y)te' 0.27(0.84+0.48) 0.21(0.34+0.57) -22.2%
I.e. the two are either neck-to-neck, but PCRE v2 usually pulls ahead,
when it does it's around 20% faster.
A brief note on thread safety: As noted in pcre2api(3) & pcre2jit(3)
the compiled pattern can be shared between threads, but not some of
the JIT context, however the grep threading support does all pattern &
JIT compilation in separate threads, so this code doesn't need to
concern itself with thread safety.
See commit 63e7e9d8b6 ("git-grep: Learn PCRE", 2011-05-09) for the
initial addition of PCRE v1. This change follows some of the same
patterns it did (and which were discussed on list at the time),
e.g. mocking up types with typedef instead of ifdef-ing them out when
USE_LIBPCRE2 isn't defined. This adds some trivial memory use to the
program, but makes the code look nicer.
1. https://lists.exim.org/lurker/message/20150105.162835.0666407a.en.html
2. https://lists.exim.org/lurker/thread/20170419.172322.833ee099.en.html
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-06-01 20:20:56 +02:00
|
|
|
}
|
|
|
|
options |= PCRE2_CASELESS;
|
|
|
|
}
|
2021-12-18 20:53:15 +01:00
|
|
|
if (!opt->ignore_locale && is_utf8_locale() && !literal)
|
2023-01-08 16:52:17 +01:00
|
|
|
options |= (PCRE2_UTF | PCRE2_UCP | PCRE2_MATCH_INVALID_UTF);
|
grep/pcre2: better support invalid UTF-8 haystacks
Improve the support for invalid UTF-8 haystacks given a non-ASCII
needle when using the PCREv2 backend.
This is a more complete fix for a bug I started to fix in
870eea8166 (grep: do not enter PCRE2_UTF mode on fixed matching,
2019-07-26), now that PCREv2 has the PCRE2_MATCH_INVALID_UTF mode we
can make use of it.
This fixes the sort of case described in 8a5999838e (grep: stess test
PCRE v2 on invalid UTF-8 data, 2019-07-26), i.e.:
- The subject string is non-ASCII (e.g. "ævar")
- We're under a is_utf8_locale(), e.g. "en_US.UTF-8", not "C"
- We are using --ignore-case, or we're a non-fixed pattern
If those conditions were satisfied and we matched found non-valid
UTF-8 data PCREv2 might bark on it, in practice this only happened
under the JIT backend (turned on by default on most platforms).
Ultimately this fixes a "regression" in b65abcafc7 ("grep: use PCRE v2
for optimized fixed-string search", 2019-07-01), I'm putting that in
scare-quotes because before then we wouldn't properly support these
complex case-folding, locale etc. cases either, it just broke in
different ways.
There was a bug related to this the PCRE2_NO_START_OPTIMIZE flag fixed
in PCREv2 10.36. It can be worked around by setting the
PCRE2_NO_START_OPTIMIZE flag. Let's do that in those cases, and add
tests for the bug.
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-01-24 18:28:13 +01:00
|
|
|
|
2022-02-17 22:14:29 +01:00
|
|
|
#ifndef GIT_PCRE2_VERSION_10_36_OR_HIGHER
|
grep/pcre2: better support invalid UTF-8 haystacks
Improve the support for invalid UTF-8 haystacks given a non-ASCII
needle when using the PCREv2 backend.
This is a more complete fix for a bug I started to fix in
870eea8166 (grep: do not enter PCRE2_UTF mode on fixed matching,
2019-07-26), now that PCREv2 has the PCRE2_MATCH_INVALID_UTF mode we
can make use of it.
This fixes the sort of case described in 8a5999838e (grep: stess test
PCRE v2 on invalid UTF-8 data, 2019-07-26), i.e.:
- The subject string is non-ASCII (e.g. "ævar")
- We're under a is_utf8_locale(), e.g. "en_US.UTF-8", not "C"
- We are using --ignore-case, or we're a non-fixed pattern
If those conditions were satisfied and we matched found non-valid
UTF-8 data PCREv2 might bark on it, in practice this only happened
under the JIT backend (turned on by default on most platforms).
Ultimately this fixes a "regression" in b65abcafc7 ("grep: use PCRE v2
for optimized fixed-string search", 2019-07-01), I'm putting that in
scare-quotes because before then we wouldn't properly support these
complex case-folding, locale etc. cases either, it just broke in
different ways.
There was a bug related to this the PCRE2_NO_START_OPTIMIZE flag fixed
in PCREv2 10.36. It can be worked around by setting the
PCRE2_NO_START_OPTIMIZE flag. Let's do that in those cases, and add
tests for the bug.
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-01-24 18:28:13 +01:00
|
|
|
/* Work around https://bugs.exim.org/show_bug.cgi?id=2642 fixed in 10.36 */
|
2021-02-18 01:07:24 +01:00
|
|
|
if (PCRE2_MATCH_INVALID_UTF && options & (PCRE2_UTF | PCRE2_CASELESS))
|
|
|
|
options |= PCRE2_NO_START_OPTIMIZE;
|
|
|
|
#endif
|
grep: add support for PCRE v2
Add support for v2 of the PCRE API. This is a new major version of
PCRE that came out in early 2015[1].
The regular expression syntax is the same, but while the API is
similar, pretty much every function is either renamed or takes
different arguments. Thus using it via entirely new functions makes
sense, as opposed to trying to e.g. have one compile_pcre_pattern()
that would call either PCRE v1 or v2 functions.
Git can now be compiled with either USE_LIBPCRE1=YesPlease or
USE_LIBPCRE2=YesPlease, with USE_LIBPCRE=YesPlease currently being a
synonym for the former. Providing both is a compile-time error.
With earlier patches to enable JIT for PCRE v1 the performance of the
release versions of both libraries is almost exactly the same, with
PCRE v2 being around 1% slower.
However after I reported this to the pcre-dev mailing list[2] I got a
lot of help with the API use from Zoltán Herczeg, he subsequently
optimized some of the JIT functionality in v2 of the library.
Running the p7820-grep-engines.sh performance test against the latest
Subversion trunk of both, with both them and git compiled as -O3, and
the test run against linux.git, gives the following results. Just the
/perl/ tests shown:
$ GIT_PERF_REPEAT_COUNT=30 GIT_PERF_LARGE_REPO=~/g/linux GIT_PERF_MAKE_COMMAND='grep -q LIBPCRE2 Makefile && make -j8 USE_LIBPCRE2=YesPlease CC=~/perl5/installed/bin/gcc NO_R_TO_GCC_LINKER=YesPlease CFLAGS=-O3 LIBPCREDIR=/home/avar/g/pcre2/inst LDFLAGS=-Wl,-rpath,/home/avar/g/pcre2/inst/lib || make -j8 USE_LIBPCRE=YesPlease CC=~/perl5/installed/bin/gcc NO_R_TO_GCC_LINKER=YesPlease CFLAGS=-O3 LIBPCREDIR=/home/avar/g/pcre/inst LDFLAGS=-Wl,-rpath,/home/avar/g/pcre/inst/lib' ./run HEAD~5 HEAD~ HEAD p7820-grep-engines.sh
[...]
Test HEAD~5 HEAD~ HEAD
-----------------------------------------------------------------------------------------------------------------
7820.3: perl grep 'how.to' 0.31(1.10+0.48) 0.21(0.35+0.56) -32.3% 0.21(0.34+0.55) -32.3%
7820.7: perl grep '^how to' 0.56(2.70+0.40) 0.24(0.64+0.52) -57.1% 0.20(0.28+0.60) -64.3%
7820.11: perl grep '[how] to' 0.56(2.66+0.38) 0.29(0.95+0.45) -48.2% 0.23(0.45+0.54) -58.9%
7820.15: perl grep '(e.t[^ ]*|v.ry) rare' 1.02(5.77+0.42) 0.31(1.02+0.54) -69.6% 0.23(0.50+0.54) -77.5%
7820.19: perl grep 'm(ú|u)lt.b(æ|y)te' 0.38(1.57+0.42) 0.27(0.85+0.46) -28.9% 0.21(0.33+0.57) -44.7%
See commit ("perf: add a comparison test of grep regex engines",
2017-04-19) for details on the machine the above test run was executed
on.
Here HEAD~2 is git with PCRE v1 without JIT, HEAD~ is PCRE v1 with
JIT, and HEAD is PCRE v2 (also with JIT). See previous commits of mine
mentioning p7820-grep-engines.sh for more details on the test setup.
For ease of readability, a different run just of HEAD~ (PCRE v1 with
JIT v.s. PCRE v2), again with just the /perl/ tests shown:
[...]
Test HEAD~ HEAD
----------------------------------------------------------------------------------------
7820.3: perl grep 'how.to' 0.21(0.42+0.52) 0.21(0.31+0.58) +0.0%
7820.7: perl grep '^how to' 0.25(0.65+0.50) 0.20(0.31+0.57) -20.0%
7820.11: perl grep '[how] to' 0.30(0.90+0.50) 0.23(0.46+0.53) -23.3%
7820.15: perl grep '(e.t[^ ]*|v.ry) rare' 0.30(1.19+0.38) 0.23(0.51+0.51) -23.3%
7820.19: perl grep 'm(ú|u)lt.b(æ|y)te' 0.27(0.84+0.48) 0.21(0.34+0.57) -22.2%
I.e. the two are either neck-to-neck, but PCRE v2 usually pulls ahead,
when it does it's around 20% faster.
A brief note on thread safety: As noted in pcre2api(3) & pcre2jit(3)
the compiled pattern can be shared between threads, but not some of
the JIT context, however the grep threading support does all pattern &
JIT compilation in separate threads, so this code doesn't need to
concern itself with thread safety.
See commit 63e7e9d8b6 ("git-grep: Learn PCRE", 2011-05-09) for the
initial addition of PCRE v1. This change follows some of the same
patterns it did (and which were discussed on list at the time),
e.g. mocking up types with typedef instead of ifdef-ing them out when
USE_LIBPCRE2 isn't defined. This adds some trivial memory use to the
program, but makes the code look nicer.
1. https://lists.exim.org/lurker/message/20150105.162835.0666407a.en.html
2. https://lists.exim.org/lurker/thread/20170419.172322.833ee099.en.html
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-06-01 20:20:56 +02:00
|
|
|
|
|
|
|
p->pcre2_pattern = pcre2_compile((PCRE2_SPTR)p->pattern,
|
|
|
|
p->patternlen, options, &error, &erroffset,
|
|
|
|
p->pcre2_compile_context);
|
|
|
|
|
|
|
|
if (p->pcre2_pattern) {
|
2021-02-18 01:07:27 +01:00
|
|
|
p->pcre2_match_data = pcre2_match_data_create_from_pattern(p->pcre2_pattern, p->pcre2_general_context);
|
grep: add support for PCRE v2
Add support for v2 of the PCRE API. This is a new major version of
PCRE that came out in early 2015[1].
The regular expression syntax is the same, but while the API is
similar, pretty much every function is either renamed or takes
different arguments. Thus using it via entirely new functions makes
sense, as opposed to trying to e.g. have one compile_pcre_pattern()
that would call either PCRE v1 or v2 functions.
Git can now be compiled with either USE_LIBPCRE1=YesPlease or
USE_LIBPCRE2=YesPlease, with USE_LIBPCRE=YesPlease currently being a
synonym for the former. Providing both is a compile-time error.
With earlier patches to enable JIT for PCRE v1 the performance of the
release versions of both libraries is almost exactly the same, with
PCRE v2 being around 1% slower.
However after I reported this to the pcre-dev mailing list[2] I got a
lot of help with the API use from Zoltán Herczeg, he subsequently
optimized some of the JIT functionality in v2 of the library.
Running the p7820-grep-engines.sh performance test against the latest
Subversion trunk of both, with both them and git compiled as -O3, and
the test run against linux.git, gives the following results. Just the
/perl/ tests shown:
$ GIT_PERF_REPEAT_COUNT=30 GIT_PERF_LARGE_REPO=~/g/linux GIT_PERF_MAKE_COMMAND='grep -q LIBPCRE2 Makefile && make -j8 USE_LIBPCRE2=YesPlease CC=~/perl5/installed/bin/gcc NO_R_TO_GCC_LINKER=YesPlease CFLAGS=-O3 LIBPCREDIR=/home/avar/g/pcre2/inst LDFLAGS=-Wl,-rpath,/home/avar/g/pcre2/inst/lib || make -j8 USE_LIBPCRE=YesPlease CC=~/perl5/installed/bin/gcc NO_R_TO_GCC_LINKER=YesPlease CFLAGS=-O3 LIBPCREDIR=/home/avar/g/pcre/inst LDFLAGS=-Wl,-rpath,/home/avar/g/pcre/inst/lib' ./run HEAD~5 HEAD~ HEAD p7820-grep-engines.sh
[...]
Test HEAD~5 HEAD~ HEAD
-----------------------------------------------------------------------------------------------------------------
7820.3: perl grep 'how.to' 0.31(1.10+0.48) 0.21(0.35+0.56) -32.3% 0.21(0.34+0.55) -32.3%
7820.7: perl grep '^how to' 0.56(2.70+0.40) 0.24(0.64+0.52) -57.1% 0.20(0.28+0.60) -64.3%
7820.11: perl grep '[how] to' 0.56(2.66+0.38) 0.29(0.95+0.45) -48.2% 0.23(0.45+0.54) -58.9%
7820.15: perl grep '(e.t[^ ]*|v.ry) rare' 1.02(5.77+0.42) 0.31(1.02+0.54) -69.6% 0.23(0.50+0.54) -77.5%
7820.19: perl grep 'm(ú|u)lt.b(æ|y)te' 0.38(1.57+0.42) 0.27(0.85+0.46) -28.9% 0.21(0.33+0.57) -44.7%
See commit ("perf: add a comparison test of grep regex engines",
2017-04-19) for details on the machine the above test run was executed
on.
Here HEAD~2 is git with PCRE v1 without JIT, HEAD~ is PCRE v1 with
JIT, and HEAD is PCRE v2 (also with JIT). See previous commits of mine
mentioning p7820-grep-engines.sh for more details on the test setup.
For ease of readability, a different run just of HEAD~ (PCRE v1 with
JIT v.s. PCRE v2), again with just the /perl/ tests shown:
[...]
Test HEAD~ HEAD
----------------------------------------------------------------------------------------
7820.3: perl grep 'how.to' 0.21(0.42+0.52) 0.21(0.31+0.58) +0.0%
7820.7: perl grep '^how to' 0.25(0.65+0.50) 0.20(0.31+0.57) -20.0%
7820.11: perl grep '[how] to' 0.30(0.90+0.50) 0.23(0.46+0.53) -23.3%
7820.15: perl grep '(e.t[^ ]*|v.ry) rare' 0.30(1.19+0.38) 0.23(0.51+0.51) -23.3%
7820.19: perl grep 'm(ú|u)lt.b(æ|y)te' 0.27(0.84+0.48) 0.21(0.34+0.57) -22.2%
I.e. the two are either neck-to-neck, but PCRE v2 usually pulls ahead,
when it does it's around 20% faster.
A brief note on thread safety: As noted in pcre2api(3) & pcre2jit(3)
the compiled pattern can be shared between threads, but not some of
the JIT context, however the grep threading support does all pattern &
JIT compilation in separate threads, so this code doesn't need to
concern itself with thread safety.
See commit 63e7e9d8b6 ("git-grep: Learn PCRE", 2011-05-09) for the
initial addition of PCRE v1. This change follows some of the same
patterns it did (and which were discussed on list at the time),
e.g. mocking up types with typedef instead of ifdef-ing them out when
USE_LIBPCRE2 isn't defined. This adds some trivial memory use to the
program, but makes the code look nicer.
1. https://lists.exim.org/lurker/message/20150105.162835.0666407a.en.html
2. https://lists.exim.org/lurker/thread/20170419.172322.833ee099.en.html
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-06-01 20:20:56 +02:00
|
|
|
if (!p->pcre2_match_data)
|
|
|
|
die("Couldn't allocate PCRE2 match data");
|
|
|
|
} else {
|
|
|
|
pcre2_get_error_message(error, errbuf, sizeof(errbuf));
|
|
|
|
compile_regexp_failed(p, (const char *)&errbuf);
|
|
|
|
}
|
|
|
|
|
|
|
|
pcre2_config(PCRE2_CONFIG_JIT, &p->pcre2_jit_on);
|
2019-07-26 17:08:11 +02:00
|
|
|
if (p->pcre2_jit_on) {
|
grep: add support for PCRE v2
Add support for v2 of the PCRE API. This is a new major version of
PCRE that came out in early 2015[1].
The regular expression syntax is the same, but while the API is
similar, pretty much every function is either renamed or takes
different arguments. Thus using it via entirely new functions makes
sense, as opposed to trying to e.g. have one compile_pcre_pattern()
that would call either PCRE v1 or v2 functions.
Git can now be compiled with either USE_LIBPCRE1=YesPlease or
USE_LIBPCRE2=YesPlease, with USE_LIBPCRE=YesPlease currently being a
synonym for the former. Providing both is a compile-time error.
With earlier patches to enable JIT for PCRE v1 the performance of the
release versions of both libraries is almost exactly the same, with
PCRE v2 being around 1% slower.
However after I reported this to the pcre-dev mailing list[2] I got a
lot of help with the API use from Zoltán Herczeg, he subsequently
optimized some of the JIT functionality in v2 of the library.
Running the p7820-grep-engines.sh performance test against the latest
Subversion trunk of both, with both them and git compiled as -O3, and
the test run against linux.git, gives the following results. Just the
/perl/ tests shown:
$ GIT_PERF_REPEAT_COUNT=30 GIT_PERF_LARGE_REPO=~/g/linux GIT_PERF_MAKE_COMMAND='grep -q LIBPCRE2 Makefile && make -j8 USE_LIBPCRE2=YesPlease CC=~/perl5/installed/bin/gcc NO_R_TO_GCC_LINKER=YesPlease CFLAGS=-O3 LIBPCREDIR=/home/avar/g/pcre2/inst LDFLAGS=-Wl,-rpath,/home/avar/g/pcre2/inst/lib || make -j8 USE_LIBPCRE=YesPlease CC=~/perl5/installed/bin/gcc NO_R_TO_GCC_LINKER=YesPlease CFLAGS=-O3 LIBPCREDIR=/home/avar/g/pcre/inst LDFLAGS=-Wl,-rpath,/home/avar/g/pcre/inst/lib' ./run HEAD~5 HEAD~ HEAD p7820-grep-engines.sh
[...]
Test HEAD~5 HEAD~ HEAD
-----------------------------------------------------------------------------------------------------------------
7820.3: perl grep 'how.to' 0.31(1.10+0.48) 0.21(0.35+0.56) -32.3% 0.21(0.34+0.55) -32.3%
7820.7: perl grep '^how to' 0.56(2.70+0.40) 0.24(0.64+0.52) -57.1% 0.20(0.28+0.60) -64.3%
7820.11: perl grep '[how] to' 0.56(2.66+0.38) 0.29(0.95+0.45) -48.2% 0.23(0.45+0.54) -58.9%
7820.15: perl grep '(e.t[^ ]*|v.ry) rare' 1.02(5.77+0.42) 0.31(1.02+0.54) -69.6% 0.23(0.50+0.54) -77.5%
7820.19: perl grep 'm(ú|u)lt.b(æ|y)te' 0.38(1.57+0.42) 0.27(0.85+0.46) -28.9% 0.21(0.33+0.57) -44.7%
See commit ("perf: add a comparison test of grep regex engines",
2017-04-19) for details on the machine the above test run was executed
on.
Here HEAD~2 is git with PCRE v1 without JIT, HEAD~ is PCRE v1 with
JIT, and HEAD is PCRE v2 (also with JIT). See previous commits of mine
mentioning p7820-grep-engines.sh for more details on the test setup.
For ease of readability, a different run just of HEAD~ (PCRE v1 with
JIT v.s. PCRE v2), again with just the /perl/ tests shown:
[...]
Test HEAD~ HEAD
----------------------------------------------------------------------------------------
7820.3: perl grep 'how.to' 0.21(0.42+0.52) 0.21(0.31+0.58) +0.0%
7820.7: perl grep '^how to' 0.25(0.65+0.50) 0.20(0.31+0.57) -20.0%
7820.11: perl grep '[how] to' 0.30(0.90+0.50) 0.23(0.46+0.53) -23.3%
7820.15: perl grep '(e.t[^ ]*|v.ry) rare' 0.30(1.19+0.38) 0.23(0.51+0.51) -23.3%
7820.19: perl grep 'm(ú|u)lt.b(æ|y)te' 0.27(0.84+0.48) 0.21(0.34+0.57) -22.2%
I.e. the two are either neck-to-neck, but PCRE v2 usually pulls ahead,
when it does it's around 20% faster.
A brief note on thread safety: As noted in pcre2api(3) & pcre2jit(3)
the compiled pattern can be shared between threads, but not some of
the JIT context, however the grep threading support does all pattern &
JIT compilation in separate threads, so this code doesn't need to
concern itself with thread safety.
See commit 63e7e9d8b6 ("git-grep: Learn PCRE", 2011-05-09) for the
initial addition of PCRE v1. This change follows some of the same
patterns it did (and which were discussed on list at the time),
e.g. mocking up types with typedef instead of ifdef-ing them out when
USE_LIBPCRE2 isn't defined. This adds some trivial memory use to the
program, but makes the code look nicer.
1. https://lists.exim.org/lurker/message/20150105.162835.0666407a.en.html
2. https://lists.exim.org/lurker/thread/20170419.172322.833ee099.en.html
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-06-01 20:20:56 +02:00
|
|
|
jitret = pcre2_jit_compile(p->pcre2_pattern, PCRE2_JIT_COMPLETE);
|
grep: fall back to interpreter if JIT memory allocation fails
Under Linux systems with SELinux's 'deny_execmem' or PaX's MPROTECT
enabled, the allocation of PCRE2's JIT rwx memory may be prohibited,
making pcre2_jit_compile() fail with PCRE2_ERROR_NOMEMORY (-48):
[user@fedora git]$ git grep -c PCRE2_JIT
grep.c:1
[user@fedora git]$ # Enable SELinux's W^X policy
[user@fedora git]$ sudo semanage boolean -m -1 deny_execmem
[user@fedora git]$ # JIT memory allocation fails, breaking 'git grep'
[user@fedora git]$ git grep -c PCRE2_JIT
fatal: Couldn't JIT the PCRE2 pattern 'PCRE2_JIT', got '-48'
Instead of failing hard in this case and making 'git grep' unusable on
such systems, simply fall back to interpreter mode, leading to a much
better user experience.
As having a functional PCRE2 JIT compiler is a legitimate use case for
performance reasons, we'll only do the fallback if the supposedly
available JIT is found to be non-functional by attempting to JIT compile
a very simple pattern. If this fails, JIT is deemed to be non-functional
and we do the interpreter fallback. For all other cases, i.e. the simple
pattern can be compiled but the user provided cannot, we fail hard as we
do now as the reason for the failure must be the pattern itself. To aid
users in helping themselves change the error message to include a hint
about the '(*NO_JIT)' prefix. Also clip the pattern at 64 characters to
ensure the hint will be seen by the user and not internally truncated by
the die() function.
Cc: Carlo Marcelo Arenas Belón <carenas@gmail.com>
Signed-off-by: Mathias Krause <minipli@grsecurity.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2023-01-31 19:56:11 +01:00
|
|
|
if (jitret == PCRE2_ERROR_NOMEMORY && !pcre2_jit_functional()) {
|
|
|
|
/*
|
|
|
|
* Even though pcre2_config(PCRE2_CONFIG_JIT, ...)
|
|
|
|
* indicated JIT support, the library might still
|
|
|
|
* fail to generate JIT code for various reasons,
|
|
|
|
* e.g. when SELinux's 'deny_execmem' or PaX's
|
|
|
|
* MPROTECT prevent creating W|X memory mappings.
|
|
|
|
*
|
|
|
|
* Instead of faling hard, fall back to interpreter
|
|
|
|
* mode, just as if the pattern was prefixed with
|
|
|
|
* '(*NO_JIT)'.
|
|
|
|
*/
|
|
|
|
p->pcre2_jit_on = 0;
|
|
|
|
return;
|
|
|
|
} else if (jitret) {
|
|
|
|
int need_clip = p->patternlen > 64;
|
|
|
|
int clip_len = need_clip ? 64 : p->patternlen;
|
|
|
|
die("Couldn't JIT the PCRE2 pattern '%.*s'%s, got '%d'%s",
|
|
|
|
clip_len, p->pattern, need_clip ? "..." : "", jitret,
|
|
|
|
pcre2_jit_functional()
|
|
|
|
? "\nPerhaps prefix (*NO_JIT) to your pattern?"
|
|
|
|
: "");
|
|
|
|
}
|
2017-11-23 15:16:58 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* The pcre2_config(PCRE2_CONFIG_JIT, ...) call just
|
|
|
|
* tells us whether the library itself supports JIT,
|
|
|
|
* but to see whether we're going to be actually using
|
|
|
|
* JIT we need to extract PCRE2_INFO_JITSIZE from the
|
|
|
|
* pattern *after* we do pcre2_jit_compile() above.
|
|
|
|
*
|
|
|
|
* This is because if the pattern contains the
|
|
|
|
* (*NO_JIT) verb (see pcre2syntax(3))
|
|
|
|
* pcre2_jit_compile() will exit early with 0. If we
|
|
|
|
* then proceed to call pcre2_jit_match() further down
|
|
|
|
* the line instead of pcre2_match() we'll either
|
|
|
|
* segfault (pre PCRE 10.31) or run into a fatal error
|
|
|
|
* (post PCRE2 10.31)
|
|
|
|
*/
|
|
|
|
patinforet = pcre2_pattern_info(p->pcre2_pattern, PCRE2_INFO_JITSIZE, &jitsizearg);
|
|
|
|
if (patinforet)
|
|
|
|
BUG("pcre2_pattern_info() failed: %d", patinforet);
|
|
|
|
if (jitsizearg == 0) {
|
|
|
|
p->pcre2_jit_on = 0;
|
|
|
|
return;
|
|
|
|
}
|
grep: add support for PCRE v2
Add support for v2 of the PCRE API. This is a new major version of
PCRE that came out in early 2015[1].
The regular expression syntax is the same, but while the API is
similar, pretty much every function is either renamed or takes
different arguments. Thus using it via entirely new functions makes
sense, as opposed to trying to e.g. have one compile_pcre_pattern()
that would call either PCRE v1 or v2 functions.
Git can now be compiled with either USE_LIBPCRE1=YesPlease or
USE_LIBPCRE2=YesPlease, with USE_LIBPCRE=YesPlease currently being a
synonym for the former. Providing both is a compile-time error.
With earlier patches to enable JIT for PCRE v1 the performance of the
release versions of both libraries is almost exactly the same, with
PCRE v2 being around 1% slower.
However after I reported this to the pcre-dev mailing list[2] I got a
lot of help with the API use from Zoltán Herczeg, he subsequently
optimized some of the JIT functionality in v2 of the library.
Running the p7820-grep-engines.sh performance test against the latest
Subversion trunk of both, with both them and git compiled as -O3, and
the test run against linux.git, gives the following results. Just the
/perl/ tests shown:
$ GIT_PERF_REPEAT_COUNT=30 GIT_PERF_LARGE_REPO=~/g/linux GIT_PERF_MAKE_COMMAND='grep -q LIBPCRE2 Makefile && make -j8 USE_LIBPCRE2=YesPlease CC=~/perl5/installed/bin/gcc NO_R_TO_GCC_LINKER=YesPlease CFLAGS=-O3 LIBPCREDIR=/home/avar/g/pcre2/inst LDFLAGS=-Wl,-rpath,/home/avar/g/pcre2/inst/lib || make -j8 USE_LIBPCRE=YesPlease CC=~/perl5/installed/bin/gcc NO_R_TO_GCC_LINKER=YesPlease CFLAGS=-O3 LIBPCREDIR=/home/avar/g/pcre/inst LDFLAGS=-Wl,-rpath,/home/avar/g/pcre/inst/lib' ./run HEAD~5 HEAD~ HEAD p7820-grep-engines.sh
[...]
Test HEAD~5 HEAD~ HEAD
-----------------------------------------------------------------------------------------------------------------
7820.3: perl grep 'how.to' 0.31(1.10+0.48) 0.21(0.35+0.56) -32.3% 0.21(0.34+0.55) -32.3%
7820.7: perl grep '^how to' 0.56(2.70+0.40) 0.24(0.64+0.52) -57.1% 0.20(0.28+0.60) -64.3%
7820.11: perl grep '[how] to' 0.56(2.66+0.38) 0.29(0.95+0.45) -48.2% 0.23(0.45+0.54) -58.9%
7820.15: perl grep '(e.t[^ ]*|v.ry) rare' 1.02(5.77+0.42) 0.31(1.02+0.54) -69.6% 0.23(0.50+0.54) -77.5%
7820.19: perl grep 'm(ú|u)lt.b(æ|y)te' 0.38(1.57+0.42) 0.27(0.85+0.46) -28.9% 0.21(0.33+0.57) -44.7%
See commit ("perf: add a comparison test of grep regex engines",
2017-04-19) for details on the machine the above test run was executed
on.
Here HEAD~2 is git with PCRE v1 without JIT, HEAD~ is PCRE v1 with
JIT, and HEAD is PCRE v2 (also with JIT). See previous commits of mine
mentioning p7820-grep-engines.sh for more details on the test setup.
For ease of readability, a different run just of HEAD~ (PCRE v1 with
JIT v.s. PCRE v2), again with just the /perl/ tests shown:
[...]
Test HEAD~ HEAD
----------------------------------------------------------------------------------------
7820.3: perl grep 'how.to' 0.21(0.42+0.52) 0.21(0.31+0.58) +0.0%
7820.7: perl grep '^how to' 0.25(0.65+0.50) 0.20(0.31+0.57) -20.0%
7820.11: perl grep '[how] to' 0.30(0.90+0.50) 0.23(0.46+0.53) -23.3%
7820.15: perl grep '(e.t[^ ]*|v.ry) rare' 0.30(1.19+0.38) 0.23(0.51+0.51) -23.3%
7820.19: perl grep 'm(ú|u)lt.b(æ|y)te' 0.27(0.84+0.48) 0.21(0.34+0.57) -22.2%
I.e. the two are either neck-to-neck, but PCRE v2 usually pulls ahead,
when it does it's around 20% faster.
A brief note on thread safety: As noted in pcre2api(3) & pcre2jit(3)
the compiled pattern can be shared between threads, but not some of
the JIT context, however the grep threading support does all pattern &
JIT compilation in separate threads, so this code doesn't need to
concern itself with thread safety.
See commit 63e7e9d8b6 ("git-grep: Learn PCRE", 2011-05-09) for the
initial addition of PCRE v1. This change follows some of the same
patterns it did (and which were discussed on list at the time),
e.g. mocking up types with typedef instead of ifdef-ing them out when
USE_LIBPCRE2 isn't defined. This adds some trivial memory use to the
program, but makes the code look nicer.
1. https://lists.exim.org/lurker/message/20150105.162835.0666407a.en.html
2. https://lists.exim.org/lurker/thread/20170419.172322.833ee099.en.html
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-06-01 20:20:56 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static int pcre2match(struct grep_pat *p, const char *line, const char *eol,
|
|
|
|
regmatch_t *match, int eflags)
|
|
|
|
{
|
|
|
|
int ret, flags = 0;
|
|
|
|
PCRE2_SIZE *ovector;
|
|
|
|
PCRE2_UCHAR errbuf[256];
|
|
|
|
|
|
|
|
if (eflags & REG_NOTBOL)
|
|
|
|
flags |= PCRE2_NOTBOL;
|
|
|
|
|
|
|
|
if (p->pcre2_jit_on)
|
|
|
|
ret = pcre2_jit_match(p->pcre2_pattern, (unsigned char *)line,
|
|
|
|
eol - line, 0, flags, p->pcre2_match_data,
|
|
|
|
NULL);
|
|
|
|
else
|
|
|
|
ret = pcre2_match(p->pcre2_pattern, (unsigned char *)line,
|
|
|
|
eol - line, 0, flags, p->pcre2_match_data,
|
|
|
|
NULL);
|
|
|
|
|
|
|
|
if (ret < 0 && ret != PCRE2_ERROR_NOMATCH) {
|
|
|
|
pcre2_get_error_message(ret, errbuf, sizeof(errbuf));
|
|
|
|
die("%s failed with error code %d: %s",
|
|
|
|
(p->pcre2_jit_on ? "pcre2_jit_match" : "pcre2_match"), ret,
|
|
|
|
errbuf);
|
|
|
|
}
|
|
|
|
if (ret > 0) {
|
|
|
|
ovector = pcre2_get_ovector_pointer(p->pcre2_match_data);
|
|
|
|
ret = 0;
|
|
|
|
match->rm_so = (int)ovector[0];
|
|
|
|
match->rm_eo = (int)ovector[1];
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void free_pcre2_pattern(struct grep_pat *p)
|
|
|
|
{
|
|
|
|
pcre2_compile_context_free(p->pcre2_compile_context);
|
|
|
|
pcre2_code_free(p->pcre2_pattern);
|
|
|
|
pcre2_match_data_free(p->pcre2_match_data);
|
2021-02-18 01:07:25 +01:00
|
|
|
#ifdef GIT_PCRE2_VERSION_10_34_OR_HIGHER
|
2021-02-18 01:07:27 +01:00
|
|
|
pcre2_maketables_free(p->pcre2_general_context, p->pcre2_tables);
|
2021-02-18 01:07:25 +01:00
|
|
|
#else
|
2019-10-16 14:10:24 +02:00
|
|
|
free((void *)p->pcre2_tables);
|
2021-02-18 01:07:25 +01:00
|
|
|
#endif
|
2021-02-18 01:07:27 +01:00
|
|
|
pcre2_general_context_free(p->pcre2_general_context);
|
grep: add support for PCRE v2
Add support for v2 of the PCRE API. This is a new major version of
PCRE that came out in early 2015[1].
The regular expression syntax is the same, but while the API is
similar, pretty much every function is either renamed or takes
different arguments. Thus using it via entirely new functions makes
sense, as opposed to trying to e.g. have one compile_pcre_pattern()
that would call either PCRE v1 or v2 functions.
Git can now be compiled with either USE_LIBPCRE1=YesPlease or
USE_LIBPCRE2=YesPlease, with USE_LIBPCRE=YesPlease currently being a
synonym for the former. Providing both is a compile-time error.
With earlier patches to enable JIT for PCRE v1 the performance of the
release versions of both libraries is almost exactly the same, with
PCRE v2 being around 1% slower.
However after I reported this to the pcre-dev mailing list[2] I got a
lot of help with the API use from Zoltán Herczeg, he subsequently
optimized some of the JIT functionality in v2 of the library.
Running the p7820-grep-engines.sh performance test against the latest
Subversion trunk of both, with both them and git compiled as -O3, and
the test run against linux.git, gives the following results. Just the
/perl/ tests shown:
$ GIT_PERF_REPEAT_COUNT=30 GIT_PERF_LARGE_REPO=~/g/linux GIT_PERF_MAKE_COMMAND='grep -q LIBPCRE2 Makefile && make -j8 USE_LIBPCRE2=YesPlease CC=~/perl5/installed/bin/gcc NO_R_TO_GCC_LINKER=YesPlease CFLAGS=-O3 LIBPCREDIR=/home/avar/g/pcre2/inst LDFLAGS=-Wl,-rpath,/home/avar/g/pcre2/inst/lib || make -j8 USE_LIBPCRE=YesPlease CC=~/perl5/installed/bin/gcc NO_R_TO_GCC_LINKER=YesPlease CFLAGS=-O3 LIBPCREDIR=/home/avar/g/pcre/inst LDFLAGS=-Wl,-rpath,/home/avar/g/pcre/inst/lib' ./run HEAD~5 HEAD~ HEAD p7820-grep-engines.sh
[...]
Test HEAD~5 HEAD~ HEAD
-----------------------------------------------------------------------------------------------------------------
7820.3: perl grep 'how.to' 0.31(1.10+0.48) 0.21(0.35+0.56) -32.3% 0.21(0.34+0.55) -32.3%
7820.7: perl grep '^how to' 0.56(2.70+0.40) 0.24(0.64+0.52) -57.1% 0.20(0.28+0.60) -64.3%
7820.11: perl grep '[how] to' 0.56(2.66+0.38) 0.29(0.95+0.45) -48.2% 0.23(0.45+0.54) -58.9%
7820.15: perl grep '(e.t[^ ]*|v.ry) rare' 1.02(5.77+0.42) 0.31(1.02+0.54) -69.6% 0.23(0.50+0.54) -77.5%
7820.19: perl grep 'm(ú|u)lt.b(æ|y)te' 0.38(1.57+0.42) 0.27(0.85+0.46) -28.9% 0.21(0.33+0.57) -44.7%
See commit ("perf: add a comparison test of grep regex engines",
2017-04-19) for details on the machine the above test run was executed
on.
Here HEAD~2 is git with PCRE v1 without JIT, HEAD~ is PCRE v1 with
JIT, and HEAD is PCRE v2 (also with JIT). See previous commits of mine
mentioning p7820-grep-engines.sh for more details on the test setup.
For ease of readability, a different run just of HEAD~ (PCRE v1 with
JIT v.s. PCRE v2), again with just the /perl/ tests shown:
[...]
Test HEAD~ HEAD
----------------------------------------------------------------------------------------
7820.3: perl grep 'how.to' 0.21(0.42+0.52) 0.21(0.31+0.58) +0.0%
7820.7: perl grep '^how to' 0.25(0.65+0.50) 0.20(0.31+0.57) -20.0%
7820.11: perl grep '[how] to' 0.30(0.90+0.50) 0.23(0.46+0.53) -23.3%
7820.15: perl grep '(e.t[^ ]*|v.ry) rare' 0.30(1.19+0.38) 0.23(0.51+0.51) -23.3%
7820.19: perl grep 'm(ú|u)lt.b(æ|y)te' 0.27(0.84+0.48) 0.21(0.34+0.57) -22.2%
I.e. the two are either neck-to-neck, but PCRE v2 usually pulls ahead,
when it does it's around 20% faster.
A brief note on thread safety: As noted in pcre2api(3) & pcre2jit(3)
the compiled pattern can be shared between threads, but not some of
the JIT context, however the grep threading support does all pattern &
JIT compilation in separate threads, so this code doesn't need to
concern itself with thread safety.
See commit 63e7e9d8b6 ("git-grep: Learn PCRE", 2011-05-09) for the
initial addition of PCRE v1. This change follows some of the same
patterns it did (and which were discussed on list at the time),
e.g. mocking up types with typedef instead of ifdef-ing them out when
USE_LIBPCRE2 isn't defined. This adds some trivial memory use to the
program, but makes the code look nicer.
1. https://lists.exim.org/lurker/message/20150105.162835.0666407a.en.html
2. https://lists.exim.org/lurker/thread/20170419.172322.833ee099.en.html
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-06-01 20:20:56 +02:00
|
|
|
}
|
|
|
|
#else /* !USE_LIBPCRE2 */
|
|
|
|
static void compile_pcre2_pattern(struct grep_pat *p, const struct grep_opt *opt)
|
|
|
|
{
|
|
|
|
die("cannot use Perl-compatible regexes when not compiled with USE_LIBPCRE");
|
|
|
|
}
|
|
|
|
|
|
|
|
static int pcre2match(struct grep_pat *p, const char *line, const char *eol,
|
|
|
|
regmatch_t *match, int eflags)
|
|
|
|
{
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void free_pcre2_pattern(struct grep_pat *p)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2016-06-25 07:22:31 +02:00
|
|
|
static void compile_fixed_regexp(struct grep_pat *p, struct grep_opt *opt)
|
|
|
|
{
|
|
|
|
struct strbuf sb = STRBUF_INIT;
|
|
|
|
int err;
|
2017-06-30 00:22:22 +02:00
|
|
|
int regflags = 0;
|
2016-06-25 07:22:31 +02:00
|
|
|
|
|
|
|
basic_regex_quote_buf(&sb, p->pattern);
|
|
|
|
if (opt->ignore_case)
|
|
|
|
regflags |= REG_ICASE;
|
|
|
|
err = regcomp(&p->regexp, sb.buf, regflags);
|
|
|
|
strbuf_release(&sb);
|
|
|
|
if (err) {
|
|
|
|
char errbuf[1024];
|
|
|
|
regerror(err, &p->regexp, errbuf, sizeof(errbuf));
|
|
|
|
compile_regexp_failed(p, errbuf);
|
|
|
|
}
|
|
|
|
}
|
grep: use PCRE v2 for optimized fixed-string search
Bring back optimized fixed-string search for "grep", this time with
PCRE v2 as an optional backend. As noted in [1] with kwset we were
slower than PCRE v1 and v2 JIT with the kwset backend, so that
optimization was counterproductive.
This brings back the optimization for "--fixed-strings", without
changing the semantics of having a NUL-byte in patterns. As seen in
previous commits in this series we could support it now, but I'd
rather just leave that edge-case aside so we don't have one behavior
or the other depending what "--fixed-strings" backend we're using. It
makes the behavior harder to understand and document, and makes tests
for the different backends more painful.
This does change the behavior under non-C locales when "log"'s
"--encoding" option is used and the heystack/needle in the
content/command-line doesn't have a matching encoding. See the recent
change in "t4210: skip more command-line encoding tests on MinGW" in
this series. I think that's OK. We did nothing sensible before
then (just compared raw bytes that had no hope of matching). At least
now the user will get some idea why their grep/log never matches in
that edge case.
I could also support the PCRE v1 backend here, but that would make the
code more complex. I'd rather aim for simplicity here and in future
changes to the diffcore. We're not going to have someone who
absolutely must have faster search, but for whom building PCRE v2
isn't acceptable.
The difference between this series of commits and the current "master"
is, using the same t/perf commands shown in the last commit:
plain grep:
Test origin/master HEAD
-------------------------------------------------------------------------
7821.1: fixed grep int 0.55(1.67+0.56) 0.41(0.98+0.60) -25.5%
7821.2: basic grep int 0.58(1.65+0.52) 0.41(0.96+0.57) -29.3%
7821.3: extended grep int 0.57(1.66+0.49) 0.42(0.93+0.60) -26.3%
7821.4: perl grep int 0.54(1.67+0.50) 0.43(0.88+0.65) -20.4%
7821.6: fixed grep uncommon 0.21(0.52+0.42) 0.16(0.24+0.51) -23.8%
7821.7: basic grep uncommon 0.20(0.49+0.45) 0.17(0.28+0.47) -15.0%
7821.8: extended grep uncommon 0.20(0.54+0.39) 0.16(0.25+0.50) -20.0%
7821.9: perl grep uncommon 0.20(0.58+0.36) 0.16(0.23+0.50) -20.0%
7821.11: fixed grep æ 0.35(1.24+0.43) 0.16(0.23+0.50) -54.3%
7821.12: basic grep æ 0.36(1.29+0.38) 0.16(0.20+0.54) -55.6%
7821.13: extended grep æ 0.35(1.23+0.44) 0.16(0.24+0.50) -54.3%
7821.14: perl grep æ 0.35(1.33+0.34) 0.16(0.28+0.46) -54.3%
grep with -i:
Test origin/master HEAD
----------------------------------------------------------------------------
7821.1: fixed grep -i int 0.62(1.81+0.70) 0.47(1.11+0.64) -24.2%
7821.2: basic grep -i int 0.67(1.90+0.53) 0.46(1.07+0.62) -31.3%
7821.3: extended grep -i int 0.62(1.92+0.53) 0.53(1.12+0.58) -14.5%
7821.4: perl grep -i int 0.66(1.85+0.58) 0.45(1.10+0.59) -31.8%
7821.6: fixed grep -i uncommon 0.21(0.54+0.43) 0.17(0.20+0.55) -19.0%
7821.7: basic grep -i uncommon 0.20(0.52+0.45) 0.17(0.29+0.48) -15.0%
7821.8: extended grep -i uncommon 0.21(0.52+0.44) 0.17(0.26+0.50) -19.0%
7821.9: perl grep -i uncommon 0.21(0.53+0.44) 0.17(0.20+0.56) -19.0%
7821.11: fixed grep -i æ 0.26(0.79+0.44) 0.16(0.29+0.46) -38.5%
7821.12: basic grep -i æ 0.26(0.79+0.42) 0.16(0.20+0.54) -38.5%
7821.13: extended grep -i æ 0.26(0.84+0.39) 0.16(0.24+0.50) -38.5%
7821.14: perl grep -i æ 0.16(0.24+0.49) 0.17(0.25+0.51) +6.3%
plain log:
Test origin/master HEAD
--------------------------------------------------------------------------------
4221.1: fixed log --grep='int' 7.24(6.95+0.28) 7.20(6.95+0.18) -0.6%
4221.2: basic log --grep='int' 7.31(6.97+0.22) 7.20(6.93+0.21) -1.5%
4221.3: extended log --grep='int' 7.37(7.04+0.24) 7.22(6.91+0.25) -2.0%
4221.4: perl log --grep='int' 7.31(7.04+0.21) 7.19(6.89+0.21) -1.6%
4221.6: fixed log --grep='uncommon' 6.93(6.59+0.32) 7.04(6.66+0.37) +1.6%
4221.7: basic log --grep='uncommon' 6.92(6.58+0.29) 7.08(6.75+0.29) +2.3%
4221.8: extended log --grep='uncommon' 6.92(6.55+0.31) 7.00(6.68+0.31) +1.2%
4221.9: perl log --grep='uncommon' 7.03(6.59+0.33) 7.12(6.73+0.34) +1.3%
4221.11: fixed log --grep='æ' 7.41(7.08+0.28) 7.05(6.76+0.29) -4.9%
4221.12: basic log --grep='æ' 7.39(6.99+0.33) 7.00(6.68+0.25) -5.3%
4221.13: extended log --grep='æ' 7.34(7.00+0.25) 7.15(6.81+0.31) -2.6%
4221.14: perl log --grep='æ' 7.43(7.13+0.26) 7.01(6.60+0.36) -5.7%
log with -i:
Test origin/master HEAD
------------------------------------------------------------------------------------
4221.1: fixed log -i --grep='int' 7.31(7.07+0.24) 7.23(7.00+0.22) -1.1%
4221.2: basic log -i --grep='int' 7.40(7.08+0.28) 7.19(6.92+0.20) -2.8%
4221.3: extended log -i --grep='int' 7.43(7.13+0.25) 7.27(6.99+0.21) -2.2%
4221.4: perl log -i --grep='int' 7.34(7.10+0.24) 7.10(6.90+0.19) -3.3%
4221.6: fixed log -i --grep='uncommon' 7.07(6.71+0.32) 7.11(6.77+0.28) +0.6%
4221.7: basic log -i --grep='uncommon' 6.99(6.64+0.28) 7.12(6.69+0.38) +1.9%
4221.8: extended log -i --grep='uncommon' 7.11(6.74+0.32) 7.10(6.77+0.27) -0.1%
4221.9: perl log -i --grep='uncommon' 6.98(6.60+0.29) 7.05(6.64+0.34) +1.0%
4221.11: fixed log -i --grep='æ' 7.85(7.45+0.34) 7.03(6.68+0.32) -10.4%
4221.12: basic log -i --grep='æ' 7.87(7.49+0.29) 7.06(6.69+0.31) -10.3%
4221.13: extended log -i --grep='æ' 7.87(7.54+0.31) 7.09(6.69+0.31) -9.9%
4221.14: perl log -i --grep='æ' 7.06(6.77+0.28) 6.91(6.57+0.31) -2.1%
So as with e05b027627 ("grep: use PCRE v2 for optimized fixed-string
search", 2019-06-26) there's a huge improvement in performance for
"grep", but in "log" most of our time is spent elsewhere, so we don't
notice it that much.
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-07-01 23:21:00 +02:00
|
|
|
#endif /* !USE_LIBPCRE2 */
|
2016-06-25 07:22:31 +02:00
|
|
|
|
2006-09-18 01:02:52 +02:00
|
|
|
static void compile_regexp(struct grep_pat *p, struct grep_opt *opt)
|
|
|
|
{
|
2009-01-10 00:18:34 +01:00
|
|
|
int err;
|
2017-06-30 00:22:21 +02:00
|
|
|
int regflags = REG_NEWLINE;
|
2009-01-10 00:18:34 +01:00
|
|
|
|
grep: simplify config parsing and option parsing
Simplify the parsing of "grep.patternType" and
"grep.extendedRegexp". This changes no behavior, but gets rid of
complex parsing logic that isn't needed anymore.
When "grep.patternType" was introduced in 84befcd0a4a (grep: add a
grep.patternType configuration setting, 2012-08-03) we promised that:
1. You can set "grep.patternType", and "[setting it to] 'default'
will return to the default matching behavior".
In that context "the default" meant whatever the configuration
system specified before that change, i.e. via grep.extendedRegexp.
2. We'd support the existing "grep.extendedRegexp" option, but ignore
it when the new "grep.patternType" option is set. We said we'd
only ignore the older "grep.extendedRegexp" option "when the
`grep.patternType` option is set to a value other than
'default'".
In a preceding commit we changed grep_config() to be called after
grep_init(), which means that much of the complexity here can go
away.
As before both "grep.patternType" and "grep.extendedRegexp" are
last-one-wins variable, with "grep.extendedRegexp" yielding to
"grep.patternType", except when "grep.patternType=default".
Note that as the previously added tests indicate this cannot be done
on-the-fly as we see the config variables, without introducing more
state keeping. I.e. if we see:
-c grep.extendedRegexp=false
-c grep.patternType=default
-c extendedRegexp=true
We need to select ERE, since grep.patternType=default unselects that
variable, which normally has higher precedence, but we also need to
select BRE in cases of:
-c grep.extendedRegexp=true \
-c grep.extendedRegexp=false
Which would not be the case for this, which select ERE:
-c grep.patternType=extended \
-c grep.extendedRegexp=false
Therefore we cannot do this on-the-fly in grep_config without also
introducing tracking variables for not only the pattern type, but what
the source of that pattern type was.
So we need to decide on the pattern after our config was fully
parsed. Let's do that by deferring the decision on the pattern type
until it's time to compile it in compile_regexp().
By that time we've not only parsed the config, but also handled the
command-line options. Those will set "opt.pattern_type_option" (*not*
"opt.extended_regexp_option"!).
At that point all we need to do is see if "grep.patternType" was
UNSPECIFIED in the end (including an explicit "=default"), if so we'll
use the "grep.extendedRegexp" configuration, if any.
See my 07a3d411739 (grep: remove regflags from the public grep_opt
API, 2017-06-29) for addition of the two comments being removed here,
i.e. the complexity noted in that commit is now going away.
1. https://lore.kernel.org/git/patch-v8-09.10-c211bb0c69d-20220118T155211Z-avarab@gmail.com/
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2022-02-16 01:00:39 +01:00
|
|
|
if (opt->pattern_type_option == GREP_PATTERN_TYPE_UNSPECIFIED)
|
|
|
|
opt->pattern_type_option = (opt->extended_regexp_option
|
|
|
|
? GREP_PATTERN_TYPE_ERE
|
|
|
|
: GREP_PATTERN_TYPE_BRE);
|
|
|
|
|
2009-03-07 13:28:40 +01:00
|
|
|
p->word_regexp = opt->word_regexp;
|
2009-11-06 10:22:35 +01:00
|
|
|
p->ignore_case = opt->ignore_case;
|
grep: simplify config parsing and option parsing
Simplify the parsing of "grep.patternType" and
"grep.extendedRegexp". This changes no behavior, but gets rid of
complex parsing logic that isn't needed anymore.
When "grep.patternType" was introduced in 84befcd0a4a (grep: add a
grep.patternType configuration setting, 2012-08-03) we promised that:
1. You can set "grep.patternType", and "[setting it to] 'default'
will return to the default matching behavior".
In that context "the default" meant whatever the configuration
system specified before that change, i.e. via grep.extendedRegexp.
2. We'd support the existing "grep.extendedRegexp" option, but ignore
it when the new "grep.patternType" option is set. We said we'd
only ignore the older "grep.extendedRegexp" option "when the
`grep.patternType` option is set to a value other than
'default'".
In a preceding commit we changed grep_config() to be called after
grep_init(), which means that much of the complexity here can go
away.
As before both "grep.patternType" and "grep.extendedRegexp" are
last-one-wins variable, with "grep.extendedRegexp" yielding to
"grep.patternType", except when "grep.patternType=default".
Note that as the previously added tests indicate this cannot be done
on-the-fly as we see the config variables, without introducing more
state keeping. I.e. if we see:
-c grep.extendedRegexp=false
-c grep.patternType=default
-c extendedRegexp=true
We need to select ERE, since grep.patternType=default unselects that
variable, which normally has higher precedence, but we also need to
select BRE in cases of:
-c grep.extendedRegexp=true \
-c grep.extendedRegexp=false
Which would not be the case for this, which select ERE:
-c grep.patternType=extended \
-c grep.extendedRegexp=false
Therefore we cannot do this on-the-fly in grep_config without also
introducing tracking variables for not only the pattern type, but what
the source of that pattern type was.
So we need to decide on the pattern after our config was fully
parsed. Let's do that by deferring the decision on the pattern type
until it's time to compile it in compile_regexp().
By that time we've not only parsed the config, but also handled the
command-line options. Those will set "opt.pattern_type_option" (*not*
"opt.extended_regexp_option"!).
At that point all we need to do is see if "grep.patternType" was
UNSPECIFIED in the end (including an explicit "=default"), if so we'll
use the "grep.extendedRegexp" configuration, if any.
See my 07a3d411739 (grep: remove regflags from the public grep_opt
API, 2017-06-29) for addition of the two comments being removed here,
i.e. the complexity noted in that commit is now going away.
1. https://lore.kernel.org/git/patch-v8-09.10-c211bb0c69d-20220118T155211Z-avarab@gmail.com/
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2022-02-16 01:00:39 +01:00
|
|
|
p->fixed = opt->pattern_type_option == GREP_PATTERN_TYPE_FIXED;
|
2009-03-07 13:28:40 +01:00
|
|
|
|
grep: simplify config parsing and option parsing
Simplify the parsing of "grep.patternType" and
"grep.extendedRegexp". This changes no behavior, but gets rid of
complex parsing logic that isn't needed anymore.
When "grep.patternType" was introduced in 84befcd0a4a (grep: add a
grep.patternType configuration setting, 2012-08-03) we promised that:
1. You can set "grep.patternType", and "[setting it to] 'default'
will return to the default matching behavior".
In that context "the default" meant whatever the configuration
system specified before that change, i.e. via grep.extendedRegexp.
2. We'd support the existing "grep.extendedRegexp" option, but ignore
it when the new "grep.patternType" option is set. We said we'd
only ignore the older "grep.extendedRegexp" option "when the
`grep.patternType` option is set to a value other than
'default'".
In a preceding commit we changed grep_config() to be called after
grep_init(), which means that much of the complexity here can go
away.
As before both "grep.patternType" and "grep.extendedRegexp" are
last-one-wins variable, with "grep.extendedRegexp" yielding to
"grep.patternType", except when "grep.patternType=default".
Note that as the previously added tests indicate this cannot be done
on-the-fly as we see the config variables, without introducing more
state keeping. I.e. if we see:
-c grep.extendedRegexp=false
-c grep.patternType=default
-c extendedRegexp=true
We need to select ERE, since grep.patternType=default unselects that
variable, which normally has higher precedence, but we also need to
select BRE in cases of:
-c grep.extendedRegexp=true \
-c grep.extendedRegexp=false
Which would not be the case for this, which select ERE:
-c grep.patternType=extended \
-c grep.extendedRegexp=false
Therefore we cannot do this on-the-fly in grep_config without also
introducing tracking variables for not only the pattern type, but what
the source of that pattern type was.
So we need to decide on the pattern after our config was fully
parsed. Let's do that by deferring the decision on the pattern type
until it's time to compile it in compile_regexp().
By that time we've not only parsed the config, but also handled the
command-line options. Those will set "opt.pattern_type_option" (*not*
"opt.extended_regexp_option"!).
At that point all we need to do is see if "grep.patternType" was
UNSPECIFIED in the end (including an explicit "=default"), if so we'll
use the "grep.extendedRegexp" configuration, if any.
See my 07a3d411739 (grep: remove regflags from the public grep_opt
API, 2017-06-29) for addition of the two comments being removed here,
i.e. the complexity noted in that commit is now going away.
1. https://lore.kernel.org/git/patch-v8-09.10-c211bb0c69d-20220118T155211Z-avarab@gmail.com/
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2022-02-16 01:00:39 +01:00
|
|
|
if (opt->pattern_type_option != GREP_PATTERN_TYPE_PCRE &&
|
2022-02-16 01:00:38 +01:00
|
|
|
memchr(p->pattern, 0, p->patternlen))
|
2019-07-01 23:20:58 +02:00
|
|
|
die(_("given pattern contains NULL byte (via -f <file>). This is only supported with -P under PCRE v2"));
|
|
|
|
|
2019-07-26 17:08:15 +02:00
|
|
|
p->is_fixed = is_fixed(p->pattern, p->patternlen);
|
grep: stess test PCRE v2 on invalid UTF-8 data
Since my b65abcafc7 ("grep: use PCRE v2 for optimized fixed-string
search", 2019-07-01) we've been dying on invalid UTF-8 data when
grepping for fixed strings if the following are all true:
* The subject string is non-ASCII (e.g. "ævar")
* We're under a is_utf8_locale(), e.g. "en_US.UTF-8", not "C"
* We compiled with PCRE v2
* That PCRE v2 did not have JIT support
The last of those is why this wasn't caught earlier, per pcre2jit(3):
"unless PCRE2_NO_UTF_CHECK is set, a UTF subject string is tested
for validity. In the interests of speed, these checks do not
happen on the JIT fast path, and if invalid data is passed, the
result is undefined."
I.e. the subject being matched against our pattern was invalid, but we
were lucky and getting away with it on the JIT path, but the non-JIT
one is stricter.
This patch does nothing to fix that, instead we sneak in support for
fixed patterns starting with "(*NO_JIT)", this disables the PCRE v2
jit with implicit fixed-string matching for testing, see
pcre2syntax(3) the syntax.
This is technically a change in behavior, but it's so obscure that I
figured it was OK. We'd previously consider this an invalid regular
expression as regcomp() would die on it, now we feed it to the PCRE v2
fixed-string path. I thought this was better than introducing yet
another GIT_TEST_* environment variable.
We're also relying on a behavior of PCRE v2 that technically could
change, but I think the test coverage is worth dipping our toe into
some somewhat undefined behavior.
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-07-26 17:08:16 +02:00
|
|
|
#ifdef USE_LIBPCRE2
|
|
|
|
if (!p->fixed && !p->is_fixed) {
|
|
|
|
const char *no_jit = "(*NO_JIT)";
|
|
|
|
const int no_jit_len = strlen(no_jit);
|
|
|
|
if (starts_with(p->pattern, no_jit) &&
|
|
|
|
is_fixed(p->pattern + no_jit_len,
|
|
|
|
p->patternlen - no_jit_len))
|
|
|
|
p->is_fixed = 1;
|
|
|
|
}
|
|
|
|
#endif
|
2019-07-26 17:08:15 +02:00
|
|
|
if (p->fixed || p->is_fixed) {
|
grep: use PCRE v2 for optimized fixed-string search
Bring back optimized fixed-string search for "grep", this time with
PCRE v2 as an optional backend. As noted in [1] with kwset we were
slower than PCRE v1 and v2 JIT with the kwset backend, so that
optimization was counterproductive.
This brings back the optimization for "--fixed-strings", without
changing the semantics of having a NUL-byte in patterns. As seen in
previous commits in this series we could support it now, but I'd
rather just leave that edge-case aside so we don't have one behavior
or the other depending what "--fixed-strings" backend we're using. It
makes the behavior harder to understand and document, and makes tests
for the different backends more painful.
This does change the behavior under non-C locales when "log"'s
"--encoding" option is used and the heystack/needle in the
content/command-line doesn't have a matching encoding. See the recent
change in "t4210: skip more command-line encoding tests on MinGW" in
this series. I think that's OK. We did nothing sensible before
then (just compared raw bytes that had no hope of matching). At least
now the user will get some idea why their grep/log never matches in
that edge case.
I could also support the PCRE v1 backend here, but that would make the
code more complex. I'd rather aim for simplicity here and in future
changes to the diffcore. We're not going to have someone who
absolutely must have faster search, but for whom building PCRE v2
isn't acceptable.
The difference between this series of commits and the current "master"
is, using the same t/perf commands shown in the last commit:
plain grep:
Test origin/master HEAD
-------------------------------------------------------------------------
7821.1: fixed grep int 0.55(1.67+0.56) 0.41(0.98+0.60) -25.5%
7821.2: basic grep int 0.58(1.65+0.52) 0.41(0.96+0.57) -29.3%
7821.3: extended grep int 0.57(1.66+0.49) 0.42(0.93+0.60) -26.3%
7821.4: perl grep int 0.54(1.67+0.50) 0.43(0.88+0.65) -20.4%
7821.6: fixed grep uncommon 0.21(0.52+0.42) 0.16(0.24+0.51) -23.8%
7821.7: basic grep uncommon 0.20(0.49+0.45) 0.17(0.28+0.47) -15.0%
7821.8: extended grep uncommon 0.20(0.54+0.39) 0.16(0.25+0.50) -20.0%
7821.9: perl grep uncommon 0.20(0.58+0.36) 0.16(0.23+0.50) -20.0%
7821.11: fixed grep æ 0.35(1.24+0.43) 0.16(0.23+0.50) -54.3%
7821.12: basic grep æ 0.36(1.29+0.38) 0.16(0.20+0.54) -55.6%
7821.13: extended grep æ 0.35(1.23+0.44) 0.16(0.24+0.50) -54.3%
7821.14: perl grep æ 0.35(1.33+0.34) 0.16(0.28+0.46) -54.3%
grep with -i:
Test origin/master HEAD
----------------------------------------------------------------------------
7821.1: fixed grep -i int 0.62(1.81+0.70) 0.47(1.11+0.64) -24.2%
7821.2: basic grep -i int 0.67(1.90+0.53) 0.46(1.07+0.62) -31.3%
7821.3: extended grep -i int 0.62(1.92+0.53) 0.53(1.12+0.58) -14.5%
7821.4: perl grep -i int 0.66(1.85+0.58) 0.45(1.10+0.59) -31.8%
7821.6: fixed grep -i uncommon 0.21(0.54+0.43) 0.17(0.20+0.55) -19.0%
7821.7: basic grep -i uncommon 0.20(0.52+0.45) 0.17(0.29+0.48) -15.0%
7821.8: extended grep -i uncommon 0.21(0.52+0.44) 0.17(0.26+0.50) -19.0%
7821.9: perl grep -i uncommon 0.21(0.53+0.44) 0.17(0.20+0.56) -19.0%
7821.11: fixed grep -i æ 0.26(0.79+0.44) 0.16(0.29+0.46) -38.5%
7821.12: basic grep -i æ 0.26(0.79+0.42) 0.16(0.20+0.54) -38.5%
7821.13: extended grep -i æ 0.26(0.84+0.39) 0.16(0.24+0.50) -38.5%
7821.14: perl grep -i æ 0.16(0.24+0.49) 0.17(0.25+0.51) +6.3%
plain log:
Test origin/master HEAD
--------------------------------------------------------------------------------
4221.1: fixed log --grep='int' 7.24(6.95+0.28) 7.20(6.95+0.18) -0.6%
4221.2: basic log --grep='int' 7.31(6.97+0.22) 7.20(6.93+0.21) -1.5%
4221.3: extended log --grep='int' 7.37(7.04+0.24) 7.22(6.91+0.25) -2.0%
4221.4: perl log --grep='int' 7.31(7.04+0.21) 7.19(6.89+0.21) -1.6%
4221.6: fixed log --grep='uncommon' 6.93(6.59+0.32) 7.04(6.66+0.37) +1.6%
4221.7: basic log --grep='uncommon' 6.92(6.58+0.29) 7.08(6.75+0.29) +2.3%
4221.8: extended log --grep='uncommon' 6.92(6.55+0.31) 7.00(6.68+0.31) +1.2%
4221.9: perl log --grep='uncommon' 7.03(6.59+0.33) 7.12(6.73+0.34) +1.3%
4221.11: fixed log --grep='æ' 7.41(7.08+0.28) 7.05(6.76+0.29) -4.9%
4221.12: basic log --grep='æ' 7.39(6.99+0.33) 7.00(6.68+0.25) -5.3%
4221.13: extended log --grep='æ' 7.34(7.00+0.25) 7.15(6.81+0.31) -2.6%
4221.14: perl log --grep='æ' 7.43(7.13+0.26) 7.01(6.60+0.36) -5.7%
log with -i:
Test origin/master HEAD
------------------------------------------------------------------------------------
4221.1: fixed log -i --grep='int' 7.31(7.07+0.24) 7.23(7.00+0.22) -1.1%
4221.2: basic log -i --grep='int' 7.40(7.08+0.28) 7.19(6.92+0.20) -2.8%
4221.3: extended log -i --grep='int' 7.43(7.13+0.25) 7.27(6.99+0.21) -2.2%
4221.4: perl log -i --grep='int' 7.34(7.10+0.24) 7.10(6.90+0.19) -3.3%
4221.6: fixed log -i --grep='uncommon' 7.07(6.71+0.32) 7.11(6.77+0.28) +0.6%
4221.7: basic log -i --grep='uncommon' 6.99(6.64+0.28) 7.12(6.69+0.38) +1.9%
4221.8: extended log -i --grep='uncommon' 7.11(6.74+0.32) 7.10(6.77+0.27) -0.1%
4221.9: perl log -i --grep='uncommon' 6.98(6.60+0.29) 7.05(6.64+0.34) +1.0%
4221.11: fixed log -i --grep='æ' 7.85(7.45+0.34) 7.03(6.68+0.32) -10.4%
4221.12: basic log -i --grep='æ' 7.87(7.49+0.29) 7.06(6.69+0.31) -10.3%
4221.13: extended log -i --grep='æ' 7.87(7.54+0.31) 7.09(6.69+0.31) -9.9%
4221.14: perl log -i --grep='æ' 7.06(6.77+0.28) 6.91(6.57+0.31) -2.1%
So as with e05b027627 ("grep: use PCRE v2 for optimized fixed-string
search", 2019-06-26) there's a huge improvement in performance for
"grep", but in "log" most of our time is spent elsewhere, so we don't
notice it that much.
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-07-01 23:21:00 +02:00
|
|
|
#ifdef USE_LIBPCRE2
|
2019-07-26 17:08:15 +02:00
|
|
|
if (p->is_fixed) {
|
grep: use PCRE v2 for optimized fixed-string search
Bring back optimized fixed-string search for "grep", this time with
PCRE v2 as an optional backend. As noted in [1] with kwset we were
slower than PCRE v1 and v2 JIT with the kwset backend, so that
optimization was counterproductive.
This brings back the optimization for "--fixed-strings", without
changing the semantics of having a NUL-byte in patterns. As seen in
previous commits in this series we could support it now, but I'd
rather just leave that edge-case aside so we don't have one behavior
or the other depending what "--fixed-strings" backend we're using. It
makes the behavior harder to understand and document, and makes tests
for the different backends more painful.
This does change the behavior under non-C locales when "log"'s
"--encoding" option is used and the heystack/needle in the
content/command-line doesn't have a matching encoding. See the recent
change in "t4210: skip more command-line encoding tests on MinGW" in
this series. I think that's OK. We did nothing sensible before
then (just compared raw bytes that had no hope of matching). At least
now the user will get some idea why their grep/log never matches in
that edge case.
I could also support the PCRE v1 backend here, but that would make the
code more complex. I'd rather aim for simplicity here and in future
changes to the diffcore. We're not going to have someone who
absolutely must have faster search, but for whom building PCRE v2
isn't acceptable.
The difference between this series of commits and the current "master"
is, using the same t/perf commands shown in the last commit:
plain grep:
Test origin/master HEAD
-------------------------------------------------------------------------
7821.1: fixed grep int 0.55(1.67+0.56) 0.41(0.98+0.60) -25.5%
7821.2: basic grep int 0.58(1.65+0.52) 0.41(0.96+0.57) -29.3%
7821.3: extended grep int 0.57(1.66+0.49) 0.42(0.93+0.60) -26.3%
7821.4: perl grep int 0.54(1.67+0.50) 0.43(0.88+0.65) -20.4%
7821.6: fixed grep uncommon 0.21(0.52+0.42) 0.16(0.24+0.51) -23.8%
7821.7: basic grep uncommon 0.20(0.49+0.45) 0.17(0.28+0.47) -15.0%
7821.8: extended grep uncommon 0.20(0.54+0.39) 0.16(0.25+0.50) -20.0%
7821.9: perl grep uncommon 0.20(0.58+0.36) 0.16(0.23+0.50) -20.0%
7821.11: fixed grep æ 0.35(1.24+0.43) 0.16(0.23+0.50) -54.3%
7821.12: basic grep æ 0.36(1.29+0.38) 0.16(0.20+0.54) -55.6%
7821.13: extended grep æ 0.35(1.23+0.44) 0.16(0.24+0.50) -54.3%
7821.14: perl grep æ 0.35(1.33+0.34) 0.16(0.28+0.46) -54.3%
grep with -i:
Test origin/master HEAD
----------------------------------------------------------------------------
7821.1: fixed grep -i int 0.62(1.81+0.70) 0.47(1.11+0.64) -24.2%
7821.2: basic grep -i int 0.67(1.90+0.53) 0.46(1.07+0.62) -31.3%
7821.3: extended grep -i int 0.62(1.92+0.53) 0.53(1.12+0.58) -14.5%
7821.4: perl grep -i int 0.66(1.85+0.58) 0.45(1.10+0.59) -31.8%
7821.6: fixed grep -i uncommon 0.21(0.54+0.43) 0.17(0.20+0.55) -19.0%
7821.7: basic grep -i uncommon 0.20(0.52+0.45) 0.17(0.29+0.48) -15.0%
7821.8: extended grep -i uncommon 0.21(0.52+0.44) 0.17(0.26+0.50) -19.0%
7821.9: perl grep -i uncommon 0.21(0.53+0.44) 0.17(0.20+0.56) -19.0%
7821.11: fixed grep -i æ 0.26(0.79+0.44) 0.16(0.29+0.46) -38.5%
7821.12: basic grep -i æ 0.26(0.79+0.42) 0.16(0.20+0.54) -38.5%
7821.13: extended grep -i æ 0.26(0.84+0.39) 0.16(0.24+0.50) -38.5%
7821.14: perl grep -i æ 0.16(0.24+0.49) 0.17(0.25+0.51) +6.3%
plain log:
Test origin/master HEAD
--------------------------------------------------------------------------------
4221.1: fixed log --grep='int' 7.24(6.95+0.28) 7.20(6.95+0.18) -0.6%
4221.2: basic log --grep='int' 7.31(6.97+0.22) 7.20(6.93+0.21) -1.5%
4221.3: extended log --grep='int' 7.37(7.04+0.24) 7.22(6.91+0.25) -2.0%
4221.4: perl log --grep='int' 7.31(7.04+0.21) 7.19(6.89+0.21) -1.6%
4221.6: fixed log --grep='uncommon' 6.93(6.59+0.32) 7.04(6.66+0.37) +1.6%
4221.7: basic log --grep='uncommon' 6.92(6.58+0.29) 7.08(6.75+0.29) +2.3%
4221.8: extended log --grep='uncommon' 6.92(6.55+0.31) 7.00(6.68+0.31) +1.2%
4221.9: perl log --grep='uncommon' 7.03(6.59+0.33) 7.12(6.73+0.34) +1.3%
4221.11: fixed log --grep='æ' 7.41(7.08+0.28) 7.05(6.76+0.29) -4.9%
4221.12: basic log --grep='æ' 7.39(6.99+0.33) 7.00(6.68+0.25) -5.3%
4221.13: extended log --grep='æ' 7.34(7.00+0.25) 7.15(6.81+0.31) -2.6%
4221.14: perl log --grep='æ' 7.43(7.13+0.26) 7.01(6.60+0.36) -5.7%
log with -i:
Test origin/master HEAD
------------------------------------------------------------------------------------
4221.1: fixed log -i --grep='int' 7.31(7.07+0.24) 7.23(7.00+0.22) -1.1%
4221.2: basic log -i --grep='int' 7.40(7.08+0.28) 7.19(6.92+0.20) -2.8%
4221.3: extended log -i --grep='int' 7.43(7.13+0.25) 7.27(6.99+0.21) -2.2%
4221.4: perl log -i --grep='int' 7.34(7.10+0.24) 7.10(6.90+0.19) -3.3%
4221.6: fixed log -i --grep='uncommon' 7.07(6.71+0.32) 7.11(6.77+0.28) +0.6%
4221.7: basic log -i --grep='uncommon' 6.99(6.64+0.28) 7.12(6.69+0.38) +1.9%
4221.8: extended log -i --grep='uncommon' 7.11(6.74+0.32) 7.10(6.77+0.27) -0.1%
4221.9: perl log -i --grep='uncommon' 6.98(6.60+0.29) 7.05(6.64+0.34) +1.0%
4221.11: fixed log -i --grep='æ' 7.85(7.45+0.34) 7.03(6.68+0.32) -10.4%
4221.12: basic log -i --grep='æ' 7.87(7.49+0.29) 7.06(6.69+0.31) -10.3%
4221.13: extended log -i --grep='æ' 7.87(7.54+0.31) 7.09(6.69+0.31) -9.9%
4221.14: perl log -i --grep='æ' 7.06(6.77+0.28) 6.91(6.57+0.31) -2.1%
So as with e05b027627 ("grep: use PCRE v2 for optimized fixed-string
search", 2019-06-26) there's a huge improvement in performance for
"grep", but in "log" most of our time is spent elsewhere, so we don't
notice it that much.
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-07-01 23:21:00 +02:00
|
|
|
compile_pcre2_pattern(p, opt);
|
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* E.g. t7811-grep-open.sh relies on the
|
|
|
|
* pattern being restored.
|
|
|
|
*/
|
|
|
|
char *old_pattern = p->pattern;
|
|
|
|
size_t old_patternlen = p->patternlen;
|
|
|
|
struct strbuf sb = STRBUF_INIT;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* There is the PCRE2_LITERAL flag, but it's
|
|
|
|
* only in PCRE v2 10.30 and later. Needing to
|
|
|
|
* ifdef our way around that and dealing with
|
|
|
|
* it + PCRE2_MULTILINE being an error is more
|
|
|
|
* complex than just quoting this ourselves.
|
|
|
|
*/
|
|
|
|
strbuf_add(&sb, "\\Q", 2);
|
|
|
|
strbuf_add(&sb, p->pattern, p->patternlen);
|
|
|
|
strbuf_add(&sb, "\\E", 2);
|
|
|
|
|
|
|
|
p->pattern = sb.buf;
|
|
|
|
p->patternlen = sb.len;
|
|
|
|
compile_pcre2_pattern(p, opt);
|
|
|
|
p->pattern = old_pattern;
|
|
|
|
p->patternlen = old_patternlen;
|
|
|
|
strbuf_release(&sb);
|
|
|
|
}
|
|
|
|
#else /* !USE_LIBPCRE2 */
|
2016-06-25 07:22:31 +02:00
|
|
|
compile_fixed_regexp(p, opt);
|
grep: use PCRE v2 for optimized fixed-string search
Bring back optimized fixed-string search for "grep", this time with
PCRE v2 as an optional backend. As noted in [1] with kwset we were
slower than PCRE v1 and v2 JIT with the kwset backend, so that
optimization was counterproductive.
This brings back the optimization for "--fixed-strings", without
changing the semantics of having a NUL-byte in patterns. As seen in
previous commits in this series we could support it now, but I'd
rather just leave that edge-case aside so we don't have one behavior
or the other depending what "--fixed-strings" backend we're using. It
makes the behavior harder to understand and document, and makes tests
for the different backends more painful.
This does change the behavior under non-C locales when "log"'s
"--encoding" option is used and the heystack/needle in the
content/command-line doesn't have a matching encoding. See the recent
change in "t4210: skip more command-line encoding tests on MinGW" in
this series. I think that's OK. We did nothing sensible before
then (just compared raw bytes that had no hope of matching). At least
now the user will get some idea why their grep/log never matches in
that edge case.
I could also support the PCRE v1 backend here, but that would make the
code more complex. I'd rather aim for simplicity here and in future
changes to the diffcore. We're not going to have someone who
absolutely must have faster search, but for whom building PCRE v2
isn't acceptable.
The difference between this series of commits and the current "master"
is, using the same t/perf commands shown in the last commit:
plain grep:
Test origin/master HEAD
-------------------------------------------------------------------------
7821.1: fixed grep int 0.55(1.67+0.56) 0.41(0.98+0.60) -25.5%
7821.2: basic grep int 0.58(1.65+0.52) 0.41(0.96+0.57) -29.3%
7821.3: extended grep int 0.57(1.66+0.49) 0.42(0.93+0.60) -26.3%
7821.4: perl grep int 0.54(1.67+0.50) 0.43(0.88+0.65) -20.4%
7821.6: fixed grep uncommon 0.21(0.52+0.42) 0.16(0.24+0.51) -23.8%
7821.7: basic grep uncommon 0.20(0.49+0.45) 0.17(0.28+0.47) -15.0%
7821.8: extended grep uncommon 0.20(0.54+0.39) 0.16(0.25+0.50) -20.0%
7821.9: perl grep uncommon 0.20(0.58+0.36) 0.16(0.23+0.50) -20.0%
7821.11: fixed grep æ 0.35(1.24+0.43) 0.16(0.23+0.50) -54.3%
7821.12: basic grep æ 0.36(1.29+0.38) 0.16(0.20+0.54) -55.6%
7821.13: extended grep æ 0.35(1.23+0.44) 0.16(0.24+0.50) -54.3%
7821.14: perl grep æ 0.35(1.33+0.34) 0.16(0.28+0.46) -54.3%
grep with -i:
Test origin/master HEAD
----------------------------------------------------------------------------
7821.1: fixed grep -i int 0.62(1.81+0.70) 0.47(1.11+0.64) -24.2%
7821.2: basic grep -i int 0.67(1.90+0.53) 0.46(1.07+0.62) -31.3%
7821.3: extended grep -i int 0.62(1.92+0.53) 0.53(1.12+0.58) -14.5%
7821.4: perl grep -i int 0.66(1.85+0.58) 0.45(1.10+0.59) -31.8%
7821.6: fixed grep -i uncommon 0.21(0.54+0.43) 0.17(0.20+0.55) -19.0%
7821.7: basic grep -i uncommon 0.20(0.52+0.45) 0.17(0.29+0.48) -15.0%
7821.8: extended grep -i uncommon 0.21(0.52+0.44) 0.17(0.26+0.50) -19.0%
7821.9: perl grep -i uncommon 0.21(0.53+0.44) 0.17(0.20+0.56) -19.0%
7821.11: fixed grep -i æ 0.26(0.79+0.44) 0.16(0.29+0.46) -38.5%
7821.12: basic grep -i æ 0.26(0.79+0.42) 0.16(0.20+0.54) -38.5%
7821.13: extended grep -i æ 0.26(0.84+0.39) 0.16(0.24+0.50) -38.5%
7821.14: perl grep -i æ 0.16(0.24+0.49) 0.17(0.25+0.51) +6.3%
plain log:
Test origin/master HEAD
--------------------------------------------------------------------------------
4221.1: fixed log --grep='int' 7.24(6.95+0.28) 7.20(6.95+0.18) -0.6%
4221.2: basic log --grep='int' 7.31(6.97+0.22) 7.20(6.93+0.21) -1.5%
4221.3: extended log --grep='int' 7.37(7.04+0.24) 7.22(6.91+0.25) -2.0%
4221.4: perl log --grep='int' 7.31(7.04+0.21) 7.19(6.89+0.21) -1.6%
4221.6: fixed log --grep='uncommon' 6.93(6.59+0.32) 7.04(6.66+0.37) +1.6%
4221.7: basic log --grep='uncommon' 6.92(6.58+0.29) 7.08(6.75+0.29) +2.3%
4221.8: extended log --grep='uncommon' 6.92(6.55+0.31) 7.00(6.68+0.31) +1.2%
4221.9: perl log --grep='uncommon' 7.03(6.59+0.33) 7.12(6.73+0.34) +1.3%
4221.11: fixed log --grep='æ' 7.41(7.08+0.28) 7.05(6.76+0.29) -4.9%
4221.12: basic log --grep='æ' 7.39(6.99+0.33) 7.00(6.68+0.25) -5.3%
4221.13: extended log --grep='æ' 7.34(7.00+0.25) 7.15(6.81+0.31) -2.6%
4221.14: perl log --grep='æ' 7.43(7.13+0.26) 7.01(6.60+0.36) -5.7%
log with -i:
Test origin/master HEAD
------------------------------------------------------------------------------------
4221.1: fixed log -i --grep='int' 7.31(7.07+0.24) 7.23(7.00+0.22) -1.1%
4221.2: basic log -i --grep='int' 7.40(7.08+0.28) 7.19(6.92+0.20) -2.8%
4221.3: extended log -i --grep='int' 7.43(7.13+0.25) 7.27(6.99+0.21) -2.2%
4221.4: perl log -i --grep='int' 7.34(7.10+0.24) 7.10(6.90+0.19) -3.3%
4221.6: fixed log -i --grep='uncommon' 7.07(6.71+0.32) 7.11(6.77+0.28) +0.6%
4221.7: basic log -i --grep='uncommon' 6.99(6.64+0.28) 7.12(6.69+0.38) +1.9%
4221.8: extended log -i --grep='uncommon' 7.11(6.74+0.32) 7.10(6.77+0.27) -0.1%
4221.9: perl log -i --grep='uncommon' 6.98(6.60+0.29) 7.05(6.64+0.34) +1.0%
4221.11: fixed log -i --grep='æ' 7.85(7.45+0.34) 7.03(6.68+0.32) -10.4%
4221.12: basic log -i --grep='æ' 7.87(7.49+0.29) 7.06(6.69+0.31) -10.3%
4221.13: extended log -i --grep='æ' 7.87(7.54+0.31) 7.09(6.69+0.31) -9.9%
4221.14: perl log -i --grep='æ' 7.06(6.77+0.28) 6.91(6.57+0.31) -2.1%
So as with e05b027627 ("grep: use PCRE v2 for optimized fixed-string
search", 2019-06-26) there's a huge improvement in performance for
"grep", but in "log" most of our time is spent elsewhere, so we don't
notice it that much.
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-07-01 23:21:00 +02:00
|
|
|
#endif /* !USE_LIBPCRE2 */
|
2016-06-25 07:22:31 +02:00
|
|
|
return;
|
Use kwset in grep
Benchmarks for the hot cache case:
before:
$ perf stat --repeat=5 git grep qwerty > /dev/null
Performance counter stats for 'git grep qwerty' (5 runs):
3,478,085 cache-misses # 2.322 M/sec ( +- 2.690% )
11,356,177 cache-references # 7.582 M/sec ( +- 2.598% )
3,872,184 branch-misses # 0.363 % ( +- 0.258% )
1,067,367,848 branches # 712.673 M/sec ( +- 2.622% )
3,828,370,782 instructions # 0.947 IPC ( +- 0.033% )
4,043,832,831 cycles # 2700.037 M/sec ( +- 0.167% )
8,518 page-faults # 0.006 M/sec ( +- 3.648% )
847 CPU-migrations # 0.001 M/sec ( +- 3.262% )
6,546 context-switches # 0.004 M/sec ( +- 2.292% )
1497.695495 task-clock-msecs # 3.303 CPUs ( +- 2.550% )
0.453394396 seconds time elapsed ( +- 0.912% )
after:
$ perf stat --repeat=5 git grep qwerty > /dev/null
Performance counter stats for 'git grep qwerty' (5 runs):
2,989,918 cache-misses # 3.166 M/sec ( +- 5.013% )
10,986,041 cache-references # 11.633 M/sec ( +- 4.899% ) (scaled from 95.06%)
3,511,993 branch-misses # 1.422 % ( +- 0.785% )
246,893,561 branches # 261.433 M/sec ( +- 3.967% )
1,392,727,757 instructions # 0.564 IPC ( +- 0.040% )
2,468,142,397 cycles # 2613.494 M/sec ( +- 0.110% )
7,747 page-faults # 0.008 M/sec ( +- 3.995% )
897 CPU-migrations # 0.001 M/sec ( +- 2.383% )
6,535 context-switches # 0.007 M/sec ( +- 1.993% )
944.384228 task-clock-msecs # 3.177 CPUs ( +- 0.268% )
0.297257643 seconds time elapsed ( +- 0.450% )
So we gain about 35% by using the kwset code.
As a side effect of using kwset two grep tests are fixed by this
patch. The first is fixed because kwset can deal with case-insensitive
search containing NULs, something strcasestr cannot do. The second one
is fixed because we consider patterns containing NULs as fixed strings
(regcomp cannot accept patterns with NULs).
Signed-off-by: Fredrik Kuivinen <frekui@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2011-08-21 00:42:18 +02:00
|
|
|
}
|
2009-01-10 00:18:34 +01:00
|
|
|
|
grep: simplify config parsing and option parsing
Simplify the parsing of "grep.patternType" and
"grep.extendedRegexp". This changes no behavior, but gets rid of
complex parsing logic that isn't needed anymore.
When "grep.patternType" was introduced in 84befcd0a4a (grep: add a
grep.patternType configuration setting, 2012-08-03) we promised that:
1. You can set "grep.patternType", and "[setting it to] 'default'
will return to the default matching behavior".
In that context "the default" meant whatever the configuration
system specified before that change, i.e. via grep.extendedRegexp.
2. We'd support the existing "grep.extendedRegexp" option, but ignore
it when the new "grep.patternType" option is set. We said we'd
only ignore the older "grep.extendedRegexp" option "when the
`grep.patternType` option is set to a value other than
'default'".
In a preceding commit we changed grep_config() to be called after
grep_init(), which means that much of the complexity here can go
away.
As before both "grep.patternType" and "grep.extendedRegexp" are
last-one-wins variable, with "grep.extendedRegexp" yielding to
"grep.patternType", except when "grep.patternType=default".
Note that as the previously added tests indicate this cannot be done
on-the-fly as we see the config variables, without introducing more
state keeping. I.e. if we see:
-c grep.extendedRegexp=false
-c grep.patternType=default
-c extendedRegexp=true
We need to select ERE, since grep.patternType=default unselects that
variable, which normally has higher precedence, but we also need to
select BRE in cases of:
-c grep.extendedRegexp=true \
-c grep.extendedRegexp=false
Which would not be the case for this, which select ERE:
-c grep.patternType=extended \
-c grep.extendedRegexp=false
Therefore we cannot do this on-the-fly in grep_config without also
introducing tracking variables for not only the pattern type, but what
the source of that pattern type was.
So we need to decide on the pattern after our config was fully
parsed. Let's do that by deferring the decision on the pattern type
until it's time to compile it in compile_regexp().
By that time we've not only parsed the config, but also handled the
command-line options. Those will set "opt.pattern_type_option" (*not*
"opt.extended_regexp_option"!).
At that point all we need to do is see if "grep.patternType" was
UNSPECIFIED in the end (including an explicit "=default"), if so we'll
use the "grep.extendedRegexp" configuration, if any.
See my 07a3d411739 (grep: remove regflags from the public grep_opt
API, 2017-06-29) for addition of the two comments being removed here,
i.e. the complexity noted in that commit is now going away.
1. https://lore.kernel.org/git/patch-v8-09.10-c211bb0c69d-20220118T155211Z-avarab@gmail.com/
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2022-02-16 01:00:39 +01:00
|
|
|
if (opt->pattern_type_option == GREP_PATTERN_TYPE_PCRE) {
|
grep: add support for PCRE v2
Add support for v2 of the PCRE API. This is a new major version of
PCRE that came out in early 2015[1].
The regular expression syntax is the same, but while the API is
similar, pretty much every function is either renamed or takes
different arguments. Thus using it via entirely new functions makes
sense, as opposed to trying to e.g. have one compile_pcre_pattern()
that would call either PCRE v1 or v2 functions.
Git can now be compiled with either USE_LIBPCRE1=YesPlease or
USE_LIBPCRE2=YesPlease, with USE_LIBPCRE=YesPlease currently being a
synonym for the former. Providing both is a compile-time error.
With earlier patches to enable JIT for PCRE v1 the performance of the
release versions of both libraries is almost exactly the same, with
PCRE v2 being around 1% slower.
However after I reported this to the pcre-dev mailing list[2] I got a
lot of help with the API use from Zoltán Herczeg, he subsequently
optimized some of the JIT functionality in v2 of the library.
Running the p7820-grep-engines.sh performance test against the latest
Subversion trunk of both, with both them and git compiled as -O3, and
the test run against linux.git, gives the following results. Just the
/perl/ tests shown:
$ GIT_PERF_REPEAT_COUNT=30 GIT_PERF_LARGE_REPO=~/g/linux GIT_PERF_MAKE_COMMAND='grep -q LIBPCRE2 Makefile && make -j8 USE_LIBPCRE2=YesPlease CC=~/perl5/installed/bin/gcc NO_R_TO_GCC_LINKER=YesPlease CFLAGS=-O3 LIBPCREDIR=/home/avar/g/pcre2/inst LDFLAGS=-Wl,-rpath,/home/avar/g/pcre2/inst/lib || make -j8 USE_LIBPCRE=YesPlease CC=~/perl5/installed/bin/gcc NO_R_TO_GCC_LINKER=YesPlease CFLAGS=-O3 LIBPCREDIR=/home/avar/g/pcre/inst LDFLAGS=-Wl,-rpath,/home/avar/g/pcre/inst/lib' ./run HEAD~5 HEAD~ HEAD p7820-grep-engines.sh
[...]
Test HEAD~5 HEAD~ HEAD
-----------------------------------------------------------------------------------------------------------------
7820.3: perl grep 'how.to' 0.31(1.10+0.48) 0.21(0.35+0.56) -32.3% 0.21(0.34+0.55) -32.3%
7820.7: perl grep '^how to' 0.56(2.70+0.40) 0.24(0.64+0.52) -57.1% 0.20(0.28+0.60) -64.3%
7820.11: perl grep '[how] to' 0.56(2.66+0.38) 0.29(0.95+0.45) -48.2% 0.23(0.45+0.54) -58.9%
7820.15: perl grep '(e.t[^ ]*|v.ry) rare' 1.02(5.77+0.42) 0.31(1.02+0.54) -69.6% 0.23(0.50+0.54) -77.5%
7820.19: perl grep 'm(ú|u)lt.b(æ|y)te' 0.38(1.57+0.42) 0.27(0.85+0.46) -28.9% 0.21(0.33+0.57) -44.7%
See commit ("perf: add a comparison test of grep regex engines",
2017-04-19) for details on the machine the above test run was executed
on.
Here HEAD~2 is git with PCRE v1 without JIT, HEAD~ is PCRE v1 with
JIT, and HEAD is PCRE v2 (also with JIT). See previous commits of mine
mentioning p7820-grep-engines.sh for more details on the test setup.
For ease of readability, a different run just of HEAD~ (PCRE v1 with
JIT v.s. PCRE v2), again with just the /perl/ tests shown:
[...]
Test HEAD~ HEAD
----------------------------------------------------------------------------------------
7820.3: perl grep 'how.to' 0.21(0.42+0.52) 0.21(0.31+0.58) +0.0%
7820.7: perl grep '^how to' 0.25(0.65+0.50) 0.20(0.31+0.57) -20.0%
7820.11: perl grep '[how] to' 0.30(0.90+0.50) 0.23(0.46+0.53) -23.3%
7820.15: perl grep '(e.t[^ ]*|v.ry) rare' 0.30(1.19+0.38) 0.23(0.51+0.51) -23.3%
7820.19: perl grep 'm(ú|u)lt.b(æ|y)te' 0.27(0.84+0.48) 0.21(0.34+0.57) -22.2%
I.e. the two are either neck-to-neck, but PCRE v2 usually pulls ahead,
when it does it's around 20% faster.
A brief note on thread safety: As noted in pcre2api(3) & pcre2jit(3)
the compiled pattern can be shared between threads, but not some of
the JIT context, however the grep threading support does all pattern &
JIT compilation in separate threads, so this code doesn't need to
concern itself with thread safety.
See commit 63e7e9d8b6 ("git-grep: Learn PCRE", 2011-05-09) for the
initial addition of PCRE v1. This change follows some of the same
patterns it did (and which were discussed on list at the time),
e.g. mocking up types with typedef instead of ifdef-ing them out when
USE_LIBPCRE2 isn't defined. This adds some trivial memory use to the
program, but makes the code look nicer.
1. https://lists.exim.org/lurker/message/20150105.162835.0666407a.en.html
2. https://lists.exim.org/lurker/thread/20170419.172322.833ee099.en.html
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-06-01 20:20:56 +02:00
|
|
|
compile_pcre2_pattern(p, opt);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2017-06-30 00:22:21 +02:00
|
|
|
if (p->ignore_case)
|
|
|
|
regflags |= REG_ICASE;
|
grep: simplify config parsing and option parsing
Simplify the parsing of "grep.patternType" and
"grep.extendedRegexp". This changes no behavior, but gets rid of
complex parsing logic that isn't needed anymore.
When "grep.patternType" was introduced in 84befcd0a4a (grep: add a
grep.patternType configuration setting, 2012-08-03) we promised that:
1. You can set "grep.patternType", and "[setting it to] 'default'
will return to the default matching behavior".
In that context "the default" meant whatever the configuration
system specified before that change, i.e. via grep.extendedRegexp.
2. We'd support the existing "grep.extendedRegexp" option, but ignore
it when the new "grep.patternType" option is set. We said we'd
only ignore the older "grep.extendedRegexp" option "when the
`grep.patternType` option is set to a value other than
'default'".
In a preceding commit we changed grep_config() to be called after
grep_init(), which means that much of the complexity here can go
away.
As before both "grep.patternType" and "grep.extendedRegexp" are
last-one-wins variable, with "grep.extendedRegexp" yielding to
"grep.patternType", except when "grep.patternType=default".
Note that as the previously added tests indicate this cannot be done
on-the-fly as we see the config variables, without introducing more
state keeping. I.e. if we see:
-c grep.extendedRegexp=false
-c grep.patternType=default
-c extendedRegexp=true
We need to select ERE, since grep.patternType=default unselects that
variable, which normally has higher precedence, but we also need to
select BRE in cases of:
-c grep.extendedRegexp=true \
-c grep.extendedRegexp=false
Which would not be the case for this, which select ERE:
-c grep.patternType=extended \
-c grep.extendedRegexp=false
Therefore we cannot do this on-the-fly in grep_config without also
introducing tracking variables for not only the pattern type, but what
the source of that pattern type was.
So we need to decide on the pattern after our config was fully
parsed. Let's do that by deferring the decision on the pattern type
until it's time to compile it in compile_regexp().
By that time we've not only parsed the config, but also handled the
command-line options. Those will set "opt.pattern_type_option" (*not*
"opt.extended_regexp_option"!).
At that point all we need to do is see if "grep.patternType" was
UNSPECIFIED in the end (including an explicit "=default"), if so we'll
use the "grep.extendedRegexp" configuration, if any.
See my 07a3d411739 (grep: remove regflags from the public grep_opt
API, 2017-06-29) for addition of the two comments being removed here,
i.e. the complexity noted in that commit is now going away.
1. https://lore.kernel.org/git/patch-v8-09.10-c211bb0c69d-20220118T155211Z-avarab@gmail.com/
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2022-02-16 01:00:39 +01:00
|
|
|
if (opt->pattern_type_option == GREP_PATTERN_TYPE_ERE)
|
2017-06-30 00:22:21 +02:00
|
|
|
regflags |= REG_EXTENDED;
|
|
|
|
err = regcomp(&p->regexp, p->pattern, regflags);
|
2006-09-18 01:02:52 +02:00
|
|
|
if (err) {
|
|
|
|
char errbuf[1024];
|
|
|
|
regerror(err, &p->regexp, errbuf, 1024);
|
2011-05-09 23:52:04 +02:00
|
|
|
compile_regexp_failed(p, errbuf);
|
2006-09-18 01:02:52 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-01-06 10:54:19 +01:00
|
|
|
static struct grep_expr *grep_not_expr(struct grep_expr *expr)
|
|
|
|
{
|
|
|
|
struct grep_expr *z = xcalloc(1, sizeof(*z));
|
|
|
|
z->node = GREP_NODE_NOT;
|
|
|
|
z->u.unary = expr;
|
|
|
|
return z;
|
|
|
|
}
|
|
|
|
|
2022-01-06 20:50:12 +01:00
|
|
|
static struct grep_expr *grep_binexp(enum grep_expr_node kind,
|
|
|
|
struct grep_expr *left,
|
|
|
|
struct grep_expr *right)
|
2022-01-06 10:51:00 +01:00
|
|
|
{
|
|
|
|
struct grep_expr *z = xcalloc(1, sizeof(*z));
|
2022-01-06 20:50:12 +01:00
|
|
|
z->node = kind;
|
2022-01-06 10:51:00 +01:00
|
|
|
z->u.binary.left = left;
|
|
|
|
z->u.binary.right = right;
|
|
|
|
return z;
|
|
|
|
}
|
|
|
|
|
2022-01-06 20:50:12 +01:00
|
|
|
static struct grep_expr *grep_or_expr(struct grep_expr *left, struct grep_expr *right)
|
|
|
|
{
|
|
|
|
return grep_binexp(GREP_NODE_OR, left, right);
|
|
|
|
}
|
|
|
|
|
2022-01-06 20:50:15 +01:00
|
|
|
static struct grep_expr *grep_and_expr(struct grep_expr *left, struct grep_expr *right)
|
|
|
|
{
|
|
|
|
return grep_binexp(GREP_NODE_AND, left, right);
|
|
|
|
}
|
|
|
|
|
2006-09-28 02:50:52 +02:00
|
|
|
static struct grep_expr *compile_pattern_or(struct grep_pat **);
|
2006-09-18 01:02:52 +02:00
|
|
|
static struct grep_expr *compile_pattern_atom(struct grep_pat **list)
|
|
|
|
{
|
|
|
|
struct grep_pat *p;
|
|
|
|
struct grep_expr *x;
|
|
|
|
|
|
|
|
p = *list;
|
2009-04-27 20:10:24 +02:00
|
|
|
if (!p)
|
|
|
|
return NULL;
|
2006-09-18 01:02:52 +02:00
|
|
|
switch (p->token) {
|
|
|
|
case GREP_PATTERN: /* atom */
|
2006-09-20 21:39:46 +02:00
|
|
|
case GREP_PATTERN_HEAD:
|
|
|
|
case GREP_PATTERN_BODY:
|
2021-03-13 17:17:22 +01:00
|
|
|
CALLOC_ARRAY(x, 1);
|
2006-09-18 01:02:52 +02:00
|
|
|
x->node = GREP_NODE_ATOM;
|
|
|
|
x->u.atom = p;
|
|
|
|
*list = p->next;
|
|
|
|
return x;
|
|
|
|
case GREP_OPEN_PAREN:
|
|
|
|
*list = p->next;
|
2006-09-28 02:50:52 +02:00
|
|
|
x = compile_pattern_or(list);
|
2006-09-18 01:02:52 +02:00
|
|
|
if (!*list || (*list)->token != GREP_CLOSE_PAREN)
|
|
|
|
die("unmatched parenthesis");
|
|
|
|
*list = (*list)->next;
|
|
|
|
return x;
|
|
|
|
default:
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct grep_expr *compile_pattern_not(struct grep_pat **list)
|
|
|
|
{
|
|
|
|
struct grep_pat *p;
|
|
|
|
struct grep_expr *x;
|
|
|
|
|
|
|
|
p = *list;
|
2009-04-27 20:10:24 +02:00
|
|
|
if (!p)
|
|
|
|
return NULL;
|
2006-09-18 01:02:52 +02:00
|
|
|
switch (p->token) {
|
|
|
|
case GREP_NOT:
|
|
|
|
if (!p->next)
|
|
|
|
die("--not not followed by pattern expression");
|
|
|
|
*list = p->next;
|
2022-01-06 10:54:19 +01:00
|
|
|
x = compile_pattern_not(list);
|
|
|
|
if (!x)
|
2006-09-18 01:02:52 +02:00
|
|
|
die("--not followed by non pattern expression");
|
2022-01-06 10:54:19 +01:00
|
|
|
return grep_not_expr(x);
|
2006-09-18 01:02:52 +02:00
|
|
|
default:
|
|
|
|
return compile_pattern_atom(list);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct grep_expr *compile_pattern_and(struct grep_pat **list)
|
|
|
|
{
|
|
|
|
struct grep_pat *p;
|
2022-01-06 20:50:15 +01:00
|
|
|
struct grep_expr *x, *y;
|
2006-09-18 01:02:52 +02:00
|
|
|
|
|
|
|
x = compile_pattern_not(list);
|
|
|
|
p = *list;
|
|
|
|
if (p && p->token == GREP_AND) {
|
2021-06-30 18:12:43 +02:00
|
|
|
if (!x)
|
|
|
|
die("--and not preceded by pattern expression");
|
2006-09-18 01:02:52 +02:00
|
|
|
if (!p->next)
|
|
|
|
die("--and not followed by pattern expression");
|
|
|
|
*list = p->next;
|
|
|
|
y = compile_pattern_and(list);
|
|
|
|
if (!y)
|
|
|
|
die("--and not followed by pattern expression");
|
2022-01-06 20:50:15 +01:00
|
|
|
return grep_and_expr(x, y);
|
2006-09-18 01:02:52 +02:00
|
|
|
}
|
|
|
|
return x;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct grep_expr *compile_pattern_or(struct grep_pat **list)
|
|
|
|
{
|
|
|
|
struct grep_pat *p;
|
2022-01-06 10:51:00 +01:00
|
|
|
struct grep_expr *x, *y;
|
2006-09-18 01:02:52 +02:00
|
|
|
|
|
|
|
x = compile_pattern_and(list);
|
|
|
|
p = *list;
|
|
|
|
if (x && p && p->token != GREP_CLOSE_PAREN) {
|
|
|
|
y = compile_pattern_or(list);
|
|
|
|
if (!y)
|
|
|
|
die("not a pattern expression %s", p->pattern);
|
2022-01-06 10:51:00 +01:00
|
|
|
return grep_or_expr(x, y);
|
2006-09-18 01:02:52 +02:00
|
|
|
}
|
|
|
|
return x;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct grep_expr *compile_pattern_expr(struct grep_pat **list)
|
|
|
|
{
|
|
|
|
return compile_pattern_or(list);
|
|
|
|
}
|
|
|
|
|
2010-09-13 07:15:35 +02:00
|
|
|
static struct grep_expr *grep_true_expr(void)
|
|
|
|
{
|
|
|
|
struct grep_expr *z = xcalloc(1, sizeof(*z));
|
|
|
|
z->node = GREP_NODE_TRUE;
|
|
|
|
return z;
|
|
|
|
}
|
|
|
|
|
2010-09-13 04:30:48 +02:00
|
|
|
static struct grep_expr *prep_header_patterns(struct grep_opt *opt)
|
2006-09-18 01:02:52 +02:00
|
|
|
{
|
|
|
|
struct grep_pat *p;
|
2010-09-13 04:30:48 +02:00
|
|
|
struct grep_expr *header_expr;
|
2010-09-13 07:15:35 +02:00
|
|
|
struct grep_expr *(header_group[GREP_HEADER_FIELD_MAX]);
|
|
|
|
enum grep_header_field fld;
|
2006-09-18 01:02:52 +02:00
|
|
|
|
2010-09-13 04:30:48 +02:00
|
|
|
if (!opt->header_list)
|
|
|
|
return NULL;
|
2012-05-06 20:17:15 +02:00
|
|
|
|
2010-09-13 04:30:48 +02:00
|
|
|
for (p = opt->header_list; p; p = p->next) {
|
|
|
|
if (p->token != GREP_PATTERN_HEAD)
|
2018-05-02 11:38:39 +02:00
|
|
|
BUG("a non-header pattern in grep header list.");
|
2013-02-03 15:37:09 +01:00
|
|
|
if (p->field < GREP_HEADER_FIELD_MIN ||
|
|
|
|
GREP_HEADER_FIELD_MAX <= p->field)
|
2018-05-02 11:38:39 +02:00
|
|
|
BUG("unknown header field %d", p->field);
|
2010-09-13 04:30:48 +02:00
|
|
|
compile_regexp(p, opt);
|
"log --author=me --grep=it" should find intersection, not union
Historically, any grep filter in "git log" family of commands were taken
as restricting to commits with any of the words in the commit log message.
However, the user almost always want to find commits "done by this person
on that topic". With "--all-match" option, a series of grep patterns can
be turned into a requirement that all of them must produce a match, but
that makes it impossible to ask for "done by me, on either this or that"
with:
log --author=me --committer=him --grep=this --grep=that
because it will require both "this" and "that" to appear.
Change the "header" parser of grep library to treat the headers specially,
and parse it as:
(all-match-OR (HEADER-AUTHOR me)
(HEADER-COMMITTER him)
(OR
(PATTERN this)
(PATTERN that) ) )
Even though the "log" command line parser doesn't give direct access to
the extended grep syntax to group terms with parentheses, this change will
cover the majority of the case the users would want.
This incidentally revealed that one test in t7002 was bogus. It ran:
log --author=Thor --grep=Thu --format='%s'
and expected (wrongly) "Thu" to match "Thursday" in the author/committer
date, but that would never match, as the timestamp in raw commit buffer
does not have the name of the day-of-the-week.
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2010-01-18 05:09:06 +01:00
|
|
|
}
|
2010-09-13 07:15:35 +02:00
|
|
|
|
|
|
|
for (fld = 0; fld < GREP_HEADER_FIELD_MAX; fld++)
|
|
|
|
header_group[fld] = NULL;
|
|
|
|
|
|
|
|
for (p = opt->header_list; p; p = p->next) {
|
|
|
|
struct grep_expr *h;
|
|
|
|
struct grep_pat *pp = p;
|
|
|
|
|
|
|
|
h = compile_pattern_atom(&pp);
|
|
|
|
if (!h || pp != p->next)
|
2018-05-02 11:38:39 +02:00
|
|
|
BUG("malformed header expr");
|
2010-09-13 07:15:35 +02:00
|
|
|
if (!header_group[p->field]) {
|
|
|
|
header_group[p->field] = h;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
header_group[p->field] = grep_or_expr(h, header_group[p->field]);
|
|
|
|
}
|
|
|
|
|
|
|
|
header_expr = NULL;
|
|
|
|
|
|
|
|
for (fld = 0; fld < GREP_HEADER_FIELD_MAX; fld++) {
|
|
|
|
if (!header_group[fld])
|
|
|
|
continue;
|
|
|
|
if (!header_expr)
|
|
|
|
header_expr = grep_true_expr();
|
|
|
|
header_expr = grep_or_expr(header_group[fld], header_expr);
|
|
|
|
}
|
2010-09-13 04:30:48 +02:00
|
|
|
return header_expr;
|
|
|
|
}
|
|
|
|
|
log --grep/--author: honor --all-match honored for multiple --grep patterns
When we have both header expression (which has to be an OR node by
construction) and a pattern expression (which could be anything), we
create a new top-level OR node to bind them together, and the
resulting expression structure looks like this:
OR
/ \
/ \
pattern OR
/ \ / \
..... committer OR
/ \
author TRUE
The three elements on the top-level backbone that are inspected by
the "all-match" logic are "pattern", "committer" and "author". When
there are more than one elements in the "pattern", the top-level
node of the "pattern" part of the subtree is an OR, and that node is
inspected by "all-match".
The result ends up ignoring the "--all-match" given from the command
line. A match on either side of the pattern is considered a match,
hence:
git log --grep=A --grep=B --author=C --all-match
shows the same "authored by C and has either A or B" that is correct
only when run without "--all-match".
Fix this by turning the resulting expression around when "--all-match"
is in effect, like this:
OR
/ \
/ \
/ OR
committer / \
author \
pattern
The set of nodes on the top-level backbone in the resulting
expression becomes "committer", "author", and the nodes that are on
the top-level backbone of the "pattern" subexpression. This makes
the "all-match" logic inspect the same nodes in "pattern" as the
case without the author and/or the committer restriction, and makes
the earlier "log" example to show "authored by C and has A and has
B", which is what the command line expects.
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-09-14 01:26:57 +02:00
|
|
|
static struct grep_expr *grep_splice_or(struct grep_expr *x, struct grep_expr *y)
|
|
|
|
{
|
|
|
|
struct grep_expr *z = x;
|
|
|
|
|
|
|
|
while (x) {
|
|
|
|
assert(x->node == GREP_NODE_OR);
|
|
|
|
if (x->u.binary.right &&
|
|
|
|
x->u.binary.right->node == GREP_NODE_TRUE) {
|
|
|
|
x->u.binary.right = y;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
x = x->u.binary.right;
|
|
|
|
}
|
|
|
|
return z;
|
|
|
|
}
|
|
|
|
|
grep/log: remove hidden --debug and --grep-debug options
Remove the hidden "grep --debug" and "log --grep-debug" options added
in 17bf35a3c7b (grep: teach --debug option to dump the parse tree,
2012-09-13).
At the time these options seem to have been intended to go along with
a documentation discussion and to help the author of relevant tests to
perform ad-hoc debugging on them[1].
Reasons to want this gone:
1. They were never documented, and the only (rather trivial) use of
them in our own codebase for testing is something I removed back
in e01b4dab01e (grep: change non-ASCII -i test to stop using
--debug, 2017-05-20).
2. Googling around doesn't show any in-the-wild uses I could dig up,
and on the Git ML the only mentions after the original discussion
seem to have been when they came up in unrelated diff contexts, or
that test commit of mine.
3. An exception to that is c581e4a7499 (grep: under --debug, show
whether PCRE JIT is enabled, 2019-08-18) where we added the
ability to dump out when PCREv2 has the JIT in effect.
The combination of that and my earlier b65abcafc7a (grep: use PCRE
v2 for optimized fixed-string search, 2019-07-01) means Git prints
this out in its most common in-the-wild configuration:
$ git log --grep-debug --grep=foo --grep=bar --grep=baz --all-match
pcre2_jit_on=1
pcre2_jit_on=1
pcre2_jit_on=1
[all-match]
(or
pattern_body<body>foo
(or
pattern_body<body>bar
pattern_body<body>baz
)
)
$ git grep --debug \( -e foo --and -e bar \) --or -e baz
pcre2_jit_on=1
pcre2_jit_on=1
pcre2_jit_on=1
(or
(and
patternfoo
patternbar
)
patternbaz
)
I.e. for each pattern we're considering for the and/or/--all-match
etc. debugging we'll now diligently spew out another identical line
saying whether the PCREv2 JIT is on or not.
I think that nobody's complained about that rather glaringly obviously
bad output says something about how much this is used, i.e. it's
not.
The need for this debugging aid for the composed grep/log patterns
seems to have passed, and the desire to dump the JIT config seems to
have been another one-off around the time we had JIT-related issues on
the PCREv2 codepath. That the original author of this debugging
facility seemingly hasn't noticed the bad output since then[2] is
probably some indicator.
1. https://lore.kernel.org/git/cover.1347615361.git.git@drmicha.warpmail.net/
2. https://lore.kernel.org/git/xmqqk1b8x0ac.fsf@gitster-ct.c.googlers.com/
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-01-26 00:36:51 +01:00
|
|
|
void compile_grep_patterns(struct grep_opt *opt)
|
2010-09-13 04:30:48 +02:00
|
|
|
{
|
|
|
|
struct grep_pat *p;
|
|
|
|
struct grep_expr *header_expr = prep_header_patterns(opt);
|
grep.c: remove "extended" in favor of "pattern_expression", fix segfault
Since 79d3696cfb4 (git-grep: boolean expression on pattern matching.,
2006-06-30) the "pattern_expression" member has been used for complex
queries (AND/OR...), with "pattern_list" being used for the simple OR
queries. Since then we've used both "pattern_expression" and its
associated boolean "extended" member to see if we have a complex
expression.
Since f41fb662f57 (revisions API: have release_revisions() release
"grep_filter", 2022-04-13) we've had a subtle bug relating to that: If
we supplied options that were only used for "complex queries", but
didn't supply the query itself we'd set "opt->extended", but would
have a NULL "pattern_expression". As a result these would segfault as
we tried to call "free_grep_patterns()" from "release_revisions()":
git -P log -1 --invert-grep
git -P log -1 --all-match
The root cause of this is that we were conflating the state management
we needed in "compile_grep_patterns()" itself with whether or not we
had an "opt->pattern_expression" later on.
In this cases as we're going through "compile_grep_patterns()" we have
no "opt->pattern_list" but have "opt->no_body_match" or
"opt->all_match". So we'd set "opt->extended = 1", but not "return" on
"opt->extended" as that's an "else if" in the same "if" statement.
That behavior is intentional and required, as the common case is that
we have an "opt->pattern_list" that we're about to parse into the
"opt->pattern_expression".
But we don't need to keep track of this "extended" flag beyond the
state management in compile_grep_patterns() itself. It needs it, but
once we're out of that function we can rely on
"opt->pattern_expression" being non-NULL instead for using these
extended patterns.
As 79d3696cfb4 itself shows we've assumed that there's a one-to-one
mapping between the two since the very beginning. I.e. "match_line()"
would check "opt->extended" to see if it should call "match_expr()",
and the first thing we do in that function is assume that we have a
"opt->pattern_expression". We'd then call "match_expr_eval()", which
would have died if that "opt->pattern_expression" was NULL.
The "die" was added in c922b01f54c (grep: fix segfault when "git grep
'('" is given, 2009-04-27), and can now be removed as it's now clearly
unreachable. We still do the right thing in the case that prompted
that fix:
git grep '('
fatal: unmatched parenthesis
Arguably neither the "--invert-grep" option added in [1] nor the
earlier "--all-match" option added in [2] were intended to be used
stand-alone, and another approach[3] would be to error out in those
cases. But since we've been treating them as a NOOP when given without
--grep for a long time let's keep doing that.
We could also return in "free_pattern_expr()" if the argument is
non-NULL, as an alternative fix for this segfault does [4]. That would
be more elegant in making the "free_*()" function behave like
"free()", but it would also remove a sanity check: The
"free_pattern_expr()" function calls itself recursively, and only the
top-level is allowed to be NULL, let's not conflate those two
conditions.
1. 22dfa8a23de (log: teach --invert-grep option, 2015-01-12)
2. 0ab7befa31d (grep --all-match, 2006-09-27)
3. https://lore.kernel.org/git/patch-1.1-f4b90799fce-20221010T165711Z-avarab@gmail.com/
4. http://lore.kernel.org/git/7e094882c2a71894416089f894557a9eae07e8f8.1665423686.git.me@ttaylorr.com
Reported-by: orygaw <orygaw@protonmail.com>
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2022-10-11 11:48:45 +02:00
|
|
|
int extended = 0;
|
2006-09-28 02:50:52 +02:00
|
|
|
|
2006-09-18 01:02:52 +02:00
|
|
|
for (p = opt->pattern_list; p; p = p->next) {
|
2006-09-20 21:39:46 +02:00
|
|
|
switch (p->token) {
|
|
|
|
case GREP_PATTERN: /* atom */
|
|
|
|
case GREP_PATTERN_HEAD:
|
|
|
|
case GREP_PATTERN_BODY:
|
2009-01-10 00:18:34 +01:00
|
|
|
compile_regexp(p, opt);
|
2006-09-20 21:39:46 +02:00
|
|
|
break;
|
|
|
|
default:
|
grep.c: remove "extended" in favor of "pattern_expression", fix segfault
Since 79d3696cfb4 (git-grep: boolean expression on pattern matching.,
2006-06-30) the "pattern_expression" member has been used for complex
queries (AND/OR...), with "pattern_list" being used for the simple OR
queries. Since then we've used both "pattern_expression" and its
associated boolean "extended" member to see if we have a complex
expression.
Since f41fb662f57 (revisions API: have release_revisions() release
"grep_filter", 2022-04-13) we've had a subtle bug relating to that: If
we supplied options that were only used for "complex queries", but
didn't supply the query itself we'd set "opt->extended", but would
have a NULL "pattern_expression". As a result these would segfault as
we tried to call "free_grep_patterns()" from "release_revisions()":
git -P log -1 --invert-grep
git -P log -1 --all-match
The root cause of this is that we were conflating the state management
we needed in "compile_grep_patterns()" itself with whether or not we
had an "opt->pattern_expression" later on.
In this cases as we're going through "compile_grep_patterns()" we have
no "opt->pattern_list" but have "opt->no_body_match" or
"opt->all_match". So we'd set "opt->extended = 1", but not "return" on
"opt->extended" as that's an "else if" in the same "if" statement.
That behavior is intentional and required, as the common case is that
we have an "opt->pattern_list" that we're about to parse into the
"opt->pattern_expression".
But we don't need to keep track of this "extended" flag beyond the
state management in compile_grep_patterns() itself. It needs it, but
once we're out of that function we can rely on
"opt->pattern_expression" being non-NULL instead for using these
extended patterns.
As 79d3696cfb4 itself shows we've assumed that there's a one-to-one
mapping between the two since the very beginning. I.e. "match_line()"
would check "opt->extended" to see if it should call "match_expr()",
and the first thing we do in that function is assume that we have a
"opt->pattern_expression". We'd then call "match_expr_eval()", which
would have died if that "opt->pattern_expression" was NULL.
The "die" was added in c922b01f54c (grep: fix segfault when "git grep
'('" is given, 2009-04-27), and can now be removed as it's now clearly
unreachable. We still do the right thing in the case that prompted
that fix:
git grep '('
fatal: unmatched parenthesis
Arguably neither the "--invert-grep" option added in [1] nor the
earlier "--all-match" option added in [2] were intended to be used
stand-alone, and another approach[3] would be to error out in those
cases. But since we've been treating them as a NOOP when given without
--grep for a long time let's keep doing that.
We could also return in "free_pattern_expr()" if the argument is
non-NULL, as an alternative fix for this segfault does [4]. That would
be more elegant in making the "free_*()" function behave like
"free()", but it would also remove a sanity check: The
"free_pattern_expr()" function calls itself recursively, and only the
top-level is allowed to be NULL, let's not conflate those two
conditions.
1. 22dfa8a23de (log: teach --invert-grep option, 2015-01-12)
2. 0ab7befa31d (grep --all-match, 2006-09-27)
3. https://lore.kernel.org/git/patch-1.1-f4b90799fce-20221010T165711Z-avarab@gmail.com/
4. http://lore.kernel.org/git/7e094882c2a71894416089f894557a9eae07e8f8.1665423686.git.me@ttaylorr.com
Reported-by: orygaw <orygaw@protonmail.com>
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2022-10-11 11:48:45 +02:00
|
|
|
extended = 1;
|
2006-09-20 21:39:46 +02:00
|
|
|
break;
|
|
|
|
}
|
2006-09-18 01:02:52 +02:00
|
|
|
}
|
|
|
|
|
2021-12-17 17:48:49 +01:00
|
|
|
if (opt->all_match || opt->no_body_match || header_expr)
|
grep.c: remove "extended" in favor of "pattern_expression", fix segfault
Since 79d3696cfb4 (git-grep: boolean expression on pattern matching.,
2006-06-30) the "pattern_expression" member has been used for complex
queries (AND/OR...), with "pattern_list" being used for the simple OR
queries. Since then we've used both "pattern_expression" and its
associated boolean "extended" member to see if we have a complex
expression.
Since f41fb662f57 (revisions API: have release_revisions() release
"grep_filter", 2022-04-13) we've had a subtle bug relating to that: If
we supplied options that were only used for "complex queries", but
didn't supply the query itself we'd set "opt->extended", but would
have a NULL "pattern_expression". As a result these would segfault as
we tried to call "free_grep_patterns()" from "release_revisions()":
git -P log -1 --invert-grep
git -P log -1 --all-match
The root cause of this is that we were conflating the state management
we needed in "compile_grep_patterns()" itself with whether or not we
had an "opt->pattern_expression" later on.
In this cases as we're going through "compile_grep_patterns()" we have
no "opt->pattern_list" but have "opt->no_body_match" or
"opt->all_match". So we'd set "opt->extended = 1", but not "return" on
"opt->extended" as that's an "else if" in the same "if" statement.
That behavior is intentional and required, as the common case is that
we have an "opt->pattern_list" that we're about to parse into the
"opt->pattern_expression".
But we don't need to keep track of this "extended" flag beyond the
state management in compile_grep_patterns() itself. It needs it, but
once we're out of that function we can rely on
"opt->pattern_expression" being non-NULL instead for using these
extended patterns.
As 79d3696cfb4 itself shows we've assumed that there's a one-to-one
mapping between the two since the very beginning. I.e. "match_line()"
would check "opt->extended" to see if it should call "match_expr()",
and the first thing we do in that function is assume that we have a
"opt->pattern_expression". We'd then call "match_expr_eval()", which
would have died if that "opt->pattern_expression" was NULL.
The "die" was added in c922b01f54c (grep: fix segfault when "git grep
'('" is given, 2009-04-27), and can now be removed as it's now clearly
unreachable. We still do the right thing in the case that prompted
that fix:
git grep '('
fatal: unmatched parenthesis
Arguably neither the "--invert-grep" option added in [1] nor the
earlier "--all-match" option added in [2] were intended to be used
stand-alone, and another approach[3] would be to error out in those
cases. But since we've been treating them as a NOOP when given without
--grep for a long time let's keep doing that.
We could also return in "free_pattern_expr()" if the argument is
non-NULL, as an alternative fix for this segfault does [4]. That would
be more elegant in making the "free_*()" function behave like
"free()", but it would also remove a sanity check: The
"free_pattern_expr()" function calls itself recursively, and only the
top-level is allowed to be NULL, let's not conflate those two
conditions.
1. 22dfa8a23de (log: teach --invert-grep option, 2015-01-12)
2. 0ab7befa31d (grep --all-match, 2006-09-27)
3. https://lore.kernel.org/git/patch-1.1-f4b90799fce-20221010T165711Z-avarab@gmail.com/
4. http://lore.kernel.org/git/7e094882c2a71894416089f894557a9eae07e8f8.1665423686.git.me@ttaylorr.com
Reported-by: orygaw <orygaw@protonmail.com>
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2022-10-11 11:48:45 +02:00
|
|
|
extended = 1;
|
|
|
|
else if (!extended)
|
2006-09-18 01:02:52 +02:00
|
|
|
return;
|
|
|
|
|
|
|
|
p = opt->pattern_list;
|
2009-03-18 21:53:27 +01:00
|
|
|
if (p)
|
|
|
|
opt->pattern_expression = compile_pattern_expr(&p);
|
2006-09-18 01:02:52 +02:00
|
|
|
if (p)
|
|
|
|
die("incomplete pattern expression: %s", p->pattern);
|
"log --author=me --grep=it" should find intersection, not union
Historically, any grep filter in "git log" family of commands were taken
as restricting to commits with any of the words in the commit log message.
However, the user almost always want to find commits "done by this person
on that topic". With "--all-match" option, a series of grep patterns can
be turned into a requirement that all of them must produce a match, but
that makes it impossible to ask for "done by me, on either this or that"
with:
log --author=me --committer=him --grep=this --grep=that
because it will require both "this" and "that" to appear.
Change the "header" parser of grep library to treat the headers specially,
and parse it as:
(all-match-OR (HEADER-AUTHOR me)
(HEADER-COMMITTER him)
(OR
(PATTERN this)
(PATTERN that) ) )
Even though the "log" command line parser doesn't give direct access to
the extended grep syntax to group terms with parentheses, this change will
cover the majority of the case the users would want.
This incidentally revealed that one test in t7002 was bogus. It ran:
log --author=Thor --grep=Thu --format='%s'
and expected (wrongly) "Thu" to match "Thursday" in the author/committer
date, but that would never match, as the timestamp in raw commit buffer
does not have the name of the day-of-the-week.
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2010-01-18 05:09:06 +01:00
|
|
|
|
2021-12-17 17:48:49 +01:00
|
|
|
if (opt->no_body_match && opt->pattern_expression)
|
|
|
|
opt->pattern_expression = grep_not_expr(opt->pattern_expression);
|
|
|
|
|
"log --author=me --grep=it" should find intersection, not union
Historically, any grep filter in "git log" family of commands were taken
as restricting to commits with any of the words in the commit log message.
However, the user almost always want to find commits "done by this person
on that topic". With "--all-match" option, a series of grep patterns can
be turned into a requirement that all of them must produce a match, but
that makes it impossible to ask for "done by me, on either this or that"
with:
log --author=me --committer=him --grep=this --grep=that
because it will require both "this" and "that" to appear.
Change the "header" parser of grep library to treat the headers specially,
and parse it as:
(all-match-OR (HEADER-AUTHOR me)
(HEADER-COMMITTER him)
(OR
(PATTERN this)
(PATTERN that) ) )
Even though the "log" command line parser doesn't give direct access to
the extended grep syntax to group terms with parentheses, this change will
cover the majority of the case the users would want.
This incidentally revealed that one test in t7002 was bogus. It ran:
log --author=Thor --grep=Thu --format='%s'
and expected (wrongly) "Thu" to match "Thursday" in the author/committer
date, but that would never match, as the timestamp in raw commit buffer
does not have the name of the day-of-the-week.
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2010-01-18 05:09:06 +01:00
|
|
|
if (!header_expr)
|
|
|
|
return;
|
|
|
|
|
2010-09-13 07:15:35 +02:00
|
|
|
if (!opt->pattern_expression)
|
"log --author=me --grep=it" should find intersection, not union
Historically, any grep filter in "git log" family of commands were taken
as restricting to commits with any of the words in the commit log message.
However, the user almost always want to find commits "done by this person
on that topic". With "--all-match" option, a series of grep patterns can
be turned into a requirement that all of them must produce a match, but
that makes it impossible to ask for "done by me, on either this or that"
with:
log --author=me --committer=him --grep=this --grep=that
because it will require both "this" and "that" to appear.
Change the "header" parser of grep library to treat the headers specially,
and parse it as:
(all-match-OR (HEADER-AUTHOR me)
(HEADER-COMMITTER him)
(OR
(PATTERN this)
(PATTERN that) ) )
Even though the "log" command line parser doesn't give direct access to
the extended grep syntax to group terms with parentheses, this change will
cover the majority of the case the users would want.
This incidentally revealed that one test in t7002 was bogus. It ran:
log --author=Thor --grep=Thu --format='%s'
and expected (wrongly) "Thu" to match "Thursday" in the author/committer
date, but that would never match, as the timestamp in raw commit buffer
does not have the name of the day-of-the-week.
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2010-01-18 05:09:06 +01:00
|
|
|
opt->pattern_expression = header_expr;
|
log --grep/--author: honor --all-match honored for multiple --grep patterns
When we have both header expression (which has to be an OR node by
construction) and a pattern expression (which could be anything), we
create a new top-level OR node to bind them together, and the
resulting expression structure looks like this:
OR
/ \
/ \
pattern OR
/ \ / \
..... committer OR
/ \
author TRUE
The three elements on the top-level backbone that are inspected by
the "all-match" logic are "pattern", "committer" and "author". When
there are more than one elements in the "pattern", the top-level
node of the "pattern" part of the subtree is an OR, and that node is
inspected by "all-match".
The result ends up ignoring the "--all-match" given from the command
line. A match on either side of the pattern is considered a match,
hence:
git log --grep=A --grep=B --author=C --all-match
shows the same "authored by C and has either A or B" that is correct
only when run without "--all-match".
Fix this by turning the resulting expression around when "--all-match"
is in effect, like this:
OR
/ \
/ \
/ OR
committer / \
author \
pattern
The set of nodes on the top-level backbone in the resulting
expression becomes "committer", "author", and the nodes that are on
the top-level backbone of the "pattern" subexpression. This makes
the "all-match" logic inspect the same nodes in "pattern" as the
case without the author and/or the committer restriction, and makes
the earlier "log" example to show "authored by C and has A and has
B", which is what the command line expects.
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-09-14 01:26:57 +02:00
|
|
|
else if (opt->all_match)
|
|
|
|
opt->pattern_expression = grep_splice_or(header_expr,
|
|
|
|
opt->pattern_expression);
|
2010-09-13 07:15:35 +02:00
|
|
|
else
|
|
|
|
opt->pattern_expression = grep_or_expr(opt->pattern_expression,
|
|
|
|
header_expr);
|
"log --author=me --grep=it" should find intersection, not union
Historically, any grep filter in "git log" family of commands were taken
as restricting to commits with any of the words in the commit log message.
However, the user almost always want to find commits "done by this person
on that topic". With "--all-match" option, a series of grep patterns can
be turned into a requirement that all of them must produce a match, but
that makes it impossible to ask for "done by me, on either this or that"
with:
log --author=me --committer=him --grep=this --grep=that
because it will require both "this" and "that" to appear.
Change the "header" parser of grep library to treat the headers specially,
and parse it as:
(all-match-OR (HEADER-AUTHOR me)
(HEADER-COMMITTER him)
(OR
(PATTERN this)
(PATTERN that) ) )
Even though the "log" command line parser doesn't give direct access to
the extended grep syntax to group terms with parentheses, this change will
cover the majority of the case the users would want.
This incidentally revealed that one test in t7002 was bogus. It ran:
log --author=Thor --grep=Thu --format='%s'
and expected (wrongly) "Thu" to match "Thursday" in the author/committer
date, but that would never match, as the timestamp in raw commit buffer
does not have the name of the day-of-the-week.
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2010-01-18 05:09:06 +01:00
|
|
|
opt->all_match = 1;
|
2006-09-18 01:02:52 +02:00
|
|
|
}
|
|
|
|
|
2006-09-28 01:27:10 +02:00
|
|
|
static void free_pattern_expr(struct grep_expr *x)
|
|
|
|
{
|
|
|
|
switch (x->node) {
|
2010-09-13 07:15:35 +02:00
|
|
|
case GREP_NODE_TRUE:
|
2006-09-28 01:27:10 +02:00
|
|
|
case GREP_NODE_ATOM:
|
|
|
|
break;
|
|
|
|
case GREP_NODE_NOT:
|
|
|
|
free_pattern_expr(x->u.unary);
|
|
|
|
break;
|
|
|
|
case GREP_NODE_AND:
|
|
|
|
case GREP_NODE_OR:
|
|
|
|
free_pattern_expr(x->u.binary.left);
|
|
|
|
free_pattern_expr(x->u.binary.right);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
free(x);
|
|
|
|
}
|
|
|
|
|
|
|
|
void free_grep_patterns(struct grep_opt *opt)
|
|
|
|
{
|
|
|
|
struct grep_pat *p, *n;
|
|
|
|
|
|
|
|
for (p = opt->pattern_list; p; p = n) {
|
|
|
|
n = p->next;
|
|
|
|
switch (p->token) {
|
|
|
|
case GREP_PATTERN: /* atom */
|
|
|
|
case GREP_PATTERN_HEAD:
|
|
|
|
case GREP_PATTERN_BODY:
|
2021-01-24 02:58:33 +01:00
|
|
|
if (p->pcre2_pattern)
|
grep: add support for PCRE v2
Add support for v2 of the PCRE API. This is a new major version of
PCRE that came out in early 2015[1].
The regular expression syntax is the same, but while the API is
similar, pretty much every function is either renamed or takes
different arguments. Thus using it via entirely new functions makes
sense, as opposed to trying to e.g. have one compile_pcre_pattern()
that would call either PCRE v1 or v2 functions.
Git can now be compiled with either USE_LIBPCRE1=YesPlease or
USE_LIBPCRE2=YesPlease, with USE_LIBPCRE=YesPlease currently being a
synonym for the former. Providing both is a compile-time error.
With earlier patches to enable JIT for PCRE v1 the performance of the
release versions of both libraries is almost exactly the same, with
PCRE v2 being around 1% slower.
However after I reported this to the pcre-dev mailing list[2] I got a
lot of help with the API use from Zoltán Herczeg, he subsequently
optimized some of the JIT functionality in v2 of the library.
Running the p7820-grep-engines.sh performance test against the latest
Subversion trunk of both, with both them and git compiled as -O3, and
the test run against linux.git, gives the following results. Just the
/perl/ tests shown:
$ GIT_PERF_REPEAT_COUNT=30 GIT_PERF_LARGE_REPO=~/g/linux GIT_PERF_MAKE_COMMAND='grep -q LIBPCRE2 Makefile && make -j8 USE_LIBPCRE2=YesPlease CC=~/perl5/installed/bin/gcc NO_R_TO_GCC_LINKER=YesPlease CFLAGS=-O3 LIBPCREDIR=/home/avar/g/pcre2/inst LDFLAGS=-Wl,-rpath,/home/avar/g/pcre2/inst/lib || make -j8 USE_LIBPCRE=YesPlease CC=~/perl5/installed/bin/gcc NO_R_TO_GCC_LINKER=YesPlease CFLAGS=-O3 LIBPCREDIR=/home/avar/g/pcre/inst LDFLAGS=-Wl,-rpath,/home/avar/g/pcre/inst/lib' ./run HEAD~5 HEAD~ HEAD p7820-grep-engines.sh
[...]
Test HEAD~5 HEAD~ HEAD
-----------------------------------------------------------------------------------------------------------------
7820.3: perl grep 'how.to' 0.31(1.10+0.48) 0.21(0.35+0.56) -32.3% 0.21(0.34+0.55) -32.3%
7820.7: perl grep '^how to' 0.56(2.70+0.40) 0.24(0.64+0.52) -57.1% 0.20(0.28+0.60) -64.3%
7820.11: perl grep '[how] to' 0.56(2.66+0.38) 0.29(0.95+0.45) -48.2% 0.23(0.45+0.54) -58.9%
7820.15: perl grep '(e.t[^ ]*|v.ry) rare' 1.02(5.77+0.42) 0.31(1.02+0.54) -69.6% 0.23(0.50+0.54) -77.5%
7820.19: perl grep 'm(ú|u)lt.b(æ|y)te' 0.38(1.57+0.42) 0.27(0.85+0.46) -28.9% 0.21(0.33+0.57) -44.7%
See commit ("perf: add a comparison test of grep regex engines",
2017-04-19) for details on the machine the above test run was executed
on.
Here HEAD~2 is git with PCRE v1 without JIT, HEAD~ is PCRE v1 with
JIT, and HEAD is PCRE v2 (also with JIT). See previous commits of mine
mentioning p7820-grep-engines.sh for more details on the test setup.
For ease of readability, a different run just of HEAD~ (PCRE v1 with
JIT v.s. PCRE v2), again with just the /perl/ tests shown:
[...]
Test HEAD~ HEAD
----------------------------------------------------------------------------------------
7820.3: perl grep 'how.to' 0.21(0.42+0.52) 0.21(0.31+0.58) +0.0%
7820.7: perl grep '^how to' 0.25(0.65+0.50) 0.20(0.31+0.57) -20.0%
7820.11: perl grep '[how] to' 0.30(0.90+0.50) 0.23(0.46+0.53) -23.3%
7820.15: perl grep '(e.t[^ ]*|v.ry) rare' 0.30(1.19+0.38) 0.23(0.51+0.51) -23.3%
7820.19: perl grep 'm(ú|u)lt.b(æ|y)te' 0.27(0.84+0.48) 0.21(0.34+0.57) -22.2%
I.e. the two are either neck-to-neck, but PCRE v2 usually pulls ahead,
when it does it's around 20% faster.
A brief note on thread safety: As noted in pcre2api(3) & pcre2jit(3)
the compiled pattern can be shared between threads, but not some of
the JIT context, however the grep threading support does all pattern &
JIT compilation in separate threads, so this code doesn't need to
concern itself with thread safety.
See commit 63e7e9d8b6 ("git-grep: Learn PCRE", 2011-05-09) for the
initial addition of PCRE v1. This change follows some of the same
patterns it did (and which were discussed on list at the time),
e.g. mocking up types with typedef instead of ifdef-ing them out when
USE_LIBPCRE2 isn't defined. This adds some trivial memory use to the
program, but makes the code look nicer.
1. https://lists.exim.org/lurker/message/20150105.162835.0666407a.en.html
2. https://lists.exim.org/lurker/thread/20170419.172322.833ee099.en.html
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-06-01 20:20:56 +02:00
|
|
|
free_pcre2_pattern(p);
|
2011-05-09 23:52:05 +02:00
|
|
|
else
|
|
|
|
regfree(&p->regexp);
|
2012-05-20 16:33:07 +02:00
|
|
|
free(p->pattern);
|
2006-09-28 01:27:10 +02:00
|
|
|
break;
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
free(p);
|
|
|
|
}
|
|
|
|
|
grep.c: remove "extended" in favor of "pattern_expression", fix segfault
Since 79d3696cfb4 (git-grep: boolean expression on pattern matching.,
2006-06-30) the "pattern_expression" member has been used for complex
queries (AND/OR...), with "pattern_list" being used for the simple OR
queries. Since then we've used both "pattern_expression" and its
associated boolean "extended" member to see if we have a complex
expression.
Since f41fb662f57 (revisions API: have release_revisions() release
"grep_filter", 2022-04-13) we've had a subtle bug relating to that: If
we supplied options that were only used for "complex queries", but
didn't supply the query itself we'd set "opt->extended", but would
have a NULL "pattern_expression". As a result these would segfault as
we tried to call "free_grep_patterns()" from "release_revisions()":
git -P log -1 --invert-grep
git -P log -1 --all-match
The root cause of this is that we were conflating the state management
we needed in "compile_grep_patterns()" itself with whether or not we
had an "opt->pattern_expression" later on.
In this cases as we're going through "compile_grep_patterns()" we have
no "opt->pattern_list" but have "opt->no_body_match" or
"opt->all_match". So we'd set "opt->extended = 1", but not "return" on
"opt->extended" as that's an "else if" in the same "if" statement.
That behavior is intentional and required, as the common case is that
we have an "opt->pattern_list" that we're about to parse into the
"opt->pattern_expression".
But we don't need to keep track of this "extended" flag beyond the
state management in compile_grep_patterns() itself. It needs it, but
once we're out of that function we can rely on
"opt->pattern_expression" being non-NULL instead for using these
extended patterns.
As 79d3696cfb4 itself shows we've assumed that there's a one-to-one
mapping between the two since the very beginning. I.e. "match_line()"
would check "opt->extended" to see if it should call "match_expr()",
and the first thing we do in that function is assume that we have a
"opt->pattern_expression". We'd then call "match_expr_eval()", which
would have died if that "opt->pattern_expression" was NULL.
The "die" was added in c922b01f54c (grep: fix segfault when "git grep
'('" is given, 2009-04-27), and can now be removed as it's now clearly
unreachable. We still do the right thing in the case that prompted
that fix:
git grep '('
fatal: unmatched parenthesis
Arguably neither the "--invert-grep" option added in [1] nor the
earlier "--all-match" option added in [2] were intended to be used
stand-alone, and another approach[3] would be to error out in those
cases. But since we've been treating them as a NOOP when given without
--grep for a long time let's keep doing that.
We could also return in "free_pattern_expr()" if the argument is
non-NULL, as an alternative fix for this segfault does [4]. That would
be more elegant in making the "free_*()" function behave like
"free()", but it would also remove a sanity check: The
"free_pattern_expr()" function calls itself recursively, and only the
top-level is allowed to be NULL, let's not conflate those two
conditions.
1. 22dfa8a23de (log: teach --invert-grep option, 2015-01-12)
2. 0ab7befa31d (grep --all-match, 2006-09-27)
3. https://lore.kernel.org/git/patch-1.1-f4b90799fce-20221010T165711Z-avarab@gmail.com/
4. http://lore.kernel.org/git/7e094882c2a71894416089f894557a9eae07e8f8.1665423686.git.me@ttaylorr.com
Reported-by: orygaw <orygaw@protonmail.com>
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2022-10-11 11:48:45 +02:00
|
|
|
if (!opt->pattern_expression)
|
2006-09-28 01:27:10 +02:00
|
|
|
return;
|
|
|
|
free_pattern_expr(opt->pattern_expression);
|
|
|
|
}
|
|
|
|
|
2021-09-21 05:49:49 +02:00
|
|
|
static const char *end_of_line(const char *cp, unsigned long *left)
|
2006-09-18 01:02:52 +02:00
|
|
|
{
|
|
|
|
unsigned long l = *left;
|
|
|
|
while (l && *cp != '\n') {
|
|
|
|
l--;
|
|
|
|
cp++;
|
|
|
|
}
|
|
|
|
*left = l;
|
|
|
|
return cp;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int word_char(char ch)
|
|
|
|
{
|
|
|
|
return isalnum(ch) || ch == '_';
|
|
|
|
}
|
|
|
|
|
grep: Colorize filename, line number, and separator
Colorize the filename, line number, and separator in git grep output, as
GNU grep does. The colors are customizable through color.grep.<slot>.
The default is to only color the separator (in cyan), since this gives
the biggest legibility increase without overwhelming the user with
colors. GNU grep also defaults cyan for the separator, but defaults to
magenta for the filename and to green for the line number, as well.
There is one difference from GNU grep: When a binary file matches
without -a, GNU grep does not color the <file> in "Binary file <file>
matches", but we do.
Like GNU grep, if --null is given, the null separators are not colored.
For config.txt, use a a sub-list to describe the slots, rather than
a single paragraph with parentheses, since this is much more readable.
Remove the cast to int for `rm_eo - rm_so` since it is not necessary.
Signed-off-by: Mark Lodato <lodatom@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2010-03-07 17:52:46 +01:00
|
|
|
static void output_color(struct grep_opt *opt, const void *data, size_t size,
|
|
|
|
const char *color)
|
|
|
|
{
|
color: delay auto-color decision until point of use
When we read a color value either from a config file or from
the command line, we use git_config_colorbool to convert it
from the tristate always/never/auto into a single yes/no
boolean value.
This has some timing implications with respect to starting
a pager.
If we start (or decide not to start) the pager before
checking the colorbool, everything is fine. Either isatty(1)
will give us the right information, or we will properly
check for pager_in_use().
However, if we decide to start a pager after we have checked
the colorbool, things are not so simple. If stdout is a tty,
then we will have already decided to use color. However, the
user may also have configured color.pager not to use color
with the pager. In this case, we need to actually turn off
color. Unfortunately, the pager code has no idea which color
variables were turned on (and there are many of them
throughout the code, and they may even have been manipulated
after the colorbool selection by something like "--color" on
the command line).
This bug can be seen any time a pager is started after
config and command line options are checked. This has
affected "git diff" since 89d07f7 (diff: don't run pager if
user asked for a diff style exit code, 2007-08-12). It has
also affect the log family since 1fda91b (Fix 'git log'
early pager startup error case, 2010-08-24).
This patch splits the notion of parsing a colorbool and
actually checking the configuration. The "use_color"
variables now have an additional possible value,
GIT_COLOR_AUTO. Users of the variable should use the new
"want_color()" wrapper, which will lazily determine and
cache the auto-color decision.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2011-08-18 07:04:23 +02:00
|
|
|
if (want_color(opt->color) && color && color[0]) {
|
grep: Colorize filename, line number, and separator
Colorize the filename, line number, and separator in git grep output, as
GNU grep does. The colors are customizable through color.grep.<slot>.
The default is to only color the separator (in cyan), since this gives
the biggest legibility increase without overwhelming the user with
colors. GNU grep also defaults cyan for the separator, but defaults to
magenta for the filename and to green for the line number, as well.
There is one difference from GNU grep: When a binary file matches
without -a, GNU grep does not color the <file> in "Binary file <file>
matches", but we do.
Like GNU grep, if --null is given, the null separators are not colored.
For config.txt, use a a sub-list to describe the slots, rather than
a single paragraph with parentheses, since this is much more readable.
Remove the cast to int for `rm_eo - rm_so` since it is not necessary.
Signed-off-by: Mark Lodato <lodatom@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2010-03-07 17:52:46 +01:00
|
|
|
opt->output(opt, color, strlen(color));
|
|
|
|
opt->output(opt, data, size);
|
|
|
|
opt->output(opt, GIT_COLOR_RESET, strlen(GIT_COLOR_RESET));
|
|
|
|
} else
|
|
|
|
opt->output(opt, data, size);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void output_sep(struct grep_opt *opt, char sign)
|
|
|
|
{
|
|
|
|
if (opt->null_following_name)
|
|
|
|
opt->output(opt, "\0", 1);
|
|
|
|
else
|
2018-05-26 15:55:22 +02:00
|
|
|
output_color(opt, &sign, 1, opt->colors[GREP_COLOR_SEP]);
|
grep: Colorize filename, line number, and separator
Colorize the filename, line number, and separator in git grep output, as
GNU grep does. The colors are customizable through color.grep.<slot>.
The default is to only color the separator (in cyan), since this gives
the biggest legibility increase without overwhelming the user with
colors. GNU grep also defaults cyan for the separator, but defaults to
magenta for the filename and to green for the line number, as well.
There is one difference from GNU grep: When a binary file matches
without -a, GNU grep does not color the <file> in "Binary file <file>
matches", but we do.
Like GNU grep, if --null is given, the null separators are not colored.
For config.txt, use a a sub-list to describe the slots, rather than
a single paragraph with parentheses, since this is much more readable.
Remove the cast to int for `rm_eo - rm_so` since it is not necessary.
Signed-off-by: Mark Lodato <lodatom@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2010-03-07 17:52:46 +01:00
|
|
|
}
|
|
|
|
|
2008-10-01 18:11:15 +02:00
|
|
|
static void show_name(struct grep_opt *opt, const char *name)
|
|
|
|
{
|
2018-05-26 15:55:22 +02:00
|
|
|
output_color(opt, name, strlen(name), opt->colors[GREP_COLOR_FILENAME]);
|
2010-01-25 23:51:39 +01:00
|
|
|
opt->output(opt, opt->null_following_name ? "\0" : "\n", 1);
|
2008-10-01 18:11:15 +02:00
|
|
|
}
|
|
|
|
|
2021-09-21 05:49:49 +02:00
|
|
|
static int patmatch(struct grep_pat *p,
|
|
|
|
const char *line, const char *eol,
|
2011-05-05 00:00:19 +02:00
|
|
|
regmatch_t *match, int eflags)
|
|
|
|
{
|
|
|
|
int hit;
|
|
|
|
|
2021-01-24 02:58:33 +01:00
|
|
|
if (p->pcre2_pattern)
|
grep: add support for PCRE v2
Add support for v2 of the PCRE API. This is a new major version of
PCRE that came out in early 2015[1].
The regular expression syntax is the same, but while the API is
similar, pretty much every function is either renamed or takes
different arguments. Thus using it via entirely new functions makes
sense, as opposed to trying to e.g. have one compile_pcre_pattern()
that would call either PCRE v1 or v2 functions.
Git can now be compiled with either USE_LIBPCRE1=YesPlease or
USE_LIBPCRE2=YesPlease, with USE_LIBPCRE=YesPlease currently being a
synonym for the former. Providing both is a compile-time error.
With earlier patches to enable JIT for PCRE v1 the performance of the
release versions of both libraries is almost exactly the same, with
PCRE v2 being around 1% slower.
However after I reported this to the pcre-dev mailing list[2] I got a
lot of help with the API use from Zoltán Herczeg, he subsequently
optimized some of the JIT functionality in v2 of the library.
Running the p7820-grep-engines.sh performance test against the latest
Subversion trunk of both, with both them and git compiled as -O3, and
the test run against linux.git, gives the following results. Just the
/perl/ tests shown:
$ GIT_PERF_REPEAT_COUNT=30 GIT_PERF_LARGE_REPO=~/g/linux GIT_PERF_MAKE_COMMAND='grep -q LIBPCRE2 Makefile && make -j8 USE_LIBPCRE2=YesPlease CC=~/perl5/installed/bin/gcc NO_R_TO_GCC_LINKER=YesPlease CFLAGS=-O3 LIBPCREDIR=/home/avar/g/pcre2/inst LDFLAGS=-Wl,-rpath,/home/avar/g/pcre2/inst/lib || make -j8 USE_LIBPCRE=YesPlease CC=~/perl5/installed/bin/gcc NO_R_TO_GCC_LINKER=YesPlease CFLAGS=-O3 LIBPCREDIR=/home/avar/g/pcre/inst LDFLAGS=-Wl,-rpath,/home/avar/g/pcre/inst/lib' ./run HEAD~5 HEAD~ HEAD p7820-grep-engines.sh
[...]
Test HEAD~5 HEAD~ HEAD
-----------------------------------------------------------------------------------------------------------------
7820.3: perl grep 'how.to' 0.31(1.10+0.48) 0.21(0.35+0.56) -32.3% 0.21(0.34+0.55) -32.3%
7820.7: perl grep '^how to' 0.56(2.70+0.40) 0.24(0.64+0.52) -57.1% 0.20(0.28+0.60) -64.3%
7820.11: perl grep '[how] to' 0.56(2.66+0.38) 0.29(0.95+0.45) -48.2% 0.23(0.45+0.54) -58.9%
7820.15: perl grep '(e.t[^ ]*|v.ry) rare' 1.02(5.77+0.42) 0.31(1.02+0.54) -69.6% 0.23(0.50+0.54) -77.5%
7820.19: perl grep 'm(ú|u)lt.b(æ|y)te' 0.38(1.57+0.42) 0.27(0.85+0.46) -28.9% 0.21(0.33+0.57) -44.7%
See commit ("perf: add a comparison test of grep regex engines",
2017-04-19) for details on the machine the above test run was executed
on.
Here HEAD~2 is git with PCRE v1 without JIT, HEAD~ is PCRE v1 with
JIT, and HEAD is PCRE v2 (also with JIT). See previous commits of mine
mentioning p7820-grep-engines.sh for more details on the test setup.
For ease of readability, a different run just of HEAD~ (PCRE v1 with
JIT v.s. PCRE v2), again with just the /perl/ tests shown:
[...]
Test HEAD~ HEAD
----------------------------------------------------------------------------------------
7820.3: perl grep 'how.to' 0.21(0.42+0.52) 0.21(0.31+0.58) +0.0%
7820.7: perl grep '^how to' 0.25(0.65+0.50) 0.20(0.31+0.57) -20.0%
7820.11: perl grep '[how] to' 0.30(0.90+0.50) 0.23(0.46+0.53) -23.3%
7820.15: perl grep '(e.t[^ ]*|v.ry) rare' 0.30(1.19+0.38) 0.23(0.51+0.51) -23.3%
7820.19: perl grep 'm(ú|u)lt.b(æ|y)te' 0.27(0.84+0.48) 0.21(0.34+0.57) -22.2%
I.e. the two are either neck-to-neck, but PCRE v2 usually pulls ahead,
when it does it's around 20% faster.
A brief note on thread safety: As noted in pcre2api(3) & pcre2jit(3)
the compiled pattern can be shared between threads, but not some of
the JIT context, however the grep threading support does all pattern &
JIT compilation in separate threads, so this code doesn't need to
concern itself with thread safety.
See commit 63e7e9d8b6 ("git-grep: Learn PCRE", 2011-05-09) for the
initial addition of PCRE v1. This change follows some of the same
patterns it did (and which were discussed on list at the time),
e.g. mocking up types with typedef instead of ifdef-ing them out when
USE_LIBPCRE2 isn't defined. This adds some trivial memory use to the
program, but makes the code look nicer.
1. https://lists.exim.org/lurker/message/20150105.162835.0666407a.en.html
2. https://lists.exim.org/lurker/thread/20170419.172322.833ee099.en.html
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-06-01 20:20:56 +02:00
|
|
|
hit = !pcre2match(p, line, eol, match, eflags);
|
2011-05-05 00:00:19 +02:00
|
|
|
else
|
2016-09-21 20:24:14 +02:00
|
|
|
hit = !regexec_buf(&p->regexp, line, eol - line, 1, match,
|
|
|
|
eflags);
|
2011-05-05 00:00:19 +02:00
|
|
|
|
|
|
|
return hit;
|
|
|
|
}
|
|
|
|
|
2021-09-21 05:49:49 +02:00
|
|
|
static void strip_timestamp(const char *bol, const char **eol_p)
|
log --author/--committer: really match only with name part
When we tried to find commits done by AUTHOR, the first implementation
tried to pattern match a line with "^author .*AUTHOR", which later was
enhanced to strip leading caret and look for "^author AUTHOR" when the
search pattern was anchored at the left end (i.e. --author="^AUTHOR").
This had a few problems:
* When looking for fixed strings (e.g. "git log -F --author=x --grep=y"),
the regexp internally used "^author .*x" would never match anything;
* To match at the end (e.g. "git log --author='google.com>$'"), the
generated regexp has to also match the trailing timestamp part the
commit header lines have. Also, in order to determine if the '$' at
the end means "match at the end of the line" or just a literal dollar
sign (probably backslash-quoted), we would need to parse the regexp
ourselves.
An earlier alternative tried to make sure that a line matches "^author "
(to limit by field name) and the user supplied pattern at the same time.
While it solved the -F problem by introducing a special override for
matching the "^author ", it did not solve the trailing timestamp nor tail
match problem. It also would have matched every commit if --author=author
was asked for, not because the author's email part had this string, but
because every commit header line that talks about the author begins with
that field name, regardleses of who wrote it.
Instead of piling more hacks on top of hacks, this rethinks the grep
machinery that is used to look for strings in the commit header, and makes
sure that (1) field name matches literally at the beginning of the line,
followed by a SP, and (2) the user supplied pattern is matched against the
remainder of the line, excluding the trailing timestamp data.
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2008-09-05 07:15:02 +02:00
|
|
|
{
|
2021-09-21 05:49:49 +02:00
|
|
|
const char *eol = *eol_p;
|
log --author/--committer: really match only with name part
When we tried to find commits done by AUTHOR, the first implementation
tried to pattern match a line with "^author .*AUTHOR", which later was
enhanced to strip leading caret and look for "^author AUTHOR" when the
search pattern was anchored at the left end (i.e. --author="^AUTHOR").
This had a few problems:
* When looking for fixed strings (e.g. "git log -F --author=x --grep=y"),
the regexp internally used "^author .*x" would never match anything;
* To match at the end (e.g. "git log --author='google.com>$'"), the
generated regexp has to also match the trailing timestamp part the
commit header lines have. Also, in order to determine if the '$' at
the end means "match at the end of the line" or just a literal dollar
sign (probably backslash-quoted), we would need to parse the regexp
ourselves.
An earlier alternative tried to make sure that a line matches "^author "
(to limit by field name) and the user supplied pattern at the same time.
While it solved the -F problem by introducing a special override for
matching the "^author ", it did not solve the trailing timestamp nor tail
match problem. It also would have matched every commit if --author=author
was asked for, not because the author's email part had this string, but
because every commit header line that talks about the author begins with
that field name, regardleses of who wrote it.
Instead of piling more hacks on top of hacks, this rethinks the grep
machinery that is used to look for strings in the commit header, and makes
sure that (1) field name matches literally at the beginning of the line,
followed by a SP, and (2) the user supplied pattern is matched against the
remainder of the line, excluding the trailing timestamp data.
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2008-09-05 07:15:02 +02:00
|
|
|
|
|
|
|
while (bol < --eol) {
|
|
|
|
if (*eol != '>')
|
|
|
|
continue;
|
|
|
|
*eol_p = ++eol;
|
grep: stop modifying buffer in strip_timestamp
When grepping for headers in commit objects, we receive individual
lines (e.g., "author Name <email> 1234 -0000"), and then strip off the
timestamp to do our match. We do so by writing a NUL byte over the
whitespace separator, and then remembering to restore it later.
We had to do it this way when this was added back in a4d7d2c6db (log
--author/--committer: really match only with name part, 2008-09-04),
because we fed the result directly to regexec(), which expects a
NUL-terminated string. But since b7d36ffca0 (regex: use regexec_buf(),
2016-09-21), we have a function which can match part of a buffer.
So instead of modifying the string, we can instead just move the "eol"
pointer, and the rest of the code will do the right thing. This will let
further patches mark more buffers as "const".
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-09-21 05:46:56 +02:00
|
|
|
break;
|
log --author/--committer: really match only with name part
When we tried to find commits done by AUTHOR, the first implementation
tried to pattern match a line with "^author .*AUTHOR", which later was
enhanced to strip leading caret and look for "^author AUTHOR" when the
search pattern was anchored at the left end (i.e. --author="^AUTHOR").
This had a few problems:
* When looking for fixed strings (e.g. "git log -F --author=x --grep=y"),
the regexp internally used "^author .*x" would never match anything;
* To match at the end (e.g. "git log --author='google.com>$'"), the
generated regexp has to also match the trailing timestamp part the
commit header lines have. Also, in order to determine if the '$' at
the end means "match at the end of the line" or just a literal dollar
sign (probably backslash-quoted), we would need to parse the regexp
ourselves.
An earlier alternative tried to make sure that a line matches "^author "
(to limit by field name) and the user supplied pattern at the same time.
While it solved the -F problem by introducing a special override for
matching the "^author ", it did not solve the trailing timestamp nor tail
match problem. It also would have matched every commit if --author=author
was asked for, not because the author's email part had this string, but
because every commit header line that talks about the author begins with
that field name, regardleses of who wrote it.
Instead of piling more hacks on top of hacks, this rethinks the grep
machinery that is used to look for strings in the commit header, and makes
sure that (1) field name matches literally at the beginning of the line,
followed by a SP, and (2) the user supplied pattern is matched against the
remainder of the line, excluding the trailing timestamp data.
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2008-09-05 07:15:02 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct {
|
|
|
|
const char *field;
|
|
|
|
size_t len;
|
|
|
|
} header_field[] = {
|
|
|
|
{ "author ", 7 },
|
|
|
|
{ "committer ", 10 },
|
2012-09-29 06:41:28 +02:00
|
|
|
{ "reflog ", 7 },
|
log --author/--committer: really match only with name part
When we tried to find commits done by AUTHOR, the first implementation
tried to pattern match a line with "^author .*AUTHOR", which later was
enhanced to strip leading caret and look for "^author AUTHOR" when the
search pattern was anchored at the left end (i.e. --author="^AUTHOR").
This had a few problems:
* When looking for fixed strings (e.g. "git log -F --author=x --grep=y"),
the regexp internally used "^author .*x" would never match anything;
* To match at the end (e.g. "git log --author='google.com>$'"), the
generated regexp has to also match the trailing timestamp part the
commit header lines have. Also, in order to determine if the '$' at
the end means "match at the end of the line" or just a literal dollar
sign (probably backslash-quoted), we would need to parse the regexp
ourselves.
An earlier alternative tried to make sure that a line matches "^author "
(to limit by field name) and the user supplied pattern at the same time.
While it solved the -F problem by introducing a special override for
matching the "^author ", it did not solve the trailing timestamp nor tail
match problem. It also would have matched every commit if --author=author
was asked for, not because the author's email part had this string, but
because every commit header line that talks about the author begins with
that field name, regardleses of who wrote it.
Instead of piling more hacks on top of hacks, this rethinks the grep
machinery that is used to look for strings in the commit header, and makes
sure that (1) field name matches literally at the beginning of the line,
followed by a SP, and (2) the user supplied pattern is matched against the
remainder of the line, excluding the trailing timestamp data.
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2008-09-05 07:15:02 +02:00
|
|
|
};
|
|
|
|
|
2021-09-29 13:57:15 +02:00
|
|
|
static int headerless_match_one_pattern(struct grep_pat *p,
|
|
|
|
const char *bol, const char *eol,
|
|
|
|
enum grep_context ctx,
|
|
|
|
regmatch_t *pmatch, int eflags)
|
2006-09-18 01:02:52 +02:00
|
|
|
{
|
|
|
|
int hit = 0;
|
2009-05-20 23:31:53 +02:00
|
|
|
const char *start = bol;
|
2006-09-18 01:02:52 +02:00
|
|
|
|
2006-09-20 21:39:46 +02:00
|
|
|
if ((p->token != GREP_PATTERN) &&
|
|
|
|
((p->token == GREP_PATTERN_HEAD) != (ctx == GREP_CONTEXT_HEAD)))
|
|
|
|
return 0;
|
|
|
|
|
2006-09-18 01:02:52 +02:00
|
|
|
again:
|
2011-05-05 00:00:19 +02:00
|
|
|
hit = patmatch(p, bol, eol, pmatch, eflags);
|
2006-09-18 01:02:52 +02:00
|
|
|
|
2009-03-07 13:28:40 +01:00
|
|
|
if (hit && p->word_regexp) {
|
2006-09-18 01:02:52 +02:00
|
|
|
if ((pmatch[0].rm_so < 0) ||
|
2009-06-03 18:19:01 +02:00
|
|
|
(eol - bol) < pmatch[0].rm_so ||
|
2006-09-18 01:02:52 +02:00
|
|
|
(pmatch[0].rm_eo < 0) ||
|
|
|
|
(eol - bol) < pmatch[0].rm_eo)
|
|
|
|
die("regexp returned nonsense");
|
|
|
|
|
|
|
|
/* Match beginning must be either beginning of the
|
|
|
|
* line, or at word boundary (i.e. the last char must
|
|
|
|
* not be a word char). Similarly, match end must be
|
|
|
|
* either end of the line, or at word boundary
|
|
|
|
* (i.e. the next char must not be a word char).
|
|
|
|
*/
|
2009-01-10 00:08:40 +01:00
|
|
|
if ( ((pmatch[0].rm_so == 0) ||
|
2006-09-18 01:02:52 +02:00
|
|
|
!word_char(bol[pmatch[0].rm_so-1])) &&
|
|
|
|
((pmatch[0].rm_eo == (eol-bol)) ||
|
|
|
|
!word_char(bol[pmatch[0].rm_eo])) )
|
|
|
|
;
|
|
|
|
else
|
|
|
|
hit = 0;
|
|
|
|
|
2009-06-03 18:19:01 +02:00
|
|
|
/* Words consist of at least one character. */
|
|
|
|
if (pmatch->rm_so == pmatch->rm_eo)
|
|
|
|
hit = 0;
|
|
|
|
|
2006-09-18 01:02:52 +02:00
|
|
|
if (!hit && pmatch[0].rm_so + bol + 1 < eol) {
|
|
|
|
/* There could be more than one match on the
|
|
|
|
* line, and the first match might not be
|
|
|
|
* strict word match. But later ones could be!
|
2009-01-10 00:08:40 +01:00
|
|
|
* Forward to the next possible start, i.e. the
|
|
|
|
* next position following a non-word char.
|
2006-09-18 01:02:52 +02:00
|
|
|
*/
|
|
|
|
bol = pmatch[0].rm_so + bol + 1;
|
2009-01-10 00:08:40 +01:00
|
|
|
while (word_char(bol[-1]) && bol < eol)
|
|
|
|
bol++;
|
2009-05-23 13:45:26 +02:00
|
|
|
eflags |= REG_NOTBOL;
|
2009-01-10 00:08:40 +01:00
|
|
|
if (bol < eol)
|
|
|
|
goto again;
|
2006-09-18 01:02:52 +02:00
|
|
|
}
|
|
|
|
}
|
2009-05-20 23:31:53 +02:00
|
|
|
if (hit) {
|
|
|
|
pmatch[0].rm_so += bol - start;
|
|
|
|
pmatch[0].rm_eo += bol - start;
|
|
|
|
}
|
2006-09-18 01:02:52 +02:00
|
|
|
return hit;
|
|
|
|
}
|
|
|
|
|
2021-09-29 13:57:15 +02:00
|
|
|
static int match_one_pattern(struct grep_pat *p,
|
|
|
|
const char *bol, const char *eol,
|
|
|
|
enum grep_context ctx, regmatch_t *pmatch,
|
|
|
|
int eflags)
|
|
|
|
{
|
|
|
|
const char *field;
|
|
|
|
size_t len;
|
|
|
|
|
|
|
|
if (p->token == GREP_PATTERN_HEAD) {
|
|
|
|
assert(p->field < ARRAY_SIZE(header_field));
|
|
|
|
field = header_field[p->field].field;
|
|
|
|
len = header_field[p->field].len;
|
|
|
|
if (strncmp(bol, field, len))
|
|
|
|
return 0;
|
|
|
|
bol += len;
|
|
|
|
|
|
|
|
switch (p->field) {
|
|
|
|
case GREP_HEADER_AUTHOR:
|
|
|
|
case GREP_HEADER_COMMITTER:
|
|
|
|
strip_timestamp(bol, &eol);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return headerless_match_one_pattern(p, bol, eol, ctx, pmatch, eflags);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2021-09-21 05:49:49 +02:00
|
|
|
static int match_expr_eval(struct grep_opt *opt, struct grep_expr *x,
|
|
|
|
const char *bol, const char *eol,
|
|
|
|
enum grep_context ctx, ssize_t *col,
|
grep.c: expose {,inverted} match column in match_line()
When calling match_line(), callers presently cannot determine the
relative offset of the match because match_line() discards the
'regmatch_t' that contains this information.
Instead, teach match_line() to take in two 'ssize_t's. Fill the first
with the offset of the match produced by the given expression. If
extended, fill the later with the offset of the match produced as if
--invert were given.
For instance, matching "--not -e x" on this line produces a columnar
offset of 0, (i.e., the whole line does not contain an x), but "--invert
--not -e -x" will fill the later ssize_t of the column containing an
"x", because this expression is semantically equivalent to "-e x".
To determine the column for the inverted and non-inverted case, do the
following:
- If matching an atom, the non-inverted column is as given from
match_one_pattern(), and the inverted column is unset.
- If matching a --not, the inverted column and non-inverted column
swap.
- If matching an --and, or --or, the non-inverted column is the
minimum of the two children.
Presently, the existing short-circuiting logic for AND and OR applies as
before. This will change in the following commit when we add options to
configure the --column flag. Taken together, this and the forthcoming
change will always yield the earlier column on a given line.
This patch will become useful when we later pick between the two new
results in order to display the column number of the first match on a
line with --column.
Co-authored-by: Jeff King <peff@peff.net>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-06-22 17:49:35 +02:00
|
|
|
ssize_t *icol, int collect_hits)
|
2006-09-18 01:02:52 +02:00
|
|
|
{
|
2006-09-28 02:50:52 +02:00
|
|
|
int h = 0;
|
|
|
|
|
2006-09-18 01:02:52 +02:00
|
|
|
switch (x->node) {
|
2010-09-13 07:15:35 +02:00
|
|
|
case GREP_NODE_TRUE:
|
|
|
|
h = 1;
|
|
|
|
break;
|
2006-09-18 01:02:52 +02:00
|
|
|
case GREP_NODE_ATOM:
|
grep.c: expose {,inverted} match column in match_line()
When calling match_line(), callers presently cannot determine the
relative offset of the match because match_line() discards the
'regmatch_t' that contains this information.
Instead, teach match_line() to take in two 'ssize_t's. Fill the first
with the offset of the match produced by the given expression. If
extended, fill the later with the offset of the match produced as if
--invert were given.
For instance, matching "--not -e x" on this line produces a columnar
offset of 0, (i.e., the whole line does not contain an x), but "--invert
--not -e -x" will fill the later ssize_t of the column containing an
"x", because this expression is semantically equivalent to "-e x".
To determine the column for the inverted and non-inverted case, do the
following:
- If matching an atom, the non-inverted column is as given from
match_one_pattern(), and the inverted column is unset.
- If matching a --not, the inverted column and non-inverted column
swap.
- If matching an --and, or --or, the non-inverted column is the
minimum of the two children.
Presently, the existing short-circuiting logic for AND and OR applies as
before. This will change in the following commit when we add options to
configure the --column flag. Taken together, this and the forthcoming
change will always yield the earlier column on a given line.
This patch will become useful when we later pick between the two new
results in order to display the column number of the first match on a
line with --column.
Co-authored-by: Jeff King <peff@peff.net>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-06-22 17:49:35 +02:00
|
|
|
{
|
|
|
|
regmatch_t tmp;
|
|
|
|
h = match_one_pattern(x->u.atom, bol, eol, ctx,
|
|
|
|
&tmp, 0);
|
|
|
|
if (h && (*col < 0 || tmp.rm_so < *col))
|
|
|
|
*col = tmp.rm_so;
|
|
|
|
}
|
2021-12-17 17:48:49 +01:00
|
|
|
if (x->u.atom->token == GREP_PATTERN_BODY)
|
|
|
|
opt->body_hit |= h;
|
2006-09-18 01:02:52 +02:00
|
|
|
break;
|
|
|
|
case GREP_NODE_NOT:
|
grep.c: expose {,inverted} match column in match_line()
When calling match_line(), callers presently cannot determine the
relative offset of the match because match_line() discards the
'regmatch_t' that contains this information.
Instead, teach match_line() to take in two 'ssize_t's. Fill the first
with the offset of the match produced by the given expression. If
extended, fill the later with the offset of the match produced as if
--invert were given.
For instance, matching "--not -e x" on this line produces a columnar
offset of 0, (i.e., the whole line does not contain an x), but "--invert
--not -e -x" will fill the later ssize_t of the column containing an
"x", because this expression is semantically equivalent to "-e x".
To determine the column for the inverted and non-inverted case, do the
following:
- If matching an atom, the non-inverted column is as given from
match_one_pattern(), and the inverted column is unset.
- If matching a --not, the inverted column and non-inverted column
swap.
- If matching an --and, or --or, the non-inverted column is the
minimum of the two children.
Presently, the existing short-circuiting logic for AND and OR applies as
before. This will change in the following commit when we add options to
configure the --column flag. Taken together, this and the forthcoming
change will always yield the earlier column on a given line.
This patch will become useful when we later pick between the two new
results in order to display the column number of the first match on a
line with --column.
Co-authored-by: Jeff King <peff@peff.net>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-06-22 17:49:35 +02:00
|
|
|
/*
|
|
|
|
* Upon visiting a GREP_NODE_NOT, col and icol become swapped.
|
|
|
|
*/
|
|
|
|
h = !match_expr_eval(opt, x->u.unary, bol, eol, ctx, icol, col,
|
|
|
|
0);
|
2006-09-28 02:50:52 +02:00
|
|
|
break;
|
2006-09-18 01:02:52 +02:00
|
|
|
case GREP_NODE_AND:
|
2018-06-22 17:49:39 +02:00
|
|
|
h = match_expr_eval(opt, x->u.binary.left, bol, eol, ctx, col,
|
grep.c: expose {,inverted} match column in match_line()
When calling match_line(), callers presently cannot determine the
relative offset of the match because match_line() discards the
'regmatch_t' that contains this information.
Instead, teach match_line() to take in two 'ssize_t's. Fill the first
with the offset of the match produced by the given expression. If
extended, fill the later with the offset of the match produced as if
--invert were given.
For instance, matching "--not -e x" on this line produces a columnar
offset of 0, (i.e., the whole line does not contain an x), but "--invert
--not -e -x" will fill the later ssize_t of the column containing an
"x", because this expression is semantically equivalent to "-e x".
To determine the column for the inverted and non-inverted case, do the
following:
- If matching an atom, the non-inverted column is as given from
match_one_pattern(), and the inverted column is unset.
- If matching a --not, the inverted column and non-inverted column
swap.
- If matching an --and, or --or, the non-inverted column is the
minimum of the two children.
Presently, the existing short-circuiting logic for AND and OR applies as
before. This will change in the following commit when we add options to
configure the --column flag. Taken together, this and the forthcoming
change will always yield the earlier column on a given line.
This patch will become useful when we later pick between the two new
results in order to display the column number of the first match on a
line with --column.
Co-authored-by: Jeff King <peff@peff.net>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-06-22 17:49:35 +02:00
|
|
|
icol, 0);
|
2018-06-22 17:49:39 +02:00
|
|
|
if (h || opt->columnnum) {
|
|
|
|
/*
|
|
|
|
* Don't short-circuit AND when given --column, since a
|
|
|
|
* NOT earlier in the tree may turn this into an OR. In
|
|
|
|
* this case, see the below comment.
|
|
|
|
*/
|
|
|
|
h &= match_expr_eval(opt, x->u.binary.right, bol, eol,
|
|
|
|
ctx, col, icol, 0);
|
|
|
|
}
|
2006-09-28 02:50:52 +02:00
|
|
|
break;
|
2006-09-18 01:02:52 +02:00
|
|
|
case GREP_NODE_OR:
|
2018-06-22 17:49:39 +02:00
|
|
|
if (!(collect_hits || opt->columnnum)) {
|
|
|
|
/*
|
|
|
|
* Don't short-circuit OR when given --column (or
|
|
|
|
* collecting hits) to ensure we don't skip a later
|
|
|
|
* child that would produce an earlier match.
|
|
|
|
*/
|
grep.c: expose {,inverted} match column in match_line()
When calling match_line(), callers presently cannot determine the
relative offset of the match because match_line() discards the
'regmatch_t' that contains this information.
Instead, teach match_line() to take in two 'ssize_t's. Fill the first
with the offset of the match produced by the given expression. If
extended, fill the later with the offset of the match produced as if
--invert were given.
For instance, matching "--not -e x" on this line produces a columnar
offset of 0, (i.e., the whole line does not contain an x), but "--invert
--not -e -x" will fill the later ssize_t of the column containing an
"x", because this expression is semantically equivalent to "-e x".
To determine the column for the inverted and non-inverted case, do the
following:
- If matching an atom, the non-inverted column is as given from
match_one_pattern(), and the inverted column is unset.
- If matching a --not, the inverted column and non-inverted column
swap.
- If matching an --and, or --or, the non-inverted column is the
minimum of the two children.
Presently, the existing short-circuiting logic for AND and OR applies as
before. This will change in the following commit when we add options to
configure the --column flag. Taken together, this and the forthcoming
change will always yield the earlier column on a given line.
This patch will become useful when we later pick between the two new
results in order to display the column number of the first match on a
line with --column.
Co-authored-by: Jeff King <peff@peff.net>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-06-22 17:49:35 +02:00
|
|
|
return (match_expr_eval(opt, x->u.binary.left, bol, eol,
|
|
|
|
ctx, col, icol, 0) ||
|
|
|
|
match_expr_eval(opt, x->u.binary.right, bol,
|
|
|
|
eol, ctx, col, icol, 0));
|
2018-06-22 17:49:39 +02:00
|
|
|
}
|
grep.c: expose {,inverted} match column in match_line()
When calling match_line(), callers presently cannot determine the
relative offset of the match because match_line() discards the
'regmatch_t' that contains this information.
Instead, teach match_line() to take in two 'ssize_t's. Fill the first
with the offset of the match produced by the given expression. If
extended, fill the later with the offset of the match produced as if
--invert were given.
For instance, matching "--not -e x" on this line produces a columnar
offset of 0, (i.e., the whole line does not contain an x), but "--invert
--not -e -x" will fill the later ssize_t of the column containing an
"x", because this expression is semantically equivalent to "-e x".
To determine the column for the inverted and non-inverted case, do the
following:
- If matching an atom, the non-inverted column is as given from
match_one_pattern(), and the inverted column is unset.
- If matching a --not, the inverted column and non-inverted column
swap.
- If matching an --and, or --or, the non-inverted column is the
minimum of the two children.
Presently, the existing short-circuiting logic for AND and OR applies as
before. This will change in the following commit when we add options to
configure the --column flag. Taken together, this and the forthcoming
change will always yield the earlier column on a given line.
This patch will become useful when we later pick between the two new
results in order to display the column number of the first match on a
line with --column.
Co-authored-by: Jeff King <peff@peff.net>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-06-22 17:49:35 +02:00
|
|
|
h = match_expr_eval(opt, x->u.binary.left, bol, eol, ctx, col,
|
|
|
|
icol, 0);
|
2018-06-22 17:49:39 +02:00
|
|
|
if (collect_hits)
|
|
|
|
x->u.binary.left->hit |= h;
|
grep.c: expose {,inverted} match column in match_line()
When calling match_line(), callers presently cannot determine the
relative offset of the match because match_line() discards the
'regmatch_t' that contains this information.
Instead, teach match_line() to take in two 'ssize_t's. Fill the first
with the offset of the match produced by the given expression. If
extended, fill the later with the offset of the match produced as if
--invert were given.
For instance, matching "--not -e x" on this line produces a columnar
offset of 0, (i.e., the whole line does not contain an x), but "--invert
--not -e -x" will fill the later ssize_t of the column containing an
"x", because this expression is semantically equivalent to "-e x".
To determine the column for the inverted and non-inverted case, do the
following:
- If matching an atom, the non-inverted column is as given from
match_one_pattern(), and the inverted column is unset.
- If matching a --not, the inverted column and non-inverted column
swap.
- If matching an --and, or --or, the non-inverted column is the
minimum of the two children.
Presently, the existing short-circuiting logic for AND and OR applies as
before. This will change in the following commit when we add options to
configure the --column flag. Taken together, this and the forthcoming
change will always yield the earlier column on a given line.
This patch will become useful when we later pick between the two new
results in order to display the column number of the first match on a
line with --column.
Co-authored-by: Jeff King <peff@peff.net>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-06-22 17:49:35 +02:00
|
|
|
h |= match_expr_eval(opt, x->u.binary.right, bol, eol, ctx, col,
|
2018-06-22 17:49:39 +02:00
|
|
|
icol, collect_hits);
|
2006-09-28 02:50:52 +02:00
|
|
|
break;
|
|
|
|
default:
|
2009-01-04 19:38:41 +01:00
|
|
|
die("Unexpected node type (internal error) %d", x->node);
|
2006-09-18 01:02:52 +02:00
|
|
|
}
|
2006-09-28 02:50:52 +02:00
|
|
|
if (collect_hits)
|
|
|
|
x->hit |= h;
|
|
|
|
return h;
|
2006-09-18 01:02:52 +02:00
|
|
|
}
|
|
|
|
|
2021-09-21 05:49:49 +02:00
|
|
|
static int match_expr(struct grep_opt *opt,
|
|
|
|
const char *bol, const char *eol,
|
grep.c: expose {,inverted} match column in match_line()
When calling match_line(), callers presently cannot determine the
relative offset of the match because match_line() discards the
'regmatch_t' that contains this information.
Instead, teach match_line() to take in two 'ssize_t's. Fill the first
with the offset of the match produced by the given expression. If
extended, fill the later with the offset of the match produced as if
--invert were given.
For instance, matching "--not -e x" on this line produces a columnar
offset of 0, (i.e., the whole line does not contain an x), but "--invert
--not -e -x" will fill the later ssize_t of the column containing an
"x", because this expression is semantically equivalent to "-e x".
To determine the column for the inverted and non-inverted case, do the
following:
- If matching an atom, the non-inverted column is as given from
match_one_pattern(), and the inverted column is unset.
- If matching a --not, the inverted column and non-inverted column
swap.
- If matching an --and, or --or, the non-inverted column is the
minimum of the two children.
Presently, the existing short-circuiting logic for AND and OR applies as
before. This will change in the following commit when we add options to
configure the --column flag. Taken together, this and the forthcoming
change will always yield the earlier column on a given line.
This patch will become useful when we later pick between the two new
results in order to display the column number of the first match on a
line with --column.
Co-authored-by: Jeff King <peff@peff.net>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-06-22 17:49:35 +02:00
|
|
|
enum grep_context ctx, ssize_t *col,
|
|
|
|
ssize_t *icol, int collect_hits)
|
2006-09-18 01:02:52 +02:00
|
|
|
{
|
|
|
|
struct grep_expr *x = opt->pattern_expression;
|
grep.c: expose {,inverted} match column in match_line()
When calling match_line(), callers presently cannot determine the
relative offset of the match because match_line() discards the
'regmatch_t' that contains this information.
Instead, teach match_line() to take in two 'ssize_t's. Fill the first
with the offset of the match produced by the given expression. If
extended, fill the later with the offset of the match produced as if
--invert were given.
For instance, matching "--not -e x" on this line produces a columnar
offset of 0, (i.e., the whole line does not contain an x), but "--invert
--not -e -x" will fill the later ssize_t of the column containing an
"x", because this expression is semantically equivalent to "-e x".
To determine the column for the inverted and non-inverted case, do the
following:
- If matching an atom, the non-inverted column is as given from
match_one_pattern(), and the inverted column is unset.
- If matching a --not, the inverted column and non-inverted column
swap.
- If matching an --and, or --or, the non-inverted column is the
minimum of the two children.
Presently, the existing short-circuiting logic for AND and OR applies as
before. This will change in the following commit when we add options to
configure the --column flag. Taken together, this and the forthcoming
change will always yield the earlier column on a given line.
This patch will become useful when we later pick between the two new
results in order to display the column number of the first match on a
line with --column.
Co-authored-by: Jeff King <peff@peff.net>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-06-22 17:49:35 +02:00
|
|
|
return match_expr_eval(opt, x, bol, eol, ctx, col, icol, collect_hits);
|
2006-09-18 01:02:52 +02:00
|
|
|
}
|
|
|
|
|
2021-09-21 05:49:49 +02:00
|
|
|
static int match_line(struct grep_opt *opt,
|
|
|
|
const char *bol, const char *eol,
|
grep.c: expose {,inverted} match column in match_line()
When calling match_line(), callers presently cannot determine the
relative offset of the match because match_line() discards the
'regmatch_t' that contains this information.
Instead, teach match_line() to take in two 'ssize_t's. Fill the first
with the offset of the match produced by the given expression. If
extended, fill the later with the offset of the match produced as if
--invert were given.
For instance, matching "--not -e x" on this line produces a columnar
offset of 0, (i.e., the whole line does not contain an x), but "--invert
--not -e -x" will fill the later ssize_t of the column containing an
"x", because this expression is semantically equivalent to "-e x".
To determine the column for the inverted and non-inverted case, do the
following:
- If matching an atom, the non-inverted column is as given from
match_one_pattern(), and the inverted column is unset.
- If matching a --not, the inverted column and non-inverted column
swap.
- If matching an --and, or --or, the non-inverted column is the
minimum of the two children.
Presently, the existing short-circuiting logic for AND and OR applies as
before. This will change in the following commit when we add options to
configure the --column flag. Taken together, this and the forthcoming
change will always yield the earlier column on a given line.
This patch will become useful when we later pick between the two new
results in order to display the column number of the first match on a
line with --column.
Co-authored-by: Jeff King <peff@peff.net>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-06-22 17:49:35 +02:00
|
|
|
ssize_t *col, ssize_t *icol,
|
2006-09-28 02:50:52 +02:00
|
|
|
enum grep_context ctx, int collect_hits)
|
2006-09-18 01:02:52 +02:00
|
|
|
{
|
|
|
|
struct grep_pat *p;
|
2018-06-22 17:49:39 +02:00
|
|
|
int hit = 0;
|
2009-03-07 13:30:27 +01:00
|
|
|
|
grep.c: remove "extended" in favor of "pattern_expression", fix segfault
Since 79d3696cfb4 (git-grep: boolean expression on pattern matching.,
2006-06-30) the "pattern_expression" member has been used for complex
queries (AND/OR...), with "pattern_list" being used for the simple OR
queries. Since then we've used both "pattern_expression" and its
associated boolean "extended" member to see if we have a complex
expression.
Since f41fb662f57 (revisions API: have release_revisions() release
"grep_filter", 2022-04-13) we've had a subtle bug relating to that: If
we supplied options that were only used for "complex queries", but
didn't supply the query itself we'd set "opt->extended", but would
have a NULL "pattern_expression". As a result these would segfault as
we tried to call "free_grep_patterns()" from "release_revisions()":
git -P log -1 --invert-grep
git -P log -1 --all-match
The root cause of this is that we were conflating the state management
we needed in "compile_grep_patterns()" itself with whether or not we
had an "opt->pattern_expression" later on.
In this cases as we're going through "compile_grep_patterns()" we have
no "opt->pattern_list" but have "opt->no_body_match" or
"opt->all_match". So we'd set "opt->extended = 1", but not "return" on
"opt->extended" as that's an "else if" in the same "if" statement.
That behavior is intentional and required, as the common case is that
we have an "opt->pattern_list" that we're about to parse into the
"opt->pattern_expression".
But we don't need to keep track of this "extended" flag beyond the
state management in compile_grep_patterns() itself. It needs it, but
once we're out of that function we can rely on
"opt->pattern_expression" being non-NULL instead for using these
extended patterns.
As 79d3696cfb4 itself shows we've assumed that there's a one-to-one
mapping between the two since the very beginning. I.e. "match_line()"
would check "opt->extended" to see if it should call "match_expr()",
and the first thing we do in that function is assume that we have a
"opt->pattern_expression". We'd then call "match_expr_eval()", which
would have died if that "opt->pattern_expression" was NULL.
The "die" was added in c922b01f54c (grep: fix segfault when "git grep
'('" is given, 2009-04-27), and can now be removed as it's now clearly
unreachable. We still do the right thing in the case that prompted
that fix:
git grep '('
fatal: unmatched parenthesis
Arguably neither the "--invert-grep" option added in [1] nor the
earlier "--all-match" option added in [2] were intended to be used
stand-alone, and another approach[3] would be to error out in those
cases. But since we've been treating them as a NOOP when given without
--grep for a long time let's keep doing that.
We could also return in "free_pattern_expr()" if the argument is
non-NULL, as an alternative fix for this segfault does [4]. That would
be more elegant in making the "free_*()" function behave like
"free()", but it would also remove a sanity check: The
"free_pattern_expr()" function calls itself recursively, and only the
top-level is allowed to be NULL, let's not conflate those two
conditions.
1. 22dfa8a23de (log: teach --invert-grep option, 2015-01-12)
2. 0ab7befa31d (grep --all-match, 2006-09-27)
3. https://lore.kernel.org/git/patch-1.1-f4b90799fce-20221010T165711Z-avarab@gmail.com/
4. http://lore.kernel.org/git/7e094882c2a71894416089f894557a9eae07e8f8.1665423686.git.me@ttaylorr.com
Reported-by: orygaw <orygaw@protonmail.com>
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2022-10-11 11:48:45 +02:00
|
|
|
if (opt->pattern_expression)
|
grep.c: expose {,inverted} match column in match_line()
When calling match_line(), callers presently cannot determine the
relative offset of the match because match_line() discards the
'regmatch_t' that contains this information.
Instead, teach match_line() to take in two 'ssize_t's. Fill the first
with the offset of the match produced by the given expression. If
extended, fill the later with the offset of the match produced as if
--invert were given.
For instance, matching "--not -e x" on this line produces a columnar
offset of 0, (i.e., the whole line does not contain an x), but "--invert
--not -e -x" will fill the later ssize_t of the column containing an
"x", because this expression is semantically equivalent to "-e x".
To determine the column for the inverted and non-inverted case, do the
following:
- If matching an atom, the non-inverted column is as given from
match_one_pattern(), and the inverted column is unset.
- If matching a --not, the inverted column and non-inverted column
swap.
- If matching an --and, or --or, the non-inverted column is the
minimum of the two children.
Presently, the existing short-circuiting logic for AND and OR applies as
before. This will change in the following commit when we add options to
configure the --column flag. Taken together, this and the forthcoming
change will always yield the earlier column on a given line.
This patch will become useful when we later pick between the two new
results in order to display the column number of the first match on a
line with --column.
Co-authored-by: Jeff King <peff@peff.net>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-06-22 17:49:35 +02:00
|
|
|
return match_expr(opt, bol, eol, ctx, col, icol,
|
|
|
|
collect_hits);
|
2006-09-28 02:50:52 +02:00
|
|
|
|
|
|
|
/* we do not call with collect_hits without being extended */
|
2006-09-18 01:02:52 +02:00
|
|
|
for (p = opt->pattern_list; p; p = p->next) {
|
grep.c: expose {,inverted} match column in match_line()
When calling match_line(), callers presently cannot determine the
relative offset of the match because match_line() discards the
'regmatch_t' that contains this information.
Instead, teach match_line() to take in two 'ssize_t's. Fill the first
with the offset of the match produced by the given expression. If
extended, fill the later with the offset of the match produced as if
--invert were given.
For instance, matching "--not -e x" on this line produces a columnar
offset of 0, (i.e., the whole line does not contain an x), but "--invert
--not -e -x" will fill the later ssize_t of the column containing an
"x", because this expression is semantically equivalent to "-e x".
To determine the column for the inverted and non-inverted case, do the
following:
- If matching an atom, the non-inverted column is as given from
match_one_pattern(), and the inverted column is unset.
- If matching a --not, the inverted column and non-inverted column
swap.
- If matching an --and, or --or, the non-inverted column is the
minimum of the two children.
Presently, the existing short-circuiting logic for AND and OR applies as
before. This will change in the following commit when we add options to
configure the --column flag. Taken together, this and the forthcoming
change will always yield the earlier column on a given line.
This patch will become useful when we later pick between the two new
results in order to display the column number of the first match on a
line with --column.
Co-authored-by: Jeff King <peff@peff.net>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-06-22 17:49:35 +02:00
|
|
|
regmatch_t tmp;
|
|
|
|
if (match_one_pattern(p, bol, eol, ctx, &tmp, 0)) {
|
2018-06-22 17:49:39 +02:00
|
|
|
hit |= 1;
|
|
|
|
if (!opt->columnnum) {
|
|
|
|
/*
|
|
|
|
* Without --column, any single match on a line
|
|
|
|
* is enough to know that it needs to be
|
|
|
|
* printed. With --column, scan _all_ patterns
|
|
|
|
* to find the earliest.
|
|
|
|
*/
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (*col < 0 || tmp.rm_so < *col)
|
|
|
|
*col = tmp.rm_so;
|
grep.c: expose {,inverted} match column in match_line()
When calling match_line(), callers presently cannot determine the
relative offset of the match because match_line() discards the
'regmatch_t' that contains this information.
Instead, teach match_line() to take in two 'ssize_t's. Fill the first
with the offset of the match produced by the given expression. If
extended, fill the later with the offset of the match produced as if
--invert were given.
For instance, matching "--not -e x" on this line produces a columnar
offset of 0, (i.e., the whole line does not contain an x), but "--invert
--not -e -x" will fill the later ssize_t of the column containing an
"x", because this expression is semantically equivalent to "-e x".
To determine the column for the inverted and non-inverted case, do the
following:
- If matching an atom, the non-inverted column is as given from
match_one_pattern(), and the inverted column is unset.
- If matching a --not, the inverted column and non-inverted column
swap.
- If matching an --and, or --or, the non-inverted column is the
minimum of the two children.
Presently, the existing short-circuiting logic for AND and OR applies as
before. This will change in the following commit when we add options to
configure the --column flag. Taken together, this and the forthcoming
change will always yield the earlier column on a given line.
This patch will become useful when we later pick between the two new
results in order to display the column number of the first match on a
line with --column.
Co-authored-by: Jeff King <peff@peff.net>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-06-22 17:49:35 +02:00
|
|
|
}
|
2006-09-18 01:02:52 +02:00
|
|
|
}
|
2018-06-22 17:49:39 +02:00
|
|
|
return hit;
|
2006-09-18 01:02:52 +02:00
|
|
|
}
|
|
|
|
|
2021-09-21 05:49:49 +02:00
|
|
|
static int match_next_pattern(struct grep_pat *p,
|
|
|
|
const char *bol, const char *eol,
|
2009-03-07 13:32:32 +01:00
|
|
|
enum grep_context ctx,
|
|
|
|
regmatch_t *pmatch, int eflags)
|
|
|
|
{
|
|
|
|
regmatch_t match;
|
|
|
|
|
2021-09-29 13:57:15 +02:00
|
|
|
if (!headerless_match_one_pattern(p, bol, eol, ctx, &match, eflags))
|
2009-03-07 13:32:32 +01:00
|
|
|
return 0;
|
|
|
|
if (match.rm_so < 0 || match.rm_eo < 0)
|
|
|
|
return 0;
|
|
|
|
if (pmatch->rm_so >= 0 && pmatch->rm_eo >= 0) {
|
|
|
|
if (match.rm_so > pmatch->rm_so)
|
|
|
|
return 1;
|
|
|
|
if (match.rm_so == pmatch->rm_so && match.rm_eo < pmatch->rm_eo)
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
pmatch->rm_so = match.rm_so;
|
|
|
|
pmatch->rm_eo = match.rm_eo;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2021-09-29 13:57:15 +02:00
|
|
|
int grep_next_match(struct grep_opt *opt,
|
|
|
|
const char *bol, const char *eol,
|
|
|
|
enum grep_context ctx, regmatch_t *pmatch,
|
|
|
|
enum grep_header_field field, int eflags)
|
2009-03-07 13:32:32 +01:00
|
|
|
{
|
|
|
|
struct grep_pat *p;
|
|
|
|
int hit = 0;
|
|
|
|
|
|
|
|
pmatch->rm_so = pmatch->rm_eo = -1;
|
|
|
|
if (bol < eol) {
|
2021-09-29 13:57:15 +02:00
|
|
|
for (p = ((ctx == GREP_CONTEXT_HEAD)
|
|
|
|
? opt->header_list : opt->pattern_list);
|
|
|
|
p; p = p->next) {
|
2009-03-07 13:32:32 +01:00
|
|
|
switch (p->token) {
|
|
|
|
case GREP_PATTERN_HEAD:
|
2021-09-29 13:57:15 +02:00
|
|
|
if ((field != GREP_HEADER_FIELD_MAX) &&
|
|
|
|
(p->field != field))
|
|
|
|
continue;
|
|
|
|
/* fall thru */
|
|
|
|
case GREP_PATTERN: /* atom */
|
2009-03-07 13:32:32 +01:00
|
|
|
case GREP_PATTERN_BODY:
|
|
|
|
hit |= match_next_pattern(p, bol, eol, ctx,
|
|
|
|
pmatch, eflags);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return hit;
|
|
|
|
}
|
|
|
|
|
2018-07-03 23:51:56 +02:00
|
|
|
static void show_line_header(struct grep_opt *opt, const char *name,
|
|
|
|
unsigned lno, ssize_t cno, char sign)
|
2009-03-07 13:32:32 +01:00
|
|
|
{
|
2011-06-05 17:24:36 +02:00
|
|
|
if (opt->heading && opt->last_shown == 0) {
|
2018-05-26 15:55:22 +02:00
|
|
|
output_color(opt, name, strlen(name), opt->colors[GREP_COLOR_FILENAME]);
|
2011-06-05 17:24:36 +02:00
|
|
|
opt->output(opt, "\n", 1);
|
|
|
|
}
|
2009-07-02 00:02:38 +02:00
|
|
|
opt->last_shown = lno;
|
|
|
|
|
2011-06-05 17:24:36 +02:00
|
|
|
if (!opt->heading && opt->pathname) {
|
2018-05-26 15:55:22 +02:00
|
|
|
output_color(opt, name, strlen(name), opt->colors[GREP_COLOR_FILENAME]);
|
grep: Colorize filename, line number, and separator
Colorize the filename, line number, and separator in git grep output, as
GNU grep does. The colors are customizable through color.grep.<slot>.
The default is to only color the separator (in cyan), since this gives
the biggest legibility increase without overwhelming the user with
colors. GNU grep also defaults cyan for the separator, but defaults to
magenta for the filename and to green for the line number, as well.
There is one difference from GNU grep: When a binary file matches
without -a, GNU grep does not color the <file> in "Binary file <file>
matches", but we do.
Like GNU grep, if --null is given, the null separators are not colored.
For config.txt, use a a sub-list to describe the slots, rather than
a single paragraph with parentheses, since this is much more readable.
Remove the cast to int for `rm_eo - rm_so` since it is not necessary.
Signed-off-by: Mark Lodato <lodatom@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2010-03-07 17:52:46 +01:00
|
|
|
output_sep(opt, sign);
|
2010-01-25 23:51:39 +01:00
|
|
|
}
|
|
|
|
if (opt->linenum) {
|
|
|
|
char buf[32];
|
2017-03-28 21:46:56 +02:00
|
|
|
xsnprintf(buf, sizeof(buf), "%d", lno);
|
2018-05-26 15:55:22 +02:00
|
|
|
output_color(opt, buf, strlen(buf), opt->colors[GREP_COLOR_LINENO]);
|
grep: Colorize filename, line number, and separator
Colorize the filename, line number, and separator in git grep output, as
GNU grep does. The colors are customizable through color.grep.<slot>.
The default is to only color the separator (in cyan), since this gives
the biggest legibility increase without overwhelming the user with
colors. GNU grep also defaults cyan for the separator, but defaults to
magenta for the filename and to green for the line number, as well.
There is one difference from GNU grep: When a binary file matches
without -a, GNU grep does not color the <file> in "Binary file <file>
matches", but we do.
Like GNU grep, if --null is given, the null separators are not colored.
For config.txt, use a a sub-list to describe the slots, rather than
a single paragraph with parentheses, since this is much more readable.
Remove the cast to int for `rm_eo - rm_so` since it is not necessary.
Signed-off-by: Mark Lodato <lodatom@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2010-03-07 17:52:46 +01:00
|
|
|
output_sep(opt, sign);
|
2010-01-25 23:51:39 +01:00
|
|
|
}
|
2018-06-22 17:49:42 +02:00
|
|
|
/*
|
|
|
|
* Treat 'cno' as the 1-indexed offset from the start of a non-context
|
|
|
|
* line to its first match. Otherwise, 'cno' is 0 indicating that we are
|
|
|
|
* being called with a context line.
|
|
|
|
*/
|
|
|
|
if (opt->columnnum && cno) {
|
|
|
|
char buf[32];
|
|
|
|
xsnprintf(buf, sizeof(buf), "%"PRIuMAX, (uintmax_t)cno);
|
2018-07-18 21:20:31 +02:00
|
|
|
output_color(opt, buf, strlen(buf), opt->colors[GREP_COLOR_COLUMNNO]);
|
2018-06-22 17:49:42 +02:00
|
|
|
output_sep(opt, sign);
|
|
|
|
}
|
2018-07-03 23:51:56 +02:00
|
|
|
}
|
|
|
|
|
2021-09-21 05:49:49 +02:00
|
|
|
static void show_line(struct grep_opt *opt,
|
|
|
|
const char *bol, const char *eol,
|
2018-07-03 23:51:56 +02:00
|
|
|
const char *name, unsigned lno, ssize_t cno, char sign)
|
|
|
|
{
|
|
|
|
int rest = eol - bol;
|
2018-07-09 22:33:47 +02:00
|
|
|
const char *match_color = NULL;
|
|
|
|
const char *line_color = NULL;
|
2018-07-03 23:51:56 +02:00
|
|
|
|
|
|
|
if (opt->file_break && opt->last_shown == 0) {
|
|
|
|
if (opt->show_hunk_mark)
|
|
|
|
opt->output(opt, "\n", 1);
|
|
|
|
} else if (opt->pre_context || opt->post_context || opt->funcbody) {
|
|
|
|
if (opt->last_shown == 0) {
|
|
|
|
if (opt->show_hunk_mark) {
|
2018-08-03 00:30:44 +02:00
|
|
|
output_color(opt, "--", 2, opt->colors[GREP_COLOR_SEP]);
|
2018-07-03 23:51:56 +02:00
|
|
|
opt->output(opt, "\n", 1);
|
|
|
|
}
|
|
|
|
} else if (lno > opt->last_shown + 1) {
|
2018-08-03 00:30:44 +02:00
|
|
|
output_color(opt, "--", 2, opt->colors[GREP_COLOR_SEP]);
|
2018-07-03 23:51:56 +02:00
|
|
|
opt->output(opt, "\n", 1);
|
|
|
|
}
|
|
|
|
}
|
2018-07-09 22:33:47 +02:00
|
|
|
if (!opt->only_matching) {
|
|
|
|
/*
|
|
|
|
* In case the line we're being called with contains more than
|
|
|
|
* one match, leave printing each header to the loop below.
|
|
|
|
*/
|
|
|
|
show_line_header(opt, name, lno, cno, sign);
|
|
|
|
}
|
|
|
|
if (opt->color || opt->only_matching) {
|
2009-03-07 13:32:32 +01:00
|
|
|
regmatch_t match;
|
|
|
|
enum grep_context ctx = GREP_CONTEXT_BODY;
|
|
|
|
int eflags = 0;
|
|
|
|
|
2018-07-09 22:33:47 +02:00
|
|
|
if (opt->color) {
|
|
|
|
if (sign == ':')
|
2018-08-03 00:30:44 +02:00
|
|
|
match_color = opt->colors[GREP_COLOR_MATCH_SELECTED];
|
2018-07-09 22:33:47 +02:00
|
|
|
else
|
2018-08-03 00:30:44 +02:00
|
|
|
match_color = opt->colors[GREP_COLOR_MATCH_CONTEXT];
|
2018-07-09 22:33:47 +02:00
|
|
|
if (sign == ':')
|
2018-08-03 00:30:44 +02:00
|
|
|
line_color = opt->colors[GREP_COLOR_SELECTED];
|
2018-07-09 22:33:47 +02:00
|
|
|
else if (sign == '-')
|
2018-08-03 00:30:44 +02:00
|
|
|
line_color = opt->colors[GREP_COLOR_CONTEXT];
|
2018-07-09 22:33:47 +02:00
|
|
|
else if (sign == '=')
|
2018-08-03 00:30:44 +02:00
|
|
|
line_color = opt->colors[GREP_COLOR_FUNCTION];
|
2018-07-09 22:33:47 +02:00
|
|
|
}
|
2021-09-29 13:57:15 +02:00
|
|
|
while (grep_next_match(opt, bol, eol, ctx, &match,
|
|
|
|
GREP_HEADER_FIELD_MAX, eflags)) {
|
2009-06-01 23:53:05 +02:00
|
|
|
if (match.rm_so == match.rm_eo)
|
|
|
|
break;
|
2010-01-25 23:51:39 +01:00
|
|
|
|
2018-07-09 22:33:47 +02:00
|
|
|
if (opt->only_matching)
|
|
|
|
show_line_header(opt, name, lno, cno, sign);
|
|
|
|
else
|
|
|
|
output_color(opt, bol, match.rm_so, line_color);
|
grep: Colorize filename, line number, and separator
Colorize the filename, line number, and separator in git grep output, as
GNU grep does. The colors are customizable through color.grep.<slot>.
The default is to only color the separator (in cyan), since this gives
the biggest legibility increase without overwhelming the user with
colors. GNU grep also defaults cyan for the separator, but defaults to
magenta for the filename and to green for the line number, as well.
There is one difference from GNU grep: When a binary file matches
without -a, GNU grep does not color the <file> in "Binary file <file>
matches", but we do.
Like GNU grep, if --null is given, the null separators are not colored.
For config.txt, use a a sub-list to describe the slots, rather than
a single paragraph with parentheses, since this is much more readable.
Remove the cast to int for `rm_eo - rm_so` since it is not necessary.
Signed-off-by: Mark Lodato <lodatom@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2010-03-07 17:52:46 +01:00
|
|
|
output_color(opt, bol + match.rm_so,
|
2014-10-27 19:23:05 +01:00
|
|
|
match.rm_eo - match.rm_so, match_color);
|
2018-07-09 22:33:47 +02:00
|
|
|
if (opt->only_matching)
|
|
|
|
opt->output(opt, "\n", 1);
|
2009-03-07 13:32:32 +01:00
|
|
|
bol += match.rm_eo;
|
2018-07-09 22:33:47 +02:00
|
|
|
cno += match.rm_eo;
|
2009-03-07 13:32:32 +01:00
|
|
|
rest -= match.rm_eo;
|
|
|
|
eflags = REG_NOTBOL;
|
|
|
|
}
|
|
|
|
}
|
2018-07-09 22:33:47 +02:00
|
|
|
if (!opt->only_matching) {
|
|
|
|
output_color(opt, bol, rest, line_color);
|
|
|
|
opt->output(opt, "\n", 1);
|
|
|
|
}
|
2009-03-07 13:32:32 +01:00
|
|
|
}
|
|
|
|
|
grep: make locking flag global
The low-level grep code traditionally didn't care about
threading, as it doesn't do any threading itself and didn't
call out to other non-thread-safe code. That changed with
0579f91 (grep: enable threading with -p and -W using lazy
attribute lookup, 2011-12-12), which pushed the lookup of
funcname attributes (which is not thread-safe) into the
low-level grep code.
As a result, the low-level code learned about a new global
"grep_attr_mutex" to serialize access to the attribute code.
A multi-threaded caller (e.g., builtin/grep.c) is expected
to initialize the mutex and set "use_threads" in the
grep_opt structure. The low-level code only uses the lock if
use_threads is set.
However, putting the use_threads flag into the grep_opt
struct is not the most logical place. Whether threading is
in use is not something that matters for each call to
grep_buffer, but is instead global to the whole program
(i.e., if any thread is doing multi-threaded grep, every
other thread, even if it thinks it is doing its own
single-threaded grep, would need to use the locking). In
practice, this distinction isn't a problem for us, because
the only user of multi-threaded grep is "git-grep", which
does nothing except call grep.
This patch turns the opt->use_threads flag into a global
flag. More important than the nit-picking semantic argument
above is that this means that the locking functions don't
need to actually have access to a grep_opt to know whether
to lock. Which in turn can make adding new locks simpler, as
we don't need to pass around a grep_opt.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:18:29 +01:00
|
|
|
int grep_use_locks;
|
|
|
|
|
2011-12-12 22:16:07 +01:00
|
|
|
/*
|
|
|
|
* This lock protects access to the gitattributes machinery, which is
|
|
|
|
* not thread-safe.
|
|
|
|
*/
|
|
|
|
pthread_mutex_t grep_attr_mutex;
|
|
|
|
|
grep: make locking flag global
The low-level grep code traditionally didn't care about
threading, as it doesn't do any threading itself and didn't
call out to other non-thread-safe code. That changed with
0579f91 (grep: enable threading with -p and -W using lazy
attribute lookup, 2011-12-12), which pushed the lookup of
funcname attributes (which is not thread-safe) into the
low-level grep code.
As a result, the low-level code learned about a new global
"grep_attr_mutex" to serialize access to the attribute code.
A multi-threaded caller (e.g., builtin/grep.c) is expected
to initialize the mutex and set "use_threads" in the
grep_opt structure. The low-level code only uses the lock if
use_threads is set.
However, putting the use_threads flag into the grep_opt
struct is not the most logical place. Whether threading is
in use is not something that matters for each call to
grep_buffer, but is instead global to the whole program
(i.e., if any thread is doing multi-threaded grep, every
other thread, even if it thinks it is doing its own
single-threaded grep, would need to use the locking). In
practice, this distinction isn't a problem for us, because
the only user of multi-threaded grep is "git-grep", which
does nothing except call grep.
This patch turns the opt->use_threads flag into a global
flag. More important than the nit-picking semantic argument
above is that this means that the locking functions don't
need to actually have access to a grep_opt to know whether
to lock. Which in turn can make adding new locks simpler, as
we don't need to pass around a grep_opt.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:18:29 +01:00
|
|
|
static inline void grep_attr_lock(void)
|
2011-12-12 22:16:07 +01:00
|
|
|
{
|
grep: make locking flag global
The low-level grep code traditionally didn't care about
threading, as it doesn't do any threading itself and didn't
call out to other non-thread-safe code. That changed with
0579f91 (grep: enable threading with -p and -W using lazy
attribute lookup, 2011-12-12), which pushed the lookup of
funcname attributes (which is not thread-safe) into the
low-level grep code.
As a result, the low-level code learned about a new global
"grep_attr_mutex" to serialize access to the attribute code.
A multi-threaded caller (e.g., builtin/grep.c) is expected
to initialize the mutex and set "use_threads" in the
grep_opt structure. The low-level code only uses the lock if
use_threads is set.
However, putting the use_threads flag into the grep_opt
struct is not the most logical place. Whether threading is
in use is not something that matters for each call to
grep_buffer, but is instead global to the whole program
(i.e., if any thread is doing multi-threaded grep, every
other thread, even if it thinks it is doing its own
single-threaded grep, would need to use the locking). In
practice, this distinction isn't a problem for us, because
the only user of multi-threaded grep is "git-grep", which
does nothing except call grep.
This patch turns the opt->use_threads flag into a global
flag. More important than the nit-picking semantic argument
above is that this means that the locking functions don't
need to actually have access to a grep_opt to know whether
to lock. Which in turn can make adding new locks simpler, as
we don't need to pass around a grep_opt.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:18:29 +01:00
|
|
|
if (grep_use_locks)
|
2011-12-12 22:16:07 +01:00
|
|
|
pthread_mutex_lock(&grep_attr_mutex);
|
|
|
|
}
|
|
|
|
|
grep: make locking flag global
The low-level grep code traditionally didn't care about
threading, as it doesn't do any threading itself and didn't
call out to other non-thread-safe code. That changed with
0579f91 (grep: enable threading with -p and -W using lazy
attribute lookup, 2011-12-12), which pushed the lookup of
funcname attributes (which is not thread-safe) into the
low-level grep code.
As a result, the low-level code learned about a new global
"grep_attr_mutex" to serialize access to the attribute code.
A multi-threaded caller (e.g., builtin/grep.c) is expected
to initialize the mutex and set "use_threads" in the
grep_opt structure. The low-level code only uses the lock if
use_threads is set.
However, putting the use_threads flag into the grep_opt
struct is not the most logical place. Whether threading is
in use is not something that matters for each call to
grep_buffer, but is instead global to the whole program
(i.e., if any thread is doing multi-threaded grep, every
other thread, even if it thinks it is doing its own
single-threaded grep, would need to use the locking). In
practice, this distinction isn't a problem for us, because
the only user of multi-threaded grep is "git-grep", which
does nothing except call grep.
This patch turns the opt->use_threads flag into a global
flag. More important than the nit-picking semantic argument
above is that this means that the locking functions don't
need to actually have access to a grep_opt to know whether
to lock. Which in turn can make adding new locks simpler, as
we don't need to pass around a grep_opt.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:18:29 +01:00
|
|
|
static inline void grep_attr_unlock(void)
|
2011-12-12 22:16:07 +01:00
|
|
|
{
|
grep: make locking flag global
The low-level grep code traditionally didn't care about
threading, as it doesn't do any threading itself and didn't
call out to other non-thread-safe code. That changed with
0579f91 (grep: enable threading with -p and -W using lazy
attribute lookup, 2011-12-12), which pushed the lookup of
funcname attributes (which is not thread-safe) into the
low-level grep code.
As a result, the low-level code learned about a new global
"grep_attr_mutex" to serialize access to the attribute code.
A multi-threaded caller (e.g., builtin/grep.c) is expected
to initialize the mutex and set "use_threads" in the
grep_opt structure. The low-level code only uses the lock if
use_threads is set.
However, putting the use_threads flag into the grep_opt
struct is not the most logical place. Whether threading is
in use is not something that matters for each call to
grep_buffer, but is instead global to the whole program
(i.e., if any thread is doing multi-threaded grep, every
other thread, even if it thinks it is doing its own
single-threaded grep, would need to use the locking). In
practice, this distinction isn't a problem for us, because
the only user of multi-threaded grep is "git-grep", which
does nothing except call grep.
This patch turns the opt->use_threads flag into a global
flag. More important than the nit-picking semantic argument
above is that this means that the locking functions don't
need to actually have access to a grep_opt to know whether
to lock. Which in turn can make adding new locks simpler, as
we don't need to pass around a grep_opt.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:18:29 +01:00
|
|
|
if (grep_use_locks)
|
2011-12-12 22:16:07 +01:00
|
|
|
pthread_mutex_unlock(&grep_attr_mutex);
|
|
|
|
}
|
2012-02-02 09:18:41 +01:00
|
|
|
|
2021-09-21 05:49:49 +02:00
|
|
|
static int match_funcname(struct grep_opt *opt, struct grep_source *gs,
|
|
|
|
const char *bol, const char *eol)
|
2009-07-02 00:06:34 +02:00
|
|
|
{
|
2009-07-02 00:07:24 +02:00
|
|
|
xdemitconf_t *xecfg = opt->priv;
|
2011-12-12 22:16:07 +01:00
|
|
|
if (xecfg && !xecfg->find_func) {
|
2018-09-21 17:57:33 +02:00
|
|
|
grep_source_load_driver(gs, opt->repo->index);
|
2012-02-02 09:20:43 +01:00
|
|
|
if (gs->driver->funcname.pattern) {
|
|
|
|
const struct userdiff_funcname *pe = &gs->driver->funcname;
|
2011-12-12 22:16:07 +01:00
|
|
|
xdiff_set_find_func(xecfg, pe->pattern, pe->cflags);
|
|
|
|
} else {
|
|
|
|
xecfg = opt->priv = NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (xecfg) {
|
2009-07-02 00:07:24 +02:00
|
|
|
char buf[1];
|
|
|
|
return xecfg->find_func(bol, eol - bol, buf, 1,
|
|
|
|
xecfg->find_func_priv) >= 0;
|
|
|
|
}
|
|
|
|
|
2009-07-02 00:06:34 +02:00
|
|
|
if (bol == eol)
|
|
|
|
return 0;
|
|
|
|
if (isalpha(*bol) || *bol == '_' || *bol == '$')
|
|
|
|
return 1;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
static void show_funcname_line(struct grep_opt *opt, struct grep_source *gs,
|
2021-09-21 05:49:49 +02:00
|
|
|
const char *bol, unsigned lno)
|
2009-07-02 00:06:34 +02:00
|
|
|
{
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
while (bol > gs->buf) {
|
2021-09-21 05:49:49 +02:00
|
|
|
const char *eol = --bol;
|
2009-07-02 00:06:34 +02:00
|
|
|
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
while (bol > gs->buf && bol[-1] != '\n')
|
2009-07-02 00:06:34 +02:00
|
|
|
bol--;
|
|
|
|
lno--;
|
|
|
|
|
|
|
|
if (lno <= opt->last_shown)
|
|
|
|
break;
|
|
|
|
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
if (match_funcname(opt, gs, bol, eol)) {
|
2018-06-22 17:49:42 +02:00
|
|
|
show_line(opt, bol, eol, gs->name, lno, 0, '=');
|
2009-07-02 00:06:34 +02:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-11-18 19:08:08 +01:00
|
|
|
static int is_empty_line(const char *bol, const char *eol);
|
|
|
|
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
static void show_pre_context(struct grep_opt *opt, struct grep_source *gs,
|
2021-09-21 05:49:49 +02:00
|
|
|
const char *bol, const char *end, unsigned lno)
|
2009-07-02 00:05:17 +02:00
|
|
|
{
|
2017-11-18 19:07:13 +01:00
|
|
|
unsigned cur = lno, from = 1, funcname_lno = 0, orig_from;
|
2017-11-18 19:08:08 +01:00
|
|
|
int funcname_needed = !!opt->funcname, comment_needed = 0;
|
2011-08-01 19:20:53 +02:00
|
|
|
|
2009-07-02 00:05:17 +02:00
|
|
|
if (opt->pre_context < lno)
|
|
|
|
from = lno - opt->pre_context;
|
|
|
|
if (from <= opt->last_shown)
|
|
|
|
from = opt->last_shown + 1;
|
2017-11-18 19:07:13 +01:00
|
|
|
orig_from = from;
|
2017-11-18 19:08:08 +01:00
|
|
|
if (opt->funcbody) {
|
|
|
|
if (match_funcname(opt, gs, bol, end))
|
|
|
|
comment_needed = 1;
|
|
|
|
else
|
|
|
|
funcname_needed = 1;
|
2017-11-18 19:07:13 +01:00
|
|
|
from = opt->last_shown + 1;
|
|
|
|
}
|
2009-07-02 00:05:17 +02:00
|
|
|
|
|
|
|
/* Rewind. */
|
2017-11-18 19:07:13 +01:00
|
|
|
while (bol > gs->buf && cur > from) {
|
2021-09-21 05:49:49 +02:00
|
|
|
const char *next_bol = bol;
|
|
|
|
const char *eol = --bol;
|
2009-07-02 00:06:34 +02:00
|
|
|
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
while (bol > gs->buf && bol[-1] != '\n')
|
2009-07-02 00:05:17 +02:00
|
|
|
bol--;
|
|
|
|
cur--;
|
2017-11-18 19:08:08 +01:00
|
|
|
if (comment_needed && (is_empty_line(bol, eol) ||
|
|
|
|
match_funcname(opt, gs, bol, eol))) {
|
|
|
|
comment_needed = 0;
|
|
|
|
from = orig_from;
|
|
|
|
if (cur < from) {
|
|
|
|
cur++;
|
|
|
|
bol = next_bol;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
if (funcname_needed && match_funcname(opt, gs, bol, eol)) {
|
2009-07-02 00:06:34 +02:00
|
|
|
funcname_lno = cur;
|
|
|
|
funcname_needed = 0;
|
2017-11-18 19:08:08 +01:00
|
|
|
if (opt->funcbody)
|
|
|
|
comment_needed = 1;
|
|
|
|
else
|
|
|
|
from = orig_from;
|
2009-07-02 00:06:34 +02:00
|
|
|
}
|
2009-07-02 00:05:17 +02:00
|
|
|
}
|
|
|
|
|
2009-07-02 00:06:34 +02:00
|
|
|
/* We need to look even further back to find a function signature. */
|
|
|
|
if (opt->funcname && funcname_needed)
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
show_funcname_line(opt, gs, bol, cur);
|
2009-07-02 00:06:34 +02:00
|
|
|
|
2009-07-02 00:05:17 +02:00
|
|
|
/* Back forward. */
|
|
|
|
while (cur < lno) {
|
2021-09-21 05:49:49 +02:00
|
|
|
const char *eol = bol, sign = (cur == funcname_lno) ? '=' : '-';
|
2009-07-02 00:05:17 +02:00
|
|
|
|
|
|
|
while (*eol != '\n')
|
|
|
|
eol++;
|
2018-06-22 17:49:42 +02:00
|
|
|
show_line(opt, bol, eol, gs->name, cur, 0, sign);
|
2009-07-02 00:05:17 +02:00
|
|
|
bol = eol + 1;
|
|
|
|
cur++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-01-11 07:39:36 +01:00
|
|
|
static int should_lookahead(struct grep_opt *opt)
|
|
|
|
{
|
|
|
|
struct grep_pat *p;
|
|
|
|
|
grep.c: remove "extended" in favor of "pattern_expression", fix segfault
Since 79d3696cfb4 (git-grep: boolean expression on pattern matching.,
2006-06-30) the "pattern_expression" member has been used for complex
queries (AND/OR...), with "pattern_list" being used for the simple OR
queries. Since then we've used both "pattern_expression" and its
associated boolean "extended" member to see if we have a complex
expression.
Since f41fb662f57 (revisions API: have release_revisions() release
"grep_filter", 2022-04-13) we've had a subtle bug relating to that: If
we supplied options that were only used for "complex queries", but
didn't supply the query itself we'd set "opt->extended", but would
have a NULL "pattern_expression". As a result these would segfault as
we tried to call "free_grep_patterns()" from "release_revisions()":
git -P log -1 --invert-grep
git -P log -1 --all-match
The root cause of this is that we were conflating the state management
we needed in "compile_grep_patterns()" itself with whether or not we
had an "opt->pattern_expression" later on.
In this cases as we're going through "compile_grep_patterns()" we have
no "opt->pattern_list" but have "opt->no_body_match" or
"opt->all_match". So we'd set "opt->extended = 1", but not "return" on
"opt->extended" as that's an "else if" in the same "if" statement.
That behavior is intentional and required, as the common case is that
we have an "opt->pattern_list" that we're about to parse into the
"opt->pattern_expression".
But we don't need to keep track of this "extended" flag beyond the
state management in compile_grep_patterns() itself. It needs it, but
once we're out of that function we can rely on
"opt->pattern_expression" being non-NULL instead for using these
extended patterns.
As 79d3696cfb4 itself shows we've assumed that there's a one-to-one
mapping between the two since the very beginning. I.e. "match_line()"
would check "opt->extended" to see if it should call "match_expr()",
and the first thing we do in that function is assume that we have a
"opt->pattern_expression". We'd then call "match_expr_eval()", which
would have died if that "opt->pattern_expression" was NULL.
The "die" was added in c922b01f54c (grep: fix segfault when "git grep
'('" is given, 2009-04-27), and can now be removed as it's now clearly
unreachable. We still do the right thing in the case that prompted
that fix:
git grep '('
fatal: unmatched parenthesis
Arguably neither the "--invert-grep" option added in [1] nor the
earlier "--all-match" option added in [2] were intended to be used
stand-alone, and another approach[3] would be to error out in those
cases. But since we've been treating them as a NOOP when given without
--grep for a long time let's keep doing that.
We could also return in "free_pattern_expr()" if the argument is
non-NULL, as an alternative fix for this segfault does [4]. That would
be more elegant in making the "free_*()" function behave like
"free()", but it would also remove a sanity check: The
"free_pattern_expr()" function calls itself recursively, and only the
top-level is allowed to be NULL, let's not conflate those two
conditions.
1. 22dfa8a23de (log: teach --invert-grep option, 2015-01-12)
2. 0ab7befa31d (grep --all-match, 2006-09-27)
3. https://lore.kernel.org/git/patch-1.1-f4b90799fce-20221010T165711Z-avarab@gmail.com/
4. http://lore.kernel.org/git/7e094882c2a71894416089f894557a9eae07e8f8.1665423686.git.me@ttaylorr.com
Reported-by: orygaw <orygaw@protonmail.com>
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2022-10-11 11:48:45 +02:00
|
|
|
if (opt->pattern_expression)
|
2010-01-11 07:39:36 +01:00
|
|
|
return 0; /* punt for too complex stuff */
|
|
|
|
if (opt->invert)
|
|
|
|
return 0;
|
|
|
|
for (p = opt->pattern_list; p; p = p->next) {
|
|
|
|
if (p->token != GREP_PATTERN)
|
|
|
|
return 0; /* punt for "header only" and stuff */
|
|
|
|
}
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int look_ahead(struct grep_opt *opt,
|
|
|
|
unsigned long *left_p,
|
|
|
|
unsigned *lno_p,
|
2021-09-21 05:49:49 +02:00
|
|
|
const char **bol_p)
|
2010-01-11 07:39:36 +01:00
|
|
|
{
|
|
|
|
unsigned lno = *lno_p;
|
2021-09-21 05:49:49 +02:00
|
|
|
const char *bol = *bol_p;
|
2010-01-11 07:39:36 +01:00
|
|
|
struct grep_pat *p;
|
2021-09-21 05:49:49 +02:00
|
|
|
const char *sp, *last_bol;
|
2010-01-11 07:39:36 +01:00
|
|
|
regoff_t earliest = -1;
|
|
|
|
|
|
|
|
for (p = opt->pattern_list; p; p = p->next) {
|
|
|
|
int hit;
|
|
|
|
regmatch_t m;
|
|
|
|
|
2011-05-05 00:00:19 +02:00
|
|
|
hit = patmatch(p, bol, bol + *left_p, &m, 0);
|
2010-01-11 07:39:36 +01:00
|
|
|
if (!hit || m.rm_so < 0 || m.rm_eo < 0)
|
|
|
|
continue;
|
|
|
|
if (earliest < 0 || m.rm_so < earliest)
|
|
|
|
earliest = m.rm_so;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (earliest < 0) {
|
|
|
|
*bol_p = bol + *left_p;
|
|
|
|
*left_p = 0;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
for (sp = bol + earliest; bol < sp && sp[-1] != '\n'; sp--)
|
|
|
|
; /* find the beginning of the line */
|
|
|
|
last_bol = sp;
|
|
|
|
|
|
|
|
for (sp = bol; sp < last_bol; sp++) {
|
|
|
|
if (*sp == '\n')
|
|
|
|
lno++;
|
|
|
|
}
|
|
|
|
*left_p -= last_bol - bol;
|
|
|
|
*bol_p = last_bol;
|
|
|
|
*lno_p = lno;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2018-09-21 17:57:23 +02:00
|
|
|
static int fill_textconv_grep(struct repository *r,
|
|
|
|
struct userdiff_driver *driver,
|
2013-05-10 17:10:15 +02:00
|
|
|
struct grep_source *gs)
|
|
|
|
{
|
|
|
|
struct diff_filespec *df;
|
|
|
|
char *buf;
|
|
|
|
size_t size;
|
|
|
|
|
|
|
|
if (!driver || !driver->textconv)
|
|
|
|
return grep_source_load(gs);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The textconv interface is intimately tied to diff_filespecs, so we
|
|
|
|
* have to pretend to be one. If we could unify the grep_source
|
|
|
|
* and diff_filespec structs, this mess could just go away.
|
|
|
|
*/
|
|
|
|
df = alloc_filespec(gs->path);
|
|
|
|
switch (gs->type) {
|
2017-05-30 19:30:44 +02:00
|
|
|
case GREP_SOURCE_OID:
|
2013-05-10 17:10:15 +02:00
|
|
|
fill_filespec(df, gs->identifier, 1, 0100644);
|
|
|
|
break;
|
|
|
|
case GREP_SOURCE_FILE:
|
2021-04-26 03:02:56 +02:00
|
|
|
fill_filespec(df, null_oid(), 0, 0100644);
|
2013-05-10 17:10:15 +02:00
|
|
|
break;
|
|
|
|
default:
|
2018-05-02 11:38:39 +02:00
|
|
|
BUG("attempt to textconv something without a path?");
|
2013-05-10 17:10:15 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
grep: replace grep_read_mutex by internal obj read lock
git-grep uses 'grep_read_mutex' to protect its calls to object reading
operations. But these have their own internal lock now, which ensures a
better performance (allowing parallel access to more regions). So, let's
remove the former and, instead, activate the latter with
enable_obj_read_lock().
Sections that are currently protected by 'grep_read_mutex' but are not
internally protected by the object reading lock should be surrounded by
obj_read_lock() and obj_read_unlock(). These guarantee mutual exclusion
with object reading operations, keeping the current behavior and
avoiding race conditions. Namely, these places are:
In grep.c:
- fill_textconv() at fill_textconv_grep().
- userdiff_get_textconv() at grep_source_1().
In builtin/grep.c:
- parse_object_or_die() and the submodule functions at
grep_submodule().
- deref_tag() and gitmodules_config_oid() at grep_objects().
If these functions become thread-safe, in the future, we might remove
the locking and probably get some speedup.
Note that some of the submodule functions will already be thread-safe
(or close to being thread-safe) with the internal object reading lock.
However, as some of them will require additional modifications to be
removed from the critical section, this will be done in its own patch.
Signed-off-by: Matheus Tavares <matheus.bernardino@usp.br>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-01-16 03:39:54 +01:00
|
|
|
* fill_textconv is not remotely thread-safe; it modifies the global
|
|
|
|
* diff tempfile structure, writes to the_repo's odb and might
|
|
|
|
* internally call thread-unsafe functions such as the
|
|
|
|
* prepare_packed_git() lazy-initializator. Because of the last two, we
|
|
|
|
* must ensure mutual exclusion between this call and the object reading
|
|
|
|
* API, thus we use obj_read_lock() here.
|
|
|
|
*
|
|
|
|
* TODO: allowing text conversion to run in parallel with object
|
|
|
|
* reading operations might increase performance in the multithreaded
|
|
|
|
* non-worktreee git-grep with --textconv.
|
2013-05-10 17:10:15 +02:00
|
|
|
*/
|
grep: replace grep_read_mutex by internal obj read lock
git-grep uses 'grep_read_mutex' to protect its calls to object reading
operations. But these have their own internal lock now, which ensures a
better performance (allowing parallel access to more regions). So, let's
remove the former and, instead, activate the latter with
enable_obj_read_lock().
Sections that are currently protected by 'grep_read_mutex' but are not
internally protected by the object reading lock should be surrounded by
obj_read_lock() and obj_read_unlock(). These guarantee mutual exclusion
with object reading operations, keeping the current behavior and
avoiding race conditions. Namely, these places are:
In grep.c:
- fill_textconv() at fill_textconv_grep().
- userdiff_get_textconv() at grep_source_1().
In builtin/grep.c:
- parse_object_or_die() and the submodule functions at
grep_submodule().
- deref_tag() and gitmodules_config_oid() at grep_objects().
If these functions become thread-safe, in the future, we might remove
the locking and probably get some speedup.
Note that some of the submodule functions will already be thread-safe
(or close to being thread-safe) with the internal object reading lock.
However, as some of them will require additional modifications to be
removed from the critical section, this will be done in its own patch.
Signed-off-by: Matheus Tavares <matheus.bernardino@usp.br>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-01-16 03:39:54 +01:00
|
|
|
obj_read_lock();
|
2018-09-21 17:57:23 +02:00
|
|
|
size = fill_textconv(r, driver, df, &buf);
|
grep: replace grep_read_mutex by internal obj read lock
git-grep uses 'grep_read_mutex' to protect its calls to object reading
operations. But these have their own internal lock now, which ensures a
better performance (allowing parallel access to more regions). So, let's
remove the former and, instead, activate the latter with
enable_obj_read_lock().
Sections that are currently protected by 'grep_read_mutex' but are not
internally protected by the object reading lock should be surrounded by
obj_read_lock() and obj_read_unlock(). These guarantee mutual exclusion
with object reading operations, keeping the current behavior and
avoiding race conditions. Namely, these places are:
In grep.c:
- fill_textconv() at fill_textconv_grep().
- userdiff_get_textconv() at grep_source_1().
In builtin/grep.c:
- parse_object_or_die() and the submodule functions at
grep_submodule().
- deref_tag() and gitmodules_config_oid() at grep_objects().
If these functions become thread-safe, in the future, we might remove
the locking and probably get some speedup.
Note that some of the submodule functions will already be thread-safe
(or close to being thread-safe) with the internal object reading lock.
However, as some of them will require additional modifications to be
removed from the critical section, this will be done in its own patch.
Signed-off-by: Matheus Tavares <matheus.bernardino@usp.br>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-01-16 03:39:54 +01:00
|
|
|
obj_read_unlock();
|
2013-05-10 17:10:15 +02:00
|
|
|
free_filespec(df);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The normal fill_textconv usage by the diff machinery would just keep
|
|
|
|
* the textconv'd buf separate from the diff_filespec. But much of the
|
|
|
|
* grep code passes around a grep_source and assumes that its "buf"
|
|
|
|
* pointer is the beginning of the thing we are searching. So let's
|
|
|
|
* install our textconv'd version into the grep_source, taking care not
|
|
|
|
* to leak any existing buffer.
|
|
|
|
*/
|
|
|
|
grep_source_clear_data(gs);
|
|
|
|
gs->buf = buf;
|
|
|
|
gs->size = size;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2016-05-28 17:06:19 +02:00
|
|
|
static int is_empty_line(const char *bol, const char *eol)
|
|
|
|
{
|
|
|
|
while (bol < eol && isspace(*bol))
|
|
|
|
bol++;
|
|
|
|
return bol == eol;
|
|
|
|
}
|
|
|
|
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
static int grep_source_1(struct grep_opt *opt, struct grep_source *gs, int collect_hits)
|
2006-09-18 01:02:52 +02:00
|
|
|
{
|
2021-09-21 05:49:49 +02:00
|
|
|
const char *bol;
|
|
|
|
const char *peek_bol = NULL;
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
unsigned long left;
|
2006-09-18 01:02:52 +02:00
|
|
|
unsigned lno = 1;
|
|
|
|
unsigned last_hit = 0;
|
|
|
|
int binary_match_only = 0;
|
|
|
|
unsigned count = 0;
|
2010-01-11 07:39:36 +01:00
|
|
|
int try_lookahead = 0;
|
2011-08-01 19:20:53 +02:00
|
|
|
int show_function = 0;
|
2013-05-10 17:10:15 +02:00
|
|
|
struct userdiff_driver *textconv = NULL;
|
2006-09-20 21:39:46 +02:00
|
|
|
enum grep_context ctx = GREP_CONTEXT_HEAD;
|
2009-07-02 00:07:24 +02:00
|
|
|
xdemitconf_t xecfg;
|
2006-09-18 01:02:52 +02:00
|
|
|
|
grep: fail if call could output and name is null
grep_source(), which performs much of the work for Git's grep library,
allows passing an arbitrary struct grep_source which represents the text
which grep_source() should search to match a pattern in the provided
struct grep_opt. In most callers, the grep_source::name field is set to
an appropriate prefix to print before a colon when a result matches:
README:Git is an Open Source project covered by the GNU General
One caller, grep_buffer(), leaves the grep_source::name field set to
NULL because there isn't enough context to determine an appropriate name
for this kind of output line. In practice, this has been fine: the only
caller of grep_buffer() is "git log --grep", and that caller sets
grep_opt::status_only, which disables output and only checks whether a
match exists. But this is brittle: a future caller can call
grep_buffer() without grep_opt::status_only set, and as soon as it hits
a match, grep_source() will try to print the match and segfault:
(null):Git is an Open Source project covered by the GNU General
For example, a future caller might want to print all matching lines from
commits which match a regex.
Futureproof by diagnosing early a use of the API that could trigger that
condition, before we know whether the pattern matches:
BUG: grep.c:1783: grep call which could print a name requires
grep_source.name be non-NULL
Aborted
This way, the caller's author gets an indication of how to fix the issue
- by providing grep_source::name or setting grep_opt::status_only - and
they are warned of the potential for a segfault unconditionally, rather
than only if there is a match.
Noticed while adding such a call to a tutorial on revision walks.
Signed-off-by: Emily Shaffer <emilyshaffer@google.com>
Reviewed-by: Jonathan Nieder <jrnieder@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-05-23 22:23:56 +02:00
|
|
|
if (!opt->status_only && gs->name == NULL)
|
|
|
|
BUG("grep call which could print a name requires "
|
|
|
|
"grep_source.name be non-NULL");
|
|
|
|
|
2010-01-25 23:51:39 +01:00
|
|
|
if (!opt->output)
|
|
|
|
opt->output = std_output;
|
|
|
|
|
2011-08-01 19:20:53 +02:00
|
|
|
if (opt->pre_context || opt->post_context || opt->file_break ||
|
|
|
|
opt->funcbody) {
|
2011-06-05 17:24:15 +02:00
|
|
|
/* Show hunk marks, except for the first file. */
|
|
|
|
if (opt->last_shown)
|
|
|
|
opt->show_hunk_mark = 1;
|
|
|
|
/*
|
|
|
|
* If we're using threads then we can't easily identify
|
|
|
|
* the first file. Always put hunk marks in that case
|
|
|
|
* and skip the very first one later in work_done().
|
|
|
|
*/
|
|
|
|
if (opt->output != std_output)
|
|
|
|
opt->show_hunk_mark = 1;
|
|
|
|
}
|
2010-03-15 17:21:10 +01:00
|
|
|
opt->last_shown = 0;
|
|
|
|
|
2013-05-10 17:10:15 +02:00
|
|
|
if (opt->allow_textconv) {
|
2018-09-21 17:57:33 +02:00
|
|
|
grep_source_load_driver(gs, opt->repo->index);
|
2013-05-10 17:10:15 +02:00
|
|
|
/*
|
|
|
|
* We might set up the shared textconv cache data here, which
|
grep: replace grep_read_mutex by internal obj read lock
git-grep uses 'grep_read_mutex' to protect its calls to object reading
operations. But these have their own internal lock now, which ensures a
better performance (allowing parallel access to more regions). So, let's
remove the former and, instead, activate the latter with
enable_obj_read_lock().
Sections that are currently protected by 'grep_read_mutex' but are not
internally protected by the object reading lock should be surrounded by
obj_read_lock() and obj_read_unlock(). These guarantee mutual exclusion
with object reading operations, keeping the current behavior and
avoiding race conditions. Namely, these places are:
In grep.c:
- fill_textconv() at fill_textconv_grep().
- userdiff_get_textconv() at grep_source_1().
In builtin/grep.c:
- parse_object_or_die() and the submodule functions at
grep_submodule().
- deref_tag() and gitmodules_config_oid() at grep_objects().
If these functions become thread-safe, in the future, we might remove
the locking and probably get some speedup.
Note that some of the submodule functions will already be thread-safe
(or close to being thread-safe) with the internal object reading lock.
However, as some of them will require additional modifications to be
removed from the critical section, this will be done in its own patch.
Signed-off-by: Matheus Tavares <matheus.bernardino@usp.br>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-01-16 03:39:54 +01:00
|
|
|
* is not thread-safe. Also, get_oid_with_context() and
|
|
|
|
* parse_object() might be internally called. As they are not
|
2020-07-29 05:33:28 +02:00
|
|
|
* currently thread-safe and might be racy with object reading,
|
grep: replace grep_read_mutex by internal obj read lock
git-grep uses 'grep_read_mutex' to protect its calls to object reading
operations. But these have their own internal lock now, which ensures a
better performance (allowing parallel access to more regions). So, let's
remove the former and, instead, activate the latter with
enable_obj_read_lock().
Sections that are currently protected by 'grep_read_mutex' but are not
internally protected by the object reading lock should be surrounded by
obj_read_lock() and obj_read_unlock(). These guarantee mutual exclusion
with object reading operations, keeping the current behavior and
avoiding race conditions. Namely, these places are:
In grep.c:
- fill_textconv() at fill_textconv_grep().
- userdiff_get_textconv() at grep_source_1().
In builtin/grep.c:
- parse_object_or_die() and the submodule functions at
grep_submodule().
- deref_tag() and gitmodules_config_oid() at grep_objects().
If these functions become thread-safe, in the future, we might remove
the locking and probably get some speedup.
Note that some of the submodule functions will already be thread-safe
(or close to being thread-safe) with the internal object reading lock.
However, as some of them will require additional modifications to be
removed from the critical section, this will be done in its own patch.
Signed-off-by: Matheus Tavares <matheus.bernardino@usp.br>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-01-16 03:39:54 +01:00
|
|
|
* obj_read_lock() must be called.
|
2013-05-10 17:10:15 +02:00
|
|
|
*/
|
|
|
|
grep_attr_lock();
|
grep: replace grep_read_mutex by internal obj read lock
git-grep uses 'grep_read_mutex' to protect its calls to object reading
operations. But these have their own internal lock now, which ensures a
better performance (allowing parallel access to more regions). So, let's
remove the former and, instead, activate the latter with
enable_obj_read_lock().
Sections that are currently protected by 'grep_read_mutex' but are not
internally protected by the object reading lock should be surrounded by
obj_read_lock() and obj_read_unlock(). These guarantee mutual exclusion
with object reading operations, keeping the current behavior and
avoiding race conditions. Namely, these places are:
In grep.c:
- fill_textconv() at fill_textconv_grep().
- userdiff_get_textconv() at grep_source_1().
In builtin/grep.c:
- parse_object_or_die() and the submodule functions at
grep_submodule().
- deref_tag() and gitmodules_config_oid() at grep_objects().
If these functions become thread-safe, in the future, we might remove
the locking and probably get some speedup.
Note that some of the submodule functions will already be thread-safe
(or close to being thread-safe) with the internal object reading lock.
However, as some of them will require additional modifications to be
removed from the critical section, this will be done in its own patch.
Signed-off-by: Matheus Tavares <matheus.bernardino@usp.br>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-01-16 03:39:54 +01:00
|
|
|
obj_read_lock();
|
2018-11-10 06:49:06 +01:00
|
|
|
textconv = userdiff_get_textconv(opt->repo, gs->driver);
|
grep: replace grep_read_mutex by internal obj read lock
git-grep uses 'grep_read_mutex' to protect its calls to object reading
operations. But these have their own internal lock now, which ensures a
better performance (allowing parallel access to more regions). So, let's
remove the former and, instead, activate the latter with
enable_obj_read_lock().
Sections that are currently protected by 'grep_read_mutex' but are not
internally protected by the object reading lock should be surrounded by
obj_read_lock() and obj_read_unlock(). These guarantee mutual exclusion
with object reading operations, keeping the current behavior and
avoiding race conditions. Namely, these places are:
In grep.c:
- fill_textconv() at fill_textconv_grep().
- userdiff_get_textconv() at grep_source_1().
In builtin/grep.c:
- parse_object_or_die() and the submodule functions at
grep_submodule().
- deref_tag() and gitmodules_config_oid() at grep_objects().
If these functions become thread-safe, in the future, we might remove
the locking and probably get some speedup.
Note that some of the submodule functions will already be thread-safe
(or close to being thread-safe) with the internal object reading lock.
However, as some of them will require additional modifications to be
removed from the critical section, this will be done in its own patch.
Signed-off-by: Matheus Tavares <matheus.bernardino@usp.br>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-01-16 03:39:54 +01:00
|
|
|
obj_read_unlock();
|
2013-05-10 17:10:15 +02:00
|
|
|
grep_attr_unlock();
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We know the result of a textconv is text, so we only have to care
|
|
|
|
* about binary handling if we are not using it.
|
|
|
|
*/
|
|
|
|
if (!textconv) {
|
|
|
|
switch (opt->binary) {
|
|
|
|
case GREP_BINARY_DEFAULT:
|
2018-09-21 17:57:33 +02:00
|
|
|
if (grep_source_is_binary(gs, opt->repo->index))
|
2013-05-10 17:10:15 +02:00
|
|
|
binary_match_only = 1;
|
|
|
|
break;
|
|
|
|
case GREP_BINARY_NOMATCH:
|
2018-09-21 17:57:33 +02:00
|
|
|
if (grep_source_is_binary(gs, opt->repo->index))
|
2013-05-10 17:10:15 +02:00
|
|
|
return 0; /* Assume unmatch */
|
|
|
|
break;
|
|
|
|
case GREP_BINARY_TEXT:
|
|
|
|
break;
|
|
|
|
default:
|
2018-05-02 11:38:39 +02:00
|
|
|
BUG("unknown binary handling mode");
|
2013-05-10 17:10:15 +02:00
|
|
|
}
|
2006-09-18 01:02:52 +02:00
|
|
|
}
|
|
|
|
|
2009-07-02 00:07:24 +02:00
|
|
|
memset(&xecfg, 0, sizeof(xecfg));
|
2011-12-12 22:16:07 +01:00
|
|
|
opt->priv = &xecfg;
|
|
|
|
|
2010-01-11 07:39:36 +01:00
|
|
|
try_lookahead = should_lookahead(opt);
|
2009-07-02 00:07:24 +02:00
|
|
|
|
2018-09-21 17:57:23 +02:00
|
|
|
if (fill_textconv_grep(opt->repo, textconv, gs) < 0)
|
2012-02-02 09:21:11 +01:00
|
|
|
return 0;
|
|
|
|
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
bol = gs->buf;
|
|
|
|
left = gs->size;
|
2006-09-18 01:02:52 +02:00
|
|
|
while (left) {
|
2021-09-21 05:49:49 +02:00
|
|
|
const char *eol;
|
2006-09-28 02:50:52 +02:00
|
|
|
int hit;
|
2018-06-22 17:49:42 +02:00
|
|
|
ssize_t cno;
|
grep.c: expose {,inverted} match column in match_line()
When calling match_line(), callers presently cannot determine the
relative offset of the match because match_line() discards the
'regmatch_t' that contains this information.
Instead, teach match_line() to take in two 'ssize_t's. Fill the first
with the offset of the match produced by the given expression. If
extended, fill the later with the offset of the match produced as if
--invert were given.
For instance, matching "--not -e x" on this line produces a columnar
offset of 0, (i.e., the whole line does not contain an x), but "--invert
--not -e -x" will fill the later ssize_t of the column containing an
"x", because this expression is semantically equivalent to "-e x".
To determine the column for the inverted and non-inverted case, do the
following:
- If matching an atom, the non-inverted column is as given from
match_one_pattern(), and the inverted column is unset.
- If matching a --not, the inverted column and non-inverted column
swap.
- If matching an --and, or --or, the non-inverted column is the
minimum of the two children.
Presently, the existing short-circuiting logic for AND and OR applies as
before. This will change in the following commit when we add options to
configure the --column flag. Taken together, this and the forthcoming
change will always yield the earlier column on a given line.
This patch will become useful when we later pick between the two new
results in order to display the column number of the first match on a
line with --column.
Co-authored-by: Jeff King <peff@peff.net>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-06-22 17:49:35 +02:00
|
|
|
ssize_t col = -1, icol = -1;
|
2006-09-18 01:02:52 +02:00
|
|
|
|
2010-01-11 07:39:36 +01:00
|
|
|
/*
|
2011-05-09 23:52:03 +02:00
|
|
|
* look_ahead() skips quickly to the line that possibly
|
2010-01-11 07:39:36 +01:00
|
|
|
* has the next hit; don't call it if we need to do
|
|
|
|
* something more than just skipping the current line
|
|
|
|
* in response to an unmatch for the current line. E.g.
|
|
|
|
* inside a post-context window, we will show the current
|
|
|
|
* line as a context around the previous hit when it
|
|
|
|
* doesn't hit.
|
|
|
|
*/
|
|
|
|
if (try_lookahead
|
|
|
|
&& !(last_hit
|
2011-08-01 19:20:53 +02:00
|
|
|
&& (show_function ||
|
|
|
|
lno <= last_hit + opt->post_context))
|
2010-01-11 07:39:36 +01:00
|
|
|
&& look_ahead(opt, &left, &lno, &bol))
|
|
|
|
break;
|
2006-09-18 01:02:52 +02:00
|
|
|
eol = end_of_line(bol, &left);
|
|
|
|
|
2006-09-20 21:39:46 +02:00
|
|
|
if ((ctx == GREP_CONTEXT_HEAD) && (eol == bol))
|
|
|
|
ctx = GREP_CONTEXT_BODY;
|
|
|
|
|
grep.c: expose {,inverted} match column in match_line()
When calling match_line(), callers presently cannot determine the
relative offset of the match because match_line() discards the
'regmatch_t' that contains this information.
Instead, teach match_line() to take in two 'ssize_t's. Fill the first
with the offset of the match produced by the given expression. If
extended, fill the later with the offset of the match produced as if
--invert were given.
For instance, matching "--not -e x" on this line produces a columnar
offset of 0, (i.e., the whole line does not contain an x), but "--invert
--not -e -x" will fill the later ssize_t of the column containing an
"x", because this expression is semantically equivalent to "-e x".
To determine the column for the inverted and non-inverted case, do the
following:
- If matching an atom, the non-inverted column is as given from
match_one_pattern(), and the inverted column is unset.
- If matching a --not, the inverted column and non-inverted column
swap.
- If matching an --and, or --or, the non-inverted column is the
minimum of the two children.
Presently, the existing short-circuiting logic for AND and OR applies as
before. This will change in the following commit when we add options to
configure the --column flag. Taken together, this and the forthcoming
change will always yield the earlier column on a given line.
This patch will become useful when we later pick between the two new
results in order to display the column number of the first match on a
line with --column.
Co-authored-by: Jeff King <peff@peff.net>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-06-22 17:49:35 +02:00
|
|
|
hit = match_line(opt, bol, eol, &col, &icol, ctx, collect_hits);
|
2006-09-18 01:02:52 +02:00
|
|
|
|
2006-09-28 02:50:52 +02:00
|
|
|
if (collect_hits)
|
|
|
|
goto next_line;
|
|
|
|
|
2006-09-18 01:02:52 +02:00
|
|
|
/* "grep -v -e foo -e bla" should list lines
|
|
|
|
* that do not have either, so inversion should
|
|
|
|
* be done outside.
|
|
|
|
*/
|
|
|
|
if (opt->invert)
|
|
|
|
hit = !hit;
|
|
|
|
if (opt->unmatch_name_only) {
|
|
|
|
if (hit)
|
|
|
|
return 0;
|
|
|
|
goto next_line;
|
|
|
|
}
|
2022-06-22 21:47:32 +02:00
|
|
|
if (hit && (opt->max_count < 0 || count < opt->max_count)) {
|
2006-09-18 01:02:52 +02:00
|
|
|
count++;
|
|
|
|
if (opt->status_only)
|
|
|
|
return 1;
|
2010-05-22 23:30:48 +02:00
|
|
|
if (opt->name_only) {
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
show_name(opt, gs->name);
|
2010-05-22 23:30:48 +02:00
|
|
|
return 1;
|
|
|
|
}
|
2010-05-22 23:29:35 +02:00
|
|
|
if (opt->count)
|
|
|
|
goto next_line;
|
2006-09-18 01:02:52 +02:00
|
|
|
if (binary_match_only) {
|
2010-01-25 23:51:39 +01:00
|
|
|
opt->output(opt, "Binary file ", 12);
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
output_color(opt, gs->name, strlen(gs->name),
|
2018-05-26 15:55:22 +02:00
|
|
|
opt->colors[GREP_COLOR_FILENAME]);
|
2010-01-25 23:51:39 +01:00
|
|
|
opt->output(opt, " matches\n", 9);
|
2006-09-18 01:02:52 +02:00
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
/* Hit at this line. If we haven't shown the
|
|
|
|
* pre-context lines, we would need to show them.
|
|
|
|
*/
|
2011-08-01 19:20:53 +02:00
|
|
|
if (opt->pre_context || opt->funcbody)
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
show_pre_context(opt, gs, bol, eol, lno);
|
2009-07-02 00:06:34 +02:00
|
|
|
else if (opt->funcname)
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
show_funcname_line(opt, gs, bol, lno);
|
2018-06-22 17:49:42 +02:00
|
|
|
cno = opt->invert ? icol : col;
|
|
|
|
if (cno < 0) {
|
|
|
|
/*
|
|
|
|
* A negative cno indicates that there was no
|
|
|
|
* match on the line. We are thus inverted and
|
|
|
|
* being asked to show all lines that _don't_
|
|
|
|
* match a given expression. Therefore, set cno
|
|
|
|
* to 0 to suggest the whole line matches.
|
|
|
|
*/
|
|
|
|
cno = 0;
|
|
|
|
}
|
|
|
|
show_line(opt, bol, eol, gs->name, lno, cno + 1, ':');
|
2009-07-02 00:02:38 +02:00
|
|
|
last_hit = lno;
|
2011-08-01 19:20:53 +02:00
|
|
|
if (opt->funcbody)
|
|
|
|
show_function = 1;
|
|
|
|
goto next_line;
|
2006-09-18 01:02:52 +02:00
|
|
|
}
|
2016-05-28 17:06:19 +02:00
|
|
|
if (show_function && (!peek_bol || peek_bol < bol)) {
|
|
|
|
unsigned long peek_left = left;
|
2021-09-21 05:49:49 +02:00
|
|
|
const char *peek_eol = eol;
|
2016-05-28 17:06:19 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Trailing empty lines are not interesting.
|
|
|
|
* Peek past them to see if they belong to the
|
|
|
|
* body of the current function.
|
|
|
|
*/
|
|
|
|
peek_bol = bol;
|
|
|
|
while (is_empty_line(peek_bol, peek_eol)) {
|
|
|
|
peek_bol = peek_eol + 1;
|
|
|
|
peek_eol = end_of_line(peek_bol, &peek_left);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (match_funcname(opt, gs, peek_bol, peek_eol))
|
|
|
|
show_function = 0;
|
|
|
|
}
|
2011-08-01 19:20:53 +02:00
|
|
|
if (show_function ||
|
|
|
|
(last_hit && lno <= last_hit + opt->post_context)) {
|
2006-09-18 01:02:52 +02:00
|
|
|
/* If the last hit is within the post context,
|
|
|
|
* we need to show this line.
|
|
|
|
*/
|
2018-06-22 17:49:42 +02:00
|
|
|
show_line(opt, bol, eol, gs->name, lno, col + 1, '-');
|
2006-09-18 01:02:52 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
next_line:
|
|
|
|
bol = eol + 1;
|
|
|
|
if (!left)
|
|
|
|
break;
|
|
|
|
left--;
|
|
|
|
lno++;
|
|
|
|
}
|
|
|
|
|
2006-09-28 02:50:52 +02:00
|
|
|
if (collect_hits)
|
|
|
|
return 0;
|
2006-09-28 01:27:10 +02:00
|
|
|
|
2006-09-18 01:02:52 +02:00
|
|
|
if (opt->status_only)
|
2017-08-18 03:38:51 +02:00
|
|
|
return opt->unmatch_name_only;
|
2006-09-18 01:02:52 +02:00
|
|
|
if (opt->unmatch_name_only) {
|
|
|
|
/* We did not see any hit, so we want to show this */
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
show_name(opt, gs->name);
|
2006-09-18 01:02:52 +02:00
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2009-07-02 00:07:24 +02:00
|
|
|
xdiff_clear_find_func(&xecfg);
|
|
|
|
opt->priv = NULL;
|
|
|
|
|
2006-09-18 01:02:52 +02:00
|
|
|
/* NEEDSWORK:
|
|
|
|
* The real "grep -c foo *.c" gives many "bar.c:0" lines,
|
|
|
|
* which feels mostly useless but sometimes useful. Maybe
|
|
|
|
* make it another option? For now suppress them.
|
|
|
|
*/
|
2010-01-25 23:51:39 +01:00
|
|
|
if (opt->count && count) {
|
|
|
|
char buf[32];
|
2014-03-11 22:15:49 +01:00
|
|
|
if (opt->pathname) {
|
|
|
|
output_color(opt, gs->name, strlen(gs->name),
|
2018-05-26 15:55:22 +02:00
|
|
|
opt->colors[GREP_COLOR_FILENAME]);
|
2014-03-11 22:15:49 +01:00
|
|
|
output_sep(opt, ':');
|
|
|
|
}
|
2017-03-28 21:46:56 +02:00
|
|
|
xsnprintf(buf, sizeof(buf), "%u\n", count);
|
2010-01-25 23:51:39 +01:00
|
|
|
opt->output(opt, buf, strlen(buf));
|
2010-05-22 23:29:35 +02:00
|
|
|
return 1;
|
2010-01-25 23:51:39 +01:00
|
|
|
}
|
2006-09-18 01:02:52 +02:00
|
|
|
return !!last_hit;
|
|
|
|
}
|
|
|
|
|
2006-09-28 02:50:52 +02:00
|
|
|
static void clr_hit_marker(struct grep_expr *x)
|
|
|
|
{
|
|
|
|
/* All-hit markers are meaningful only at the very top level
|
|
|
|
* OR node.
|
|
|
|
*/
|
|
|
|
while (1) {
|
|
|
|
x->hit = 0;
|
|
|
|
if (x->node != GREP_NODE_OR)
|
|
|
|
return;
|
|
|
|
x->u.binary.left->hit = 0;
|
|
|
|
x = x->u.binary.right;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static int chk_hit_marker(struct grep_expr *x)
|
|
|
|
{
|
|
|
|
/* Top level nodes have hit markers. See if they all are hits */
|
|
|
|
while (1) {
|
|
|
|
if (x->node != GREP_NODE_OR)
|
|
|
|
return x->hit;
|
|
|
|
if (!x->u.binary.left->hit)
|
|
|
|
return 0;
|
|
|
|
x = x->u.binary.right;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
int grep_source(struct grep_opt *opt, struct grep_source *gs)
|
2006-09-28 02:50:52 +02:00
|
|
|
{
|
|
|
|
/*
|
|
|
|
* we do not have to do the two-pass grep when we do not check
|
|
|
|
* buffer-wide "all-match".
|
|
|
|
*/
|
2021-12-17 17:48:49 +01:00
|
|
|
if (!opt->all_match && !opt->no_body_match)
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
return grep_source_1(opt, gs, 0);
|
2006-09-28 02:50:52 +02:00
|
|
|
|
|
|
|
/* Otherwise the toplevel "or" terms hit a bit differently.
|
|
|
|
* We first clear hit markers from them.
|
|
|
|
*/
|
|
|
|
clr_hit_marker(opt->pattern_expression);
|
2021-12-17 17:48:49 +01:00
|
|
|
opt->body_hit = 0;
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
grep_source_1(opt, gs, 1);
|
2006-09-28 02:50:52 +02:00
|
|
|
|
2021-12-17 17:48:49 +01:00
|
|
|
if (opt->all_match && !chk_hit_marker(opt->pattern_expression))
|
|
|
|
return 0;
|
|
|
|
if (opt->no_body_match && opt->body_hit)
|
2006-09-28 02:50:52 +02:00
|
|
|
return 0;
|
|
|
|
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
return grep_source_1(opt, gs, 0);
|
|
|
|
}
|
|
|
|
|
2021-09-21 05:51:28 +02:00
|
|
|
static void grep_source_init_buf(struct grep_source *gs,
|
|
|
|
const char *buf,
|
2021-08-16 23:09:53 +02:00
|
|
|
unsigned long size)
|
|
|
|
{
|
|
|
|
gs->type = GREP_SOURCE_BUF;
|
|
|
|
gs->name = NULL;
|
|
|
|
gs->path = NULL;
|
|
|
|
gs->buf = buf;
|
|
|
|
gs->size = size;
|
|
|
|
gs->driver = NULL;
|
|
|
|
gs->identifier = NULL;
|
|
|
|
}
|
|
|
|
|
2021-09-21 05:51:28 +02:00
|
|
|
int grep_buffer(struct grep_opt *opt, const char *buf, unsigned long size)
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
{
|
|
|
|
struct grep_source gs;
|
|
|
|
int r;
|
|
|
|
|
2021-08-16 23:09:53 +02:00
|
|
|
grep_source_init_buf(&gs, buf, size);
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
|
|
|
|
r = grep_source(opt, &gs);
|
|
|
|
|
|
|
|
grep_source_clear(&gs);
|
|
|
|
return r;
|
|
|
|
}
|
|
|
|
|
2021-08-16 23:09:53 +02:00
|
|
|
void grep_source_init_file(struct grep_source *gs, const char *name,
|
|
|
|
const char *path)
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
{
|
2021-08-16 23:09:53 +02:00
|
|
|
gs->type = GREP_SOURCE_FILE;
|
2015-01-13 02:59:09 +01:00
|
|
|
gs->name = xstrdup_or_null(name);
|
|
|
|
gs->path = xstrdup_or_null(path);
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
gs->buf = NULL;
|
|
|
|
gs->size = 0;
|
2012-02-02 09:20:43 +01:00
|
|
|
gs->driver = NULL;
|
2021-08-16 23:09:53 +02:00
|
|
|
gs->identifier = xstrdup(path);
|
|
|
|
}
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
|
2021-08-16 23:09:53 +02:00
|
|
|
void grep_source_init_oid(struct grep_source *gs, const char *name,
|
2021-08-16 23:09:56 +02:00
|
|
|
const char *path, const struct object_id *oid,
|
|
|
|
struct repository *repo)
|
2021-08-16 23:09:53 +02:00
|
|
|
{
|
|
|
|
gs->type = GREP_SOURCE_OID;
|
|
|
|
gs->name = xstrdup_or_null(name);
|
|
|
|
gs->path = xstrdup_or_null(path);
|
|
|
|
gs->buf = NULL;
|
|
|
|
gs->size = 0;
|
|
|
|
gs->driver = NULL;
|
|
|
|
gs->identifier = oiddup(oid);
|
2021-08-16 23:09:56 +02:00
|
|
|
gs->repo = repo;
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
void grep_source_clear(struct grep_source *gs)
|
|
|
|
{
|
2017-06-16 01:15:49 +02:00
|
|
|
FREE_AND_NULL(gs->name);
|
|
|
|
FREE_AND_NULL(gs->path);
|
|
|
|
FREE_AND_NULL(gs->identifier);
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
grep_source_clear_data(gs);
|
|
|
|
}
|
|
|
|
|
|
|
|
void grep_source_clear_data(struct grep_source *gs)
|
|
|
|
{
|
|
|
|
switch (gs->type) {
|
|
|
|
case GREP_SOURCE_FILE:
|
2017-05-30 19:30:44 +02:00
|
|
|
case GREP_SOURCE_OID:
|
2021-09-21 05:51:28 +02:00
|
|
|
/* these types own the buffer */
|
|
|
|
free((char *)gs->buf);
|
|
|
|
gs->buf = NULL;
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
gs->size = 0;
|
|
|
|
break;
|
|
|
|
case GREP_SOURCE_BUF:
|
|
|
|
/* leave user-provided buf intact */
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-05-30 19:30:44 +02:00
|
|
|
static int grep_source_load_oid(struct grep_source *gs)
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
{
|
|
|
|
enum object_type type;
|
|
|
|
|
2021-08-16 23:09:56 +02:00
|
|
|
gs->buf = repo_read_object_file(gs->repo, gs->identifier, &type,
|
|
|
|
&gs->size);
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
if (!gs->buf)
|
|
|
|
return error(_("'%s': unable to read %s"),
|
|
|
|
gs->name,
|
2017-05-30 19:30:44 +02:00
|
|
|
oid_to_hex(gs->identifier));
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int grep_source_load_file(struct grep_source *gs)
|
|
|
|
{
|
|
|
|
const char *filename = gs->identifier;
|
|
|
|
struct stat st;
|
|
|
|
char *data;
|
|
|
|
size_t size;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
if (lstat(filename, &st) < 0) {
|
|
|
|
err_ret:
|
|
|
|
if (errno != ENOENT)
|
2016-05-08 11:47:47 +02:00
|
|
|
error_errno(_("failed to stat '%s'"), filename);
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
if (!S_ISREG(st.st_mode))
|
|
|
|
return -1;
|
|
|
|
size = xsize_t(st.st_size);
|
|
|
|
i = open(filename, O_RDONLY);
|
|
|
|
if (i < 0)
|
|
|
|
goto err_ret;
|
2016-02-22 23:44:28 +01:00
|
|
|
data = xmallocz(size);
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
if (st.st_size != read_in_full(i, data, size)) {
|
2016-05-08 11:47:47 +02:00
|
|
|
error_errno(_("'%s': short read"), filename);
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
close(i);
|
|
|
|
free(data);
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
close(i);
|
|
|
|
|
|
|
|
gs->buf = data;
|
|
|
|
gs->size = size;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2012-09-20 23:20:09 +02:00
|
|
|
static int grep_source_load(struct grep_source *gs)
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
{
|
|
|
|
if (gs->buf)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
switch (gs->type) {
|
|
|
|
case GREP_SOURCE_FILE:
|
|
|
|
return grep_source_load_file(gs);
|
2017-05-30 19:30:44 +02:00
|
|
|
case GREP_SOURCE_OID:
|
|
|
|
return grep_source_load_oid(gs);
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
case GREP_SOURCE_BUF:
|
|
|
|
return gs->buf ? 0 : -1;
|
|
|
|
}
|
2018-05-02 11:38:39 +02:00
|
|
|
BUG("invalid grep_source type to load");
|
2006-09-28 02:50:52 +02:00
|
|
|
}
|
2012-02-02 09:20:43 +01:00
|
|
|
|
2018-09-21 17:57:33 +02:00
|
|
|
void grep_source_load_driver(struct grep_source *gs,
|
|
|
|
struct index_state *istate)
|
2012-02-02 09:20:43 +01:00
|
|
|
{
|
|
|
|
if (gs->driver)
|
|
|
|
return;
|
|
|
|
|
|
|
|
grep_attr_lock();
|
grep: replace grep_read_mutex by internal obj read lock
git-grep uses 'grep_read_mutex' to protect its calls to object reading
operations. But these have their own internal lock now, which ensures a
better performance (allowing parallel access to more regions). So, let's
remove the former and, instead, activate the latter with
enable_obj_read_lock().
Sections that are currently protected by 'grep_read_mutex' but are not
internally protected by the object reading lock should be surrounded by
obj_read_lock() and obj_read_unlock(). These guarantee mutual exclusion
with object reading operations, keeping the current behavior and
avoiding race conditions. Namely, these places are:
In grep.c:
- fill_textconv() at fill_textconv_grep().
- userdiff_get_textconv() at grep_source_1().
In builtin/grep.c:
- parse_object_or_die() and the submodule functions at
grep_submodule().
- deref_tag() and gitmodules_config_oid() at grep_objects().
If these functions become thread-safe, in the future, we might remove
the locking and probably get some speedup.
Note that some of the submodule functions will already be thread-safe
(or close to being thread-safe) with the internal object reading lock.
However, as some of them will require additional modifications to be
removed from the critical section, this will be done in its own patch.
Signed-off-by: Matheus Tavares <matheus.bernardino@usp.br>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-01-16 03:39:54 +01:00
|
|
|
if (gs->path)
|
2018-09-21 17:57:33 +02:00
|
|
|
gs->driver = userdiff_find_by_path(istate, gs->path);
|
2012-02-02 09:20:43 +01:00
|
|
|
if (!gs->driver)
|
|
|
|
gs->driver = userdiff_find_by_name("default");
|
|
|
|
grep_attr_unlock();
|
|
|
|
}
|
2012-02-02 09:21:02 +01:00
|
|
|
|
2018-09-21 17:57:33 +02:00
|
|
|
static int grep_source_is_binary(struct grep_source *gs,
|
|
|
|
struct index_state *istate)
|
2012-02-02 09:21:02 +01:00
|
|
|
{
|
2018-09-21 17:57:33 +02:00
|
|
|
grep_source_load_driver(gs, istate);
|
2012-02-02 09:21:02 +01:00
|
|
|
if (gs->driver->binary != -1)
|
|
|
|
return gs->driver->binary;
|
|
|
|
|
|
|
|
if (!grep_source_load(gs))
|
|
|
|
return buffer_is_binary(gs->buf, gs->size);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|