From 6a301345a545ce5faf1a054d6c9bf1558dd46b0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Wed, 1 Feb 2012 22:17:18 +0700 Subject: [PATCH 001/148] pack-objects: do not accept "--index-version=version," MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- builtin/pack-objects.c | 2 +- t/t5302-pack-index.sh | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index 0f2e7b8f5c..297f792851 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -2471,7 +2471,7 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) pack_idx_opts.version = strtoul(arg + 16, &c, 10); if (pack_idx_opts.version > 2) die("bad %s", arg); - if (*c == ',') + if (*c == ',' && c[1]) pack_idx_opts.off32_limit = strtoul(c+1, &c, 0); if (*c || pack_idx_opts.off32_limit & 0x80000000) die("bad %s", arg); diff --git a/t/t5302-pack-index.sh b/t/t5302-pack-index.sh index f8fa92446c..fe82025d4a 100755 --- a/t/t5302-pack-index.sh +++ b/t/t5302-pack-index.sh @@ -73,6 +73,10 @@ test_expect_success 'index-pack --verify on index version 2' ' git index-pack --verify "test-2-${pack2}.pack" ' +test_expect_success \ + 'pack-objects --index-version=2, is not accepted' \ + 'test_must_fail git pack-objects --index-version=2, test-3 Date: Wed, 1 Feb 2012 22:17:19 +0700 Subject: [PATCH 002/148] pack-objects: remove bogus comment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The comment was introduced in b5d97e6 (pack-objects: run rev-list equivalent internally. - 2006-09-04), stating that git pack-objects [options] base-name is acceptable and refs should be passed into rev-list. But that's not true. All arguments after base-name are ignored. Remove the comment and reject this syntax (i.e. no more arguments after base name) Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- builtin/pack-objects.c | 15 +-------------- t/t5300-pack-object.sh | 4 ++++ 2 files changed, 5 insertions(+), 14 deletions(-) diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index 297f792851..80e3114573 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -2484,23 +2484,10 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) usage(pack_usage); } - /* Traditionally "pack-objects [options] base extra" failed; - * we would however want to take refs parameter that would - * have been given to upstream rev-list ourselves, which means - * we somehow want to say what the base name is. So the - * syntax would be: - * - * pack-objects [options] base - * - * in other words, we would treat the first non-option as the - * base_name and send everything else to the internal revision - * walker. - */ - if (!pack_to_stdout) base_name = argv[i++]; - if (pack_to_stdout != !base_name) + if (pack_to_stdout != !base_name || argv[i]) usage(pack_usage); if (!pack_to_stdout && !pack_size_limit) diff --git a/t/t5300-pack-object.sh b/t/t5300-pack-object.sh index 602806d09c..d9d856b87b 100755 --- a/t/t5300-pack-object.sh +++ b/t/t5300-pack-object.sh @@ -38,6 +38,10 @@ test_expect_success \ 'pack without delta' \ 'packname_1=$(git pack-objects --window=0 test-1 Date: Wed, 1 Feb 2012 22:17:20 +0700 Subject: [PATCH 003/148] pack-objects: convert to use parse_options() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- builtin/pack-objects.c | 317 ++++++++++++++++++----------------------- 1 file changed, 140 insertions(+), 177 deletions(-) diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index 80e3114573..e21e5af8f9 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -18,16 +18,11 @@ #include "refs.h" #include "thread-utils.h" -static const char pack_usage[] = - "git pack-objects [ -q | --progress | --all-progress ]\n" - " [--all-progress-implied]\n" - " [--max-pack-size=] [--local] [--incremental]\n" - " [--window=] [--window-memory=] [--depth=]\n" - " [--no-reuse-delta] [--no-reuse-object] [--delta-base-offset]\n" - " [--threads=] [--non-empty] [--revs [--unpacked | --all]]\n" - " [--reflog] [--stdout | base-name] [--include-tag]\n" - " [--keep-unreachable | --unpack-unreachable]\n" - " [< ref-list | < object-list]"; +static const char *pack_usage[] = { + "git pack-objects --stdout [options...] [< ref-list | < object-list]", + "git pack-objects [options...] base-name [< ref-list | < object-list]", + NULL +}; struct object_entry { struct pack_idx_entry idx; @@ -2305,191 +2300,159 @@ static void get_object_list(int ac, const char **av) loosen_unused_packed_objects(&revs); } +static int option_parse_index_version(const struct option *opt, + const char *arg, int unset) +{ + char *c; + const char *val = arg; + pack_idx_opts.version = strtoul(val, &c, 10); + if (pack_idx_opts.version > 2) + die(_("unsupported index version %s"), val); + if (*c == ',' && c[1]) + pack_idx_opts.off32_limit = strtoul(c+1, &c, 0); + if (*c || pack_idx_opts.off32_limit & 0x80000000) + die(_("bad index version '%s'"), val); + return 0; +} + +static int option_parse_ulong(const struct option *opt, + const char *arg, int unset) +{ + if (unset) + die(_("option %s does not accept negative form"), + opt->long_name); + + if (!git_parse_ulong(arg, opt->value)) + die(_("unable to parse value '%s' for option %s"), + arg, opt->long_name); + return 0; +} + +#define OPT_ULONG(s, l, v, h) \ + { OPTION_CALLBACK, (s), (l), (v), "n", (h), \ + PARSE_OPT_NONEG, option_parse_ulong } + int cmd_pack_objects(int argc, const char **argv, const char *prefix) { int use_internal_rev_list = 0; int thin = 0; int all_progress_implied = 0; - uint32_t i; - const char **rp_av; - int rp_ac_alloc = 64; - int rp_ac; + const char *rp_av[6]; + int rp_ac = 0; + int rev_list_unpacked = 0, rev_list_all = 0, rev_list_reflog = 0; + struct option pack_objects_options[] = { + OPT_SET_INT('q', "quiet", &progress, + "do not show progress meter", 0), + OPT_SET_INT(0, "progress", &progress, + "show progress meter", 1), + OPT_SET_INT(0, "all-progress", &progress, + "show progress meter during object writing phase", 2), + OPT_BOOL(0, "all-progress-implied", + &all_progress_implied, + "similar to --all-progress when progress meter is shown"), + { OPTION_CALLBACK, 0, "index-version", NULL, "version[,offset]", + "write the pack index file in the specified idx format version", + 0, option_parse_index_version }, + OPT_ULONG(0, "max-pack-size", &pack_size_limit, + "maximum size of each output pack file"), + OPT_BOOL(0, "local", &local, + "ignore borrowed objects from alternate object store"), + OPT_BOOL(0, "incremental", &incremental, + "ignore packed objects"), + OPT_INTEGER(0, "window", &window, + "limit pack window by objects"), + OPT_ULONG(0, "window-memory", &window_memory_limit, + "limit pack window by memory in addition to object limit"), + OPT_INTEGER(0, "depth", &depth, + "maximum length of delta chain allowed in the resulting pack"), + OPT_BOOL(0, "reuse-delta", &reuse_delta, + "reuse existing deltas"), + OPT_BOOL(0, "reuse-object", &reuse_object, + "reuse existing objects"), + OPT_BOOL(0, "delta-base-offset", &allow_ofs_delta, + "use OFS_DELTA objects"), + OPT_INTEGER(0, "threads", &delta_search_threads, + "use threads when searching for best delta matches"), + OPT_BOOL(0, "non-empty", &non_empty, + "do not create an empty pack output"), + OPT_BOOL(0, "revs", &use_internal_rev_list, + "read revision arguments from standard input"), + { OPTION_SET_INT, 0, "unpacked", &rev_list_unpacked, NULL, + "limit the objects to those that are not yet packed", + PARSE_OPT_NOARG | PARSE_OPT_NONEG, NULL, 1 }, + { OPTION_SET_INT, 0, "all", &rev_list_all, NULL, + "include objects reachable from any reference", + PARSE_OPT_NOARG | PARSE_OPT_NONEG, NULL, 1 }, + { OPTION_SET_INT, 0, "reflog", &rev_list_reflog, NULL, + "include objects referred by reflog entries", + PARSE_OPT_NOARG | PARSE_OPT_NONEG, NULL, 1 }, + OPT_BOOL(0, "stdout", &pack_to_stdout, + "output pack to stdout"), + OPT_BOOL(0, "include-tag", &include_tag, + "include tag objects that refer to objects to be packed"), + OPT_BOOL(0, "keep-unreachable", &keep_unreachable, + "keep unreachable objects"), + OPT_BOOL(0, "unpack-unreachable", &unpack_unreachable, + "unpack unreachable objects"), + OPT_BOOL(0, "thin", &thin, + "create thin packs"), + OPT_BOOL(0, "honor-pack-keep", &ignore_packed_keep, + "ignore packs that have companion .keep file"), + OPT_INTEGER(0, "compression", &pack_compression_level, + "pack compression level"), + OPT_SET_INT(0, "keep-true-parents", &grafts_replace_parents, + "do not hide commits by grafts", 0), + OPT_END(), + }; read_replace_refs = 0; - rp_av = xcalloc(rp_ac_alloc, sizeof(*rp_av)); - - rp_av[0] = "pack-objects"; - rp_av[1] = "--objects"; /* --thin will make it --objects-edge */ - rp_ac = 2; - reset_pack_idx_option(&pack_idx_opts); git_config(git_pack_config, NULL); if (!pack_compression_seen && core_compression_seen) pack_compression_level = core_compression_level; progress = isatty(2); - for (i = 1; i < argc; i++) { - const char *arg = argv[i]; + argc = parse_options(argc, argv, prefix, pack_objects_options, + pack_usage, 0); - if (*arg != '-') - break; + if (argc) { + base_name = argv[0]; + argc--; + } + if (pack_to_stdout != !base_name || argc) + usage_with_options(pack_usage, pack_objects_options); - if (!strcmp("--non-empty", arg)) { - non_empty = 1; - continue; - } - if (!strcmp("--local", arg)) { - local = 1; - continue; - } - if (!strcmp("--incremental", arg)) { - incremental = 1; - continue; - } - if (!strcmp("--honor-pack-keep", arg)) { - ignore_packed_keep = 1; - continue; - } - if (!prefixcmp(arg, "--compression=")) { - char *end; - int level = strtoul(arg+14, &end, 0); - if (!arg[14] || *end) - usage(pack_usage); - if (level == -1) - level = Z_DEFAULT_COMPRESSION; - else if (level < 0 || level > Z_BEST_COMPRESSION) - die("bad pack compression level %d", level); - pack_compression_level = level; - continue; - } - if (!prefixcmp(arg, "--max-pack-size=")) { - pack_size_limit_cfg = 0; - if (!git_parse_ulong(arg+16, &pack_size_limit)) - usage(pack_usage); - continue; - } - if (!prefixcmp(arg, "--window=")) { - char *end; - window = strtoul(arg+9, &end, 0); - if (!arg[9] || *end) - usage(pack_usage); - continue; - } - if (!prefixcmp(arg, "--window-memory=")) { - if (!git_parse_ulong(arg+16, &window_memory_limit)) - usage(pack_usage); - continue; - } - if (!prefixcmp(arg, "--threads=")) { - char *end; - delta_search_threads = strtoul(arg+10, &end, 0); - if (!arg[10] || *end || delta_search_threads < 0) - usage(pack_usage); -#ifdef NO_PTHREADS - if (delta_search_threads != 1) - warning("no threads support, " - "ignoring %s", arg); -#endif - continue; - } - if (!prefixcmp(arg, "--depth=")) { - char *end; - depth = strtoul(arg+8, &end, 0); - if (!arg[8] || *end) - usage(pack_usage); - continue; - } - if (!strcmp("--progress", arg)) { - progress = 1; - continue; - } - if (!strcmp("--all-progress", arg)) { - progress = 2; - continue; - } - if (!strcmp("--all-progress-implied", arg)) { - all_progress_implied = 1; - continue; - } - if (!strcmp("-q", arg)) { - progress = 0; - continue; - } - if (!strcmp("--no-reuse-delta", arg)) { - reuse_delta = 0; - continue; - } - if (!strcmp("--no-reuse-object", arg)) { - reuse_object = reuse_delta = 0; - continue; - } - if (!strcmp("--delta-base-offset", arg)) { - allow_ofs_delta = 1; - continue; - } - if (!strcmp("--stdout", arg)) { - pack_to_stdout = 1; - continue; - } - if (!strcmp("--revs", arg)) { - use_internal_rev_list = 1; - continue; - } - if (!strcmp("--keep-unreachable", arg)) { - keep_unreachable = 1; - continue; - } - if (!strcmp("--unpack-unreachable", arg)) { - unpack_unreachable = 1; - continue; - } - if (!strcmp("--include-tag", arg)) { - include_tag = 1; - continue; - } - if (!strcmp("--unpacked", arg) || - !strcmp("--reflog", arg) || - !strcmp("--all", arg)) { - use_internal_rev_list = 1; - if (rp_ac >= rp_ac_alloc - 1) { - rp_ac_alloc = alloc_nr(rp_ac_alloc); - rp_av = xrealloc(rp_av, - rp_ac_alloc * sizeof(*rp_av)); - } - rp_av[rp_ac++] = arg; - continue; - } - if (!strcmp("--thin", arg)) { - use_internal_rev_list = 1; - thin = 1; - rp_av[1] = "--objects-edge"; - continue; - } - if (!prefixcmp(arg, "--index-version=")) { - char *c; - pack_idx_opts.version = strtoul(arg + 16, &c, 10); - if (pack_idx_opts.version > 2) - die("bad %s", arg); - if (*c == ',' && c[1]) - pack_idx_opts.off32_limit = strtoul(c+1, &c, 0); - if (*c || pack_idx_opts.off32_limit & 0x80000000) - die("bad %s", arg); - continue; - } - if (!strcmp(arg, "--keep-true-parents")) { - grafts_replace_parents = 0; - continue; - } - usage(pack_usage); + rp_av[rp_ac++] = "pack-objects"; + if (thin) { + use_internal_rev_list = 1; + rp_av[rp_ac++] = "--objects-edge"; + } else + rp_av[rp_ac++] = "--objects"; + + if (rev_list_all) { + use_internal_rev_list = 1; + rp_av[rp_ac++] = "--all"; + } + if (rev_list_reflog) { + use_internal_rev_list = 1; + rp_av[rp_ac++] = "--reflog"; + } + if (rev_list_unpacked) { + use_internal_rev_list = 1; + rp_av[rp_ac++] = "--unpacked"; } - if (!pack_to_stdout) - base_name = argv[i++]; - - if (pack_to_stdout != !base_name || argv[i]) - usage(pack_usage); - + if (!reuse_object) + reuse_delta = 0; + if (pack_compression_level == -1) + pack_compression_level = Z_DEFAULT_COMPRESSION; + else if (pack_compression_level < 0 || pack_compression_level > Z_BEST_COMPRESSION) + die("bad pack compression level %d", pack_compression_level); +#ifdef NO_PTHREADS + if (delta_search_threads != 1) + warning("no threads support, ignoring %s", arg); +#endif if (!pack_to_stdout && !pack_size_limit) pack_size_limit = pack_size_limit_cfg; if (pack_to_stdout && pack_size_limit) From 95099731bf2c79ccf5870655e36caa4215f0ced0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Wed, 1 Feb 2012 20:48:54 +0700 Subject: [PATCH 004/148] sha1_file.c: move the core logic of find_pack_entry() into fill_pack_entry() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The new helper function implements the logic to find the offset for the object in one pack and fill a pack_entry structure. The next patch will restructure the loop and will call the helper from two places. Signed-off-by: Nguyễn Thái Ngọc Duy Acked-by: Nicolas Pitre Signed-off-by: Junio C Hamano --- sha1_file.c | 61 ++++++++++++++++++++++++++++++----------------------- 1 file changed, 35 insertions(+), 26 deletions(-) diff --git a/sha1_file.c b/sha1_file.c index 88f2151ff3..61e51edc42 100644 --- a/sha1_file.c +++ b/sha1_file.c @@ -2010,11 +2010,44 @@ int is_pack_valid(struct packed_git *p) return !open_packed_git(p); } +static int fill_pack_entry(const unsigned char *sha1, + struct pack_entry *e, + struct packed_git *p) +{ + off_t offset; + + if (p->num_bad_objects) { + unsigned i; + for (i = 0; i < p->num_bad_objects; i++) + if (!hashcmp(sha1, p->bad_object_sha1 + 20 * i)) + return 0; + } + + offset = find_pack_entry_one(sha1, p); + if (!offset) + return 0; + + /* + * We are about to tell the caller where they can locate the + * requested object. We better make sure the packfile is + * still here and can be accessed before supplying that + * answer, as it may have been deleted since the index was + * loaded! + */ + if (!is_pack_valid(p)) { + warning("packfile %s cannot be accessed", p->pack_name); + return 0; + } + e->offset = offset; + e->p = p; + hashcpy(e->sha1, sha1); + return 1; +} + static int find_pack_entry(const unsigned char *sha1, struct pack_entry *e) { static struct packed_git *last_found = (void *)1; struct packed_git *p; - off_t offset; prepare_packed_git(); if (!packed_git) @@ -2022,35 +2055,11 @@ static int find_pack_entry(const unsigned char *sha1, struct pack_entry *e) p = (last_found == (void *)1) ? packed_git : last_found; do { - if (p->num_bad_objects) { - unsigned i; - for (i = 0; i < p->num_bad_objects; i++) - if (!hashcmp(sha1, p->bad_object_sha1 + 20 * i)) - goto next; - } - - offset = find_pack_entry_one(sha1, p); - if (offset) { - /* - * We are about to tell the caller where they can - * locate the requested object. We better make - * sure the packfile is still here and can be - * accessed before supplying that answer, as - * it may have been deleted since the index - * was loaded! - */ - if (!is_pack_valid(p)) { - warning("packfile %s cannot be accessed", p->pack_name); - goto next; - } - e->offset = offset; - e->p = p; - hashcpy(e->sha1, sha1); + if (fill_pack_entry(sha1, e, p)) { last_found = p; return 1; } - next: if (p == last_found) p = packed_git; else From c01f51cc750dbd76e50919bf4e3b94e1b47d2e23 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Wed, 1 Feb 2012 20:48:55 +0700 Subject: [PATCH 005/148] find_pack_entry(): do not keep packed_git pointer locally MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit f7c22cc (always start looking up objects in the last used pack first - 2007-05-30) introduce a static packed_git* pointer as an optimization. The kept pointer however may become invalid if free_pack_by_name() happens to free that particular pack. Current code base does not access packs after calling free_pack_by_name() so it should not be a problem. Anyway, move the pointer out so that free_pack_by_name() can reset it to avoid running into troubles in future. Signed-off-by: Nguyễn Thái Ngọc Duy Acked-by: Nicolas Pitre Signed-off-by: Junio C Hamano --- sha1_file.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/sha1_file.c b/sha1_file.c index 61e51edc42..6b1b5125c8 100644 --- a/sha1_file.c +++ b/sha1_file.c @@ -54,6 +54,8 @@ static struct cached_object empty_tree = { 0 }; +static struct packed_git *last_found_pack; + static struct cached_object *find_cached_object(const unsigned char *sha1) { int i; @@ -720,6 +722,8 @@ void free_pack_by_name(const char *pack_name) close_pack_index(p); free(p->bad_object_sha1); *pp = p->next; + if (last_found_pack == p) + last_found_pack = NULL; free(p); return; } @@ -2046,27 +2050,22 @@ static int fill_pack_entry(const unsigned char *sha1, static int find_pack_entry(const unsigned char *sha1, struct pack_entry *e) { - static struct packed_git *last_found = (void *)1; struct packed_git *p; prepare_packed_git(); if (!packed_git) return 0; - p = (last_found == (void *)1) ? packed_git : last_found; - do { - if (fill_pack_entry(sha1, e, p)) { - last_found = p; - return 1; - } + if (last_found_pack && fill_pack_entry(sha1, e, last_found_pack)) + return 1; - if (p == last_found) - p = packed_git; - else - p = p->next; - if (p == last_found) - p = p->next; - } while (p); + for (p = packed_git; p; p = p->next) { + if (p == last_found_pack || !fill_pack_entry(sha1, e, p)) + continue; + + last_found_pack = p; + return 1; + } return 0; } From 78db6ea9dc1a872f9d07a36fe7aec700a5c963b9 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Thu, 2 Feb 2012 03:18:29 -0500 Subject: [PATCH 006/148] grep: make locking flag global The low-level grep code traditionally didn't care about threading, as it doesn't do any threading itself and didn't call out to other non-thread-safe code. That changed with 0579f91 (grep: enable threading with -p and -W using lazy attribute lookup, 2011-12-12), which pushed the lookup of funcname attributes (which is not thread-safe) into the low-level grep code. As a result, the low-level code learned about a new global "grep_attr_mutex" to serialize access to the attribute code. A multi-threaded caller (e.g., builtin/grep.c) is expected to initialize the mutex and set "use_threads" in the grep_opt structure. The low-level code only uses the lock if use_threads is set. However, putting the use_threads flag into the grep_opt struct is not the most logical place. Whether threading is in use is not something that matters for each call to grep_buffer, but is instead global to the whole program (i.e., if any thread is doing multi-threaded grep, every other thread, even if it thinks it is doing its own single-threaded grep, would need to use the locking). In practice, this distinction isn't a problem for us, because the only user of multi-threaded grep is "git-grep", which does nothing except call grep. This patch turns the opt->use_threads flag into a global flag. More important than the nit-picking semantic argument above is that this means that the locking functions don't need to actually have access to a grep_opt to know whether to lock. Which in turn can make adding new locks simpler, as we don't need to pass around a grep_opt. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- builtin/grep.c | 4 ++-- grep.c | 18 ++++++++++-------- grep.h | 2 +- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/builtin/grep.c b/builtin/grep.c index 9ce064ac11..5b881870f6 100644 --- a/builtin/grep.c +++ b/builtin/grep.c @@ -259,6 +259,7 @@ static void start_threads(struct grep_opt *opt) pthread_cond_init(&cond_add, NULL); pthread_cond_init(&cond_write, NULL); pthread_cond_init(&cond_result, NULL); + grep_use_locks = 1; for (i = 0; i < ARRAY_SIZE(todo); i++) { strbuf_init(&todo[i].out, 0); @@ -307,6 +308,7 @@ static int wait_all(void) pthread_cond_destroy(&cond_add); pthread_cond_destroy(&cond_write); pthread_cond_destroy(&cond_result); + grep_use_locks = 0; return hit; } @@ -1030,8 +1032,6 @@ int cmd_grep(int argc, const char **argv, const char *prefix) use_threads = 0; #endif - opt.use_threads = use_threads; - #ifndef NO_PTHREADS if (use_threads) { if (opt.pre_context || opt.post_context || opt.file_break || diff --git a/grep.c b/grep.c index 486230b511..7a67c2ff6f 100644 --- a/grep.c +++ b/grep.c @@ -807,26 +807,28 @@ static void show_line(struct grep_opt *opt, char *bol, char *eol, } #ifndef NO_PTHREADS +int grep_use_locks; + /* * This lock protects access to the gitattributes machinery, which is * not thread-safe. */ pthread_mutex_t grep_attr_mutex; -static inline void grep_attr_lock(struct grep_opt *opt) +static inline void grep_attr_lock(void) { - if (opt->use_threads) + if (grep_use_locks) pthread_mutex_lock(&grep_attr_mutex); } -static inline void grep_attr_unlock(struct grep_opt *opt) +static inline void grep_attr_unlock(void) { - if (opt->use_threads) + if (grep_use_locks) pthread_mutex_unlock(&grep_attr_mutex); } #else -#define grep_attr_lock(opt) -#define grep_attr_unlock(opt) +#define grep_attr_lock() +#define grep_attr_unlock() #endif static int match_funcname(struct grep_opt *opt, const char *name, char *bol, char *eol) @@ -834,9 +836,9 @@ static int match_funcname(struct grep_opt *opt, const char *name, char *bol, cha xdemitconf_t *xecfg = opt->priv; if (xecfg && !xecfg->find_func) { struct userdiff_driver *drv; - grep_attr_lock(opt); + grep_attr_lock(); drv = userdiff_find_by_path(name); - grep_attr_unlock(opt); + grep_attr_unlock(); if (drv && drv->funcname.pattern) { const struct userdiff_funcname *pe = &drv->funcname; xdiff_set_find_func(xecfg, pe->pattern, pe->cflags); diff --git a/grep.h b/grep.h index fb205f3542..3653bb333c 100644 --- a/grep.h +++ b/grep.h @@ -116,7 +116,6 @@ struct grep_opt { int show_hunk_mark; int file_break; int heading; - int use_threads; void *priv; void (*output)(struct grep_opt *opt, const void *data, size_t size); @@ -138,6 +137,7 @@ extern int grep_threads_ok(const struct grep_opt *opt); * Mutex used around access to the attributes machinery if * opt->use_threads. Must be initialized/destroyed by callers! */ +extern int grep_use_locks; extern pthread_mutex_t grep_attr_mutex; #endif From b3aeb285d0ac1dcb4d578a61a68e08646f96501f Mon Sep 17 00:00:00 2001 From: Jeff King Date: Thu, 2 Feb 2012 03:18:41 -0500 Subject: [PATCH 007/148] grep: move sha1-reading mutex into low-level code The multi-threaded git-grep code needs to serialize access to the thread-unsafe read_sha1_file call. It does this with a mutex that is local to builtin/grep.c. Let's instead push this down into grep.c, where it can be used by both builtin/grep.c and grep.c. This will let us safely teach the low-level grep.c code tricks that involve reading from the object db. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- builtin/grep.c | 29 ++++++----------------------- grep.c | 6 ++++++ grep.h | 17 +++++++++++++++++ 3 files changed, 29 insertions(+), 23 deletions(-) diff --git a/builtin/grep.c b/builtin/grep.c index 5b881870f6..aeb361663a 100644 --- a/builtin/grep.c +++ b/builtin/grep.c @@ -85,21 +85,6 @@ static inline void grep_unlock(void) pthread_mutex_unlock(&grep_mutex); } -/* Used to serialize calls to read_sha1_file. */ -static pthread_mutex_t read_sha1_mutex; - -static inline void read_sha1_lock(void) -{ - if (use_threads) - pthread_mutex_lock(&read_sha1_mutex); -} - -static inline void read_sha1_unlock(void) -{ - if (use_threads) - pthread_mutex_unlock(&read_sha1_mutex); -} - /* Signalled when a new work_item is added to todo. */ static pthread_cond_t cond_add; @@ -254,7 +239,7 @@ static void start_threads(struct grep_opt *opt) int i; pthread_mutex_init(&grep_mutex, NULL); - pthread_mutex_init(&read_sha1_mutex, NULL); + pthread_mutex_init(&grep_read_mutex, NULL); pthread_mutex_init(&grep_attr_mutex, NULL); pthread_cond_init(&cond_add, NULL); pthread_cond_init(&cond_write, NULL); @@ -303,7 +288,7 @@ static int wait_all(void) } pthread_mutex_destroy(&grep_mutex); - pthread_mutex_destroy(&read_sha1_mutex); + pthread_mutex_destroy(&grep_read_mutex); pthread_mutex_destroy(&grep_attr_mutex); pthread_cond_destroy(&cond_add); pthread_cond_destroy(&cond_write); @@ -313,8 +298,6 @@ static int wait_all(void) return hit; } #else /* !NO_PTHREADS */ -#define read_sha1_lock() -#define read_sha1_unlock() static int wait_all(void) { @@ -376,9 +359,9 @@ static void *lock_and_read_sha1_file(const unsigned char *sha1, enum object_type { void *data; - read_sha1_lock(); + grep_read_lock(); data = read_sha1_file(sha1, type, size); - read_sha1_unlock(); + grep_read_unlock(); return data; } @@ -617,10 +600,10 @@ static int grep_object(struct grep_opt *opt, const struct pathspec *pathspec, struct strbuf base; int hit, len; - read_sha1_lock(); + grep_read_lock(); data = read_object_with_reference(obj->sha1, tree_type, &size, NULL); - read_sha1_unlock(); + grep_read_unlock(); if (!data) die(_("unable to read tree (%s)"), sha1_to_hex(obj->sha1)); diff --git a/grep.c b/grep.c index 7a67c2ff6f..db58a29c2b 100644 --- a/grep.c +++ b/grep.c @@ -826,6 +826,12 @@ static inline void grep_attr_unlock(void) if (grep_use_locks) pthread_mutex_unlock(&grep_attr_mutex); } + +/* + * Same as git_attr_mutex, but protecting the thread-unsafe object db access. + */ +pthread_mutex_t grep_read_mutex; + #else #define grep_attr_lock() #define grep_attr_unlock() diff --git a/grep.h b/grep.h index 3653bb333c..4f1b0251b0 100644 --- a/grep.h +++ b/grep.h @@ -139,6 +139,23 @@ extern int grep_threads_ok(const struct grep_opt *opt); */ extern int grep_use_locks; extern pthread_mutex_t grep_attr_mutex; +extern pthread_mutex_t grep_read_mutex; + +static inline void grep_read_lock(void) +{ + if (grep_use_locks) + pthread_mutex_lock(&grep_read_mutex); +} + +static inline void grep_read_unlock(void) +{ + if (grep_use_locks) + pthread_mutex_unlock(&grep_read_mutex); +} + +#else +#define grep_read_lock() +#define grep_read_unlock() #endif #endif From e1327023ea22c3bf57e7d28596da356043f073fc Mon Sep 17 00:00:00 2001 From: Jeff King Date: Thu, 2 Feb 2012 03:19:28 -0500 Subject: [PATCH 008/148] grep: refactor the concept of "grep source" into an object The main interface to the low-level grep code is grep_buffer, which takes a pointer to a buffer and a size. This is convenient and flexible (we use it to grep commit bodies, files on disk, and blobs by sha1), but it makes it hard to pass extra information about what we are grepping (either for correctness, like overriding binary auto-detection, or for optimizations, like lazily loading blob contents). Instead, let's encapsulate the idea of a "grep source", including the buffer, its size, and where the data is coming from. This is similar to the diff_filespec structure used by the diff code (unsurprising, since future patches will implement some of the same optimizations found there). The diffstat is slightly scarier than the actual patch content. Most of the modified lines are simply replacing access to raw variables with their counterparts that are now in a "struct grep_source". Most of the added lines were taken from builtin/grep.c, which partially abstracted the idea of grep sources (for file vs sha1 sources). Instead of dropping the now-redundant code, this patch leaves builtin/grep.c using the traditional grep_buffer interface (which now wraps the grep_source interface). That makes it easy to test that there is no change of behavior (yet). Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- grep.c | 198 +++++++++++++++++++++++++++++++++++++++++++++++---------- grep.h | 22 +++++++ 2 files changed, 186 insertions(+), 34 deletions(-) diff --git a/grep.c b/grep.c index db58a29c2b..8204ca25e5 100644 --- a/grep.c +++ b/grep.c @@ -837,13 +837,13 @@ pthread_mutex_t grep_read_mutex; #define grep_attr_unlock() #endif -static int match_funcname(struct grep_opt *opt, const char *name, char *bol, char *eol) +static int match_funcname(struct grep_opt *opt, struct grep_source *gs, char *bol, char *eol) { xdemitconf_t *xecfg = opt->priv; if (xecfg && !xecfg->find_func) { struct userdiff_driver *drv; grep_attr_lock(); - drv = userdiff_find_by_path(name); + drv = userdiff_find_by_path(gs->name); grep_attr_unlock(); if (drv && drv->funcname.pattern) { const struct userdiff_funcname *pe = &drv->funcname; @@ -866,33 +866,33 @@ static int match_funcname(struct grep_opt *opt, const char *name, char *bol, cha return 0; } -static void show_funcname_line(struct grep_opt *opt, const char *name, - char *buf, char *bol, unsigned lno) +static void show_funcname_line(struct grep_opt *opt, struct grep_source *gs, + char *bol, unsigned lno) { - while (bol > buf) { + while (bol > gs->buf) { char *eol = --bol; - while (bol > buf && bol[-1] != '\n') + while (bol > gs->buf && bol[-1] != '\n') bol--; lno--; if (lno <= opt->last_shown) break; - if (match_funcname(opt, name, bol, eol)) { - show_line(opt, bol, eol, name, lno, '='); + if (match_funcname(opt, gs, bol, eol)) { + show_line(opt, bol, eol, gs->name, lno, '='); break; } } } -static void show_pre_context(struct grep_opt *opt, const char *name, char *buf, +static void show_pre_context(struct grep_opt *opt, struct grep_source *gs, char *bol, char *end, unsigned lno) { unsigned cur = lno, from = 1, funcname_lno = 0; int funcname_needed = !!opt->funcname; - if (opt->funcbody && !match_funcname(opt, name, bol, end)) + if (opt->funcbody && !match_funcname(opt, gs, bol, end)) funcname_needed = 2; if (opt->pre_context < lno) @@ -901,14 +901,14 @@ static void show_pre_context(struct grep_opt *opt, const char *name, char *buf, from = opt->last_shown + 1; /* Rewind. */ - while (bol > buf && + while (bol > gs->buf && cur > (funcname_needed == 2 ? opt->last_shown + 1 : from)) { char *eol = --bol; - while (bol > buf && bol[-1] != '\n') + while (bol > gs->buf && bol[-1] != '\n') bol--; cur--; - if (funcname_needed && match_funcname(opt, name, bol, eol)) { + if (funcname_needed && match_funcname(opt, gs, bol, eol)) { funcname_lno = cur; funcname_needed = 0; } @@ -916,7 +916,7 @@ static void show_pre_context(struct grep_opt *opt, const char *name, char *buf, /* We need to look even further back to find a function signature. */ if (opt->funcname && funcname_needed) - show_funcname_line(opt, name, buf, bol, cur); + show_funcname_line(opt, gs, bol, cur); /* Back forward. */ while (cur < lno) { @@ -924,7 +924,7 @@ static void show_pre_context(struct grep_opt *opt, const char *name, char *buf, while (*eol != '\n') eol++; - show_line(opt, bol, eol, name, cur, sign); + show_line(opt, bol, eol, gs->name, cur, sign); bol = eol + 1; cur++; } @@ -991,11 +991,10 @@ static void std_output(struct grep_opt *opt, const void *buf, size_t size) fwrite(buf, size, 1, stdout); } -static int grep_buffer_1(struct grep_opt *opt, const char *name, - char *buf, unsigned long size, int collect_hits) +static int grep_source_1(struct grep_opt *opt, struct grep_source *gs, int collect_hits) { - char *bol = buf; - unsigned long left = size; + char *bol; + unsigned long left; unsigned lno = 1; unsigned last_hit = 0; int binary_match_only = 0; @@ -1023,13 +1022,16 @@ static int grep_buffer_1(struct grep_opt *opt, const char *name, } opt->last_shown = 0; + if (grep_source_load(gs) < 0) + return 0; + switch (opt->binary) { case GREP_BINARY_DEFAULT: - if (buffer_is_binary(buf, size)) + if (buffer_is_binary(gs->buf, gs->size)) binary_match_only = 1; break; case GREP_BINARY_NOMATCH: - if (buffer_is_binary(buf, size)) + if (buffer_is_binary(gs->buf, gs->size)) return 0; /* Assume unmatch */ break; case GREP_BINARY_TEXT: @@ -1043,6 +1045,8 @@ static int grep_buffer_1(struct grep_opt *opt, const char *name, try_lookahead = should_lookahead(opt); + bol = gs->buf; + left = gs->size; while (left) { char *eol, ch; int hit; @@ -1091,14 +1095,14 @@ static int grep_buffer_1(struct grep_opt *opt, const char *name, if (opt->status_only) return 1; if (opt->name_only) { - show_name(opt, name); + show_name(opt, gs->name); return 1; } if (opt->count) goto next_line; if (binary_match_only) { opt->output(opt, "Binary file ", 12); - output_color(opt, name, strlen(name), + output_color(opt, gs->name, strlen(gs->name), opt->color_filename); opt->output(opt, " matches\n", 9); return 1; @@ -1107,23 +1111,23 @@ static int grep_buffer_1(struct grep_opt *opt, const char *name, * pre-context lines, we would need to show them. */ if (opt->pre_context || opt->funcbody) - show_pre_context(opt, name, buf, bol, eol, lno); + show_pre_context(opt, gs, bol, eol, lno); else if (opt->funcname) - show_funcname_line(opt, name, buf, bol, lno); - show_line(opt, bol, eol, name, lno, ':'); + show_funcname_line(opt, gs, bol, lno); + show_line(opt, bol, eol, gs->name, lno, ':'); last_hit = lno; if (opt->funcbody) show_function = 1; goto next_line; } - if (show_function && match_funcname(opt, name, bol, eol)) + if (show_function && match_funcname(opt, gs, bol, eol)) show_function = 0; if (show_function || (last_hit && lno <= last_hit + opt->post_context)) { /* If the last hit is within the post context, * we need to show this line. */ - show_line(opt, bol, eol, name, lno, '-'); + show_line(opt, bol, eol, gs->name, lno, '-'); } next_line: @@ -1141,7 +1145,7 @@ static int grep_buffer_1(struct grep_opt *opt, const char *name, return 0; if (opt->unmatch_name_only) { /* We did not see any hit, so we want to show this */ - show_name(opt, name); + show_name(opt, gs->name); return 1; } @@ -1155,7 +1159,7 @@ static int grep_buffer_1(struct grep_opt *opt, const char *name, */ if (opt->count && count) { char buf[32]; - output_color(opt, name, strlen(name), opt->color_filename); + output_color(opt, gs->name, strlen(gs->name), opt->color_filename); output_sep(opt, ':'); snprintf(buf, sizeof(buf), "%u\n", count); opt->output(opt, buf, strlen(buf)); @@ -1190,23 +1194,149 @@ static int chk_hit_marker(struct grep_expr *x) } } -int grep_buffer(struct grep_opt *opt, const char *name, char *buf, unsigned long size) +int grep_source(struct grep_opt *opt, struct grep_source *gs) { /* * we do not have to do the two-pass grep when we do not check * buffer-wide "all-match". */ if (!opt->all_match) - return grep_buffer_1(opt, name, buf, size, 0); + return grep_source_1(opt, gs, 0); /* Otherwise the toplevel "or" terms hit a bit differently. * We first clear hit markers from them. */ clr_hit_marker(opt->pattern_expression); - grep_buffer_1(opt, name, buf, size, 1); + grep_source_1(opt, gs, 1); if (!chk_hit_marker(opt->pattern_expression)) return 0; - return grep_buffer_1(opt, name, buf, size, 0); + return grep_source_1(opt, gs, 0); +} + +int grep_buffer(struct grep_opt *opt, const char *name, char *buf, unsigned long size) +{ + struct grep_source gs; + int r; + + grep_source_init(&gs, GREP_SOURCE_BUF, name, NULL); + gs.buf = buf; + gs.size = size; + + r = grep_source(opt, &gs); + + grep_source_clear(&gs); + return r; +} + +void grep_source_init(struct grep_source *gs, enum grep_source_type type, + const char *name, const void *identifier) +{ + gs->type = type; + gs->name = name ? xstrdup(name) : NULL; + gs->buf = NULL; + gs->size = 0; + + switch (type) { + case GREP_SOURCE_FILE: + gs->identifier = xstrdup(identifier); + break; + case GREP_SOURCE_SHA1: + gs->identifier = xmalloc(20); + memcpy(gs->identifier, identifier, 20); + break; + case GREP_SOURCE_BUF: + gs->identifier = NULL; + } +} + +void grep_source_clear(struct grep_source *gs) +{ + free(gs->name); + gs->name = NULL; + free(gs->identifier); + gs->identifier = NULL; + grep_source_clear_data(gs); +} + +void grep_source_clear_data(struct grep_source *gs) +{ + switch (gs->type) { + case GREP_SOURCE_FILE: + case GREP_SOURCE_SHA1: + free(gs->buf); + gs->buf = NULL; + gs->size = 0; + break; + case GREP_SOURCE_BUF: + /* leave user-provided buf intact */ + break; + } +} + +static int grep_source_load_sha1(struct grep_source *gs) +{ + enum object_type type; + + grep_read_lock(); + gs->buf = read_sha1_file(gs->identifier, &type, &gs->size); + grep_read_unlock(); + + if (!gs->buf) + return error(_("'%s': unable to read %s"), + gs->name, + sha1_to_hex(gs->identifier)); + return 0; +} + +static int grep_source_load_file(struct grep_source *gs) +{ + const char *filename = gs->identifier; + struct stat st; + char *data; + size_t size; + int i; + + if (lstat(filename, &st) < 0) { + err_ret: + if (errno != ENOENT) + error(_("'%s': %s"), filename, strerror(errno)); + return -1; + } + if (!S_ISREG(st.st_mode)) + return -1; + size = xsize_t(st.st_size); + i = open(filename, O_RDONLY); + if (i < 0) + goto err_ret; + data = xmalloc(size + 1); + if (st.st_size != read_in_full(i, data, size)) { + error(_("'%s': short read %s"), filename, strerror(errno)); + close(i); + free(data); + return -1; + } + close(i); + data[size] = 0; + + gs->buf = data; + gs->size = size; + return 0; +} + +int grep_source_load(struct grep_source *gs) +{ + if (gs->buf) + return 0; + + switch (gs->type) { + case GREP_SOURCE_FILE: + return grep_source_load_file(gs); + case GREP_SOURCE_SHA1: + return grep_source_load_sha1(gs); + case GREP_SOURCE_BUF: + return gs->buf ? 0 : -1; + } + die("BUG: invalid grep_source type"); } diff --git a/grep.h b/grep.h index 4f1b0251b0..e386ca4c73 100644 --- a/grep.h +++ b/grep.h @@ -129,6 +129,28 @@ extern void compile_grep_patterns(struct grep_opt *opt); extern void free_grep_patterns(struct grep_opt *opt); extern int grep_buffer(struct grep_opt *opt, const char *name, char *buf, unsigned long size); +struct grep_source { + char *name; + + enum grep_source_type { + GREP_SOURCE_SHA1, + GREP_SOURCE_FILE, + GREP_SOURCE_BUF, + } type; + void *identifier; + + char *buf; + unsigned long size; +}; + +void grep_source_init(struct grep_source *gs, enum grep_source_type type, + const char *name, const void *identifier); +int grep_source_load(struct grep_source *gs); +void grep_source_clear_data(struct grep_source *gs); +void grep_source_clear(struct grep_source *gs); + +int grep_source(struct grep_opt *opt, struct grep_source *gs); + extern struct grep_opt *grep_opt_dup(const struct grep_opt *opt); extern int grep_threads_ok(const struct grep_opt *opt); From 8f24a6323ece9be1bf1a04b4b5856112438337f2 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Thu, 2 Feb 2012 03:19:37 -0500 Subject: [PATCH 009/148] convert git-grep to use grep_source interface The grep_source interface (as opposed to grep_buffer) will eventually gives us a richer interface for telling the low-level grep code about our buffers. Eventually this will lead to things like better binary-file handling. For now, it lets us drop a lot of now-redundant code. The conversion is mostly straight-forward. One thing to note is that the memory ownership rules for "struct grep_source" are different than the "struct work_item" found here (the former will copy things like the filename, rather than taking ownership). Therefore you will also see some slight tweaking of when filename buffers are released. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- builtin/grep.c | 142 ++++++++----------------------------------------- 1 file changed, 23 insertions(+), 119 deletions(-) diff --git a/builtin/grep.c b/builtin/grep.c index aeb361663a..c9b6a138fe 100644 --- a/builtin/grep.c +++ b/builtin/grep.c @@ -29,25 +29,12 @@ static int use_threads = 1; #define THREADS 8 static pthread_t threads[THREADS]; -static void *load_sha1(const unsigned char *sha1, unsigned long *size, - const char *name); -static void *load_file(const char *filename, size_t *sz); - -enum work_type {WORK_SHA1, WORK_FILE}; - /* We use one producer thread and THREADS consumer * threads. The producer adds struct work_items to 'todo' and the * consumers pick work items from the same array. */ struct work_item { - enum work_type type; - char *name; - - /* if type == WORK_SHA1, then 'identifier' is a SHA1, - * otherwise type == WORK_FILE, and 'identifier' is a NUL - * terminated filename. - */ - void *identifier; + struct grep_source source; char done; struct strbuf out; }; @@ -98,7 +85,8 @@ static pthread_cond_t cond_result; static int skip_first_line; -static void add_work(enum work_type type, char *name, void *id) +static void add_work(enum grep_source_type type, const char *name, + const void *id) { grep_lock(); @@ -106,9 +94,7 @@ static void add_work(enum work_type type, char *name, void *id) pthread_cond_wait(&cond_write, &grep_mutex); } - todo[todo_end].type = type; - todo[todo_end].name = name; - todo[todo_end].identifier = id; + grep_source_init(&todo[todo_end].source, type, name, id); todo[todo_end].done = 0; strbuf_reset(&todo[todo_end].out); todo_end = (todo_end + 1) % ARRAY_SIZE(todo); @@ -136,21 +122,6 @@ static struct work_item *get_work(void) return ret; } -static void grep_sha1_async(struct grep_opt *opt, char *name, - const unsigned char *sha1) -{ - unsigned char *s; - s = xmalloc(20); - memcpy(s, sha1, 20); - add_work(WORK_SHA1, name, s); -} - -static void grep_file_async(struct grep_opt *opt, char *name, - const char *filename) -{ - add_work(WORK_FILE, name, xstrdup(filename)); -} - static void work_done(struct work_item *w) { int old_done; @@ -177,8 +148,7 @@ static void work_done(struct work_item *w) write_or_die(1, p, len); } - free(w->name); - free(w->identifier); + grep_source_clear(&w->source); } if (old_done != todo_done) @@ -201,25 +171,8 @@ static void *run(void *arg) break; opt->output_priv = w; - if (w->type == WORK_SHA1) { - unsigned long sz; - void* data = load_sha1(w->identifier, &sz, w->name); - - if (data) { - hit |= grep_buffer(opt, w->name, data, sz); - free(data); - } - } else if (w->type == WORK_FILE) { - size_t sz; - void* data = load_file(w->identifier, &sz); - if (data) { - hit |= grep_buffer(opt, w->name, data, sz); - free(data); - } - } else { - assert(0); - } - + hit |= grep_source(opt, &w->source); + grep_source_clear_data(&w->source); work_done(w); } free_grep_patterns(arg); @@ -365,23 +318,10 @@ static void *lock_and_read_sha1_file(const unsigned char *sha1, enum object_type return data; } -static void *load_sha1(const unsigned char *sha1, unsigned long *size, - const char *name) -{ - enum object_type type; - void *data = lock_and_read_sha1_file(sha1, &type, size); - - if (!data) - error(_("'%s': unable to read %s"), name, sha1_to_hex(sha1)); - - return data; -} - static int grep_sha1(struct grep_opt *opt, const unsigned char *sha1, const char *filename, int tree_name_len) { struct strbuf pathbuf = STRBUF_INIT; - char *name; if (opt->relative && opt->prefix_length) { quote_path_relative(filename + tree_name_len, -1, &pathbuf, @@ -391,87 +331,51 @@ static int grep_sha1(struct grep_opt *opt, const unsigned char *sha1, strbuf_addstr(&pathbuf, filename); } - name = strbuf_detach(&pathbuf, NULL); - #ifndef NO_PTHREADS if (use_threads) { - grep_sha1_async(opt, name, sha1); + add_work(GREP_SOURCE_SHA1, pathbuf.buf, sha1); + strbuf_release(&pathbuf); return 0; } else #endif { + struct grep_source gs; int hit; - unsigned long sz; - void *data = load_sha1(sha1, &sz, name); - if (!data) - hit = 0; - else - hit = grep_buffer(opt, name, data, sz); - free(data); - free(name); + grep_source_init(&gs, GREP_SOURCE_SHA1, pathbuf.buf, sha1); + strbuf_release(&pathbuf); + hit = grep_source(opt, &gs); + + grep_source_clear(&gs); return hit; } } -static void *load_file(const char *filename, size_t *sz) -{ - struct stat st; - char *data; - int i; - - if (lstat(filename, &st) < 0) { - err_ret: - if (errno != ENOENT) - error(_("'%s': %s"), filename, strerror(errno)); - return NULL; - } - if (!S_ISREG(st.st_mode)) - return NULL; - *sz = xsize_t(st.st_size); - i = open(filename, O_RDONLY); - if (i < 0) - goto err_ret; - data = xmalloc(*sz + 1); - if (st.st_size != read_in_full(i, data, *sz)) { - error(_("'%s': short read %s"), filename, strerror(errno)); - close(i); - free(data); - return NULL; - } - close(i); - data[*sz] = 0; - return data; -} - static int grep_file(struct grep_opt *opt, const char *filename) { struct strbuf buf = STRBUF_INIT; - char *name; if (opt->relative && opt->prefix_length) quote_path_relative(filename, -1, &buf, opt->prefix); else strbuf_addstr(&buf, filename); - name = strbuf_detach(&buf, NULL); #ifndef NO_PTHREADS if (use_threads) { - grep_file_async(opt, name, filename); + add_work(GREP_SOURCE_FILE, buf.buf, filename); + strbuf_release(&buf); return 0; } else #endif { + struct grep_source gs; int hit; - size_t sz; - void *data = load_file(filename, &sz); - if (!data) - hit = 0; - else - hit = grep_buffer(opt, name, data, sz); - free(data); - free(name); + grep_source_init(&gs, GREP_SOURCE_FILE, buf.buf, filename); + strbuf_release(&buf); + hit = grep_source(opt, &gs); + + grep_source_clear(&gs); return hit; } } From c876d6da88d9bf1f3377d4eed66fc7705e31c30e Mon Sep 17 00:00:00 2001 From: Jeff King Date: Thu, 2 Feb 2012 03:20:10 -0500 Subject: [PATCH 010/148] grep: drop grep_buffer's "name" parameter Before the grep_source interface existed, grep_buffer was used by two types of callers: 1. Ones which pulled a file into a buffer, and then wanted to supply the file's name for the output (i.e., git grep). 2. Ones which really just wanted to grep a buffer (i.e., git log --grep). Callers in set (1) should now be using grep_source. Callers in set (2) always pass NULL for the "name" parameter of grep_buffer. We can therefore get rid of this now-useless parameter. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- grep.c | 4 ++-- grep.h | 2 +- revision.c | 1 - 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/grep.c b/grep.c index 8204ca25e5..2a3fe7ce6f 100644 --- a/grep.c +++ b/grep.c @@ -1215,12 +1215,12 @@ int grep_source(struct grep_opt *opt, struct grep_source *gs) return grep_source_1(opt, gs, 0); } -int grep_buffer(struct grep_opt *opt, const char *name, char *buf, unsigned long size) +int grep_buffer(struct grep_opt *opt, char *buf, unsigned long size) { struct grep_source gs; int r; - grep_source_init(&gs, GREP_SOURCE_BUF, name, NULL); + grep_source_init(&gs, GREP_SOURCE_BUF, NULL, NULL); gs.buf = buf; gs.size = size; diff --git a/grep.h b/grep.h index e386ca4c73..8bf3001417 100644 --- a/grep.h +++ b/grep.h @@ -127,7 +127,7 @@ extern void append_grep_pattern(struct grep_opt *opt, const char *pat, const cha extern void append_header_grep_pattern(struct grep_opt *, enum grep_header_field, const char *); extern void compile_grep_patterns(struct grep_opt *opt); extern void free_grep_patterns(struct grep_opt *opt); -extern int grep_buffer(struct grep_opt *opt, const char *name, char *buf, unsigned long size); +extern int grep_buffer(struct grep_opt *opt, char *buf, unsigned long size); struct grep_source { char *name; diff --git a/revision.c b/revision.c index 064e351084..89d3a8fafb 100644 --- a/revision.c +++ b/revision.c @@ -2128,7 +2128,6 @@ static int commit_match(struct commit *commit, struct rev_info *opt) if (!opt->grep_filter.pattern_list && !opt->grep_filter.header_list) return 1; return grep_buffer(&opt->grep_filter, - NULL, /* we say nothing, not even filename */ commit->buffer, strlen(commit->buffer)); } From 94ad9d9e075d3f7802cf56641ebb342d43fb46e3 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Thu, 2 Feb 2012 03:20:43 -0500 Subject: [PATCH 011/148] grep: cache userdiff_driver in grep_source Right now, grep only uses the userdiff_driver for one thing: looking up funcname patterns for "-p" and "-W". As new uses for userdiff drivers are added to the grep code, we want to minimize attribute lookups, which can be expensive. It might seem at first that this would also optimize multiple lookups when the funcname pattern for a file is needed multiple times. However, the compiled funcname pattern is already cached in struct grep_opt's "priv" member, so multiple lookups are already suppressed. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- grep.c | 22 ++++++++++++++++------ grep.h | 4 ++++ 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/grep.c b/grep.c index 2a3fe7ce6f..bb1856985b 100644 --- a/grep.c +++ b/grep.c @@ -841,12 +841,9 @@ static int match_funcname(struct grep_opt *opt, struct grep_source *gs, char *bo { xdemitconf_t *xecfg = opt->priv; if (xecfg && !xecfg->find_func) { - struct userdiff_driver *drv; - grep_attr_lock(); - drv = userdiff_find_by_path(gs->name); - grep_attr_unlock(); - if (drv && drv->funcname.pattern) { - const struct userdiff_funcname *pe = &drv->funcname; + grep_source_load_driver(gs); + if (gs->driver->funcname.pattern) { + const struct userdiff_funcname *pe = &gs->driver->funcname; xdiff_set_find_func(xecfg, pe->pattern, pe->cflags); } else { xecfg = opt->priv = NULL; @@ -1237,6 +1234,7 @@ void grep_source_init(struct grep_source *gs, enum grep_source_type type, gs->name = name ? xstrdup(name) : NULL; gs->buf = NULL; gs->size = 0; + gs->driver = NULL; switch (type) { case GREP_SOURCE_FILE: @@ -1340,3 +1338,15 @@ int grep_source_load(struct grep_source *gs) } die("BUG: invalid grep_source type"); } + +void grep_source_load_driver(struct grep_source *gs) +{ + if (gs->driver) + return; + + grep_attr_lock(); + gs->driver = userdiff_find_by_path(gs->name); + if (!gs->driver) + gs->driver = userdiff_find_by_name("default"); + grep_attr_unlock(); +} diff --git a/grep.h b/grep.h index 8bf3001417..73b28c2df7 100644 --- a/grep.h +++ b/grep.h @@ -9,6 +9,7 @@ typedef int pcre_extra; #endif #include "kwset.h" #include "thread-utils.h" +#include "userdiff.h" enum grep_pat_token { GREP_PATTERN, @@ -141,6 +142,8 @@ struct grep_source { char *buf; unsigned long size; + + struct userdiff_driver *driver; }; void grep_source_init(struct grep_source *gs, enum grep_source_type type, @@ -148,6 +151,7 @@ void grep_source_init(struct grep_source *gs, enum grep_source_type type, int grep_source_load(struct grep_source *gs); void grep_source_clear_data(struct grep_source *gs); void grep_source_clear(struct grep_source *gs); +void grep_source_load_driver(struct grep_source *gs); int grep_source(struct grep_opt *opt, struct grep_source *gs); From 41b59bfcb16abb738e5c95c95fb462e717d47d4d Mon Sep 17 00:00:00 2001 From: Jeff King Date: Thu, 2 Feb 2012 03:21:02 -0500 Subject: [PATCH 012/148] grep: respect diff attributes for binary-ness There is currently no way for users to tell git-grep that a particular path is or is not a binary file; instead, grep always relies on its auto-detection (or the user specifying "-a" to treat all binary-looking files like text). This patch teaches git-grep to use the same attribute lookup that is used by git-diff. We could add a new "grep" flag, but that is unnecessarily complex and unlikely to be useful. Despite the name, the "-diff" attribute (or "diff=foo" and the associated diff.foo.binary config option) are really about describing the contents of the path. It's simply historical that diff was the only thing that cared about these attributes in the past. And if this simple approach turns out to be insufficient, we still have a backwards-compatible path forward: we can add a separate "grep" attribute, and fall back to respecting "diff" if it is unset. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- grep.c | 16 ++++++++++++++-- grep.h | 1 + t/t7008-grep-binary.sh | 24 ++++++++++++++++++++++++ 3 files changed, 39 insertions(+), 2 deletions(-) diff --git a/grep.c b/grep.c index bb1856985b..a50d161721 100644 --- a/grep.c +++ b/grep.c @@ -1024,11 +1024,11 @@ static int grep_source_1(struct grep_opt *opt, struct grep_source *gs, int colle switch (opt->binary) { case GREP_BINARY_DEFAULT: - if (buffer_is_binary(gs->buf, gs->size)) + if (grep_source_is_binary(gs)) binary_match_only = 1; break; case GREP_BINARY_NOMATCH: - if (buffer_is_binary(gs->buf, gs->size)) + if (grep_source_is_binary(gs)) return 0; /* Assume unmatch */ break; case GREP_BINARY_TEXT: @@ -1350,3 +1350,15 @@ void grep_source_load_driver(struct grep_source *gs) gs->driver = userdiff_find_by_name("default"); grep_attr_unlock(); } + +int grep_source_is_binary(struct grep_source *gs) +{ + grep_source_load_driver(gs); + if (gs->driver->binary != -1) + return gs->driver->binary; + + if (!grep_source_load(gs)) + return buffer_is_binary(gs->buf, gs->size); + + return 0; +} diff --git a/grep.h b/grep.h index 73b28c2df7..36e49d8255 100644 --- a/grep.h +++ b/grep.h @@ -152,6 +152,7 @@ int grep_source_load(struct grep_source *gs); void grep_source_clear_data(struct grep_source *gs); void grep_source_clear(struct grep_source *gs); void grep_source_load_driver(struct grep_source *gs); +int grep_source_is_binary(struct grep_source *gs); int grep_source(struct grep_opt *opt, struct grep_source *gs); diff --git a/t/t7008-grep-binary.sh b/t/t7008-grep-binary.sh index 917a264eea..fd6410fc71 100755 --- a/t/t7008-grep-binary.sh +++ b/t/t7008-grep-binary.sh @@ -99,4 +99,28 @@ test_expect_success 'git grep yx a' " test_must_fail git grep -f f a " +test_expect_success 'grep respects binary diff attribute' ' + echo text >t && + git add t && + echo t:text >expect && + git grep text t >actual && + test_cmp expect actual && + echo "t -diff" >.gitattributes && + echo "Binary file t matches" >expect && + git grep text t >actual && + test_cmp expect actual +' + +test_expect_success 'grep respects not-binary diff attribute' ' + echo binQary | q_to_nul >b && + git add b && + echo "Binary file b matches" >expect && + git grep bin b >actual && + test_cmp expect actual && + echo "b diff" >.gitattributes && + echo "b:binQary" >expect && + git grep bin b | nul_to_q >actual && + test_cmp expect actual +' + test_done From 08265798e1ff6abc1b0aaff31c1471f83bd51425 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Thu, 2 Feb 2012 03:21:11 -0500 Subject: [PATCH 013/148] grep: load file data after checking binary-ness Usually we load each file to grep into memory, check whether it's binary, and then either grep it (the default) or not (if "-I" was given). In the "-I" case, we can skip loading the file entirely if it is marked as binary via gitattributes. On my giant 3-gigabyte media repository, doing "git grep -I foo" went from: real 0m0.712s user 0m0.044s sys 0m4.780s to: real 0m0.026s user 0m0.016s sys 0m0.020s Obviously this is an extreme example. The repo is almost entirely binary files, and you can see that we spent all of our time asking the kernel to read() the data. However, with a cold disk cache, even avoiding a few binary files can have an impact. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- grep.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/grep.c b/grep.c index a50d161721..3821400966 100644 --- a/grep.c +++ b/grep.c @@ -1019,9 +1019,6 @@ static int grep_source_1(struct grep_opt *opt, struct grep_source *gs, int colle } opt->last_shown = 0; - if (grep_source_load(gs) < 0) - return 0; - switch (opt->binary) { case GREP_BINARY_DEFAULT: if (grep_source_is_binary(gs)) @@ -1042,6 +1039,9 @@ static int grep_source_1(struct grep_opt *opt, struct grep_source *gs, int colle try_lookahead = should_lookahead(opt); + if (grep_source_load(gs) < 0) + return 0; + bol = gs->buf; left = gs->size; while (left) { From 9dd5245c1043dd18fd7b3f44b9e51eef7e4b58d8 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Thu, 2 Feb 2012 03:24:28 -0500 Subject: [PATCH 014/148] grep: pre-load userdiff drivers when threaded The low-level grep_source code will automatically load the userdiff driver to see whether a file is binary. However, when we are threaded, it will load the drivers in a non-deterministic order, handling each one as its assigned thread happens to be scheduled. Meanwhile, the attribute lookup code (which underlies the userdiff driver lookup) is optimized to handle paths in sequential order (because they tend to share the same gitattributes files). Multi-threading the lookups destroys the locality and makes this optimization less effective. We can fix this by pre-loading the userdiff driver in the main thread, before we hand off the file to a worker thread. My best-of-five for "git grep foo" on the linux-2.6 repository went from: real 0m0.391s user 0m1.708s sys 0m0.584s to: real 0m0.360s user 0m1.576s sys 0m0.572s Not a huge speedup, but it's quite easy to do. The only trick is that we shouldn't perform this optimization if "-a" was used, in which case we won't bother checking whether the files are binary at all. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- builtin/grep.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/builtin/grep.c b/builtin/grep.c index c9b6a138fe..e741aca18c 100644 --- a/builtin/grep.c +++ b/builtin/grep.c @@ -85,8 +85,8 @@ static pthread_cond_t cond_result; static int skip_first_line; -static void add_work(enum grep_source_type type, const char *name, - const void *id) +static void add_work(struct grep_opt *opt, enum grep_source_type type, + const char *name, const void *id) { grep_lock(); @@ -95,6 +95,8 @@ static void add_work(enum grep_source_type type, const char *name, } grep_source_init(&todo[todo_end].source, type, name, id); + if (opt->binary != GREP_BINARY_TEXT) + grep_source_load_driver(&todo[todo_end].source); todo[todo_end].done = 0; strbuf_reset(&todo[todo_end].out); todo_end = (todo_end + 1) % ARRAY_SIZE(todo); @@ -333,7 +335,7 @@ static int grep_sha1(struct grep_opt *opt, const unsigned char *sha1, #ifndef NO_PTHREADS if (use_threads) { - add_work(GREP_SOURCE_SHA1, pathbuf.buf, sha1); + add_work(opt, GREP_SOURCE_SHA1, pathbuf.buf, sha1); strbuf_release(&pathbuf); return 0; } else @@ -362,7 +364,7 @@ static int grep_file(struct grep_opt *opt, const char *filename) #ifndef NO_PTHREADS if (use_threads) { - add_work(GREP_SOURCE_FILE, buf.buf, filename); + add_work(opt, GREP_SOURCE_FILE, buf.buf, filename); strbuf_release(&buf); return 0; } else From b3256eb8b35937192e85725d0c2bcb422295790c Mon Sep 17 00:00:00 2001 From: Jeff King Date: Thu, 2 Feb 2012 16:59:13 -0500 Subject: [PATCH 015/148] standardize and improve lookup rules for external local repos When you specify a local repository on the command line of clone, ls-remote, upload-pack, receive-pack, or upload-archive, or in a request to git-daemon, we perform a little bit of lookup magic, doing things like looking in working trees for .git directories and appending ".git" for bare repos. For clone, this magic happens in get_repo_path. For everything else, it happens in enter_repo. In both cases, there are some ambiguous or confusing cases that aren't handled well, and there is one case that is not handled the same by both methods. This patch tries to provide (and test!) standard, sensible lookup rules for both code paths. The intended changes are: 1. When looking up "foo", we have always preferred a working tree "foo" (containing "foo/.git" over the bare "foo.git". But we did not prefer a bare "foo" over "foo.git". With this patch, we do so. 2. We would select directories that existed but didn't actually look like git repositories. With this patch, we make sure a selected directory looks like a git repo. Not only is this more sensible in general, but it will help anybody who is negatively affected by change (1) negatively (e.g., if they had "foo.git" next to its separate work tree "foo", and expect to keep finding "foo.git" when they reference "foo"). 3. The enter_repo code path would, given "foo", look for "foo.git/.git" (i.e., do the ".git" append magic even for a repo with working tree). The clone code path did not; with this patch, they now behave the same. In the unlikely case of a working tree overlaying a bare repo (i.e., a ".git" directory _inside_ a bare repo), we continue to treat it as a working tree (prefering the "inner" .git over the bare repo). This is mainly because the combination seems nonsensical, and I'd rather stick with existing behavior on the off chance that somebody is relying on it. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- builtin/clone.c | 4 +- cache.h | 1 + path.c | 7 ++- setup.c | 2 +- t/t5900-repo-selection.sh | 100 ++++++++++++++++++++++++++++++++++++++ 5 files changed, 109 insertions(+), 5 deletions(-) create mode 100755 t/t5900-repo-selection.sh diff --git a/builtin/clone.c b/builtin/clone.c index efe8b6cce5..b11dd6dd61 100644 --- a/builtin/clone.c +++ b/builtin/clone.c @@ -105,7 +105,7 @@ static const char *argv_submodule[] = { static char *get_repo_path(const char *repo, int *is_bundle) { - static char *suffix[] = { "/.git", ".git", "" }; + static char *suffix[] = { "/.git", "", ".git/.git", ".git" }; static char *bundle_suffix[] = { ".bundle", "" }; struct stat st; int i; @@ -115,7 +115,7 @@ static char *get_repo_path(const char *repo, int *is_bundle) path = mkpath("%s%s", repo, suffix[i]); if (stat(path, &st)) continue; - if (S_ISDIR(st.st_mode)) { + if (S_ISDIR(st.st_mode) && is_git_directory(path)) { *is_bundle = 0; return xstrdup(absolute_path(path)); } else if (S_ISREG(st.st_mode) && st.st_size > 8) { diff --git a/cache.h b/cache.h index 2e6ad3604e..3dfb95d8aa 100644 --- a/cache.h +++ b/cache.h @@ -432,6 +432,7 @@ extern char *git_work_tree_cfg; extern int is_inside_work_tree(void); extern int have_git_dir(void); extern const char *get_git_dir(void); +extern int is_git_directory(const char *path); extern char *get_object_directory(void); extern char *get_index_file(void); extern char *get_graft_file(void); diff --git a/path.c b/path.c index b6f71d1086..6f2aa699ad 100644 --- a/path.c +++ b/path.c @@ -293,7 +293,7 @@ const char *enter_repo(const char *path, int strict) if (!strict) { static const char *suffix[] = { - ".git/.git", "/.git", ".git", "", NULL, + "/.git", "", ".git/.git", ".git", NULL, }; const char *gitfile; int len = strlen(path); @@ -324,8 +324,11 @@ const char *enter_repo(const char *path, int strict) return NULL; len = strlen(used_path); for (i = 0; suffix[i]; i++) { + struct stat st; strcpy(used_path + len, suffix[i]); - if (!access(used_path, F_OK)) { + if (!stat(used_path, &st) && + (S_ISREG(st.st_mode) || + (S_ISDIR(st.st_mode) && is_git_directory(used_path)))) { strcat(validated_path, suffix[i]); break; } diff --git a/setup.c b/setup.c index 61c22e6bec..7a3618fab7 100644 --- a/setup.c +++ b/setup.c @@ -247,7 +247,7 @@ const char **get_pathspec(const char *prefix, const char **pathspec) * a proper "ref:", or a regular file HEAD that has a properly * formatted sha1 object name. */ -static int is_git_directory(const char *suspect) +int is_git_directory(const char *suspect) { char path[PATH_MAX]; size_t len = strlen(suspect); diff --git a/t/t5900-repo-selection.sh b/t/t5900-repo-selection.sh new file mode 100755 index 0000000000..3d5b418bb4 --- /dev/null +++ b/t/t5900-repo-selection.sh @@ -0,0 +1,100 @@ +#!/bin/sh + +test_description='selecting remote repo in ambiguous cases' +. ./test-lib.sh + +reset() { + rm -rf foo foo.git fetch clone +} + +make_tree() { + git init "$1" && + (cd "$1" && test_commit "$1") +} + +make_bare() { + git init --bare "$1" && + (cd "$1" && + tree=`git hash-object -w -t tree /dev/null` && + commit=$(echo "$1" | git commit-tree $tree) && + git update-ref HEAD $commit + ) +} + +get() { + git init --bare fetch && + (cd fetch && git fetch "../$1") && + git clone "$1" clone +} + +check() { + echo "$1" >expect && + (cd fetch && git log -1 --format=%s FETCH_HEAD) >actual.fetch && + (cd clone && git log -1 --format=%s HEAD) >actual.clone && + test_cmp expect actual.fetch && + test_cmp expect actual.clone +} + +test_expect_success 'find .git dir in worktree' ' + reset && + make_tree foo && + get foo && + check foo +' + +test_expect_success 'automagically add .git suffix' ' + reset && + make_bare foo.git && + get foo && + check foo.git +' + +test_expect_success 'automagically add .git suffix to worktree' ' + reset && + make_tree foo.git && + get foo && + check foo.git +' + +test_expect_success 'prefer worktree foo over bare foo.git' ' + reset && + make_tree foo && + make_bare foo.git && + get foo && + check foo +' + +test_expect_success 'prefer bare foo over bare foo.git' ' + reset && + make_bare foo && + make_bare foo.git && + get foo && + check foo +' + +test_expect_success 'disambiguate with full foo.git' ' + reset && + make_bare foo && + make_bare foo.git && + get foo.git && + check foo.git +' + +test_expect_success 'we are not fooled by non-git foo directory' ' + reset && + make_bare foo.git && + mkdir foo && + get foo && + check foo.git +' + +test_expect_success 'prefer inner .git over outer bare' ' + reset && + make_tree foo && + make_bare foo.git && + mv foo/.git foo.git && + get foo.git && + check foo +' + +test_done From 84d9e2d50ca9fbcf34e31cb74797fc182187c7b5 Mon Sep 17 00:00:00 2001 From: Jakub Narebski Date: Fri, 3 Feb 2012 13:44:54 +0100 Subject: [PATCH 016/148] gitweb: Allow UTF-8 encoded CGI query parameters and path_info MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Gitweb forgot to turn query parameters into UTF-8. This results in a bug that one cannot search for a string with characters outside US-ASCII. For example searching for "Michał Kiedrowicz" (containing letter 'ł' - LATIN SMALL LETTER L WITH STROKE, with Unicode codepoint U+0142, represented with 0xc5 0x82 bytes in UTF-8 and percent-encoded as %C5%82) result in the following incorrect data in search field MichaÅ\202 Kiedrowicz This is caused by CGI by default treating '0xc5 0x82' bytes as two characters in Perl legacy encoding latin-1 (iso-8859-1), because 's' query parameter is not processed explicitly as UTF-8 encoded string. The solution used here follows "Using Unicode in a Perl CGI script" article on http://www.lemoda.net/cgi/perl-unicode/index.html: use CGI; use Encode 'decode_utf8; my $value = params('input'); $value = decode_utf8($value); Decoding UTF-8 is done when filling %input_params hash and $path_info variable; the former requires to move from explicit $cgi->param(