diff --git a/Documentation/git-fast-export.txt b/Documentation/git-fast-export.txt index e8950de3ba..1978dbdc6a 100644 --- a/Documentation/git-fast-export.txt +++ b/Documentation/git-fast-export.txt @@ -119,6 +119,11 @@ by keeping the marks the same across runs. the shape of the history and stored tree. See the section on `ANONYMIZING` below. +--anonymize-map=[:]:: + Convert token `` to `` in the anonymized output. If + `` is omitted, map `` to itself (i.e., do not + anonymize it). See the section on `ANONYMIZING` below. + --reference-excluded-parents:: By default, running a command such as `git fast-export master~5..master` will not include the commit master{tilde}5 @@ -238,6 +243,30 @@ collapse "User 0", "User 1", etc into "User X"). This produces a much smaller output, and it is usually easy to quickly confirm that there is no private data in the stream. +Reproducing some bugs may require referencing particular commits or +paths, which becomes challenging after refnames and paths have been +anonymized. You can ask for a particular token to be left as-is or +mapped to a new value. For example, if you have a bug which reproduces +with `git rev-list sensitive -- secret.c`, you can run: + +--------------------------------------------------- +$ git fast-export --anonymize --all \ + --anonymize-map=sensitive:foo \ + --anonymize-map=secret.c:bar.c \ + >stream +--------------------------------------------------- + +After importing the stream, you can then run `git rev-list foo -- bar.c` +in the anonymized repository. + +Note that paths and refnames are split into tokens at slash boundaries. +The command above would anonymize `subdir/secret.c` as something like +`path123/bar.c`; you could then search for `bar.c` in the anonymized +repository to determine the final pathname. + +To make referencing the final pathname simpler, you can map each path +component; so if you also anonymize `subdir` to `publicdir`, then the +final pathname would be `publicdir/bar.c`. LIMITATIONS ----------- diff --git a/builtin/fast-export.c b/builtin/fast-export.c index 85868162ee..9f37895d4c 100644 --- a/builtin/fast-export.c +++ b/builtin/fast-export.c @@ -45,6 +45,7 @@ static struct string_list extra_refs = STRING_LIST_INIT_NODUP; static struct string_list tag_refs = STRING_LIST_INIT_NODUP; static struct refspec refspecs = REFSPEC_INIT_FETCH; static int anonymize; +static struct hashmap anonymized_seeds; static struct revision_sources revision_sources; static int parse_opt_signed_tag_mode(const struct option *opt, @@ -119,25 +120,34 @@ static int has_unshown_parent(struct commit *commit) } struct anonymized_entry { + struct hashmap_entry hash; + const char *anon; + const char orig[FLEX_ARRAY]; +}; + +struct anonymized_entry_key { struct hashmap_entry hash; const char *orig; size_t orig_len; - const char *anon; - size_t anon_len; }; static int anonymized_entry_cmp(const void *unused_cmp_data, const struct hashmap_entry *eptr, const struct hashmap_entry *entry_or_key, - const void *unused_keydata) + const void *keydata) { const struct anonymized_entry *a, *b; a = container_of(eptr, const struct anonymized_entry, hash); - b = container_of(entry_or_key, const struct anonymized_entry, hash); + if (keydata) { + const struct anonymized_entry_key *key = keydata; + int equal = !strncmp(a->orig, key->orig, key->orig_len) && + !a->orig[key->orig_len]; + return !equal; + } - return a->orig_len != b->orig_len || - memcmp(a->orig, b->orig, a->orig_len); + b = container_of(entry_or_key, const struct anonymized_entry, hash); + return strcmp(a->orig, b->orig); } /* @@ -145,31 +155,39 @@ static int anonymized_entry_cmp(const void *unused_cmp_data, * the same anonymized string with another. The actual generation * is farmed out to the generate function. */ -static const void *anonymize_mem(struct hashmap *map, - void *(*generate)(const void *, size_t *), - const void *orig, size_t *len) +static const char *anonymize_str(struct hashmap *map, + char *(*generate)(void *), + const char *orig, size_t len, + void *data) { - struct anonymized_entry key, *ret; + struct anonymized_entry_key key; + struct anonymized_entry *ret; if (!map->cmpfn) hashmap_init(map, anonymized_entry_cmp, NULL, 0); - hashmap_entry_init(&key.hash, memhash(orig, *len)); + hashmap_entry_init(&key.hash, memhash(orig, len)); key.orig = orig; - key.orig_len = *len; - ret = hashmap_get_entry(map, &key, hash, NULL); + key.orig_len = len; + /* First check if it's a token the user configured manually... */ + if (anonymized_seeds.cmpfn) + ret = hashmap_get_entry(&anonymized_seeds, &key, hash, &key); + else + ret = NULL; + + /* ...otherwise check if we've already seen it in this context... */ + if (!ret) + ret = hashmap_get_entry(map, &key, hash, &key); + + /* ...and finally generate a new mapping if necessary */ if (!ret) { - ret = xmalloc(sizeof(*ret)); + FLEX_ALLOC_MEM(ret, orig, orig, len); hashmap_entry_init(&ret->hash, key.hash.hash); - ret->orig = xstrdup(orig); - ret->orig_len = *len; - ret->anon = generate(orig, len); - ret->anon_len = *len; + ret->anon = generate(data); hashmap_put(map, &ret->hash); } - *len = ret->anon_len; return ret->anon; } @@ -181,13 +199,13 @@ static const void *anonymize_mem(struct hashmap *map, */ static void anonymize_path(struct strbuf *out, const char *path, struct hashmap *map, - void *(*generate)(const void *, size_t *)) + char *(*generate)(void *)) { while (*path) { const char *end_of_component = strchrnul(path, '/'); size_t len = end_of_component - path; - const char *c = anonymize_mem(map, generate, path, &len); - strbuf_add(out, c, len); + const char *c = anonymize_str(map, generate, path, len, NULL); + strbuf_addstr(out, c); path = end_of_component; if (*path) strbuf_addch(out, *path++); @@ -361,12 +379,12 @@ static void print_path_1(const char *path) printf("%s", path); } -static void *anonymize_path_component(const void *path, size_t *len) +static char *anonymize_path_component(void *data) { static int counter; struct strbuf out = STRBUF_INIT; strbuf_addf(&out, "path%d", counter++); - return strbuf_detach(&out, len); + return strbuf_detach(&out, NULL); } static void print_path(const char *path) @@ -383,20 +401,23 @@ static void print_path(const char *path) } } -static void *generate_fake_oid(const void *old, size_t *len) +static char *generate_fake_oid(void *data) { static uint32_t counter = 1; /* avoid null oid */ const unsigned hashsz = the_hash_algo->rawsz; - unsigned char *out = xcalloc(hashsz, 1); + unsigned char out[GIT_MAX_RAWSZ]; + char *hex = xmallocz(GIT_MAX_HEXSZ); + + hashclr(out); put_be32(out + hashsz - 4, counter++); - return out; + return hash_to_hex_algop_r(hex, out, the_hash_algo); } -static const struct object_id *anonymize_oid(const struct object_id *oid) +static const char *anonymize_oid(const char *oid_hex) { static struct hashmap objs; - size_t len = the_hash_algo->rawsz; - return anonymize_mem(&objs, generate_fake_oid, oid, &len); + size_t len = strlen(oid_hex); + return anonymize_str(&objs, generate_fake_oid, oid_hex, len, NULL); } static void show_filemodify(struct diff_queue_struct *q, @@ -455,9 +476,9 @@ static void show_filemodify(struct diff_queue_struct *q, */ if (no_data || S_ISGITLINK(spec->mode)) printf("M %06o %s ", spec->mode, - oid_to_hex(anonymize ? - anonymize_oid(&spec->oid) : - &spec->oid)); + anonymize ? + anonymize_oid(oid_to_hex(&spec->oid)) : + oid_to_hex(&spec->oid)); else { struct object *object = lookup_object(the_repository, &spec->oid); @@ -493,12 +514,12 @@ static const char *find_encoding(const char *begin, const char *end) return bol; } -static void *anonymize_ref_component(const void *old, size_t *len) +static char *anonymize_ref_component(void *data) { static int counter; struct strbuf out = STRBUF_INIT; strbuf_addf(&out, "ref%d", counter++); - return strbuf_detach(&out, len); + return strbuf_detach(&out, NULL); } static const char *anonymize_refname(const char *refname) @@ -517,13 +538,6 @@ static const char *anonymize_refname(const char *refname) static struct strbuf anon = STRBUF_INIT; int i; - /* - * We also leave "master" as a special case, since it does not reveal - * anything interesting. - */ - if (!strcmp(refname, "refs/heads/master")) - return refname; - strbuf_reset(&anon); for (i = 0; i < ARRAY_SIZE(prefixes); i++) { if (skip_prefix(refname, prefixes[i], &refname)) { @@ -546,14 +560,13 @@ static char *anonymize_commit_message(const char *old) return xstrfmt("subject %d\n\nbody\n", counter++); } -static struct hashmap idents; -static void *anonymize_ident(const void *old, size_t *len) +static char *anonymize_ident(void *data) { static int counter; struct strbuf out = STRBUF_INIT; strbuf_addf(&out, "User %d ", counter, counter); counter++; - return strbuf_detach(&out, len); + return strbuf_detach(&out, NULL); } /* @@ -563,6 +576,7 @@ static void *anonymize_ident(const void *old, size_t *len) */ static void anonymize_ident_line(const char **beg, const char **end) { + static struct hashmap idents; static struct strbuf buffers[] = { STRBUF_INIT, STRBUF_INIT }; static unsigned which_buffer; @@ -588,9 +602,9 @@ static void anonymize_ident_line(const char **beg, const char **end) size_t len; len = split.mail_end - split.name_begin; - ident = anonymize_mem(&idents, anonymize_ident, - split.name_begin, &len); - strbuf_add(out, ident, len); + ident = anonymize_str(&idents, anonymize_ident, + split.name_begin, len, NULL); + strbuf_addstr(out, ident); strbuf_addch(out, ' '); strbuf_add(out, split.date_begin, split.tz_end - split.date_begin); } else { @@ -712,9 +726,10 @@ static void handle_commit(struct commit *commit, struct rev_info *rev, if (mark) printf(":%d\n", mark); else - printf("%s\n", oid_to_hex(anonymize ? - anonymize_oid(&obj->oid) : - &obj->oid)); + printf("%s\n", + anonymize ? + anonymize_oid(oid_to_hex(&obj->oid)) : + oid_to_hex(&obj->oid)); i++; } @@ -729,12 +744,12 @@ static void handle_commit(struct commit *commit, struct rev_info *rev, show_progress(); } -static void *anonymize_tag(const void *old, size_t *len) +static char *anonymize_tag(void *data) { static int counter; struct strbuf out = STRBUF_INIT; strbuf_addf(&out, "tag message %d", counter++); - return strbuf_detach(&out, len); + return strbuf_detach(&out, NULL); } static void handle_tail(struct object_array *commits, struct rev_info *revs, @@ -804,8 +819,8 @@ static void handle_tag(const char *name, struct tag *tag) name = anonymize_refname(name); if (message) { static struct hashmap tags; - message = anonymize_mem(&tags, anonymize_tag, - message, &message_size); + message = anonymize_str(&tags, anonymize_tag, + message, message_size, NULL); } } @@ -1136,6 +1151,37 @@ static void handle_deletes(void) } } +static char *anonymize_seed(void *data) +{ + return xstrdup(data); +} + +static int parse_opt_anonymize_map(const struct option *opt, + const char *arg, int unset) +{ + struct hashmap *map = opt->value; + const char *delim, *value; + size_t keylen; + + BUG_ON_OPT_NEG(unset); + + delim = strchr(arg, ':'); + if (delim) { + keylen = delim - arg; + value = delim + 1; + } else { + keylen = strlen(arg); + value = arg; + } + + if (!keylen || !*value) + return error(_("--anonymize-map token cannot be empty")); + + anonymize_str(map, anonymize_seed, arg, keylen, (void *)value); + + return 0; +} + int cmd_fast_export(int argc, const char **argv, const char *prefix) { struct rev_info revs; @@ -1177,6 +1223,9 @@ int cmd_fast_export(int argc, const char **argv, const char *prefix) OPT_STRING_LIST(0, "refspec", &refspecs_list, N_("refspec"), N_("Apply refspec to exported refs")), OPT_BOOL(0, "anonymize", &anonymize, N_("anonymize output")), + OPT_CALLBACK_F(0, "anonymize-map", &anonymized_seeds, N_("from:to"), + N_("convert to in anonymized output"), + PARSE_OPT_NONEG, parse_opt_anonymize_map), OPT_BOOL(0, "reference-excluded-parents", &reference_excluded_commits, N_("Reference parents which are not in fast-export stream by object id")), OPT_BOOL(0, "show-original-ids", &show_original_ids, @@ -1204,6 +1253,9 @@ int cmd_fast_export(int argc, const char **argv, const char *prefix) if (argc > 1) usage_with_options (fast_export_usage, options); + if (anonymized_seeds.cmpfn && !anonymize) + die(_("--anonymize-map without --anonymize does not make sense")); + if (refspecs_list.nr) { int i; diff --git a/t/t9351-fast-export-anonymize.sh b/t/t9351-fast-export-anonymize.sh index 897dc50907..5ac2c3b5ee 100755 --- a/t/t9351-fast-export-anonymize.sh +++ b/t/t9351-fast-export-anonymize.sh @@ -6,15 +6,24 @@ test_description='basic tests for fast-export --anonymize' test_expect_success 'setup simple repo' ' test_commit base && test_commit foo && + test_commit retain-me && git checkout -b other HEAD^ && mkdir subdir && test_commit subdir/bar && test_commit subdir/xyzzy && + fake_commit=$(echo $ZERO_OID | sed s/0/a/) && + git update-index --add --cacheinfo 160000,$fake_commit,link1 && + git update-index --add --cacheinfo 160000,$fake_commit,link2 && + git commit -m "add gitlink" && git tag -m "annotated tag" mytag ' test_expect_success 'export anonymized stream' ' - git fast-export --anonymize --all >stream + git fast-export --anonymize --all \ + --anonymize-map=retain-me \ + --anonymize-map=xyzzy:custom-name \ + --anonymize-map=other \ + >stream ' # this also covers commit messages @@ -26,12 +35,23 @@ test_expect_success 'stream omits path names' ' ! grep xyzzy stream ' -test_expect_success 'stream allows master as refname' ' - grep master stream +test_expect_success 'stream contains user-specified names' ' + grep retain-me stream && + grep custom-name stream +' + +test_expect_success 'stream omits gitlink oids' ' + # avoid relying on the whole oid to remain hash-agnostic; this is + # plenty to be unique within our test case + ! grep a000000000000000000 stream +' + +test_expect_success 'stream retains other as refname' ' + grep other stream ' test_expect_success 'stream omits other refnames' ' - ! grep other stream && + ! grep master stream && ! grep mytag stream ' @@ -57,7 +77,8 @@ test_expect_success 'import stream to new repository' ' test_expect_success 'result has two branches' ' git for-each-ref --format="%(refname)" refs/heads >branches && test_line_count = 2 branches && - other_branch=$(grep -v refs/heads/master branches) + other_branch=refs/heads/other && + main_branch=$(grep -v $other_branch branches) ' test_expect_success 'repo has original shape and timestamps' ' @@ -65,34 +86,35 @@ test_expect_success 'repo has original shape and timestamps' ' git log --format="%m %ct" --left-right --boundary "$@" } && (cd .. && shape master...other) >expect && - shape master...$other_branch >actual && + shape $main_branch...$other_branch >actual && test_cmp expect actual ' test_expect_success 'root tree has original shape' ' # the output entries are not necessarily in the same - # order, but we know at least that we will have one tree - # and one blob, so just check the sorted order - cat >expect <<-\EOF && - blob - tree - EOF + # order, but we should at least have the same set of + # object types. + git -C .. ls-tree HEAD >orig-root && + cut -d" " -f2 expect && git ls-tree $other_branch >root && cut -d" " -f2 actual && test_cmp expect actual ' test_expect_success 'paths in subdir ended up in one tree' ' - cat >expect <<-\EOF && - blob - blob - EOF + git -C .. ls-tree other:subdir >orig-subdir && + cut -d" " -f2 expect && tree=$(grep tree root | cut -f2) && git ls-tree $other_branch:$tree >tree && cut -d" " -f2 actual && test_cmp expect actual ' +test_expect_success 'identical gitlinks got identical oid' ' + awk "/commit/ { print \$3 }" commits && + test_line_count = 1 commits +' + test_expect_success 'tag points to branch tip' ' git rev-parse $other_branch >expect && git for-each-ref --format="%(*objectname)" | grep . >actual &&