From b8c0689bb97b3ef3027e4d3670538d4cdacd638d Mon Sep 17 00:00:00 2001 From: Jeff King Date: Tue, 23 Jun 2020 11:24:47 -0400 Subject: t9351: derive anonymized tree checks from original repo Our tests of the anonymized repo just hard-code the expected set of objects in the root and subdirectory trees. This makes them brittle to the test setup changing (e.g., adding new paths that need tested). Let's look at the original repo to compute our expected set of objects. Note that this isn't completely perfect (e.g., we still rely on there being only one tree in the root), but it does simplify later patches. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- t/t9351-fast-export-anonymize.sh | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 't') diff --git a/t/t9351-fast-export-anonymize.sh b/t/t9351-fast-export-anonymize.sh index 897dc50907..e772cf9930 100755 --- a/t/t9351-fast-export-anonymize.sh +++ b/t/t9351-fast-export-anonymize.sh @@ -71,22 +71,18 @@ test_expect_success 'repo has original shape and timestamps' ' test_expect_success 'root tree has original shape' ' # the output entries are not necessarily in the same - # order, but we know at least that we will have one tree - # and one blob, so just check the sorted order - cat >expect <<-\EOF && - blob - tree - EOF + # order, but we should at least have the same set of + # object types. + git -C .. ls-tree HEAD >orig-root && + cut -d" " -f2 expect && git ls-tree $other_branch >root && cut -d" " -f2 actual && test_cmp expect actual ' test_expect_success 'paths in subdir ended up in one tree' ' - cat >expect <<-\EOF && - blob - blob - EOF + git -C .. ls-tree other:subdir >orig-subdir && + cut -d" " -f2 expect && tree=$(grep tree root | cut -f2) && git ls-tree $other_branch:$tree >tree && cut -d" " -f2 actual && -- cgit v1.3-5-g9baa From b897bf5f37f81d6a9303c4542422cf33d08b7cf0 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Tue, 23 Jun 2020 11:24:49 -0400 Subject: fast-export: use xmemdupz() for anonymizing oids Our anonymize_mem() function is careful to take a ptr/len pair to allow storing binary tokens like object ids, as well as partial strings (e.g., just "foo" of "foo/bar"). But it duplicates the hash key using xstrdup()! That means that: - for a partial string, we'd store all bytes up to the NUL, even though we'd never look at anything past "len". This didn't produce wrong behavior, but was wasteful. - for a binary oid that doesn't contain a zero byte, we'd copy garbage bytes off the end of the array (though as long as nothing complained about reading uninitialized bytes, further reads would be limited by "len", and we'd produce the correct results) - for a binary oid that does contain a zero byte, we'd copy _fewer_ bytes than intended into the hashmap struct. When we later try to look up a value, we'd access uninitialized memory and potentially falsely claim that a particular oid is not present. The most common reason to store an oid is an anonymized gitlink, but our test case doesn't have any gitlinks at all. So let's add one whose oid contains a NUL and is present at two different paths. ASan catches the memory error, but even without it we can detect the bug because the oid is not anonymized the same way for both paths. And of course the fix is to copy the correct number of bytes. We don't technically need the appended NUL from xmemdupz(), but it doesn't hurt as an extra protection against anybody treating it like a string (plus a future patch will push us more in that direction). Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- builtin/fast-export.c | 2 +- t/t9351-fast-export-anonymize.sh | 15 +++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) (limited to 't') diff --git a/builtin/fast-export.c b/builtin/fast-export.c index 85868162ee..289395a131 100644 --- a/builtin/fast-export.c +++ b/builtin/fast-export.c @@ -162,7 +162,7 @@ static const void *anonymize_mem(struct hashmap *map, if (!ret) { ret = xmalloc(sizeof(*ret)); hashmap_entry_init(&ret->hash, key.hash.hash); - ret->orig = xstrdup(orig); + ret->orig = xmemdupz(orig, *len); ret->orig_len = *len; ret->anon = generate(orig, len); ret->anon_len = *len; diff --git a/t/t9351-fast-export-anonymize.sh b/t/t9351-fast-export-anonymize.sh index e772cf9930..dc5d75cd19 100755 --- a/t/t9351-fast-export-anonymize.sh +++ b/t/t9351-fast-export-anonymize.sh @@ -10,6 +10,10 @@ test_expect_success 'setup simple repo' ' mkdir subdir && test_commit subdir/bar && test_commit subdir/xyzzy && + fake_commit=$(echo $ZERO_OID | sed s/0/a/) && + git update-index --add --cacheinfo 160000,$fake_commit,link1 && + git update-index --add --cacheinfo 160000,$fake_commit,link2 && + git commit -m "add gitlink" && git tag -m "annotated tag" mytag ' @@ -26,6 +30,12 @@ test_expect_success 'stream omits path names' ' ! grep xyzzy stream ' +test_expect_success 'stream omits gitlink oids' ' + # avoid relying on the whole oid to remain hash-agnostic; this is + # plenty to be unique within our test case + ! grep a000000000000000000 stream +' + test_expect_success 'stream allows master as refname' ' grep master stream ' @@ -89,6 +99,11 @@ test_expect_success 'paths in subdir ended up in one tree' ' test_cmp expect actual ' +test_expect_success 'identical gitlinks got identical oid' ' + awk "/commit/ { print \$3 }" commits && + test_line_count = 1 commits +' + test_expect_success 'tag points to branch tip' ' git rev-parse $other_branch >expect && git for-each-ref --format="%(*objectname)" | grep . >actual && -- cgit v1.3-5-g9baa From 65b5d9fae7684a282f48295b645c2f9da77c2736 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Thu, 25 Jun 2020 15:48:32 -0400 Subject: fast-export: allow seeding the anonymized mapping After you anonymize a repository, it can be hard to find which commits correspond between the original and the result, and thus hard to reproduce commands that triggered bugs in the original. Let's make it possible to seed the anonymization map. This lets users either: - mark names to be retained as-is, if they don't consider them secret (in which case their original commands would just work) - map names to new values, which lets them adapt the reproduction recipe to the new names without revealing the originals The implementation is fairly straight-forward. We already store each anonymized token in a hashmap (so that the same token appearing twice is converted to the same result). We can just introduce a new "seed" hashmap which is consulted first. This does make a few more promises to the user about how we'll anonymize things (e.g., token-splitting pathnames). But it's unlikely that we'd want to change those rules, even if the actual anonymization of a single token changes. And it makes things much easier for the user, who can unblind only a directory name without having to specify each path within it. One alternative to this approach would be to anonymize as we see fit, and then dump the whole refname and pathname mappings to a file. This does work, but it's a bit awkward to use (you have to manually dig the items you care about out of the mapping). Helped-by: Eric Sunshine Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- Documentation/git-fast-export.txt | 29 +++++++++++++++++++++++ builtin/fast-export.c | 50 ++++++++++++++++++++++++++++++++++++++- t/t9351-fast-export-anonymize.sh | 11 ++++++++- 3 files changed, 88 insertions(+), 2 deletions(-) (limited to 't') diff --git a/Documentation/git-fast-export.txt b/Documentation/git-fast-export.txt index e8950de3ba..1978dbdc6a 100644 --- a/Documentation/git-fast-export.txt +++ b/Documentation/git-fast-export.txt @@ -119,6 +119,11 @@ by keeping the marks the same across runs. the shape of the history and stored tree. See the section on `ANONYMIZING` below. +--anonymize-map=[:]:: + Convert token `` to `` in the anonymized output. If + `` is omitted, map `` to itself (i.e., do not + anonymize it). See the section on `ANONYMIZING` below. + --reference-excluded-parents:: By default, running a command such as `git fast-export master~5..master` will not include the commit master{tilde}5 @@ -238,6 +243,30 @@ collapse "User 0", "User 1", etc into "User X"). This produces a much smaller output, and it is usually easy to quickly confirm that there is no private data in the stream. +Reproducing some bugs may require referencing particular commits or +paths, which becomes challenging after refnames and paths have been +anonymized. You can ask for a particular token to be left as-is or +mapped to a new value. For example, if you have a bug which reproduces +with `git rev-list sensitive -- secret.c`, you can run: + +--------------------------------------------------- +$ git fast-export --anonymize --all \ + --anonymize-map=sensitive:foo \ + --anonymize-map=secret.c:bar.c \ + >stream +--------------------------------------------------- + +After importing the stream, you can then run `git rev-list foo -- bar.c` +in the anonymized repository. + +Note that paths and refnames are split into tokens at slash boundaries. +The command above would anonymize `subdir/secret.c` as something like +`path123/bar.c`; you could then search for `bar.c` in the anonymized +repository to determine the final pathname. + +To make referencing the final pathname simpler, you can map each path +component; so if you also anonymize `subdir` to `publicdir`, then the +final pathname would be `publicdir/bar.c`. LIMITATIONS ----------- diff --git a/builtin/fast-export.c b/builtin/fast-export.c index 1cbca5b4b4..b0b09bca30 100644 --- a/builtin/fast-export.c +++ b/builtin/fast-export.c @@ -45,6 +45,7 @@ static struct string_list extra_refs = STRING_LIST_INIT_NODUP; static struct string_list tag_refs = STRING_LIST_INIT_NODUP; static struct refspec refspecs = REFSPEC_INIT_FETCH; static int anonymize; +static struct hashmap anonymized_seeds; static struct revision_sources revision_sources; static int parse_opt_signed_tag_mode(const struct option *opt, @@ -168,8 +169,18 @@ static const char *anonymize_str(struct hashmap *map, hashmap_entry_init(&key.hash, memhash(orig, len)); key.orig = orig; key.orig_len = len; - ret = hashmap_get_entry(map, &key, hash, &key); + /* First check if it's a token the user configured manually... */ + if (anonymized_seeds.cmpfn) + ret = hashmap_get_entry(&anonymized_seeds, &key, hash, &key); + else + ret = NULL; + + /* ...otherwise check if we've already seen it in this context... */ + if (!ret) + ret = hashmap_get_entry(map, &key, hash, &key); + + /* ...and finally generate a new mapping if necessary */ if (!ret) { FLEX_ALLOC_MEM(ret, orig, orig, len); hashmap_entry_init(&ret->hash, key.hash.hash); @@ -1147,6 +1158,37 @@ static void handle_deletes(void) } } +static char *anonymize_seed(void *data) +{ + return xstrdup(data); +} + +static int parse_opt_anonymize_map(const struct option *opt, + const char *arg, int unset) +{ + struct hashmap *map = opt->value; + const char *delim, *value; + size_t keylen; + + BUG_ON_OPT_NEG(unset); + + delim = strchr(arg, ':'); + if (delim) { + keylen = delim - arg; + value = delim + 1; + } else { + keylen = strlen(arg); + value = arg; + } + + if (!keylen || !*value) + return error(_("--anonymize-map token cannot be empty")); + + anonymize_str(map, anonymize_seed, arg, keylen, (void *)value); + + return 0; +} + int cmd_fast_export(int argc, const char **argv, const char *prefix) { struct rev_info revs; @@ -1188,6 +1230,9 @@ int cmd_fast_export(int argc, const char **argv, const char *prefix) OPT_STRING_LIST(0, "refspec", &refspecs_list, N_("refspec"), N_("Apply refspec to exported refs")), OPT_BOOL(0, "anonymize", &anonymize, N_("anonymize output")), + OPT_CALLBACK_F(0, "anonymize-map", &anonymized_seeds, N_("from:to"), + N_("convert to in anonymized output"), + PARSE_OPT_NONEG, parse_opt_anonymize_map), OPT_BOOL(0, "reference-excluded-parents", &reference_excluded_commits, N_("Reference parents which are not in fast-export stream by object id")), OPT_BOOL(0, "show-original-ids", &show_original_ids, @@ -1215,6 +1260,9 @@ int cmd_fast_export(int argc, const char **argv, const char *prefix) if (argc > 1) usage_with_options (fast_export_usage, options); + if (anonymized_seeds.cmpfn && !anonymize) + die(_("--anonymize-map without --anonymize does not make sense")); + if (refspecs_list.nr) { int i; diff --git a/t/t9351-fast-export-anonymize.sh b/t/t9351-fast-export-anonymize.sh index dc5d75cd19..5a21c71568 100755 --- a/t/t9351-fast-export-anonymize.sh +++ b/t/t9351-fast-export-anonymize.sh @@ -6,6 +6,7 @@ test_description='basic tests for fast-export --anonymize' test_expect_success 'setup simple repo' ' test_commit base && test_commit foo && + test_commit retain-me && git checkout -b other HEAD^ && mkdir subdir && test_commit subdir/bar && @@ -18,7 +19,10 @@ test_expect_success 'setup simple repo' ' ' test_expect_success 'export anonymized stream' ' - git fast-export --anonymize --all >stream + git fast-export --anonymize --all \ + --anonymize-map=retain-me \ + --anonymize-map=xyzzy:custom-name \ + >stream ' # this also covers commit messages @@ -30,6 +34,11 @@ test_expect_success 'stream omits path names' ' ! grep xyzzy stream ' +test_expect_success 'stream contains user-specified names' ' + grep retain-me stream && + grep custom-name stream +' + test_expect_success 'stream omits gitlink oids' ' # avoid relying on the whole oid to remain hash-agnostic; this is # plenty to be unique within our test case -- cgit v1.3-5-g9baa From 8a494955835e3635926d4f6f4607c1ed95b011fc Mon Sep 17 00:00:00 2001 From: Jeff King Date: Thu, 25 Jun 2020 15:48:35 -0400 Subject: fast-export: anonymize "master" refname Running "fast-export --anonymize" will leave "refs/heads/master" untouched in the output, for two reasons: - it helped to have some known reference point between the original and anonymized repository - since it's historically the default branch name, it doesn't leak any information Now that we can ask fast-export to retain particular tokens, we have a much better tool for the first one (because it works for any ref, not just master). For the second, the notion of "default branch name" is likely to become configurable soon, at which point the name _does_ leak information. Let's drop this special case in preparation. Note that we have to adjust the test a bit, since it relied on using the name "master" in the anonymized repos. We could just use --anonymize-map=master to keep the same output, but then we wouldn't know if it works because of our hard-coded master or because of the explicit map. So let's flip the test a bit, and confirm that we anonymize "master", but keep "other" in the output. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- builtin/fast-export.c | 7 ------- t/t9351-fast-export-anonymize.sh | 12 +++++++----- 2 files changed, 7 insertions(+), 12 deletions(-) (limited to 't') diff --git a/builtin/fast-export.c b/builtin/fast-export.c index b0b09bca30..c6ecf404d7 100644 --- a/builtin/fast-export.c +++ b/builtin/fast-export.c @@ -538,13 +538,6 @@ static const char *anonymize_refname(const char *refname) static struct strbuf anon = STRBUF_INIT; int i; - /* - * We also leave "master" as a special case, since it does not reveal - * anything interesting. - */ - if (!strcmp(refname, "refs/heads/master")) - return refname; - strbuf_reset(&anon); for (i = 0; i < ARRAY_SIZE(prefixes); i++) { if (skip_prefix(refname, prefixes[i], &refname)) { diff --git a/t/t9351-fast-export-anonymize.sh b/t/t9351-fast-export-anonymize.sh index 5a21c71568..5ac2c3b5ee 100755 --- a/t/t9351-fast-export-anonymize.sh +++ b/t/t9351-fast-export-anonymize.sh @@ -22,6 +22,7 @@ test_expect_success 'export anonymized stream' ' git fast-export --anonymize --all \ --anonymize-map=retain-me \ --anonymize-map=xyzzy:custom-name \ + --anonymize-map=other \ >stream ' @@ -45,12 +46,12 @@ test_expect_success 'stream omits gitlink oids' ' ! grep a000000000000000000 stream ' -test_expect_success 'stream allows master as refname' ' - grep master stream +test_expect_success 'stream retains other as refname' ' + grep other stream ' test_expect_success 'stream omits other refnames' ' - ! grep other stream && + ! grep master stream && ! grep mytag stream ' @@ -76,7 +77,8 @@ test_expect_success 'import stream to new repository' ' test_expect_success 'result has two branches' ' git for-each-ref --format="%(refname)" refs/heads >branches && test_line_count = 2 branches && - other_branch=$(grep -v refs/heads/master branches) + other_branch=refs/heads/other && + main_branch=$(grep -v $other_branch branches) ' test_expect_success 'repo has original shape and timestamps' ' @@ -84,7 +86,7 @@ test_expect_success 'repo has original shape and timestamps' ' git log --format="%m %ct" --left-right --boundary "$@" } && (cd .. && shape master...other) >expect && - shape master...$other_branch >actual && + shape $main_branch...$other_branch >actual && test_cmp expect actual ' -- cgit v1.3-5-g9baa