fast-export: do automatic reencoding of commit messages only if requested

Automatic re-encoding of commit messages (and dropping of the encoding
header) hurts attempts to do reversible history rewrites (e.g. sha1sum
<-> sha256sum transitions, some subtree rewrites), and seems
inconsistent with the general principle followed elsewhere in
fast-export of requiring explicit user requests to modify the output
(e.g. --signed-tags=strip, --tag-of-filtered-object=rewrite).  Add a
--reencode flag that the user can use to specify, and like other
fast-export flags, default it to 'abort'.

Signed-off-by: Elijah Newren <newren@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
This commit is contained in:
Elijah Newren 2019-05-13 21:31:02 -07:00 committed by Junio C Hamano
parent 57a8be2cb0
commit e80001f8fd
3 changed files with 85 additions and 6 deletions

View File

@ -129,6 +129,13 @@ marks the same across runs.
for intermediary filters (e.g. for rewriting commit messages for intermediary filters (e.g. for rewriting commit messages
which refer to older commits, or for stripping blobs by id). which refer to older commits, or for stripping blobs by id).
--reencode=(yes|no|abort)::
Specify how to handle `encoding` header in commit objects. When
asking to 'abort' (which is the default), this program will die
when encountering such a commit object. With 'yes', the commit
message will be reencoded into UTF-8. With 'no', the original
encoding will be preserved.
--refspec:: --refspec::
Apply the specified refspec to each ref exported. Multiple of them can Apply the specified refspec to each ref exported. Multiple of them can
be specified. be specified.

View File

@ -33,6 +33,7 @@ static const char *fast_export_usage[] = {
static int progress; static int progress;
static enum { SIGNED_TAG_ABORT, VERBATIM, WARN, WARN_STRIP, STRIP } signed_tag_mode = SIGNED_TAG_ABORT; static enum { SIGNED_TAG_ABORT, VERBATIM, WARN, WARN_STRIP, STRIP } signed_tag_mode = SIGNED_TAG_ABORT;
static enum { TAG_FILTERING_ABORT, DROP, REWRITE } tag_of_filtered_mode = TAG_FILTERING_ABORT; static enum { TAG_FILTERING_ABORT, DROP, REWRITE } tag_of_filtered_mode = TAG_FILTERING_ABORT;
static enum { REENCODE_ABORT, REENCODE_YES, REENCODE_NO } reencode_mode = REENCODE_ABORT;
static int fake_missing_tagger; static int fake_missing_tagger;
static int use_done_feature; static int use_done_feature;
static int no_data; static int no_data;
@ -77,6 +78,31 @@ static int parse_opt_tag_of_filtered_mode(const struct option *opt,
return 0; return 0;
} }
static int parse_opt_reencode_mode(const struct option *opt,
const char *arg, int unset)
{
if (unset) {
reencode_mode = REENCODE_ABORT;
return 0;
}
switch (git_parse_maybe_bool(arg)) {
case 0:
reencode_mode = REENCODE_NO;
break;
case 1:
reencode_mode = REENCODE_YES;
break;
default:
if (!strcasecmp(arg, "abort"))
reencode_mode = REENCODE_ABORT;
else
return error("Unknown reencoding mode: %s", arg);
}
return 0;
}
static struct decoration idnums; static struct decoration idnums;
static uint32_t last_idnum; static uint32_t last_idnum;
@ -633,10 +659,21 @@ static void handle_commit(struct commit *commit, struct rev_info *rev,
} }
mark_next_object(&commit->object); mark_next_object(&commit->object);
if (anonymize) if (anonymize) {
reencoded = anonymize_commit_message(message); reencoded = anonymize_commit_message(message);
else if (!is_encoding_utf8(encoding)) } else if (encoding) {
reencoded = reencode_string(message, "UTF-8", encoding); switch(reencode_mode) {
case REENCODE_YES:
reencoded = reencode_string(message, "UTF-8", encoding);
break;
case REENCODE_NO:
break;
case REENCODE_ABORT:
die("Encountered commit-specific encoding %s in commit "
"%s; use --reencode=[yes|no] to handle it",
encoding, oid_to_hex(&commit->object.oid));
}
}
if (!commit->parents) if (!commit->parents)
printf("reset %s\n", refname); printf("reset %s\n", refname);
printf("commit %s\nmark :%"PRIu32"\n", refname, last_idnum); printf("commit %s\nmark :%"PRIu32"\n", refname, last_idnum);
@ -1091,6 +1128,9 @@ int cmd_fast_export(int argc, const char **argv, const char *prefix)
OPT_CALLBACK(0, "tag-of-filtered-object", &tag_of_filtered_mode, N_("mode"), OPT_CALLBACK(0, "tag-of-filtered-object", &tag_of_filtered_mode, N_("mode"),
N_("select handling of tags that tag filtered objects"), N_("select handling of tags that tag filtered objects"),
parse_opt_tag_of_filtered_mode), parse_opt_tag_of_filtered_mode),
OPT_CALLBACK(0, "reencode", &reencode_mode, N_("mode"),
N_("select handling of commit messages in an alternate encoding"),
parse_opt_reencode_mode),
OPT_STRING(0, "export-marks", &export_filename, N_("file"), OPT_STRING(0, "export-marks", &export_filename, N_("file"),
N_("Dump marks to this file")), N_("Dump marks to this file")),
OPT_STRING(0, "import-marks", &import_filename, N_("file"), OPT_STRING(0, "import-marks", &import_filename, N_("file"),

View File

@ -94,14 +94,14 @@ test_expect_success 'fast-export --show-original-ids | git fast-import' '
test $MUSS = $(git rev-parse --verify refs/tags/muss) test $MUSS = $(git rev-parse --verify refs/tags/muss)
' '
test_expect_success 'iso-8859-7' ' test_expect_success 'reencoding iso-8859-7' '
test_when_finished "git reset --hard HEAD~1" && test_when_finished "git reset --hard HEAD~1" &&
test_config i18n.commitencoding iso-8859-7 && test_config i18n.commitencoding iso-8859-7 &&
test_tick && test_tick &&
echo rosten >file && echo rosten >file &&
git commit -s -F "$TEST_DIRECTORY/t9350/simple-iso-8859-7-commit-message.txt" file && git commit -s -F "$TEST_DIRECTORY/t9350/simple-iso-8859-7-commit-message.txt" file &&
git fast-export wer^..wer >iso-8859-7.fi && git fast-export --reencode=yes wer^..wer >iso-8859-7.fi &&
sed "s/wer/i18n/" iso-8859-7.fi | sed "s/wer/i18n/" iso-8859-7.fi |
(cd new && (cd new &&
git fast-import && git fast-import &&
@ -118,13 +118,45 @@ test_expect_success 'iso-8859-7' '
! grep ^encoding actual) ! grep ^encoding actual)
' '
test_expect_success 'aborting on iso-8859-7' '
test_when_finished "git reset --hard HEAD~1" &&
test_config i18n.commitencoding iso-8859-7 &&
echo rosten >file &&
git commit -s -F "$TEST_DIRECTORY/t9350/simple-iso-8859-7-commit-message.txt" file &&
test_must_fail git fast-export --reencode=abort wer^..wer >iso-8859-7.fi
'
test_expect_success 'preserving iso-8859-7' '
test_when_finished "git reset --hard HEAD~1" &&
test_config i18n.commitencoding iso-8859-7 &&
echo rosten >file &&
git commit -s -F "$TEST_DIRECTORY/t9350/simple-iso-8859-7-commit-message.txt" file &&
git fast-export --reencode=no wer^..wer >iso-8859-7.fi &&
sed "s/wer/i18n-no-recoding/" iso-8859-7.fi |
(cd new &&
git fast-import &&
# The commit object, if not re-encoded, is 240 bytes.
# Removing the "encoding iso-8859-7\n" header would drops 20
# bytes. Re-encoding the Pi character from \xF0 (\360) in
# iso-8859-7 to \xCF\x80 (\317\200) in UTF-8 adds a byte.
# Check for the expected size...
test 240 -eq "$(git cat-file -s i18n-no-recoding)" &&
# ...as well as the expected byte.
git cat-file commit i18n-no-recoding >actual &&
grep $(printf "\360") actual &&
# Also make sure the commit has the "encoding" header
grep ^encoding actual)
'
test_expect_success 'encoding preserved if reencoding fails' ' test_expect_success 'encoding preserved if reencoding fails' '
test_when_finished "git reset --hard HEAD~1" && test_when_finished "git reset --hard HEAD~1" &&
test_config i18n.commitencoding iso-8859-7 && test_config i18n.commitencoding iso-8859-7 &&
echo rosten >file && echo rosten >file &&
git commit -s -F "$TEST_DIRECTORY/t9350/broken-iso-8859-7-commit-message.txt" file && git commit -s -F "$TEST_DIRECTORY/t9350/broken-iso-8859-7-commit-message.txt" file &&
git fast-export wer^..wer >iso-8859-7.fi && git fast-export --reencode=yes wer^..wer >iso-8859-7.fi &&
sed "s/wer/i18n-invalid/" iso-8859-7.fi | sed "s/wer/i18n-invalid/" iso-8859-7.fi |
(cd new && (cd new &&
git fast-import && git fast-import &&