Merge branch 'en/fast-export-encoding'
The "git fast-export/import" pair has been taught to handle commits with log messages in encoding other than UTF-8 better. * en/fast-export-encoding: fast-export: do automatic reencoding of commit messages only if requested fast-export: differentiate between explicitly UTF-8 and implicitly UTF-8 fast-export: avoid stripping encoding header if we cannot reencode fast-import: support 'encoding' commit header t9350: fix encoding test to actually test reencoding
This commit is contained in:
commit
66dc7b68e4
@ -129,6 +129,13 @@ marks the same across runs.
|
||||
for intermediary filters (e.g. for rewriting commit messages
|
||||
which refer to older commits, or for stripping blobs by id).
|
||||
|
||||
--reencode=(yes|no|abort)::
|
||||
Specify how to handle `encoding` header in commit objects. When
|
||||
asking to 'abort' (which is the default), this program will die
|
||||
when encountering such a commit object. With 'yes', the commit
|
||||
message will be reencoded into UTF-8. With 'no', the original
|
||||
encoding will be preserved.
|
||||
|
||||
--refspec::
|
||||
Apply the specified refspec to each ref exported. Multiple of them can
|
||||
be specified.
|
||||
|
@ -388,6 +388,7 @@ change to the project.
|
||||
original-oid?
|
||||
('author' (SP <name>)? SP LT <email> GT SP <when> LF)?
|
||||
'committer' (SP <name>)? SP LT <email> GT SP <when> LF
|
||||
('encoding' SP <encoding>)?
|
||||
data
|
||||
('from' SP <commit-ish> LF)?
|
||||
('merge' SP <commit-ish> LF)?
|
||||
@ -455,6 +456,12 @@ that was selected by the --date-format=<fmt> command-line option.
|
||||
See ``Date Formats'' above for the set of supported formats, and
|
||||
their syntax.
|
||||
|
||||
`encoding`
|
||||
^^^^^^^^^^
|
||||
The optional `encoding` command indicates the encoding of the commit
|
||||
message. Most commits are UTF-8 and the encoding is omitted, but this
|
||||
allows importing commit messages into git without first reencoding them.
|
||||
|
||||
`from`
|
||||
^^^^^^
|
||||
The `from` command is used to specify the commit to initialize
|
||||
|
@ -33,6 +33,7 @@ static const char *fast_export_usage[] = {
|
||||
static int progress;
|
||||
static enum { SIGNED_TAG_ABORT, VERBATIM, WARN, WARN_STRIP, STRIP } signed_tag_mode = SIGNED_TAG_ABORT;
|
||||
static enum { TAG_FILTERING_ABORT, DROP, REWRITE } tag_of_filtered_mode = TAG_FILTERING_ABORT;
|
||||
static enum { REENCODE_ABORT, REENCODE_YES, REENCODE_NO } reencode_mode = REENCODE_ABORT;
|
||||
static int fake_missing_tagger;
|
||||
static int use_done_feature;
|
||||
static int no_data;
|
||||
@ -77,6 +78,31 @@ static int parse_opt_tag_of_filtered_mode(const struct option *opt,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int parse_opt_reencode_mode(const struct option *opt,
|
||||
const char *arg, int unset)
|
||||
{
|
||||
if (unset) {
|
||||
reencode_mode = REENCODE_ABORT;
|
||||
return 0;
|
||||
}
|
||||
|
||||
switch (git_parse_maybe_bool(arg)) {
|
||||
case 0:
|
||||
reencode_mode = REENCODE_NO;
|
||||
break;
|
||||
case 1:
|
||||
reencode_mode = REENCODE_YES;
|
||||
break;
|
||||
default:
|
||||
if (!strcasecmp(arg, "abort"))
|
||||
reencode_mode = REENCODE_ABORT;
|
||||
else
|
||||
return error("Unknown reencoding mode: %s", arg);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct decoration idnums;
|
||||
static uint32_t last_idnum;
|
||||
|
||||
@ -453,7 +479,7 @@ static const char *find_encoding(const char *begin, const char *end)
|
||||
bol = memmem(begin, end ? end - begin : strlen(begin),
|
||||
needle, strlen(needle));
|
||||
if (!bol)
|
||||
return git_commit_encoding;
|
||||
return NULL;
|
||||
bol += strlen(needle);
|
||||
eol = strchrnul(bol, '\n');
|
||||
*eol = '\0';
|
||||
@ -633,18 +659,32 @@ static void handle_commit(struct commit *commit, struct rev_info *rev,
|
||||
}
|
||||
|
||||
mark_next_object(&commit->object);
|
||||
if (anonymize)
|
||||
if (anonymize) {
|
||||
reencoded = anonymize_commit_message(message);
|
||||
else if (!is_encoding_utf8(encoding))
|
||||
reencoded = reencode_string(message, "UTF-8", encoding);
|
||||
} else if (encoding) {
|
||||
switch(reencode_mode) {
|
||||
case REENCODE_YES:
|
||||
reencoded = reencode_string(message, "UTF-8", encoding);
|
||||
break;
|
||||
case REENCODE_NO:
|
||||
break;
|
||||
case REENCODE_ABORT:
|
||||
die("Encountered commit-specific encoding %s in commit "
|
||||
"%s; use --reencode=[yes|no] to handle it",
|
||||
encoding, oid_to_hex(&commit->object.oid));
|
||||
}
|
||||
}
|
||||
if (!commit->parents)
|
||||
printf("reset %s\n", refname);
|
||||
printf("commit %s\nmark :%"PRIu32"\n", refname, last_idnum);
|
||||
if (show_original_ids)
|
||||
printf("original-oid %s\n", oid_to_hex(&commit->object.oid));
|
||||
printf("%.*s\n%.*s\ndata %u\n%s",
|
||||
printf("%.*s\n%.*s\n",
|
||||
(int)(author_end - author), author,
|
||||
(int)(committer_end - committer), committer,
|
||||
(int)(committer_end - committer), committer);
|
||||
if (!reencoded && encoding)
|
||||
printf("encoding %s\n", encoding);
|
||||
printf("data %u\n%s",
|
||||
(unsigned)(reencoded
|
||||
? strlen(reencoded) : message
|
||||
? strlen(message) : 0),
|
||||
@ -1088,6 +1128,9 @@ int cmd_fast_export(int argc, const char **argv, const char *prefix)
|
||||
OPT_CALLBACK(0, "tag-of-filtered-object", &tag_of_filtered_mode, N_("mode"),
|
||||
N_("select handling of tags that tag filtered objects"),
|
||||
parse_opt_tag_of_filtered_mode),
|
||||
OPT_CALLBACK(0, "reencode", &reencode_mode, N_("mode"),
|
||||
N_("select handling of commit messages in an alternate encoding"),
|
||||
parse_opt_reencode_mode),
|
||||
OPT_STRING(0, "export-marks", &export_filename, N_("file"),
|
||||
N_("Dump marks to this file")),
|
||||
OPT_STRING(0, "import-marks", &import_filename, N_("file"),
|
||||
|
@ -2585,6 +2585,7 @@ static void parse_new_commit(const char *arg)
|
||||
struct branch *b;
|
||||
char *author = NULL;
|
||||
char *committer = NULL;
|
||||
const char *encoding = NULL;
|
||||
struct hash_list *merge_list = NULL;
|
||||
unsigned int merge_count;
|
||||
unsigned char prev_fanout, new_fanout;
|
||||
@ -2607,6 +2608,8 @@ static void parse_new_commit(const char *arg)
|
||||
}
|
||||
if (!committer)
|
||||
die("Expected committer but didn't get one");
|
||||
if (skip_prefix(command_buf.buf, "encoding ", &encoding))
|
||||
read_next_command();
|
||||
parse_data(&msg, 0, NULL);
|
||||
read_next_command();
|
||||
parse_from(b);
|
||||
@ -2670,9 +2673,13 @@ static void parse_new_commit(const char *arg)
|
||||
}
|
||||
strbuf_addf(&new_data,
|
||||
"author %s\n"
|
||||
"committer %s\n"
|
||||
"\n",
|
||||
"committer %s\n",
|
||||
author ? author : committer, committer);
|
||||
if (encoding)
|
||||
strbuf_addf(&new_data,
|
||||
"encoding %s\n",
|
||||
encoding);
|
||||
strbuf_addch(&new_data, '\n');
|
||||
strbuf_addbuf(&new_data, &msg);
|
||||
free(author);
|
||||
free(committer);
|
||||
|
@ -3299,4 +3299,24 @@ test_expect_success !MINGW 'W: get-mark & empty orphan commit with erroneous thi
|
||||
sed -e s/LFs/LLL/ W-input | tr L "\n" | test_must_fail git fast-import
|
||||
'
|
||||
|
||||
###
|
||||
### series X (other new features)
|
||||
###
|
||||
|
||||
test_expect_success 'X: handling encoding' '
|
||||
test_tick &&
|
||||
cat >input <<-INPUT_END &&
|
||||
commit refs/heads/encoding
|
||||
committer $GIT_COMMITTER_NAME <$GIT_COMMITTER_EMAIL> $GIT_COMMITTER_DATE
|
||||
encoding iso-8859-7
|
||||
data <<COMMIT
|
||||
INPUT_END
|
||||
|
||||
printf "Pi: \360\nCOMMIT\n" >>input &&
|
||||
|
||||
git fast-import <input &&
|
||||
git cat-file -p encoding | grep $(printf "\360") &&
|
||||
git log -1 --format=%B encoding | grep $(printf "\317\200")
|
||||
'
|
||||
|
||||
test_done
|
||||
|
@ -94,22 +94,83 @@ test_expect_success 'fast-export --show-original-ids | git fast-import' '
|
||||
test $MUSS = $(git rev-parse --verify refs/tags/muss)
|
||||
'
|
||||
|
||||
test_expect_success 'iso-8859-1' '
|
||||
test_expect_success 'reencoding iso-8859-7' '
|
||||
|
||||
git config i18n.commitencoding ISO8859-1 &&
|
||||
# use author and committer name in ISO-8859-1 to match it.
|
||||
. "$TEST_DIRECTORY"/t3901/8859-1.txt &&
|
||||
test_when_finished "git reset --hard HEAD~1" &&
|
||||
test_config i18n.commitencoding iso-8859-7 &&
|
||||
test_tick &&
|
||||
echo rosten >file &&
|
||||
git commit -s -m den file &&
|
||||
git fast-export wer^..wer >iso8859-1.fi &&
|
||||
sed "s/wer/i18n/" iso8859-1.fi |
|
||||
git commit -s -F "$TEST_DIRECTORY/t9350/simple-iso-8859-7-commit-message.txt" file &&
|
||||
git fast-export --reencode=yes wer^..wer >iso-8859-7.fi &&
|
||||
sed "s/wer/i18n/" iso-8859-7.fi |
|
||||
(cd new &&
|
||||
git fast-import &&
|
||||
# The commit object, if not re-encoded, would be 240 bytes.
|
||||
# Removing the "encoding iso-8859-7\n" header drops 20 bytes.
|
||||
# Re-encoding the Pi character from \xF0 (\360) in iso-8859-7
|
||||
# to \xCF\x80 (\317\200) in UTF-8 adds a byte. Check for
|
||||
# the expected size.
|
||||
test 221 -eq "$(git cat-file -s i18n)" &&
|
||||
# ...and for the expected translation of bytes.
|
||||
git cat-file commit i18n >actual &&
|
||||
grep "Áéí óú" actual)
|
||||
|
||||
grep $(printf "\317\200") actual &&
|
||||
# Also make sure the commit does not have the "encoding" header
|
||||
! grep ^encoding actual)
|
||||
'
|
||||
|
||||
test_expect_success 'aborting on iso-8859-7' '
|
||||
|
||||
test_when_finished "git reset --hard HEAD~1" &&
|
||||
test_config i18n.commitencoding iso-8859-7 &&
|
||||
echo rosten >file &&
|
||||
git commit -s -F "$TEST_DIRECTORY/t9350/simple-iso-8859-7-commit-message.txt" file &&
|
||||
test_must_fail git fast-export --reencode=abort wer^..wer >iso-8859-7.fi
|
||||
'
|
||||
|
||||
test_expect_success 'preserving iso-8859-7' '
|
||||
|
||||
test_when_finished "git reset --hard HEAD~1" &&
|
||||
test_config i18n.commitencoding iso-8859-7 &&
|
||||
echo rosten >file &&
|
||||
git commit -s -F "$TEST_DIRECTORY/t9350/simple-iso-8859-7-commit-message.txt" file &&
|
||||
git fast-export --reencode=no wer^..wer >iso-8859-7.fi &&
|
||||
sed "s/wer/i18n-no-recoding/" iso-8859-7.fi |
|
||||
(cd new &&
|
||||
git fast-import &&
|
||||
# The commit object, if not re-encoded, is 240 bytes.
|
||||
# Removing the "encoding iso-8859-7\n" header would drops 20
|
||||
# bytes. Re-encoding the Pi character from \xF0 (\360) in
|
||||
# iso-8859-7 to \xCF\x80 (\317\200) in UTF-8 adds a byte.
|
||||
# Check for the expected size...
|
||||
test 240 -eq "$(git cat-file -s i18n-no-recoding)" &&
|
||||
# ...as well as the expected byte.
|
||||
git cat-file commit i18n-no-recoding >actual &&
|
||||
grep $(printf "\360") actual &&
|
||||
# Also make sure the commit has the "encoding" header
|
||||
grep ^encoding actual)
|
||||
'
|
||||
|
||||
test_expect_success 'encoding preserved if reencoding fails' '
|
||||
|
||||
test_when_finished "git reset --hard HEAD~1" &&
|
||||
test_config i18n.commitencoding iso-8859-7 &&
|
||||
echo rosten >file &&
|
||||
git commit -s -F "$TEST_DIRECTORY/t9350/broken-iso-8859-7-commit-message.txt" file &&
|
||||
git fast-export --reencode=yes wer^..wer >iso-8859-7.fi &&
|
||||
sed "s/wer/i18n-invalid/" iso-8859-7.fi |
|
||||
(cd new &&
|
||||
git fast-import &&
|
||||
git cat-file commit i18n-invalid >actual &&
|
||||
# Make sure the commit still has the encoding header
|
||||
grep ^encoding actual &&
|
||||
# Verify that the commit has the expected size; i.e.
|
||||
# that no bytes were re-encoded to a different encoding.
|
||||
test 252 -eq "$(git cat-file -s i18n-invalid)" &&
|
||||
# ...and check for the original special bytes
|
||||
grep $(printf "\360") actual &&
|
||||
grep $(printf "\377") actual)
|
||||
'
|
||||
|
||||
test_expect_success 'import/export-marks' '
|
||||
|
||||
git checkout -b marks master &&
|
||||
@ -224,7 +285,6 @@ GIT_COMMITTER_NAME='C O Mitter'; export GIT_COMMITTER_NAME
|
||||
|
||||
test_expect_success 'setup copies' '
|
||||
|
||||
git config --unset i18n.commitencoding &&
|
||||
git checkout -b copy rein &&
|
||||
git mv file file3 &&
|
||||
git commit -m move1 &&
|
||||
|
1
t/t9350/broken-iso-8859-7-commit-message.txt
Normal file
1
t/t9350/broken-iso-8859-7-commit-message.txt
Normal file
@ -0,0 +1 @@
|
||||
Pi: ð; Invalid: ÿ
|
1
t/t9350/simple-iso-8859-7-commit-message.txt
Normal file
1
t/t9350/simple-iso-8859-7-commit-message.txt
Normal file
@ -0,0 +1 @@
|
||||
Pi: <20>
|
Loading…
Reference in New Issue
Block a user