Merge branch 'lt/commit-tree-guess-utf-8'
Teach "git commit" and "git commit-tree" the "we are told to use utf-8 in log message, but this does not look like utf-8---attempt to pass it through convert-from-latin1-to-utf8 and see if it makes sense" heuristics "git mailinfo" already uses. * lt/commit-tree-guess-utf-8: commit/commit-tree: correct latin1 to utf-8
This commit is contained in:
commit
ae80b5a892
@ -481,36 +481,12 @@ static struct strbuf *decode_b_segment(const struct strbuf *b_seg)
|
|||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* When there is no known charset, guess.
|
|
||||||
*
|
|
||||||
* Right now we assume that if the target is UTF-8 (the default),
|
|
||||||
* and it already looks like UTF-8 (which includes US-ASCII as its
|
|
||||||
* subset, of course) then that is what it is and there is nothing
|
|
||||||
* to do.
|
|
||||||
*
|
|
||||||
* Otherwise, we default to assuming it is Latin1 for historical
|
|
||||||
* reasons.
|
|
||||||
*/
|
|
||||||
static const char *guess_charset(const struct strbuf *line, const char *target_charset)
|
|
||||||
{
|
|
||||||
if (is_encoding_utf8(target_charset)) {
|
|
||||||
if (is_utf8(line->buf))
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
return "ISO8859-1";
|
|
||||||
}
|
|
||||||
|
|
||||||
static void convert_to_utf8(struct strbuf *line, const char *charset)
|
static void convert_to_utf8(struct strbuf *line, const char *charset)
|
||||||
{
|
{
|
||||||
char *out;
|
char *out;
|
||||||
|
|
||||||
if (!charset || !*charset) {
|
if (!charset || !*charset)
|
||||||
charset = guess_charset(line, metainfo_charset);
|
return;
|
||||||
if (!charset)
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!strcasecmp(metainfo_charset, charset))
|
if (!strcasecmp(metainfo_charset, charset))
|
||||||
return;
|
return;
|
||||||
out = reencode_string(line->buf, metainfo_charset, charset);
|
out = reencode_string(line->buf, metainfo_charset, charset);
|
||||||
|
88
commit.c
88
commit.c
@ -1112,8 +1112,92 @@ int commit_tree(const struct strbuf *msg, unsigned char *tree,
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int find_invalid_utf8(const char *buf, int len)
|
||||||
|
{
|
||||||
|
int offset = 0;
|
||||||
|
|
||||||
|
while (len) {
|
||||||
|
unsigned char c = *buf++;
|
||||||
|
int bytes, bad_offset;
|
||||||
|
|
||||||
|
len--;
|
||||||
|
offset++;
|
||||||
|
|
||||||
|
/* Simple US-ASCII? No worries. */
|
||||||
|
if (c < 0x80)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
bad_offset = offset-1;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Count how many more high bits set: that's how
|
||||||
|
* many more bytes this sequence should have.
|
||||||
|
*/
|
||||||
|
bytes = 0;
|
||||||
|
while (c & 0x40) {
|
||||||
|
c <<= 1;
|
||||||
|
bytes++;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Must be between 1 and 5 more bytes */
|
||||||
|
if (bytes < 1 || bytes > 5)
|
||||||
|
return bad_offset;
|
||||||
|
|
||||||
|
/* Do we *have* that many bytes? */
|
||||||
|
if (len < bytes)
|
||||||
|
return bad_offset;
|
||||||
|
|
||||||
|
offset += bytes;
|
||||||
|
len -= bytes;
|
||||||
|
|
||||||
|
/* And verify that they are good continuation bytes */
|
||||||
|
do {
|
||||||
|
if ((*buf++ & 0xc0) != 0x80)
|
||||||
|
return bad_offset;
|
||||||
|
} while (--bytes);
|
||||||
|
|
||||||
|
/* We could/should check the value and length here too */
|
||||||
|
}
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This verifies that the buffer is in proper utf8 format.
|
||||||
|
*
|
||||||
|
* If it isn't, it assumes any non-utf8 characters are Latin1,
|
||||||
|
* and does the conversion.
|
||||||
|
*
|
||||||
|
* Fixme: we should probably also disallow overlong forms and
|
||||||
|
* invalid characters. But we don't do that currently.
|
||||||
|
*/
|
||||||
|
static int verify_utf8(struct strbuf *buf)
|
||||||
|
{
|
||||||
|
int ok = 1;
|
||||||
|
long pos = 0;
|
||||||
|
|
||||||
|
for (;;) {
|
||||||
|
int bad;
|
||||||
|
unsigned char c;
|
||||||
|
unsigned char replace[2];
|
||||||
|
|
||||||
|
bad = find_invalid_utf8(buf->buf + pos, buf->len - pos);
|
||||||
|
if (bad < 0)
|
||||||
|
return ok;
|
||||||
|
pos += bad;
|
||||||
|
ok = 0;
|
||||||
|
c = buf->buf[pos];
|
||||||
|
strbuf_remove(buf, pos, 1);
|
||||||
|
|
||||||
|
/* We know 'c' must be in the range 128-255 */
|
||||||
|
replace[0] = 0xc0 + (c >> 6);
|
||||||
|
replace[1] = 0x80 + (c & 0x3f);
|
||||||
|
strbuf_insert(buf, pos, replace, 2);
|
||||||
|
pos += 2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static const char commit_utf8_warn[] =
|
static const char commit_utf8_warn[] =
|
||||||
"Warning: commit message does not conform to UTF-8.\n"
|
"Warning: commit message did not conform to UTF-8.\n"
|
||||||
"You may want to amend it after fixing the message, or set the config\n"
|
"You may want to amend it after fixing the message, or set the config\n"
|
||||||
"variable i18n.commitencoding to the encoding your project uses.\n";
|
"variable i18n.commitencoding to the encoding your project uses.\n";
|
||||||
|
|
||||||
@ -1170,7 +1254,7 @@ int commit_tree_extended(const struct strbuf *msg, unsigned char *tree,
|
|||||||
strbuf_addbuf(&buffer, msg);
|
strbuf_addbuf(&buffer, msg);
|
||||||
|
|
||||||
/* And check the encoding */
|
/* And check the encoding */
|
||||||
if (encoding_is_utf8 && !is_utf8(buffer.buf))
|
if (encoding_is_utf8 && !verify_utf8(&buffer))
|
||||||
fprintf(stderr, commit_utf8_warn);
|
fprintf(stderr, commit_utf8_warn);
|
||||||
|
|
||||||
if (sign_commit && do_sign_commit(&buffer, sign_commit))
|
if (sign_commit && do_sign_commit(&buffer, sign_commit))
|
||||||
|
Loading…
Reference in New Issue
Block a user