Merge branch 'lt/commit-tree-guess-utf-8'

Teach "git commit" and "git commit-tree" the "we are told to use
utf-8 in log message, but this does not look like utf-8---attempt to
pass it through convert-from-latin1-to-utf8 and see if it makes
sense" heuristics "git mailinfo" already uses.

* lt/commit-tree-guess-utf-8:
  commit/commit-tree: correct latin1 to utf-8
This commit is contained in:
Junio C Hamano 2012-09-07 11:08:38 -07:00
commit ae80b5a892
2 changed files with 88 additions and 28 deletions

View File

@ -481,36 +481,12 @@ static struct strbuf *decode_b_segment(const struct strbuf *b_seg)
return out; return out;
} }
/*
* When there is no known charset, guess.
*
* Right now we assume that if the target is UTF-8 (the default),
* and it already looks like UTF-8 (which includes US-ASCII as its
* subset, of course) then that is what it is and there is nothing
* to do.
*
* Otherwise, we default to assuming it is Latin1 for historical
* reasons.
*/
static const char *guess_charset(const struct strbuf *line, const char *target_charset)
{
if (is_encoding_utf8(target_charset)) {
if (is_utf8(line->buf))
return NULL;
}
return "ISO8859-1";
}
static void convert_to_utf8(struct strbuf *line, const char *charset) static void convert_to_utf8(struct strbuf *line, const char *charset)
{ {
char *out; char *out;
if (!charset || !*charset) { if (!charset || !*charset)
charset = guess_charset(line, metainfo_charset); return;
if (!charset)
return;
}
if (!strcasecmp(metainfo_charset, charset)) if (!strcasecmp(metainfo_charset, charset))
return; return;
out = reencode_string(line->buf, metainfo_charset, charset); out = reencode_string(line->buf, metainfo_charset, charset);

View File

@ -1112,8 +1112,92 @@ int commit_tree(const struct strbuf *msg, unsigned char *tree,
return result; return result;
} }
static int find_invalid_utf8(const char *buf, int len)
{
int offset = 0;
while (len) {
unsigned char c = *buf++;
int bytes, bad_offset;
len--;
offset++;
/* Simple US-ASCII? No worries. */
if (c < 0x80)
continue;
bad_offset = offset-1;
/*
* Count how many more high bits set: that's how
* many more bytes this sequence should have.
*/
bytes = 0;
while (c & 0x40) {
c <<= 1;
bytes++;
}
/* Must be between 1 and 5 more bytes */
if (bytes < 1 || bytes > 5)
return bad_offset;
/* Do we *have* that many bytes? */
if (len < bytes)
return bad_offset;
offset += bytes;
len -= bytes;
/* And verify that they are good continuation bytes */
do {
if ((*buf++ & 0xc0) != 0x80)
return bad_offset;
} while (--bytes);
/* We could/should check the value and length here too */
}
return -1;
}
/*
* This verifies that the buffer is in proper utf8 format.
*
* If it isn't, it assumes any non-utf8 characters are Latin1,
* and does the conversion.
*
* Fixme: we should probably also disallow overlong forms and
* invalid characters. But we don't do that currently.
*/
static int verify_utf8(struct strbuf *buf)
{
int ok = 1;
long pos = 0;
for (;;) {
int bad;
unsigned char c;
unsigned char replace[2];
bad = find_invalid_utf8(buf->buf + pos, buf->len - pos);
if (bad < 0)
return ok;
pos += bad;
ok = 0;
c = buf->buf[pos];
strbuf_remove(buf, pos, 1);
/* We know 'c' must be in the range 128-255 */
replace[0] = 0xc0 + (c >> 6);
replace[1] = 0x80 + (c & 0x3f);
strbuf_insert(buf, pos, replace, 2);
pos += 2;
}
}
static const char commit_utf8_warn[] = static const char commit_utf8_warn[] =
"Warning: commit message does not conform to UTF-8.\n" "Warning: commit message did not conform to UTF-8.\n"
"You may want to amend it after fixing the message, or set the config\n" "You may want to amend it after fixing the message, or set the config\n"
"variable i18n.commitencoding to the encoding your project uses.\n"; "variable i18n.commitencoding to the encoding your project uses.\n";
@ -1170,7 +1254,7 @@ int commit_tree_extended(const struct strbuf *msg, unsigned char *tree,
strbuf_addbuf(&buffer, msg); strbuf_addbuf(&buffer, msg);
/* And check the encoding */ /* And check the encoding */
if (encoding_is_utf8 && !is_utf8(buffer.buf)) if (encoding_is_utf8 && !verify_utf8(&buffer))
fprintf(stderr, commit_utf8_warn); fprintf(stderr, commit_utf8_warn);
if (sign_commit && do_sign_commit(&buffer, sign_commit)) if (sign_commit && do_sign_commit(&buffer, sign_commit))