utf8: add function to detect a missing UTF-16/32 BOM
If the endianness is not defined in the encoding name, then let's be strict and require a BOM to avoid any encoding confusion. The is_missing_required_utf_bom() function returns true if a required BOM is missing. The Unicode standard instructs to assume big-endian if there in no BOM for UTF-16/32 [1][2]. However, the W3C/WHATWG encoding standard used in HTML5 recommends to assume little-endian to "deal with deployed content" [3]. Strictly requiring a BOM seems to be the safest option for content in Git. This function is used in a subsequent commit. [1] http://unicode.org/faq/utf_bom.html#gen6 [2] http://www.unicode.org/versions/Unicode10.0.0/ch03.pdf Section 3.10, D98, page 132 [3] https://encoding.spec.whatwg.org/#utf-16le Signed-off-by: Lars Schneider <larsxschneider@gmail.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
This commit is contained in:
parent
10ecb82e4f
commit
c6e48652f6
13
utf8.c
13
utf8.c
@ -586,6 +586,19 @@ int has_prohibited_utf_bom(const char *enc, const char *data, size_t len)
|
||||
);
|
||||
}
|
||||
|
||||
int is_missing_required_utf_bom(const char *enc, const char *data, size_t len)
|
||||
{
|
||||
return (
|
||||
(same_utf_encoding(enc, "UTF-16")) &&
|
||||
!(has_bom_prefix(data, len, utf16_be_bom, sizeof(utf16_be_bom)) ||
|
||||
has_bom_prefix(data, len, utf16_le_bom, sizeof(utf16_le_bom)))
|
||||
) || (
|
||||
(same_utf_encoding(enc, "UTF-32")) &&
|
||||
!(has_bom_prefix(data, len, utf32_be_bom, sizeof(utf32_be_bom)) ||
|
||||
has_bom_prefix(data, len, utf32_le_bom, sizeof(utf32_le_bom)))
|
||||
);
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns first character length in bytes for multi-byte `text` according to
|
||||
* `encoding`.
|
||||
|
19
utf8.h
19
utf8.h
@ -79,4 +79,23 @@ void strbuf_utf8_align(struct strbuf *buf, align_type position, unsigned int wid
|
||||
*/
|
||||
int has_prohibited_utf_bom(const char *enc, const char *data, size_t len);
|
||||
|
||||
/*
|
||||
* If the endianness is not defined in the encoding name, then we
|
||||
* require a BOM. The function returns true if a required BOM is missing.
|
||||
*
|
||||
* The Unicode standard instructs to assume big-endian if there in no
|
||||
* BOM for UTF-16/32 [1][2]. However, the W3C/WHATWG encoding standard
|
||||
* used in HTML5 recommends to assume little-endian to "deal with
|
||||
* deployed content" [3].
|
||||
*
|
||||
* Therefore, strictly requiring a BOM seems to be the safest option for
|
||||
* content in Git.
|
||||
*
|
||||
* [1] http://unicode.org/faq/utf_bom.html#gen6
|
||||
* [2] http://www.unicode.org/versions/Unicode10.0.0/ch03.pdf
|
||||
* Section 3.10, D98, page 132
|
||||
* [3] https://encoding.spec.whatwg.org/#utf-16le
|
||||
*/
|
||||
int is_missing_required_utf_bom(const char *enc, const char *data, size_t len);
|
||||
|
||||
#endif
|
||||
|
Loading…
Reference in New Issue
Block a user