Merge branch 'bc/utf16-portability-fix'
The code and tests assume that the system supplied iconv() would always use BOM in its output when asked to encode to UTF-16 (or UTF-32), but apparently some implementations output big-endian without BOM. A compile-time knob has been added to help such systems (e.g. NonStop) to add BOM to the output to increase portability. * bc/utf16-portability-fix: utf8: handle systems that don't write BOM for UTF-16
This commit is contained in:
commit
18f9fb687f
7
Makefile
7
Makefile
@ -259,6 +259,10 @@ all::
|
||||
# Define OLD_ICONV if your library has an old iconv(), where the second
|
||||
# (input buffer pointer) parameter is declared with type (const char **).
|
||||
#
|
||||
# Define ICONV_OMITS_BOM if your iconv implementation does not write a
|
||||
# byte-order mark (BOM) when writing UTF-16 or UTF-32 and always writes in
|
||||
# big-endian format.
|
||||
#
|
||||
# Define NO_DEFLATE_BOUND if your zlib does not have deflateBound.
|
||||
#
|
||||
# Define NO_R_TO_GCC_LINKER if your gcc does not like "-R/path/lib"
|
||||
@ -1417,6 +1421,9 @@ ifndef NO_ICONV
|
||||
EXTLIBS += $(ICONV_LINK) -liconv
|
||||
endif
|
||||
endif
|
||||
ifdef ICONV_OMITS_BOM
|
||||
BASIC_CFLAGS += -DICONV_OMITS_BOM
|
||||
endif
|
||||
ifdef NEEDS_LIBGEN
|
||||
EXTLIBS += -lgen
|
||||
endif
|
||||
|
@ -6,6 +6,30 @@ test_description='working-tree-encoding conversion via gitattributes'
|
||||
|
||||
GIT_TRACE_WORKING_TREE_ENCODING=1 && export GIT_TRACE_WORKING_TREE_ENCODING
|
||||
|
||||
test_lazy_prereq NO_UTF16_BOM '
|
||||
test $(printf abc | iconv -f UTF-8 -t UTF-16 | wc -c) = 6
|
||||
'
|
||||
|
||||
test_lazy_prereq NO_UTF32_BOM '
|
||||
test $(printf abc | iconv -f UTF-8 -t UTF-32 | wc -c) = 12
|
||||
'
|
||||
|
||||
write_utf16 () {
|
||||
if test_have_prereq NO_UTF16_BOM
|
||||
then
|
||||
printf '\xfe\xff'
|
||||
fi &&
|
||||
iconv -f UTF-8 -t UTF-16
|
||||
}
|
||||
|
||||
write_utf32 () {
|
||||
if test_have_prereq NO_UTF32_BOM
|
||||
then
|
||||
printf '\x00\x00\xfe\xff'
|
||||
fi &&
|
||||
iconv -f UTF-8 -t UTF-32
|
||||
}
|
||||
|
||||
test_expect_success 'setup test files' '
|
||||
git config core.eol lf &&
|
||||
|
||||
@ -13,8 +37,8 @@ test_expect_success 'setup test files' '
|
||||
echo "*.utf16 text working-tree-encoding=utf-16" >.gitattributes &&
|
||||
echo "*.utf16lebom text working-tree-encoding=UTF-16LE-BOM" >>.gitattributes &&
|
||||
printf "$text" >test.utf8.raw &&
|
||||
printf "$text" | iconv -f UTF-8 -t UTF-16 >test.utf16.raw &&
|
||||
printf "$text" | iconv -f UTF-8 -t UTF-32 >test.utf32.raw &&
|
||||
printf "$text" | write_utf16 >test.utf16.raw &&
|
||||
printf "$text" | write_utf32 >test.utf32.raw &&
|
||||
printf "\377\376" >test.utf16lebom.raw &&
|
||||
printf "$text" | iconv -f UTF-8 -t UTF-32LE >>test.utf16lebom.raw &&
|
||||
|
||||
@ -124,8 +148,8 @@ do
|
||||
test_when_finished "rm -f crlf.utf${i}.raw lf.utf${i}.raw" &&
|
||||
test_when_finished "git reset --hard HEAD^" &&
|
||||
|
||||
cat lf.utf8.raw | iconv -f UTF-8 -t UTF-${i} >lf.utf${i}.raw &&
|
||||
cat crlf.utf8.raw | iconv -f UTF-8 -t UTF-${i} >crlf.utf${i}.raw &&
|
||||
cat lf.utf8.raw | write_utf${i} >lf.utf${i}.raw &&
|
||||
cat crlf.utf8.raw | write_utf${i} >crlf.utf${i}.raw &&
|
||||
cp crlf.utf${i}.raw eol.utf${i} &&
|
||||
|
||||
cat >expectIndexLF <<-EOF &&
|
||||
@ -223,7 +247,7 @@ test_expect_success ICONV_SHIFT_JIS 'check roundtrip encoding' '
|
||||
|
||||
text="hallo there!\nroundtrip test here!" &&
|
||||
printf "$text" | iconv -f UTF-8 -t SHIFT-JIS >roundtrip.shift &&
|
||||
printf "$text" | iconv -f UTF-8 -t UTF-16 >roundtrip.utf16 &&
|
||||
printf "$text" | write_utf16 >roundtrip.utf16 &&
|
||||
echo "*.shift text working-tree-encoding=SHIFT-JIS" >>.gitattributes &&
|
||||
|
||||
# SHIFT-JIS encoded files are round-trip checked by default...
|
||||
|
14
utf8.c
14
utf8.c
@ -559,6 +559,10 @@ char *reencode_string_len(const char *in, size_t insz,
|
||||
/*
|
||||
* For writing, UTF-16 iconv typically creates "UTF-16BE-BOM"
|
||||
* Some users under Windows want the little endian version
|
||||
*
|
||||
* We handle UTF-16 and UTF-32 ourselves only if the platform does not
|
||||
* provide a BOM (which we require), since we want to match the behavior
|
||||
* of the system tools and libc as much as possible.
|
||||
*/
|
||||
if (same_utf_encoding("UTF-16LE-BOM", out_encoding)) {
|
||||
bom_str = utf16_le_bom;
|
||||
@ -568,6 +572,16 @@ char *reencode_string_len(const char *in, size_t insz,
|
||||
bom_str = utf16_be_bom;
|
||||
bom_len = sizeof(utf16_be_bom);
|
||||
out_encoding = "UTF-16BE";
|
||||
#ifdef ICONV_OMITS_BOM
|
||||
} else if (same_utf_encoding("UTF-16", out_encoding)) {
|
||||
bom_str = utf16_be_bom;
|
||||
bom_len = sizeof(utf16_be_bom);
|
||||
out_encoding = "UTF-16BE";
|
||||
} else if (same_utf_encoding("UTF-32", out_encoding)) {
|
||||
bom_str = utf32_be_bom;
|
||||
bom_len = sizeof(utf32_be_bom);
|
||||
out_encoding = "UTF-32BE";
|
||||
#endif
|
||||
}
|
||||
|
||||
conv = iconv_open(out_encoding, in_encoding);
|
||||
|
Loading…
Reference in New Issue
Block a user