Win32: add Unicode conversion functions
Add Unicode conversion functions to convert between Windows native UTF-16LE encoding to UTF-8 and back. To support repositories with legacy-encoded file names, the UTF-8 to UTF-16 conversion function tries to create valid, unique file names even for invalid UTF-8 byte sequences, so that these repositories can be checked out without error. The current implementation leaves invalid UTF-8 bytes in range 0xa0 - 0xff as is (producing printable Unicode chars \u00a0 - \u00ff, equivalent to ISO-8859-1), and converts 0x80 - 0x9f to hex-code (\u0080 - \u009f are control chars). The Windows MultiByteToWideChar API was not used as it either drops invalid UTF-8 sequences (on Win2k/XP; producing non-unique or even empty file names) or converts them to the replacement char \ufffd (Vista/7; causing ERROR_INVALID_NAME in subsequent calls to file system APIs). Signed-off-by: Karsten Blees <blees@dcon.de> Signed-off-by: Stepan Kasal <kasal@ucw.cz> Signed-off-by: Junio C Hamano <gitster@pobox.com>
This commit is contained in:
parent
1edeb9abf5
commit
1c950a594c
@ -1848,6 +1848,91 @@ int mingw_offset_1st_component(const char *path)
|
||||
return offset + is_dir_sep(path[offset]);
|
||||
}
|
||||
|
||||
int xutftowcsn(wchar_t *wcs, const char *utfs, size_t wcslen, int utflen)
|
||||
{
|
||||
int upos = 0, wpos = 0;
|
||||
const unsigned char *utf = (const unsigned char*) utfs;
|
||||
if (!utf || !wcs || wcslen < 1) {
|
||||
errno = EINVAL;
|
||||
return -1;
|
||||
}
|
||||
/* reserve space for \0 */
|
||||
wcslen--;
|
||||
if (utflen < 0)
|
||||
utflen = INT_MAX;
|
||||
|
||||
while (upos < utflen) {
|
||||
int c = utf[upos++] & 0xff;
|
||||
if (utflen == INT_MAX && c == 0)
|
||||
break;
|
||||
|
||||
if (wpos >= wcslen) {
|
||||
wcs[wpos] = 0;
|
||||
errno = ERANGE;
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (c < 0x80) {
|
||||
/* ASCII */
|
||||
wcs[wpos++] = c;
|
||||
} else if (c >= 0xc2 && c < 0xe0 && upos < utflen &&
|
||||
(utf[upos] & 0xc0) == 0x80) {
|
||||
/* 2-byte utf-8 */
|
||||
c = ((c & 0x1f) << 6);
|
||||
c |= (utf[upos++] & 0x3f);
|
||||
wcs[wpos++] = c;
|
||||
} else if (c >= 0xe0 && c < 0xf0 && upos + 1 < utflen &&
|
||||
!(c == 0xe0 && utf[upos] < 0xa0) && /* over-long encoding */
|
||||
(utf[upos] & 0xc0) == 0x80 &&
|
||||
(utf[upos + 1] & 0xc0) == 0x80) {
|
||||
/* 3-byte utf-8 */
|
||||
c = ((c & 0x0f) << 12);
|
||||
c |= ((utf[upos++] & 0x3f) << 6);
|
||||
c |= (utf[upos++] & 0x3f);
|
||||
wcs[wpos++] = c;
|
||||
} else if (c >= 0xf0 && c < 0xf5 && upos + 2 < utflen &&
|
||||
wpos + 1 < wcslen &&
|
||||
!(c == 0xf0 && utf[upos] < 0x90) && /* over-long encoding */
|
||||
!(c == 0xf4 && utf[upos] >= 0x90) && /* > \u10ffff */
|
||||
(utf[upos] & 0xc0) == 0x80 &&
|
||||
(utf[upos + 1] & 0xc0) == 0x80 &&
|
||||
(utf[upos + 2] & 0xc0) == 0x80) {
|
||||
/* 4-byte utf-8: convert to \ud8xx \udcxx surrogate pair */
|
||||
c = ((c & 0x07) << 18);
|
||||
c |= ((utf[upos++] & 0x3f) << 12);
|
||||
c |= ((utf[upos++] & 0x3f) << 6);
|
||||
c |= (utf[upos++] & 0x3f);
|
||||
c -= 0x10000;
|
||||
wcs[wpos++] = 0xd800 | (c >> 10);
|
||||
wcs[wpos++] = 0xdc00 | (c & 0x3ff);
|
||||
} else if (c >= 0xa0) {
|
||||
/* invalid utf-8 byte, printable unicode char: convert 1:1 */
|
||||
wcs[wpos++] = c;
|
||||
} else {
|
||||
/* invalid utf-8 byte, non-printable unicode: convert to hex */
|
||||
static const char *hex = "0123456789abcdef";
|
||||
wcs[wpos++] = hex[c >> 4];
|
||||
if (wpos < wcslen)
|
||||
wcs[wpos++] = hex[c & 0x0f];
|
||||
}
|
||||
}
|
||||
wcs[wpos] = 0;
|
||||
return wpos;
|
||||
}
|
||||
|
||||
int xwcstoutf(char *utf, const wchar_t *wcs, size_t utflen)
|
||||
{
|
||||
if (!wcs || !utf || utflen < 1) {
|
||||
errno = EINVAL;
|
||||
return -1;
|
||||
}
|
||||
utflen = WideCharToMultiByte(CP_UTF8, 0, wcs, -1, utf, utflen, NULL, NULL);
|
||||
if (utflen)
|
||||
return utflen - 1;
|
||||
errno = ERANGE;
|
||||
return -1;
|
||||
}
|
||||
|
||||
void mingw_startup()
|
||||
{
|
||||
/* copy executable name to argv[0] */
|
||||
|
104
compat/mingw.h
104
compat/mingw.h
@ -357,6 +357,110 @@ void mingw_open_html(const char *path);
|
||||
char **make_augmented_environ(const char *const *vars);
|
||||
void free_environ(char **env);
|
||||
|
||||
/**
|
||||
* Converts UTF-8 encoded string to UTF-16LE.
|
||||
*
|
||||
* To support repositories with legacy-encoded file names, invalid UTF-8 bytes
|
||||
* 0xa0 - 0xff are converted to corresponding printable Unicode chars \u00a0 -
|
||||
* \u00ff, and invalid UTF-8 bytes 0x80 - 0x9f (which would make non-printable
|
||||
* Unicode) are converted to hex-code.
|
||||
*
|
||||
* Lead-bytes not followed by an appropriate number of trail-bytes, over-long
|
||||
* encodings and 4-byte encodings > \u10ffff are detected as invalid UTF-8.
|
||||
*
|
||||
* Maximum space requirement for the target buffer is two wide chars per UTF-8
|
||||
* char (((strlen(utf) * 2) + 1) [* sizeof(wchar_t)]).
|
||||
*
|
||||
* The maximum space is needed only if the entire input string consists of
|
||||
* invalid UTF-8 bytes in range 0x80-0x9f, as per the following table:
|
||||
*
|
||||
* | | UTF-8 | UTF-16 |
|
||||
* Code point | UTF-8 sequence | bytes | words | ratio
|
||||
* --------------+-------------------+-------+--------+-------
|
||||
* 000000-00007f | 0-7f | 1 | 1 | 1
|
||||
* 000080-0007ff | c2-df + 80-bf | 2 | 1 | 0.5
|
||||
* 000800-00ffff | e0-ef + 2 * 80-bf | 3 | 1 | 0.33
|
||||
* 010000-10ffff | f0-f4 + 3 * 80-bf | 4 | 2 (a) | 0.5
|
||||
* invalid | 80-9f | 1 | 2 (b) | 2
|
||||
* invalid | a0-ff | 1 | 1 | 1
|
||||
*
|
||||
* (a) encoded as UTF-16 surrogate pair
|
||||
* (b) encoded as two hex digits
|
||||
*
|
||||
* Note that, while the UTF-8 encoding scheme can be extended to 5-byte, 6-byte
|
||||
* or even indefinite-byte sequences, the largest valid code point \u10ffff
|
||||
* encodes as only 4 UTF-8 bytes.
|
||||
*
|
||||
* Parameters:
|
||||
* wcs: wide char target buffer
|
||||
* utf: string to convert
|
||||
* wcslen: size of target buffer (in wchar_t's)
|
||||
* utflen: size of string to convert, or -1 if 0-terminated
|
||||
*
|
||||
* Returns:
|
||||
* length of converted string (_wcslen(wcs)), or -1 on failure
|
||||
*
|
||||
* Errors:
|
||||
* EINVAL: one of the input parameters is invalid (e.g. NULL)
|
||||
* ERANGE: the output buffer is too small
|
||||
*/
|
||||
int xutftowcsn(wchar_t *wcs, const char *utf, size_t wcslen, int utflen);
|
||||
|
||||
/**
|
||||
* Simplified variant of xutftowcsn, assumes input string is \0-terminated.
|
||||
*/
|
||||
static inline int xutftowcs(wchar_t *wcs, const char *utf, size_t wcslen)
|
||||
{
|
||||
return xutftowcsn(wcs, utf, wcslen, -1);
|
||||
}
|
||||
|
||||
/**
|
||||
* Simplified file system specific variant of xutftowcsn, assumes output
|
||||
* buffer size is MAX_PATH wide chars and input string is \0-terminated,
|
||||
* fails with ENAMETOOLONG if input string is too long.
|
||||
*/
|
||||
static inline int xutftowcs_path(wchar_t *wcs, const char *utf)
|
||||
{
|
||||
int result = xutftowcsn(wcs, utf, MAX_PATH, -1);
|
||||
if (result < 0 && errno == ERANGE)
|
||||
errno = ENAMETOOLONG;
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts UTF-16LE encoded string to UTF-8.
|
||||
*
|
||||
* Maximum space requirement for the target buffer is three UTF-8 chars per
|
||||
* wide char ((_wcslen(wcs) * 3) + 1).
|
||||
*
|
||||
* The maximum space is needed only if the entire input string consists of
|
||||
* UTF-16 words in range 0x0800-0xd7ff or 0xe000-0xffff (i.e. \u0800-\uffff
|
||||
* modulo surrogate pairs), as per the following table:
|
||||
*
|
||||
* | | UTF-16 | UTF-8 |
|
||||
* Code point | UTF-16 sequence | words | bytes | ratio
|
||||
* --------------+-----------------------+--------+-------+-------
|
||||
* 000000-00007f | 0000-007f | 1 | 1 | 1
|
||||
* 000080-0007ff | 0080-07ff | 1 | 2 | 2
|
||||
* 000800-00ffff | 0800-d7ff / e000-ffff | 1 | 3 | 3
|
||||
* 010000-10ffff | d800-dbff + dc00-dfff | 2 | 4 | 2
|
||||
*
|
||||
* Note that invalid code points > 10ffff cannot be represented in UTF-16.
|
||||
*
|
||||
* Parameters:
|
||||
* utf: target buffer
|
||||
* wcs: wide string to convert
|
||||
* utflen: size of target buffer
|
||||
*
|
||||
* Returns:
|
||||
* length of converted string, or -1 on failure
|
||||
*
|
||||
* Errors:
|
||||
* EINVAL: one of the input parameters is invalid (e.g. NULL)
|
||||
* ERANGE: the output buffer is too small
|
||||
*/
|
||||
int xwcstoutf(char *utf, const wchar_t *wcs, size_t utflen);
|
||||
|
||||
/*
|
||||
* A critical section used in the implementation of the spawn
|
||||
* functions (mingw_spawnv[p]e()) and waitpid(). Intialised in
|
||||
|
Loading…
Reference in New Issue
Block a user