From: HAT Date: Wed, 10 Aug 2011 14:39:51 +0000 (+0900) Subject: case-conversion of surrogate pair X-Git-Url: https://arthur.barton.de/cgi-bin/gitweb.cgi?p=netatalk.git;a=commitdiff_plain;h=a920a0e006be3662242e4128090f91c427484f10 case-conversion of surrogate pair --- diff --git a/NEWS b/NEWS index c76b347b..081a34d3 100644 --- a/NEWS +++ b/NEWS @@ -28,6 +28,7 @@ Changes in 2.2.1 * FIX: suse: initscript return better status * FIX: Sourcecode distribution: add missing headers * FIX: Solaris 10: missing dirfd replacement function +* FIX: case-conversion of surrogate pair Changes in 2.2 ============== diff --git a/libatalk/unicode/util_unistr.c b/libatalk/unicode/util_unistr.c index 070ca93b..1e9ba7d4 100644 --- a/libatalk/unicode/util_unistr.c +++ b/libatalk/unicode/util_unistr.c @@ -1,3 +1,13 @@ +/******************************************************************* + NOTE: + The early netatalk 2.x was based on UCS-2. + UCS-2 don't support chars above U+10000. + Recent netatalk is based on UTF-16. + UTF-16 can support chars above U+10000, using Surrogate Pair. + However, Surrogate Pair is complex, dirty, filthy and disagreeable. + There might still be latent bugs... +********************************************************************/ + #ifdef HAVE_CONFIG_H #include "config.h" #endif /* HAVE_CONFIG_H */ @@ -20,14 +30,30 @@ Convert a string to lower case. return True if any char is converted ********************************************************************/ +/* surrogate pair support */ + int strlower_w(ucs2_t *s) { int ret = 0; + while (*s) { - ucs2_t v = tolower_w(*s); - if (v != *s) { - *s = v; - ret = 1; + if ((0xD800 <= *s) && (*s < 0xDC00)) { + if ((0xDC00 <= s[1]) && (s[1] < 0xE000)) { + u_int32_t s_sp = (u_int32_t)*s << 16 | (u_int32_t)s[1]; + u_int32_t v_sp = tolower_sp(s_sp); + if (v_sp != s_sp) { + *s = v_sp >> 16; + s++; + *s = v_sp & 0xFFFF; + ret = 1; + } + } + } else { + ucs2_t v = tolower_w(*s); + if (v != *s) { + *s = v; + ret = 1; + } } s++; } @@ -38,41 +64,74 @@ int strlower_w(ucs2_t *s) Convert a string to upper case. return True if any char is converted ********************************************************************/ +/* surrogate pair support */ + int strupper_w(ucs2_t *s) { int ret = 0; + while (*s) { - ucs2_t v = toupper_w(*s); - if (v != *s) { - *s = v; - ret = 1; + if ((0xD800 <= *s) && (*s < 0xDC00)) { + if ((0xDC00 <= s[1]) && (s[1] < 0xE000)) { + u_int32_t s_sp = (u_int32_t)*s << 16 | (u_int32_t)s[1]; + u_int32_t v_sp = toupper_sp(s_sp); + if (v_sp != s_sp) { + *s = v_sp >> 16; + s++; + *s = v_sp & 0xFFFF; + ret = 1; + } + } + } else { + ucs2_t v = toupper_w(*s); + if (v != *s) { + *s = v; + ret = 1; + } } s++; } return ret; } - /******************************************************************* +wide & sp islower() determine if a character is lowercase ********************************************************************/ +/* These functions are not used. */ + int islower_w(ucs2_t c) { return ( c == tolower_w(c)); } +int islower_sp(u_int32_t c_sp) +{ + return ( c_sp == tolower_sp(c_sp)); +} + /******************************************************************* +wide & sp isupper() determine if a character is uppercase ********************************************************************/ +/* These functions are not used. */ + int isupper_w(ucs2_t c) { return ( c == toupper_w(c)); } +int isupper_sp(u_int32_t c_sp) +{ + return ( c_sp == toupper_sp(c_sp)); +} /******************************************************************* - Count the number of characters in a ucs2_t string. +wide strlen() + Count the number of characters in a UTF-16 string. ********************************************************************/ +/* NOTE: one surrogate pair is two characters. */ + size_t strlen_w(const ucs2_t *src) { size_t len; @@ -83,8 +142,11 @@ size_t strlen_w(const ucs2_t *src) } /******************************************************************* - Count up to max number of characters in a ucs2_t string. +wide strnlen() + Count up to max number of characters in a UTF-16 string. ********************************************************************/ +/* NOTE: one surrogate pair is two characters. */ + size_t strnlen_w(const ucs2_t *src, size_t max) { size_t len; @@ -97,6 +159,8 @@ size_t strnlen_w(const ucs2_t *src, size_t max) /******************************************************************* wide strchr() ********************************************************************/ +/* NOTE: hi and lo of surrogate pair are separately processed. */ + ucs2_t *strchr_w(const ucs2_t *s, ucs2_t c) { while (*s != 0) { @@ -108,6 +172,11 @@ ucs2_t *strchr_w(const ucs2_t *s, ucs2_t c) return NULL; } +/******************************************************************* +wide & sp strcasechr() +********************************************************************/ +/* NOTE: separately process BMP and surrogate pair */ + ucs2_t *strcasechr_w(const ucs2_t *s, ucs2_t c) { while (*s != 0) { @@ -120,6 +189,21 @@ ucs2_t *strcasechr_w(const ucs2_t *s, ucs2_t c) return NULL; } +ucs2_t *strcasechr_sp(const ucs2_t *s, u_int32_t c_sp) +{ + if (*s == 0) return NULL; + while (s[1] != 0) { + if (toupper_sp(c_sp) == toupper_sp((u_int32_t)*s << 16 | (u_int32_t)s[1])) return (ucs2_t *)s; + s++; + } + + return NULL; +} + +/******************************************************************* +wide strcmp() +********************************************************************/ +/* no problem of surrogate pair */ int strcmp_w(const ucs2_t *a, const ucs2_t *b) { @@ -130,6 +214,11 @@ int strcmp_w(const ucs2_t *a, const ucs2_t *b) string is longer */ } +/******************************************************************* +wide strncmp() +********************************************************************/ +/* no problem of surrogate pair */ + int strncmp_w(const ucs2_t *a, const ucs2_t *b, size_t len) { size_t n = 0; @@ -140,6 +229,8 @@ int strncmp_w(const ucs2_t *a, const ucs2_t *b, size_t len) /******************************************************************* wide strstr() ********************************************************************/ +/* no problem of surrogate pair */ + ucs2_t *strstr_w(const ucs2_t *s, const ucs2_t *ins) { ucs2_t *r; @@ -156,6 +247,11 @@ ucs2_t *strstr_w(const ucs2_t *s, const ucs2_t *ins) return NULL; } +/******************************************************************* +wide strcasestr() +********************************************************************/ +/* */ + ucs2_t *strcasestr_w(const ucs2_t *s, const ucs2_t *ins) { ucs2_t *r; @@ -172,32 +268,66 @@ ucs2_t *strcasestr_w(const ucs2_t *s, const ucs2_t *ins) return NULL; } - - - /******************************************************************* +wide strcasecmp() case insensitive string comparison ********************************************************************/ +/* surrogate pair support */ + int strcasecmp_w(const ucs2_t *a, const ucs2_t *b) { - while (*b && toupper_w(*a) == toupper_w(*b)) { a++; b++; } + int ret; + + while (*a && *b) { + if ((0xD800 <= *a) && (*a < 0xDC00)) { + if (ret = tolower_sp((u_int32_t)*a << 16 | (u_int32_t)a[1]) - tolower_sp((u_int32_t)*b << 16 | (u_int32_t)b[1])) return ret; + a++; + b++; + if (!(*a && *b)) return (tolower_w(*a) - tolower_w(*b)); /* avoid buffer over run */ + } else { + if (ret = tolower_w(*a) - tolower_w(*b)) return ret; + } + a++; + b++; + } return (tolower_w(*a) - tolower_w(*b)); } /******************************************************************* -case insensitive string comparison, lenght limited +wide strncasecmp() +case insensitive string comparison, length limited ********************************************************************/ +/* NOTE: compare up to 'len+1' if 'len' isolate surrogate pair */ + int strncasecmp_w(const ucs2_t *a, const ucs2_t *b, size_t len) { size_t n = 0; - while ((n < len) && *b && (toupper_w(*a) == toupper_w(*b))) { a++; b++; n++; } + int ret; + + while ((n < len) && *a && *b) { + if ((0xD800 <= *a) && (*a < 0xDC00)) { + if (ret = tolower_sp((u_int32_t)*a << 16 | (u_int32_t)a[1]) - tolower_sp((u_int32_t)*b << 16 | (u_int32_t)b[1])) return ret; + a++; + b++; + n++; + if (!((n < len) && *a && *b)) return (tolower_w(*a) - tolower_w(*b)); + } else { + if (ret = tolower_w(*a) - tolower_w(*b)) return ret; + } + a++; + b++; + n++; + } return (len - n)?(tolower_w(*a) - tolower_w(*b)):0; } /******************************************************************* +wide strndup() duplicate string ********************************************************************/ +/* NOTE: not check isolation of surrogate pair */ /* if len == 0 then duplicate the whole string */ + ucs2_t *strndup_w(const ucs2_t *src, size_t len) { ucs2_t *dest; @@ -215,6 +345,12 @@ ucs2_t *strndup_w(const ucs2_t *src, size_t len) return dest; } +/******************************************************************* +wide strdup() +duplicate string +********************************************************************/ +/* no problem of surrogate pair */ + ucs2_t *strdup_w(const ucs2_t *src) { return strndup_w(src, 0);