X-Git-Url: https://arthur.barton.de/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=libatalk%2Funicode%2Futil_unistr.c;h=c163d6f6e3f6366d75d7503ba64787dd3356e255;hb=654f86c6e05414423af719716ad028f7af7a65f5;hp=9b018baa97217b0eddd15f0947c0d40691051a45;hpb=80fe6e63eb8ec9a396b4383cb8210fd5ded6ad35;p=netatalk.git diff --git a/libatalk/unicode/util_unistr.c b/libatalk/unicode/util_unistr.c index 9b018baa..c163d6f6 100644 --- a/libatalk/unicode/util_unistr.c +++ b/libatalk/unicode/util_unistr.c @@ -1,3 +1,13 @@ +/******************************************************************* + NOTE: + The early netatalk 2.x was based on UCS-2. + UCS-2 don't support chars above U+10000. + Recent netatalk is based on UTF-16. + UTF-16 can support chars above U+10000, using Surrogate Pair. + However, Surrogate Pair is complex, dirty, filthy and disagreeable. + There might still be latent bugs... +********************************************************************/ + #ifdef HAVE_CONFIG_H #include "config.h" #endif /* HAVE_CONFIG_H */ @@ -12,21 +22,38 @@ #include #include +#include + #include "precompose.h" -#include "byteorder.h" /******************************************************************* Convert a string to lower case. return True if any char is converted ********************************************************************/ +/* surrogate pair support */ + int strlower_w(ucs2_t *s) { int ret = 0; + while (*s) { - ucs2_t v = tolower_w(*s); - if (v != *s) { - *s = v; - ret = 1; + if ((0xD800 <= *s) && (*s < 0xDC00)) { + if ((0xDC00 <= s[1]) && (s[1] < 0xE000)) { + uint32_t s_sp = (uint32_t)*s << 16 | (uint32_t)s[1]; + uint32_t v_sp = tolower_sp(s_sp); + if (v_sp != s_sp) { + *s = v_sp >> 16; + s++; + *s = v_sp & 0xFFFF; + ret = 1; + } + } + } else { + ucs2_t v = tolower_w(*s); + if (v != *s) { + *s = v; + ret = 1; + } } s++; } @@ -37,41 +64,74 @@ int strlower_w(ucs2_t *s) Convert a string to upper case. return True if any char is converted ********************************************************************/ +/* surrogate pair support */ + int strupper_w(ucs2_t *s) { int ret = 0; + while (*s) { - ucs2_t v = toupper_w(*s); - if (v != *s) { - *s = v; - ret = 1; + if ((0xD800 <= *s) && (*s < 0xDC00)) { + if ((0xDC00 <= s[1]) && (s[1] < 0xE000)) { + uint32_t s_sp = (uint32_t)*s << 16 | (uint32_t)s[1]; + uint32_t v_sp = toupper_sp(s_sp); + if (v_sp != s_sp) { + *s = v_sp >> 16; + s++; + *s = v_sp & 0xFFFF; + ret = 1; + } + } + } else { + ucs2_t v = toupper_w(*s); + if (v != *s) { + *s = v; + ret = 1; + } } s++; } return ret; } - /******************************************************************* +wide & sp islower() determine if a character is lowercase ********************************************************************/ +/* These functions are not used. */ + int islower_w(ucs2_t c) { return ( c == tolower_w(c)); } +int islower_sp(uint32_t c_sp) +{ + return ( c_sp == tolower_sp(c_sp)); +} + /******************************************************************* +wide & sp isupper() determine if a character is uppercase ********************************************************************/ +/* These functions are not used. */ + int isupper_w(ucs2_t c) { return ( c == toupper_w(c)); } +int isupper_sp(uint32_t c_sp) +{ + return ( c_sp == toupper_sp(c_sp)); +} /******************************************************************* - Count the number of characters in a ucs2_t string. +wide strlen() + Count the number of characters in a UTF-16 string. ********************************************************************/ +/* NOTE: one surrogate pair is two characters. */ + size_t strlen_w(const ucs2_t *src) { size_t len; @@ -82,8 +142,11 @@ size_t strlen_w(const ucs2_t *src) } /******************************************************************* - Count up to max number of characters in a ucs2_t string. +wide strnlen() + Count up to max number of characters in a UTF-16 string. ********************************************************************/ +/* NOTE: one surrogate pair is two characters. */ + size_t strnlen_w(const ucs2_t *src, size_t max) { size_t len; @@ -96,6 +159,8 @@ size_t strnlen_w(const ucs2_t *src, size_t max) /******************************************************************* wide strchr() ********************************************************************/ +/* NOTE: hi and lo of surrogate pair are separately processed. */ + ucs2_t *strchr_w(const ucs2_t *s, ucs2_t c) { while (*s != 0) { @@ -107,11 +172,15 @@ ucs2_t *strchr_w(const ucs2_t *s, ucs2_t c) return NULL; } +/******************************************************************* +wide & sp strcasechr() +********************************************************************/ +/* NOTE: separately process BMP and surrogate pair */ + ucs2_t *strcasechr_w(const ucs2_t *s, ucs2_t c) { while (*s != 0) { -/* LOG(log_debug, logtype_default, "Comparing %X to %X (%X - %X)", c, *s, toupper_w(c), toupper_w(*s));*/ - if (toupper_w(c) == toupper_w(*s)) return (ucs2_t *)s; + if (tolower_w(c) == tolower_w(*s)) return (ucs2_t *)s; s++; } if (c == *s) return (ucs2_t *)s; @@ -119,6 +188,21 @@ ucs2_t *strcasechr_w(const ucs2_t *s, ucs2_t c) return NULL; } +ucs2_t *strcasechr_sp(const ucs2_t *s, uint32_t c_sp) +{ + if (*s == 0) return NULL; + while (s[1] != 0) { + if (tolower_sp(c_sp) == tolower_sp((uint32_t)*s << 16 | (uint32_t)s[1])) return (ucs2_t *)s; + s++; + } + + return NULL; +} + +/******************************************************************* +wide strcmp() +********************************************************************/ +/* no problem of surrogate pair */ int strcmp_w(const ucs2_t *a, const ucs2_t *b) { @@ -129,6 +213,11 @@ int strcmp_w(const ucs2_t *a, const ucs2_t *b) string is longer */ } +/******************************************************************* +wide strncmp() +********************************************************************/ +/* no problem of surrogate pair */ + int strncmp_w(const ucs2_t *a, const ucs2_t *b, size_t len) { size_t n = 0; @@ -139,6 +228,8 @@ int strncmp_w(const ucs2_t *a, const ucs2_t *b, size_t len) /******************************************************************* wide strstr() ********************************************************************/ +/* no problem of surrogate pair */ + ucs2_t *strstr_w(const ucs2_t *s, const ucs2_t *ins) { ucs2_t *r; @@ -155,6 +246,11 @@ ucs2_t *strstr_w(const ucs2_t *s, const ucs2_t *ins) return NULL; } +/******************************************************************* +wide strcasestr() +********************************************************************/ +/* surrogate pair support */ + ucs2_t *strcasestr_w(const ucs2_t *s, const ucs2_t *ins) { ucs2_t *r; @@ -164,39 +260,86 @@ ucs2_t *strcasestr_w(const ucs2_t *s, const ucs2_t *ins) slen = strlen_w(s); inslen = strlen_w(ins); r = (ucs2_t *)s; - while ((r = strcasechr_w(r, *ins))) { - if (strncasecmp_w(r, ins, inslen) == 0) return r; - r++; + + if ((0xD800 <= *ins) && (*ins < 0xDC00)) { + if ((0xDC00 <= ins[1]) && (ins[1] < 0xE000)) { + uint32_t ins_sp = (uint32_t)*ins << 16 | (uint32_t)ins[1]; + while ((r = strcasechr_sp(r, ins_sp))) { + if (strncasecmp_w(r, ins, inslen) == 0) return r; + r++; + } + } else { + return NULL; /* illegal sequence */ + } + } else { + while ((r = strcasechr_w(r, *ins))) { + if (strncasecmp_w(r, ins, inslen) == 0) return r; + r++; + } } return NULL; } - - - /******************************************************************* +wide strcasecmp() case insensitive string comparison ********************************************************************/ +/* surrogate pair support */ + int strcasecmp_w(const ucs2_t *a, const ucs2_t *b) { - while (*b && toupper_w(*a) == toupper_w(*b)) { a++; b++; } + int ret; + + while (*a && *b) { + if ((0xD800 <= *a) && (*a < 0xDC00)) { + if (ret = tolower_sp((uint32_t)*a << 16 | (uint32_t)a[1]) - tolower_sp((uint32_t)*b << 16 | (uint32_t)b[1])) return ret; + a++; + b++; + if (!(*a && *b)) return (tolower_w(*a) - tolower_w(*b)); /* avoid buffer over run */ + } else { + if (ret = tolower_w(*a) - tolower_w(*b)) return ret; + } + a++; + b++; + } return (tolower_w(*a) - tolower_w(*b)); } /******************************************************************* -case insensitive string comparison, lenght limited +wide strncasecmp() +case insensitive string comparison, length limited ********************************************************************/ +/* NOTE: compare up to 'len+1' if 'len' isolate surrogate pair */ + int strncasecmp_w(const ucs2_t *a, const ucs2_t *b, size_t len) { size_t n = 0; - while ((n < len) && *b && (toupper_w(*a) == toupper_w(*b))) { a++; b++; n++; } + int ret; + + while ((n < len) && *a && *b) { + if ((0xD800 <= *a) && (*a < 0xDC00)) { + if (ret = tolower_sp((uint32_t)*a << 16 | (uint32_t)a[1]) - tolower_sp((uint32_t)*b << 16 | (uint32_t)b[1])) return ret; + a++; + b++; + n++; + if (!((n < len) && *a && *b)) return (tolower_w(*a) - tolower_w(*b)); + } else { + if (ret = tolower_w(*a) - tolower_w(*b)) return ret; + } + a++; + b++; + n++; + } return (len - n)?(tolower_w(*a) - tolower_w(*b)):0; } /******************************************************************* +wide strndup() duplicate string ********************************************************************/ +/* NOTE: not check isolation of surrogate pair */ /* if len == 0 then duplicate the whole string */ + ucs2_t *strndup_w(const ucs2_t *src, size_t len) { ucs2_t *dest; @@ -214,6 +357,12 @@ ucs2_t *strndup_w(const ucs2_t *src, size_t len) return dest; } +/******************************************************************* +wide strdup() +duplicate string +********************************************************************/ +/* no problem of surrogate pair */ + ucs2_t *strdup_w(const ucs2_t *src) { return strndup_w(src, 0); @@ -222,6 +371,8 @@ ucs2_t *strdup_w(const ucs2_t *src) /******************************************************************* copy a string with max len ********************************************************************/ +/* This function is not used. */ +/* NOTE: not check isolation of surrogate pair */ ucs2_t *strncpy_w(ucs2_t *dest, const ucs2_t *src, const size_t max) { @@ -241,7 +392,9 @@ ucs2_t *strncpy_w(ucs2_t *dest, const ucs2_t *src, const size_t max) /******************************************************************* append a string of len bytes and add a terminator ********************************************************************/ +/* These functions are not used. */ +/* NOTE: not check isolation of surrogate pair */ ucs2_t *strncat_w(ucs2_t *dest, const ucs2_t *src, const size_t max) { size_t start; @@ -258,7 +411,7 @@ ucs2_t *strncat_w(ucs2_t *dest, const ucs2_t *src, const size_t max) return dest; } - +/* no problem of surrogate pair */ ucs2_t *strcat_w(ucs2_t *dest, const ucs2_t *src) { size_t start; @@ -416,8 +569,6 @@ size_t precompose_w (ucs2_t *name, size_t inplen, ucs2_t *comp, size_t *outlen) base = *in; while (*outlen > 2) { i += 2; - in++; - if (i == inplen) { *out = base; out++; @@ -425,7 +576,7 @@ size_t precompose_w (ucs2_t *name, size_t inplen, ucs2_t *comp, size_t *outlen) *outlen -= 2; return o_len - *outlen; } - + in++; comb = *in; result = 0; @@ -453,7 +604,7 @@ size_t precompose_w (ucs2_t *name, size_t inplen, ucs2_t *comp, size_t *outlen) /* Binary Search for Surrogate Pair */ else if ((0xD800 <= base) && (base < 0xDC00)) { - if ((0xDC00 <= comb) && (comb < 0xE000) && (i + 4 <= inplen)) { + if ((0xDC00 <= comb) && (comb < 0xE000) && (i + 6 <= inplen)) { base_sp = ((uint32_t)base << 16) | (uint32_t)comb; do { comb_sp = ((uint32_t)in[1] << 16) | (uint32_t)in[2]; @@ -462,7 +613,7 @@ size_t precompose_w (ucs2_t *name, size_t inplen, ucs2_t *comp, size_t *outlen) i += 4; in +=2; } - } while ((i + 4 <= inplen) && result_sp) ; + } while ((i + 6 <= inplen) && result_sp) ; *out = base_sp >> 16; out++; @@ -478,6 +629,11 @@ size_t precompose_w (ucs2_t *name, size_t inplen, ucs2_t *comp, size_t *outlen) *outlen -= 2; i += 2; + if (i == inplen) { + out++; + *out = 0; + return o_len - *outlen; + } in++; base = *in;