X-Git-Url: https://arthur.barton.de/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=libatalk%2Funicode%2Futil_unistr.c;h=e5ea4ba0fcea2cd8def574b835432305a0024c50;hb=7be2a68aa70fa5641e944ad0aed6f699736d3de9;hp=070ca93b1a99aedb24dc5fc510d302c26fc40fe3;hpb=06b7f576daa4a127964febf210869f4729235dc3;p=netatalk.git diff --git a/libatalk/unicode/util_unistr.c b/libatalk/unicode/util_unistr.c index 070ca93b..e5ea4ba0 100644 --- a/libatalk/unicode/util_unistr.c +++ b/libatalk/unicode/util_unistr.c @@ -1,3 +1,13 @@ +/******************************************************************* + NOTE: + The early netatalk 2.x was based on UCS-2. + UCS-2 don't support chars above U+10000. + Recent netatalk is based on UTF-16. + UTF-16 can support chars above U+10000, using Surrogate Pair. + However, Surrogate Pair is complex, dirty, filthy and disagreeable. + There might still be latent bugs... +********************************************************************/ + #ifdef HAVE_CONFIG_H #include "config.h" #endif /* HAVE_CONFIG_H */ @@ -9,8 +19,7 @@ #include #include #include - -#include +#include #include #include "precompose.h" @@ -20,14 +29,30 @@ Convert a string to lower case. return True if any char is converted ********************************************************************/ +/* surrogate pair support */ + int strlower_w(ucs2_t *s) { int ret = 0; + while (*s) { - ucs2_t v = tolower_w(*s); - if (v != *s) { - *s = v; - ret = 1; + if ((0xD800 <= *s) && (*s < 0xDC00)) { + if ((0xDC00 <= s[1]) && (s[1] < 0xE000)) { + uint32_t s_sp = (uint32_t)*s << 16 | (uint32_t)s[1]; + uint32_t v_sp = tolower_sp(s_sp); + if (v_sp != s_sp) { + *s = v_sp >> 16; + s++; + *s = v_sp & 0xFFFF; + ret = 1; + } + } + } else { + ucs2_t v = tolower_w(*s); + if (v != *s) { + *s = v; + ret = 1; + } } s++; } @@ -38,41 +63,74 @@ int strlower_w(ucs2_t *s) Convert a string to upper case. return True if any char is converted ********************************************************************/ +/* surrogate pair support */ + int strupper_w(ucs2_t *s) { int ret = 0; + while (*s) { - ucs2_t v = toupper_w(*s); - if (v != *s) { - *s = v; - ret = 1; + if ((0xD800 <= *s) && (*s < 0xDC00)) { + if ((0xDC00 <= s[1]) && (s[1] < 0xE000)) { + uint32_t s_sp = (uint32_t)*s << 16 | (uint32_t)s[1]; + uint32_t v_sp = toupper_sp(s_sp); + if (v_sp != s_sp) { + *s = v_sp >> 16; + s++; + *s = v_sp & 0xFFFF; + ret = 1; + } + } + } else { + ucs2_t v = toupper_w(*s); + if (v != *s) { + *s = v; + ret = 1; + } } s++; } return ret; } - /******************************************************************* +wide & sp islower() determine if a character is lowercase ********************************************************************/ +/* These functions are not used. */ + int islower_w(ucs2_t c) { return ( c == tolower_w(c)); } +int islower_sp(uint32_t c_sp) +{ + return ( c_sp == tolower_sp(c_sp)); +} + /******************************************************************* +wide & sp isupper() determine if a character is uppercase ********************************************************************/ +/* These functions are not used. */ + int isupper_w(ucs2_t c) { return ( c == toupper_w(c)); } +int isupper_sp(uint32_t c_sp) +{ + return ( c_sp == toupper_sp(c_sp)); +} /******************************************************************* - Count the number of characters in a ucs2_t string. +wide strlen() + Count the number of characters in a UTF-16 string. ********************************************************************/ +/* NOTE: one surrogate pair is two characters. */ + size_t strlen_w(const ucs2_t *src) { size_t len; @@ -83,8 +141,11 @@ size_t strlen_w(const ucs2_t *src) } /******************************************************************* - Count up to max number of characters in a ucs2_t string. +wide strnlen() + Count up to max number of characters in a UTF-16 string. ********************************************************************/ +/* NOTE: one surrogate pair is two characters. */ + size_t strnlen_w(const ucs2_t *src, size_t max) { size_t len; @@ -97,6 +158,8 @@ size_t strnlen_w(const ucs2_t *src, size_t max) /******************************************************************* wide strchr() ********************************************************************/ +/* NOTE: hi and lo of surrogate pair are separately processed. */ + ucs2_t *strchr_w(const ucs2_t *s, ucs2_t c) { while (*s != 0) { @@ -108,11 +171,15 @@ ucs2_t *strchr_w(const ucs2_t *s, ucs2_t c) return NULL; } +/******************************************************************* +wide & sp strcasechr() +********************************************************************/ +/* NOTE: separately process BMP and surrogate pair */ + ucs2_t *strcasechr_w(const ucs2_t *s, ucs2_t c) { while (*s != 0) { -/* LOG(log_debug, logtype_default, "Comparing %X to %X (%X - %X)", c, *s, toupper_w(c), toupper_w(*s));*/ - if (toupper_w(c) == toupper_w(*s)) return (ucs2_t *)s; + if (tolower_w(c) == tolower_w(*s)) return (ucs2_t *)s; s++; } if (c == *s) return (ucs2_t *)s; @@ -120,6 +187,21 @@ ucs2_t *strcasechr_w(const ucs2_t *s, ucs2_t c) return NULL; } +ucs2_t *strcasechr_sp(const ucs2_t *s, uint32_t c_sp) +{ + if (*s == 0) return NULL; + while (s[1] != 0) { + if (tolower_sp(c_sp) == tolower_sp((uint32_t)*s << 16 | (uint32_t)s[1])) return (ucs2_t *)s; + s++; + } + + return NULL; +} + +/******************************************************************* +wide strcmp() +********************************************************************/ +/* no problem of surrogate pair */ int strcmp_w(const ucs2_t *a, const ucs2_t *b) { @@ -130,6 +212,11 @@ int strcmp_w(const ucs2_t *a, const ucs2_t *b) string is longer */ } +/******************************************************************* +wide strncmp() +********************************************************************/ +/* no problem of surrogate pair */ + int strncmp_w(const ucs2_t *a, const ucs2_t *b, size_t len) { size_t n = 0; @@ -140,6 +227,8 @@ int strncmp_w(const ucs2_t *a, const ucs2_t *b, size_t len) /******************************************************************* wide strstr() ********************************************************************/ +/* no problem of surrogate pair */ + ucs2_t *strstr_w(const ucs2_t *s, const ucs2_t *ins) { ucs2_t *r; @@ -156,6 +245,11 @@ ucs2_t *strstr_w(const ucs2_t *s, const ucs2_t *ins) return NULL; } +/******************************************************************* +wide strcasestr() +********************************************************************/ +/* surrogate pair support */ + ucs2_t *strcasestr_w(const ucs2_t *s, const ucs2_t *ins) { ucs2_t *r; @@ -165,39 +259,86 @@ ucs2_t *strcasestr_w(const ucs2_t *s, const ucs2_t *ins) slen = strlen_w(s); inslen = strlen_w(ins); r = (ucs2_t *)s; - while ((r = strcasechr_w(r, *ins))) { - if (strncasecmp_w(r, ins, inslen) == 0) return r; - r++; + + if ((0xD800 <= *ins) && (*ins < 0xDC00)) { + if ((0xDC00 <= ins[1]) && (ins[1] < 0xE000)) { + u_int32_t ins_sp = (u_int32_t)*ins << 16 | (u_int32_t)ins[1]; + while ((r = strcasechr_sp(r, ins_sp))) { + if (strncasecmp_w(r, ins, inslen) == 0) return r; + r++; + } + } else { + return NULL; /* illegal sequence */ + } + } else { + while ((r = strcasechr_w(r, *ins))) { + if (strncasecmp_w(r, ins, inslen) == 0) return r; + r++; + } } return NULL; } - - - /******************************************************************* +wide strcasecmp() case insensitive string comparison ********************************************************************/ +/* surrogate pair support */ + int strcasecmp_w(const ucs2_t *a, const ucs2_t *b) { - while (*b && toupper_w(*a) == toupper_w(*b)) { a++; b++; } + int ret; + + while (*a && *b) { + if ((0xD800 <= *a) && (*a < 0xDC00)) { + if (ret = tolower_sp((uint32_t)*a << 16 | (uint32_t)a[1]) - tolower_sp((uint32_t)*b << 16 | (uint32_t)b[1])) return ret; + a++; + b++; + if (!(*a && *b)) return (tolower_w(*a) - tolower_w(*b)); /* avoid buffer over run */ + } else { + if (ret = tolower_w(*a) - tolower_w(*b)) return ret; + } + a++; + b++; + } return (tolower_w(*a) - tolower_w(*b)); } /******************************************************************* -case insensitive string comparison, lenght limited +wide strncasecmp() +case insensitive string comparison, length limited ********************************************************************/ +/* NOTE: compare up to 'len+1' if 'len' isolate surrogate pair */ + int strncasecmp_w(const ucs2_t *a, const ucs2_t *b, size_t len) { size_t n = 0; - while ((n < len) && *b && (toupper_w(*a) == toupper_w(*b))) { a++; b++; n++; } + int ret; + + while ((n < len) && *a && *b) { + if ((0xD800 <= *a) && (*a < 0xDC00)) { + if (ret = tolower_sp((uint32_t)*a << 16 | (uint32_t)a[1]) - tolower_sp((uint32_t)*b << 16 | (uint32_t)b[1])) return ret; + a++; + b++; + n++; + if (!((n < len) && *a && *b)) return (tolower_w(*a) - tolower_w(*b)); + } else { + if (ret = tolower_w(*a) - tolower_w(*b)) return ret; + } + a++; + b++; + n++; + } return (len - n)?(tolower_w(*a) - tolower_w(*b)):0; } /******************************************************************* +wide strndup() duplicate string ********************************************************************/ +/* NOTE: not check isolation of surrogate pair */ /* if len == 0 then duplicate the whole string */ + ucs2_t *strndup_w(const ucs2_t *src, size_t len) { ucs2_t *dest; @@ -215,6 +356,12 @@ ucs2_t *strndup_w(const ucs2_t *src, size_t len) return dest; } +/******************************************************************* +wide strdup() +duplicate string +********************************************************************/ +/* no problem of surrogate pair */ + ucs2_t *strdup_w(const ucs2_t *src) { return strndup_w(src, 0); @@ -223,6 +370,8 @@ ucs2_t *strdup_w(const ucs2_t *src) /******************************************************************* copy a string with max len ********************************************************************/ +/* This function is not used. */ +/* NOTE: not check isolation of surrogate pair */ ucs2_t *strncpy_w(ucs2_t *dest, const ucs2_t *src, const size_t max) { @@ -242,7 +391,9 @@ ucs2_t *strncpy_w(ucs2_t *dest, const ucs2_t *src, const size_t max) /******************************************************************* append a string of len bytes and add a terminator ********************************************************************/ +/* These functions are not used. */ +/* NOTE: not check isolation of surrogate pair */ ucs2_t *strncat_w(ucs2_t *dest, const ucs2_t *src, const size_t max) { size_t start; @@ -259,7 +410,7 @@ ucs2_t *strncat_w(ucs2_t *dest, const ucs2_t *src, const size_t max) return dest; } - +/* no problem of surrogate pair */ ucs2_t *strcat_w(ucs2_t *dest, const ucs2_t *src) { size_t start; @@ -286,7 +437,7 @@ static ucs2_t do_precomposition(unsigned int base, unsigned int comb) int min = 0; int max = PRECOMP_COUNT - 1; int mid; - u_int32_t sought = (base << 16) | comb, that; + uint32_t sought = (base << 16) | comb, that; /* binary search */ while (max >= min) { @@ -305,17 +456,17 @@ static ucs2_t do_precomposition(unsigned int base, unsigned int comb) } /* ------------------------ */ -static u_int32_t do_precomposition_sp(unsigned int base_sp, unsigned int comb_sp) +static uint32_t do_precomposition_sp(unsigned int base_sp, unsigned int comb_sp) { int min = 0; int max = PRECOMP_SP_COUNT - 1; int mid; - u_int64_t sought_sp = ((u_int64_t)base_sp << 32) | (u_int64_t)comb_sp, that_sp; + uint64_t sought_sp = ((uint64_t)base_sp << 32) | (uint64_t)comb_sp, that_sp; /* binary search */ while (max >= min) { mid = (min + max) / 2; - that_sp = ((u_int64_t)precompositions_sp[mid].base_sp << 32) | ((u_int64_t)precompositions_sp[mid].comb_sp); + that_sp = ((uint64_t)precompositions_sp[mid].base_sp << 32) | ((uint64_t)precompositions_sp[mid].comb_sp); if (that_sp < sought_sp) { min = mid + 1; } else if (that_sp > sought_sp) { @@ -329,13 +480,13 @@ static u_int32_t do_precomposition_sp(unsigned int base_sp, unsigned int comb_sp } /* -------------------------- */ -static u_int32_t do_decomposition(ucs2_t base) +static uint32_t do_decomposition(ucs2_t base) { int min = 0; int max = DECOMP_COUNT - 1; int mid; - u_int32_t sought = base; - u_int32_t result, that; + uint32_t sought = base; + uint32_t result, that; /* binary search */ while (max >= min) { @@ -355,14 +506,14 @@ static u_int32_t do_decomposition(ucs2_t base) } /* -------------------------- */ -static u_int64_t do_decomposition_sp(unsigned int base_sp) +static uint64_t do_decomposition_sp(unsigned int base_sp) { int min = 0; int max = DECOMP_SP_COUNT - 1; int mid; - u_int32_t sought_sp = base_sp; - u_int32_t that_sp; - u_int64_t result_sp; + uint32_t sought_sp = base_sp; + uint32_t that_sp; + uint64_t result_sp; /* binary search */ while (max >= min) { @@ -373,7 +524,7 @@ static u_int64_t do_decomposition_sp(unsigned int base_sp) } else if (that_sp > sought_sp) { max = mid - 1; } else { - result_sp = ((u_int64_t)decompositions_sp[mid].base_sp << 32) | ((u_int64_t)decompositions_sp[mid].comb_sp); + result_sp = ((uint64_t)decompositions_sp[mid].base_sp << 32) | ((uint64_t)decompositions_sp[mid].comb_sp); return result_sp; } } @@ -400,11 +551,11 @@ size_t precompose_w (ucs2_t *name, size_t inplen, ucs2_t *comp, size_t *outlen) { size_t i; ucs2_t base, comb; - u_int32_t base_sp, comb_sp; + uint32_t base_sp, comb_sp; ucs2_t *in, *out; ucs2_t lindex, vindex; ucs2_t result; - u_int32_t result_sp; + uint32_t result_sp; size_t o_len = *outlen; if (!inplen || (inplen & 1) || inplen > o_len) @@ -417,8 +568,6 @@ size_t precompose_w (ucs2_t *name, size_t inplen, ucs2_t *comp, size_t *outlen) base = *in; while (*outlen > 2) { i += 2; - in++; - if (i == inplen) { *out = base; out++; @@ -426,7 +575,7 @@ size_t precompose_w (ucs2_t *name, size_t inplen, ucs2_t *comp, size_t *outlen) *outlen -= 2; return o_len - *outlen; } - + in++; comb = *in; result = 0; @@ -454,16 +603,16 @@ size_t precompose_w (ucs2_t *name, size_t inplen, ucs2_t *comp, size_t *outlen) /* Binary Search for Surrogate Pair */ else if ((0xD800 <= base) && (base < 0xDC00)) { - if ((0xDC00 <= comb) && (comb < 0xE000) && (i + 4 <= inplen)) { - base_sp = ((u_int32_t)base << 16) | (u_int32_t)comb; + if ((0xDC00 <= comb) && (comb < 0xE000) && (i + 6 <= inplen)) { + base_sp = ((uint32_t)base << 16) | (uint32_t)comb; do { - comb_sp = ((u_int32_t)in[1] << 16) | (u_int32_t)in[2]; + comb_sp = ((uint32_t)in[1] << 16) | (uint32_t)in[2]; if (result_sp = do_precomposition_sp(base_sp, comb_sp)) { base_sp = result_sp; i += 4; in +=2; } - } while ((i + 4 <= inplen) && result_sp) ; + } while ((i + 6 <= inplen) && result_sp) ; *out = base_sp >> 16; out++; @@ -479,6 +628,11 @@ size_t precompose_w (ucs2_t *name, size_t inplen, ucs2_t *comp, size_t *outlen) *outlen -= 2; i += 2; + if (i == inplen) { + out++; + *out = 0; + return o_len - *outlen; + } in++; base = *in; @@ -509,11 +663,11 @@ size_t decompose_w (ucs2_t *name, size_t inplen, ucs2_t *comp, size_t *outlen) size_t i; size_t comblen; ucs2_t base, comb[COMBBUFLEN]; - u_int32_t base_sp; + uint32_t base_sp; ucs2_t sindex, tjamo; ucs2_t *in, *out; unsigned int result; - u_int64_t result_sp; + uint64_t result_sp; size_t o_len = *outlen; if (!inplen || (inplen & 1)) @@ -551,7 +705,7 @@ size_t decompose_w (ucs2_t *name, size_t inplen, ucs2_t *comp, size_t *outlen) /* Binary Search for Surrogate Pair */ else if ((0xD800 <= base) && (base < 0xDC00)) { if (i + 2 < inplen) { - base_sp = ((u_int32_t)base << 16) | (u_int32_t)in[1]; + base_sp = ((uint32_t)base << 16) | (uint32_t)in[1]; do { if ( !(result_sp = do_decomposition_sp(base_sp))) break; comblen += 2;