From: HAT Date: Wed, 22 Dec 2010 11:20:36 +0000 (+0900) Subject: composition of surrogate pair X-Git-Url: https://arthur.barton.de/cgi-bin/gitweb.cgi?p=netatalk.git;a=commitdiff_plain;h=f3ffd27311eb88d53f6b07a39ecb96d4811e3977;hp=381bdbb4d06e6a9050050c7351264656f6a5a8df composition of surrogate pair --- diff --git a/contrib/misc/make-precompose.h.pl b/contrib/misc/make-precompose.h.pl old mode 100644 new mode 100755 index 9e85815c..558a537b --- a/contrib/misc/make-precompose.h.pl +++ b/contrib/misc/make-precompose.h.pl @@ -12,8 +12,12 @@ # table for binary search -------------------------------------------------- open(UNICODEDATA, "<$ARGV[0]"); -open(PRECOMPOSETEMP, ">precompose.TEMP"); -open( DECOMPOSETEMP, ">decompose.TEMP"); + +open(PRECOMPOSE_TEMP, ">precompose.TEMP"); +open( DECOMPOSE_TEMP, ">decompose.TEMP"); + +open(PRECOMPOSE_SP_TEMP, ">precompose_sp.TEMP"); +open( DECOMPOSE_SP_TEMP, ">decompose_sp.TEMP"); while (){ chop; @@ -41,11 +45,6 @@ while (){ $leftbracket = " { "; $rightbracket =" }, "; - if (hex($code0) > 0xFFFF) { # DELETE THIS LINE IF INTERNAL CODE IS UCS4 - $leftbracket = "\/\*{ "; # DELETE THIS LINE IF INTERNAL CODE IS UCS4 - $rightbracket =" },\*\/ "; # DELETE THIS LINE IF INTERNAL CODE IS UCS4 - } # DELETE THIS LINE IF INTERNAL CODE IS UCS4 - # AFP 3.x Spec if ( ((0x2000 <= hex($code0)) && (hex($code0) <= 0x2FFF)) || ((0xFE30 <= hex($code0)) && (hex($code0) <= 0xFE4F)) @@ -54,9 +53,28 @@ while (){ $rightbracket =" },\*\/ "; } - printf(PRECOMPOSETEMP "%s0x%08X, 0x%08X, 0x%08X%s\/\* %s \*\/\n", $leftbracket, hex($code0), hex($base), hex($comb), $rightbracket, $Name1); + if (hex($code0) > 0xFFFF) { # DELETE THIS LINE IF INTERNAL CODE IS UCS4 + + $code0_sp_hi = 0xD800 - (0x10000 >> 10) + (hex($code0) >> 10); + $code0_sp_lo = 0xDC00 + (hex($code0) & 0x3FF); + + $base_sp_hi = 0xD800 - (0x10000 >> 10) + (hex($base) >> 10); + $base_sp_lo = 0xDC00 + (hex($base) & 0x3FF); - printf( DECOMPOSETEMP "%s0x%08X, 0x%08X, 0x%08X%s\/\* %s \*\/\n", $leftbracket, hex($code0), hex($base), hex($comb), $rightbracket, $Name1); + $comb_sp_hi = 0xD800 - (0x10000 >> 10) + (hex($comb) >> 10); + $comb_sp_lo = 0xDC00 + (hex($comb) & 0x3FF); + + printf(PRECOMPOSE_SP_TEMP "%s0x%04X%04X, 0x%04X%04X, 0x%04X%04X%s\/\* %s \*\/\n", + $leftbracket, $code0_sp_hi ,$code0_sp_lo, $base_sp_hi, $base_sp_lo, $comb_sp_hi, $comb_sp_lo, $rightbracket, $Name1); + printf(DECOMPOSE_SP_TEMP "%s0x%04X%04X, 0x%04X%04X, 0x%04X%04X%s\/\* %s \*\/\n", + $leftbracket, $code0_sp_hi ,$code0_sp_lo, $base_sp_hi, $base_sp_lo, $comb_sp_hi, $comb_sp_lo, $rightbracket, $Name1); + + $leftbracket = "\/\*{ "; # DELETE THIS LINE IF INTERNAL CODE IS UCS4 + $rightbracket =" },\*\/ "; # DELETE THIS LINE IF INTERNAL CODE IS UCS4 + } # DELETE THIS LINE IF INTERNAL CODE IS UCS4 + + printf(PRECOMPOSE_TEMP "%s0x%08X, 0x%08X, 0x%08X%s\/\* %s \*\/\n", $leftbracket, hex($code0), hex($base), hex($comb), $rightbracket, $Name1); + printf( DECOMPOSE_TEMP "%s0x%08X, 0x%08X, 0x%08X%s\/\* %s \*\/\n", $leftbracket, hex($code0), hex($base), hex($comb), $rightbracket, $Name1); } } @@ -66,6 +84,9 @@ while (){ system("sort -k 3 precompose.TEMP \> precompose.SORT"); system("sort -k 2 decompose.TEMP \> decompose.SORT"); +system("sort -k 3 precompose_sp.TEMP \> precompose_sp.SORT"); +system("sort -k 2 decompose_sp.TEMP \> decompose_sp.SORT"); + # print ------------------------------------------------------------------- printf ("\/\* This file is generated by contrib/misc/make-precompose.h.pl %s \*\/\n", $ARGV[0]); @@ -97,6 +118,30 @@ system("cat decompose.SORT"); print ("\}\;\n"); print ("\n"); + + +print ("static const struct \{\n"); +print (" unsigned int replacement\;\n"); +print (" unsigned int base\;\n"); +print (" unsigned int comb\;\n"); +print ("\} precompositions_sp\[\] \= \{\n"); + +system("cat precompose_sp.SORT"); + +print ("\}\;\n"); +print ("\n"); + +print ("static const struct \{\n"); +print (" unsigned int replacement\;\n"); +print (" unsigned int base\;\n"); +print (" unsigned int comb\;\n"); +print ("\} decompositions_sp\[\] \= \{\n"); + +system("cat decompose_sp.SORT"); + +print ("\}\;\n"); +print ("\n"); + print ("\/\* EOF \*\/\n"); # EOF diff --git a/libatalk/unicode/precompose.h b/libatalk/unicode/precompose.h index 9eba0692..2d3a505c 100644 --- a/libatalk/unicode/precompose.h +++ b/libatalk/unicode/precompose.h @@ -1009,6 +1009,9 @@ static const struct { { 0x000030FE, 0x000030FD, 0x00003099 }, /* KATAKANA VOICED ITERATION MARK */ { 0x0000FB2C, 0x0000FB49, 0x000005C1 }, /* HEBREW LETTER SHIN WITH DAGESH AND SHIN DOT */ { 0x0000FB2D, 0x0000FB49, 0x000005C2 }, /* HEBREW LETTER SHIN WITH DAGESH AND SIN DOT */ +/*{ 0x0001109A, 0x00011099, 0x000110BA },*/ /* KAITHI LETTER DDDHA */ +/*{ 0x0001109C, 0x0001109B, 0x000110BA },*/ /* KAITHI LETTER RHA */ +/*{ 0x000110AB, 0x000110A5, 0x000110BA },*/ /* KAITHI LETTER VA */ /*{ 0x0001D15E, 0x0001D157, 0x0001D165 },*/ /* MUSICAL SYMBOL HALF NOTE */ /*{ 0x0001D15F, 0x0001D158, 0x0001D165 },*/ /* MUSICAL SYMBOL QUARTER NOTE */ /*{ 0x0001D160, 0x0001D15F, 0x0001D16E },*/ /* MUSICAL SYMBOL EIGHTH NOTE */ @@ -2029,6 +2032,9 @@ static const struct { { 0x0000FB4C, 0x000005D1, 0x000005BF }, /* HEBREW LETTER BET WITH RAFE */ { 0x0000FB4D, 0x000005DB, 0x000005BF }, /* HEBREW LETTER KAF WITH RAFE */ { 0x0000FB4E, 0x000005E4, 0x000005BF }, /* HEBREW LETTER PE WITH RAFE */ +/*{ 0x0001109A, 0x00011099, 0x000110BA },*/ /* KAITHI LETTER DDDHA */ +/*{ 0x0001109C, 0x0001109B, 0x000110BA },*/ /* KAITHI LETTER RHA */ +/*{ 0x000110AB, 0x000110A5, 0x000110BA },*/ /* KAITHI LETTER VA */ /*{ 0x0001D15E, 0x0001D157, 0x0001D165 },*/ /* MUSICAL SYMBOL HALF NOTE */ /*{ 0x0001D15F, 0x0001D158, 0x0001D165 },*/ /* MUSICAL SYMBOL QUARTER NOTE */ /*{ 0x0001D160, 0x0001D15F, 0x0001D16E },*/ /* MUSICAL SYMBOL EIGHTH NOTE */ @@ -2044,4 +2050,50 @@ static const struct { /*{ 0x0001D1C0, 0x0001D1BC, 0x0001D16F },*/ /* MUSICAL SYMBOL FUSA BLACK */ }; +static const struct { + unsigned int replacement; + unsigned int base; + unsigned int comb; +} precompositions_sp[] = { + { 0xD804DC9A, 0xD804DC99, 0xD804DCBA }, /* KAITHI LETTER DDDHA */ + { 0xD804DC9C, 0xD804DC9B, 0xD804DCBA }, /* KAITHI LETTER RHA */ + { 0xD804DCAB, 0xD804DCA5, 0xD804DCBA }, /* KAITHI LETTER VA */ + { 0xD834DD5E, 0xD834DD57, 0xD834DD65 }, /* MUSICAL SYMBOL HALF NOTE */ + { 0xD834DD5F, 0xD834DD58, 0xD834DD65 }, /* MUSICAL SYMBOL QUARTER NOTE */ + { 0xD834DD60, 0xD834DD5F, 0xD834DD6E }, /* MUSICAL SYMBOL EIGHTH NOTE */ + { 0xD834DD61, 0xD834DD5F, 0xD834DD6F }, /* MUSICAL SYMBOL SIXTEENTH NOTE */ + { 0xD834DD62, 0xD834DD5F, 0xD834DD70 }, /* MUSICAL SYMBOL THIRTY-SECOND NOTE */ + { 0xD834DD63, 0xD834DD5F, 0xD834DD71 }, /* MUSICAL SYMBOL SIXTY-FOURTH NOTE */ + { 0xD834DD64, 0xD834DD5F, 0xD834DD72 }, /* MUSICAL SYMBOL ONE HUNDRED TWENTY-EIGHTH NOTE */ + { 0xD834DDBB, 0xD834DDB9, 0xD834DD65 }, /* MUSICAL SYMBOL MINIMA */ + { 0xD834DDBC, 0xD834DDBA, 0xD834DD65 }, /* MUSICAL SYMBOL MINIMA BLACK */ + { 0xD834DDBD, 0xD834DDBB, 0xD834DD6E }, /* MUSICAL SYMBOL SEMIMINIMA WHITE */ + { 0xD834DDBF, 0xD834DDBB, 0xD834DD6F }, /* MUSICAL SYMBOL FUSA WHITE */ + { 0xD834DDBE, 0xD834DDBC, 0xD834DD6E }, /* MUSICAL SYMBOL SEMIMINIMA BLACK */ + { 0xD834DDC0, 0xD834DDBC, 0xD834DD6F }, /* MUSICAL SYMBOL FUSA BLACK */ +}; + +static const struct { + unsigned int replacement; + unsigned int base; + unsigned int comb; +} decompositions_sp[] = { + { 0xD804DC9A, 0xD804DC99, 0xD804DCBA }, /* KAITHI LETTER DDDHA */ + { 0xD804DC9C, 0xD804DC9B, 0xD804DCBA }, /* KAITHI LETTER RHA */ + { 0xD804DCAB, 0xD804DCA5, 0xD804DCBA }, /* KAITHI LETTER VA */ + { 0xD834DD5E, 0xD834DD57, 0xD834DD65 }, /* MUSICAL SYMBOL HALF NOTE */ + { 0xD834DD5F, 0xD834DD58, 0xD834DD65 }, /* MUSICAL SYMBOL QUARTER NOTE */ + { 0xD834DD60, 0xD834DD5F, 0xD834DD6E }, /* MUSICAL SYMBOL EIGHTH NOTE */ + { 0xD834DD61, 0xD834DD5F, 0xD834DD6F }, /* MUSICAL SYMBOL SIXTEENTH NOTE */ + { 0xD834DD62, 0xD834DD5F, 0xD834DD70 }, /* MUSICAL SYMBOL THIRTY-SECOND NOTE */ + { 0xD834DD63, 0xD834DD5F, 0xD834DD71 }, /* MUSICAL SYMBOL SIXTY-FOURTH NOTE */ + { 0xD834DD64, 0xD834DD5F, 0xD834DD72 }, /* MUSICAL SYMBOL ONE HUNDRED TWENTY-EIGHTH NOTE */ + { 0xD834DDBB, 0xD834DDB9, 0xD834DD65 }, /* MUSICAL SYMBOL MINIMA */ + { 0xD834DDBC, 0xD834DDBA, 0xD834DD65 }, /* MUSICAL SYMBOL MINIMA BLACK */ + { 0xD834DDBD, 0xD834DDBB, 0xD834DD6E }, /* MUSICAL SYMBOL SEMIMINIMA WHITE */ + { 0xD834DDBE, 0xD834DDBC, 0xD834DD6E }, /* MUSICAL SYMBOL SEMIMINIMA BLACK */ + { 0xD834DDBF, 0xD834DDBB, 0xD834DD6F }, /* MUSICAL SYMBOL FUSA WHITE */ + { 0xD834DDC0, 0xD834DDBC, 0xD834DD6F }, /* MUSICAL SYMBOL FUSA BLACK */ +}; + /* EOF */ diff --git a/libatalk/unicode/util_unistr.c b/libatalk/unicode/util_unistr.c index d08f8626..93b4a287 100644 --- a/libatalk/unicode/util_unistr.c +++ b/libatalk/unicode/util_unistr.c @@ -28,7 +28,12 @@ #define HANGUL_SCOUNT (HANGUL_LCOUNT * HANGUL_NCOUNT) /* 11172 */ #define MAXCOMBLEN 3 +#define MAXCOMBSPLEN 2 +#define COMBBUFLEN 4 /* max(MAXCOMBLEN, MAXCOMBSPLEN*2) */ +/******************************************************************* + Convert a wide character to upper/lower case. +********************************************************************/ ucs2_t toupper_w(ucs2_t val) { if ( val >= 0x0040 && val <= 0x007F) @@ -333,7 +338,10 @@ ucs2_t *strcat_w(ucs2_t *dest, const ucs2_t *src) } -/* ------------------------ */ +/******************************************************************* +binary search for pre|decomposition +********************************************************************/ + static ucs2_t do_precomposition(unsigned int base, unsigned int comb) { int min = 0; @@ -357,6 +365,30 @@ static ucs2_t do_precomposition(unsigned int base, unsigned int comb) return 0; } +/* ------------------------ */ +static u_int32_t do_precomposition_sp(unsigned int base_sp, unsigned int comb_sp) +{ + int min = 0; + int max = sizeof(precompositions_sp) / sizeof(precompositions_sp[0]) - 1; + int mid; + u_int64_t sought = ((u_int64_t)base_sp << 32) | (u_int64_t)comb_sp, that; + + /* binary search */ + while (max >= min) { + mid = (min + max) / 2; + that = ((u_int64_t)precompositions_sp[mid].base << 32) | ((u_int64_t)precompositions_sp[mid].comb); + if (that < sought) { + min = mid + 1; + } else if (that > sought) { + max = mid - 1; + } else { + return precompositions_sp[mid].replacement; + } + } + /* no match */ + return 0; +} + /* -------------------------- */ static u_int32_t do_decomposition(ucs2_t base) { @@ -383,36 +415,70 @@ static u_int32_t do_decomposition(ucs2_t base) return 0; } -/* we can't use static, this stuff needs to be reentrant */ -/* static char comp[MAXPATHLEN +1]; */ +/* -------------------------- */ +static u_int64_t do_decomposition_sp(unsigned int base) +{ + int min = 0; + int max = sizeof(decompositions_sp) / sizeof(decompositions_sp[0]) - 1; + int mid; + u_int32_t sought = base; + u_int32_t that; + u_int64_t result; + + /* binary search */ + while (max >= min) { + mid = (min + max) / 2; + that = decompositions_sp[mid].replacement; + if (that < sought) { + min = mid + 1; + } else if (that > sought) { + max = mid - 1; + } else { + result = ((u_int64_t)decompositions_sp[mid].base << 32) | ((u_int64_t)decompositions_sp[mid].comb); + return result; + } + } + /* no match */ + return 0; +} + +/******************************************************************* +pre|decomposition + + we can't use static, this stuff needs to be reentrant + static char comp[MAXPATHLEN +1]; + + exclude U2000-U2FFF, UFE30-UFE4F and U2F800-U2FA1F ranges + in decompositions[] from decomposition according to AFP 3.x spec + + We don't implement Singleton and Canonical Ordering + because they cause the problem of the roundtrip + such as Dancing Icon +********************************************************************/ size_t precompose_w (ucs2_t *name, size_t inplen, ucs2_t *comp, size_t *outlen) { size_t i; ucs2_t base, comb; + u_int32_t base_sp, comb_sp; ucs2_t *in, *out; ucs2_t hangul_lindex, hangul_vindex; ucs2_t result; + u_int32_t result_sp; size_t o_len = *outlen; - + if (!inplen || (inplen & 1) || inplen > o_len) return (size_t)-1; - /* Actually, */ - /* Decomposition and Canonical Ordering are necessary here. */ - /* */ - /* Ex. in = CanonicalOrdering(decompose_w(name)) */ - /* */ - /* A new mapping table is needed for CanonicalOrdering. */ - i = 0; in = name; out = comp; - + base = *in; while (*outlen > 2) { i += 2; in++; + if (i == inplen) { *out = base; out++; @@ -420,9 +486,10 @@ size_t precompose_w (ucs2_t *name, size_t inplen, ucs2_t *comp, size_t *outlen) *outlen -= 2; return o_len - *outlen; } + comb = *in; result = 0; - + /* Non-Combination Character */ if (comb < 0x300) ; @@ -445,8 +512,42 @@ size_t precompose_w (ucs2_t *name, size_t inplen, ucs2_t *comp, size_t *outlen) } } - /* Combining Sequence */ - else if ((result = do_precomposition(base, comb))) { + /* Binary Search for Surrogate Pair */ + else if ((0xD800 <= base) && (base < 0xDC00)) { + if ((0xDC00 <= comb) && (comb < 0xE000) && (i + 4 <= inplen)) { + base_sp = ((u_int32_t)base << 16) | (u_int32_t)comb; + do { + comb_sp = ((u_int32_t)in[1] << 16) | (u_int32_t)in[2]; + if (result_sp = do_precomposition_sp(base_sp, comb_sp)) { + base_sp = result_sp; + i += 4; + in +=2; + } + } while ((i + 4 <= inplen) && result_sp) ; + + *out = base_sp >> 16; + out++; + *outlen -= 2; + + if (*outlen <= 2) { + errno = E2BIG; + return (size_t)-1; + } + + *out = base_sp & 0xFFFF; + out++; + *outlen -= 2; + + i += 2; + in++; + base = *in; + + result = 1; + } + } + + /* Binary Search for BMP */ + else if (result = do_precomposition(base, comb)) { base = result; } @@ -457,25 +558,22 @@ size_t precompose_w (ucs2_t *name, size_t inplen, ucs2_t *comp, size_t *outlen) base = comb; } } - + errno = E2BIG; return (size_t)-1; } /* --------------- */ - -/* Singleton Decomposition is unsupported. */ -/* A new mapping table is needed for implementation. */ - size_t decompose_w (ucs2_t *name, size_t inplen, ucs2_t *comp, size_t *outlen) { size_t i; size_t comblen; - ucs2_t base; - ucs2_t comb[MAXCOMBLEN]; + ucs2_t base, comb[COMBBUFLEN]; + u_int32_t base_sp; ucs2_t hangul_sindex, tjamo; ucs2_t *in, *out; unsigned int result; + u_int64_t result_sp; size_t o_len = *outlen; if (!inplen || (inplen & 1)) @@ -495,31 +593,57 @@ size_t decompose_w (ucs2_t *name, size_t inplen, ucs2_t *comp, size_t *outlen) else if ((HANGUL_SBASE <= base) && (base < HANGUL_SBASE + HANGUL_SCOUNT)) { hangul_sindex = base - HANGUL_SBASE; base = HANGUL_LBASE + hangul_sindex / HANGUL_NCOUNT; - comb[MAXCOMBLEN-2] = HANGUL_VBASE + (hangul_sindex % HANGUL_NCOUNT) / HANGUL_TCOUNT; + comb[COMBBUFLEN-2] = HANGUL_VBASE + (hangul_sindex % HANGUL_NCOUNT) / HANGUL_TCOUNT; /* */ if ((tjamo = HANGUL_TBASE + hangul_sindex % HANGUL_TCOUNT) == HANGUL_TBASE) { - comb[MAXCOMBLEN-1] = comb[MAXCOMBLEN-2]; + comb[COMBBUFLEN-1] = comb[COMBBUFLEN-2]; comblen = 1; } /* */ else { - comb[MAXCOMBLEN-1] = tjamo; + comb[COMBBUFLEN-1] = tjamo; comblen = 2; } } - /* Combining Sequence */ - /* exclude U2000-U2FFF and UFE30-UFE4F ranges in decompositions[] */ - /* from decomposition according to AFP 3.1 spec */ + /* Binary Search for Surrogate Pair */ + else if ((0xD800 <= base) && (base < 0xDC00)) { + if (i + 2 < inplen) { + base_sp = ((u_int32_t)base << 16) | (u_int32_t)in[1]; + do { + if ( !(result_sp = do_decomposition_sp(base_sp))) break; + comblen += 2; + base_sp = result_sp >> 32; + comb[COMBBUFLEN-comblen] = (result_sp >> 16) & 0xFFFF; /* hi */ + comb[COMBBUFLEN-comblen+1] = result_sp & 0xFFFF; /* lo */ + } while (comblen < (MAXCOMBSPLEN<<1)); + + if (*outlen < (comblen + 1) << 1) { + errno = E2BIG; + return (size_t)-1; + } + + *out = base_sp >> 16; /* hi */ + out++; + *outlen -= 2; + + base = base_sp & 0xFFFF; /* lo */ + + i += 2; + in++; + } + } + + /* Binary Search for BMP */ else { do { - if ((comblen >= MAXCOMBLEN) || !(result = do_decomposition(base))) break; + if ( !(result = do_decomposition(base))) break; comblen++; base = result >> 16; - comb[MAXCOMBLEN-comblen] = result & 0xffff; - } while (0x007f < base) ; + comb[COMBBUFLEN-comblen] = result & 0xFFFF; + } while ((0x007f < base) && (comblen < MAXCOMBLEN)); } if (*outlen < (comblen + 1) << 1) { @@ -532,7 +656,7 @@ size_t decompose_w (ucs2_t *name, size_t inplen, ucs2_t *comp, size_t *outlen) *outlen -= 2; while ( comblen > 0 ) { - *out = comb[MAXCOMBLEN-comblen]; + *out = comb[COMBBUFLEN-comblen]; out++; *outlen -= 2; comblen--; @@ -541,13 +665,15 @@ size_t decompose_w (ucs2_t *name, size_t inplen, ucs2_t *comp, size_t *outlen) i += 2; in++; } - - /* Is Canonical Ordering necessary here? */ *out = 0; return o_len-*outlen; } +/******************************************************************* +length of UTF-8 character and string +********************************************************************/ + size_t utf8_charlen ( char* utf8 ) { unsigned char *p;