# table for binary search --------------------------------------------------
open(UNICODEDATA, "<$ARGV[0]");
-open(PRECOMPOSETEMP, ">precompose.TEMP");
-open( DECOMPOSETEMP, ">decompose.TEMP");
+
+open(PRECOMPOSE_TEMP, ">precompose.TEMP");
+open( DECOMPOSE_TEMP, ">decompose.TEMP");
+
+open(PRECOMPOSE_SP_TEMP, ">precompose_sp.TEMP");
+open( DECOMPOSE_SP_TEMP, ">decompose_sp.TEMP");
while (<UNICODEDATA>){
chop;
$leftbracket = " { ";
$rightbracket =" }, ";
- if (hex($code0) > 0xFFFF) { # DELETE THIS LINE IF INTERNAL CODE IS UCS4
- $leftbracket = "\/\*{ "; # DELETE THIS LINE IF INTERNAL CODE IS UCS4
- $rightbracket =" },\*\/ "; # DELETE THIS LINE IF INTERNAL CODE IS UCS4
- } # DELETE THIS LINE IF INTERNAL CODE IS UCS4
-
# AFP 3.x Spec
if ( ((0x2000 <= hex($code0)) && (hex($code0) <= 0x2FFF))
|| ((0xFE30 <= hex($code0)) && (hex($code0) <= 0xFE4F))
$rightbracket =" },\*\/ ";
}
- printf(PRECOMPOSETEMP "%s0x%08X, 0x%08X, 0x%08X%s\/\* %s \*\/\n", $leftbracket, hex($code0), hex($base), hex($comb), $rightbracket, $Name1);
+ if (hex($code0) > 0xFFFF) { # DELETE THIS LINE IF INTERNAL CODE IS UCS4
+
+ $code0_sp_hi = 0xD800 - (0x10000 >> 10) + (hex($code0) >> 10);
+ $code0_sp_lo = 0xDC00 + (hex($code0) & 0x3FF);
+
+ $base_sp_hi = 0xD800 - (0x10000 >> 10) + (hex($base) >> 10);
+ $base_sp_lo = 0xDC00 + (hex($base) & 0x3FF);
- printf( DECOMPOSETEMP "%s0x%08X, 0x%08X, 0x%08X%s\/\* %s \*\/\n", $leftbracket, hex($code0), hex($base), hex($comb), $rightbracket, $Name1);
+ $comb_sp_hi = 0xD800 - (0x10000 >> 10) + (hex($comb) >> 10);
+ $comb_sp_lo = 0xDC00 + (hex($comb) & 0x3FF);
+
+ printf(PRECOMPOSE_SP_TEMP "%s0x%04X%04X, 0x%04X%04X, 0x%04X%04X%s\/\* %s \*\/\n",
+ $leftbracket, $code0_sp_hi ,$code0_sp_lo, $base_sp_hi, $base_sp_lo, $comb_sp_hi, $comb_sp_lo, $rightbracket, $Name1);
+ printf(DECOMPOSE_SP_TEMP "%s0x%04X%04X, 0x%04X%04X, 0x%04X%04X%s\/\* %s \*\/\n",
+ $leftbracket, $code0_sp_hi ,$code0_sp_lo, $base_sp_hi, $base_sp_lo, $comb_sp_hi, $comb_sp_lo, $rightbracket, $Name1);
+
+ $leftbracket = "\/\*{ "; # DELETE THIS LINE IF INTERNAL CODE IS UCS4
+ $rightbracket =" },\*\/ "; # DELETE THIS LINE IF INTERNAL CODE IS UCS4
+ } # DELETE THIS LINE IF INTERNAL CODE IS UCS4
+
+ printf(PRECOMPOSE_TEMP "%s0x%08X, 0x%08X, 0x%08X%s\/\* %s \*\/\n", $leftbracket, hex($code0), hex($base), hex($comb), $rightbracket, $Name1);
+ printf( DECOMPOSE_TEMP "%s0x%08X, 0x%08X, 0x%08X%s\/\* %s \*\/\n", $leftbracket, hex($code0), hex($base), hex($comb), $rightbracket, $Name1);
}
}
system("sort -k 3 precompose.TEMP \> precompose.SORT");
system("sort -k 2 decompose.TEMP \> decompose.SORT");
+system("sort -k 3 precompose_sp.TEMP \> precompose_sp.SORT");
+system("sort -k 2 decompose_sp.TEMP \> decompose_sp.SORT");
+
# print -------------------------------------------------------------------
printf ("\/\* This file is generated by contrib/misc/make-precompose.h.pl %s \*\/\n", $ARGV[0]);
print ("\}\;\n");
print ("\n");
+
+
+print ("static const struct \{\n");
+print (" unsigned int replacement\;\n");
+print (" unsigned int base\;\n");
+print (" unsigned int comb\;\n");
+print ("\} precompositions_sp\[\] \= \{\n");
+
+system("cat precompose_sp.SORT");
+
+print ("\}\;\n");
+print ("\n");
+
+print ("static const struct \{\n");
+print (" unsigned int replacement\;\n");
+print (" unsigned int base\;\n");
+print (" unsigned int comb\;\n");
+print ("\} decompositions_sp\[\] \= \{\n");
+
+system("cat decompose_sp.SORT");
+
+print ("\}\;\n");
+print ("\n");
+
print ("\/\* EOF \*\/\n");
# EOF
{ 0x000030FE, 0x000030FD, 0x00003099 }, /* KATAKANA VOICED ITERATION MARK */
{ 0x0000FB2C, 0x0000FB49, 0x000005C1 }, /* HEBREW LETTER SHIN WITH DAGESH AND SHIN DOT */
{ 0x0000FB2D, 0x0000FB49, 0x000005C2 }, /* HEBREW LETTER SHIN WITH DAGESH AND SIN DOT */
+/*{ 0x0001109A, 0x00011099, 0x000110BA },*/ /* KAITHI LETTER DDDHA */
+/*{ 0x0001109C, 0x0001109B, 0x000110BA },*/ /* KAITHI LETTER RHA */
+/*{ 0x000110AB, 0x000110A5, 0x000110BA },*/ /* KAITHI LETTER VA */
/*{ 0x0001D15E, 0x0001D157, 0x0001D165 },*/ /* MUSICAL SYMBOL HALF NOTE */
/*{ 0x0001D15F, 0x0001D158, 0x0001D165 },*/ /* MUSICAL SYMBOL QUARTER NOTE */
/*{ 0x0001D160, 0x0001D15F, 0x0001D16E },*/ /* MUSICAL SYMBOL EIGHTH NOTE */
{ 0x0000FB4C, 0x000005D1, 0x000005BF }, /* HEBREW LETTER BET WITH RAFE */
{ 0x0000FB4D, 0x000005DB, 0x000005BF }, /* HEBREW LETTER KAF WITH RAFE */
{ 0x0000FB4E, 0x000005E4, 0x000005BF }, /* HEBREW LETTER PE WITH RAFE */
+/*{ 0x0001109A, 0x00011099, 0x000110BA },*/ /* KAITHI LETTER DDDHA */
+/*{ 0x0001109C, 0x0001109B, 0x000110BA },*/ /* KAITHI LETTER RHA */
+/*{ 0x000110AB, 0x000110A5, 0x000110BA },*/ /* KAITHI LETTER VA */
/*{ 0x0001D15E, 0x0001D157, 0x0001D165 },*/ /* MUSICAL SYMBOL HALF NOTE */
/*{ 0x0001D15F, 0x0001D158, 0x0001D165 },*/ /* MUSICAL SYMBOL QUARTER NOTE */
/*{ 0x0001D160, 0x0001D15F, 0x0001D16E },*/ /* MUSICAL SYMBOL EIGHTH NOTE */
/*{ 0x0001D1C0, 0x0001D1BC, 0x0001D16F },*/ /* MUSICAL SYMBOL FUSA BLACK */
};
+static const struct {
+ unsigned int replacement;
+ unsigned int base;
+ unsigned int comb;
+} precompositions_sp[] = {
+ { 0xD804DC9A, 0xD804DC99, 0xD804DCBA }, /* KAITHI LETTER DDDHA */
+ { 0xD804DC9C, 0xD804DC9B, 0xD804DCBA }, /* KAITHI LETTER RHA */
+ { 0xD804DCAB, 0xD804DCA5, 0xD804DCBA }, /* KAITHI LETTER VA */
+ { 0xD834DD5E, 0xD834DD57, 0xD834DD65 }, /* MUSICAL SYMBOL HALF NOTE */
+ { 0xD834DD5F, 0xD834DD58, 0xD834DD65 }, /* MUSICAL SYMBOL QUARTER NOTE */
+ { 0xD834DD60, 0xD834DD5F, 0xD834DD6E }, /* MUSICAL SYMBOL EIGHTH NOTE */
+ { 0xD834DD61, 0xD834DD5F, 0xD834DD6F }, /* MUSICAL SYMBOL SIXTEENTH NOTE */
+ { 0xD834DD62, 0xD834DD5F, 0xD834DD70 }, /* MUSICAL SYMBOL THIRTY-SECOND NOTE */
+ { 0xD834DD63, 0xD834DD5F, 0xD834DD71 }, /* MUSICAL SYMBOL SIXTY-FOURTH NOTE */
+ { 0xD834DD64, 0xD834DD5F, 0xD834DD72 }, /* MUSICAL SYMBOL ONE HUNDRED TWENTY-EIGHTH NOTE */
+ { 0xD834DDBB, 0xD834DDB9, 0xD834DD65 }, /* MUSICAL SYMBOL MINIMA */
+ { 0xD834DDBC, 0xD834DDBA, 0xD834DD65 }, /* MUSICAL SYMBOL MINIMA BLACK */
+ { 0xD834DDBD, 0xD834DDBB, 0xD834DD6E }, /* MUSICAL SYMBOL SEMIMINIMA WHITE */
+ { 0xD834DDBF, 0xD834DDBB, 0xD834DD6F }, /* MUSICAL SYMBOL FUSA WHITE */
+ { 0xD834DDBE, 0xD834DDBC, 0xD834DD6E }, /* MUSICAL SYMBOL SEMIMINIMA BLACK */
+ { 0xD834DDC0, 0xD834DDBC, 0xD834DD6F }, /* MUSICAL SYMBOL FUSA BLACK */
+};
+
+static const struct {
+ unsigned int replacement;
+ unsigned int base;
+ unsigned int comb;
+} decompositions_sp[] = {
+ { 0xD804DC9A, 0xD804DC99, 0xD804DCBA }, /* KAITHI LETTER DDDHA */
+ { 0xD804DC9C, 0xD804DC9B, 0xD804DCBA }, /* KAITHI LETTER RHA */
+ { 0xD804DCAB, 0xD804DCA5, 0xD804DCBA }, /* KAITHI LETTER VA */
+ { 0xD834DD5E, 0xD834DD57, 0xD834DD65 }, /* MUSICAL SYMBOL HALF NOTE */
+ { 0xD834DD5F, 0xD834DD58, 0xD834DD65 }, /* MUSICAL SYMBOL QUARTER NOTE */
+ { 0xD834DD60, 0xD834DD5F, 0xD834DD6E }, /* MUSICAL SYMBOL EIGHTH NOTE */
+ { 0xD834DD61, 0xD834DD5F, 0xD834DD6F }, /* MUSICAL SYMBOL SIXTEENTH NOTE */
+ { 0xD834DD62, 0xD834DD5F, 0xD834DD70 }, /* MUSICAL SYMBOL THIRTY-SECOND NOTE */
+ { 0xD834DD63, 0xD834DD5F, 0xD834DD71 }, /* MUSICAL SYMBOL SIXTY-FOURTH NOTE */
+ { 0xD834DD64, 0xD834DD5F, 0xD834DD72 }, /* MUSICAL SYMBOL ONE HUNDRED TWENTY-EIGHTH NOTE */
+ { 0xD834DDBB, 0xD834DDB9, 0xD834DD65 }, /* MUSICAL SYMBOL MINIMA */
+ { 0xD834DDBC, 0xD834DDBA, 0xD834DD65 }, /* MUSICAL SYMBOL MINIMA BLACK */
+ { 0xD834DDBD, 0xD834DDBB, 0xD834DD6E }, /* MUSICAL SYMBOL SEMIMINIMA WHITE */
+ { 0xD834DDBE, 0xD834DDBC, 0xD834DD6E }, /* MUSICAL SYMBOL SEMIMINIMA BLACK */
+ { 0xD834DDBF, 0xD834DDBB, 0xD834DD6F }, /* MUSICAL SYMBOL FUSA WHITE */
+ { 0xD834DDC0, 0xD834DDBC, 0xD834DD6F }, /* MUSICAL SYMBOL FUSA BLACK */
+};
+
/* EOF */
#define HANGUL_SCOUNT (HANGUL_LCOUNT * HANGUL_NCOUNT) /* 11172 */
#define MAXCOMBLEN 3
+#define MAXCOMBSPLEN 2
+#define COMBBUFLEN 4 /* max(MAXCOMBLEN, MAXCOMBSPLEN*2) */
+/*******************************************************************
+ Convert a wide character to upper/lower case.
+********************************************************************/
ucs2_t toupper_w(ucs2_t val)
{
if ( val >= 0x0040 && val <= 0x007F)
}
-/* ------------------------ */
+/*******************************************************************
+binary search for pre|decomposition
+********************************************************************/
+
static ucs2_t do_precomposition(unsigned int base, unsigned int comb)
{
int min = 0;
return 0;
}
+/* ------------------------ */
+static u_int32_t do_precomposition_sp(unsigned int base_sp, unsigned int comb_sp)
+{
+ int min = 0;
+ int max = sizeof(precompositions_sp) / sizeof(precompositions_sp[0]) - 1;
+ int mid;
+ u_int64_t sought = ((u_int64_t)base_sp << 32) | (u_int64_t)comb_sp, that;
+
+ /* binary search */
+ while (max >= min) {
+ mid = (min + max) / 2;
+ that = ((u_int64_t)precompositions_sp[mid].base << 32) | ((u_int64_t)precompositions_sp[mid].comb);
+ if (that < sought) {
+ min = mid + 1;
+ } else if (that > sought) {
+ max = mid - 1;
+ } else {
+ return precompositions_sp[mid].replacement;
+ }
+ }
+ /* no match */
+ return 0;
+}
+
/* -------------------------- */
static u_int32_t do_decomposition(ucs2_t base)
{
return 0;
}
-/* we can't use static, this stuff needs to be reentrant */
-/* static char comp[MAXPATHLEN +1]; */
+/* -------------------------- */
+static u_int64_t do_decomposition_sp(unsigned int base)
+{
+ int min = 0;
+ int max = sizeof(decompositions_sp) / sizeof(decompositions_sp[0]) - 1;
+ int mid;
+ u_int32_t sought = base;
+ u_int32_t that;
+ u_int64_t result;
+
+ /* binary search */
+ while (max >= min) {
+ mid = (min + max) / 2;
+ that = decompositions_sp[mid].replacement;
+ if (that < sought) {
+ min = mid + 1;
+ } else if (that > sought) {
+ max = mid - 1;
+ } else {
+ result = ((u_int64_t)decompositions_sp[mid].base << 32) | ((u_int64_t)decompositions_sp[mid].comb);
+ return result;
+ }
+ }
+ /* no match */
+ return 0;
+}
+
+/*******************************************************************
+pre|decomposition
+
+ we can't use static, this stuff needs to be reentrant
+ static char comp[MAXPATHLEN +1];
+
+ exclude U2000-U2FFF, UFE30-UFE4F and U2F800-U2FA1F ranges
+ in decompositions[] from decomposition according to AFP 3.x spec
+
+ We don't implement Singleton and Canonical Ordering
+ because they cause the problem of the roundtrip
+ such as Dancing Icon
+********************************************************************/
size_t precompose_w (ucs2_t *name, size_t inplen, ucs2_t *comp, size_t *outlen)
{
size_t i;
ucs2_t base, comb;
+ u_int32_t base_sp, comb_sp;
ucs2_t *in, *out;
ucs2_t hangul_lindex, hangul_vindex;
ucs2_t result;
+ u_int32_t result_sp;
size_t o_len = *outlen;
-
+
if (!inplen || (inplen & 1) || inplen > o_len)
return (size_t)-1;
- /* Actually, */
- /* Decomposition and Canonical Ordering are necessary here. */
- /* */
- /* Ex. in = CanonicalOrdering(decompose_w(name)) */
- /* */
- /* A new mapping table is needed for CanonicalOrdering. */
-
i = 0;
in = name;
out = comp;
-
+
base = *in;
while (*outlen > 2) {
i += 2;
in++;
+
if (i == inplen) {
*out = base;
out++;
*outlen -= 2;
return o_len - *outlen;
}
+
comb = *in;
result = 0;
-
+
/* Non-Combination Character */
if (comb < 0x300) ;
}
}
- /* Combining Sequence */
- else if ((result = do_precomposition(base, comb))) {
+ /* Binary Search for Surrogate Pair */
+ else if ((0xD800 <= base) && (base < 0xDC00)) {
+ if ((0xDC00 <= comb) && (comb < 0xE000) && (i + 4 <= inplen)) {
+ base_sp = ((u_int32_t)base << 16) | (u_int32_t)comb;
+ do {
+ comb_sp = ((u_int32_t)in[1] << 16) | (u_int32_t)in[2];
+ if (result_sp = do_precomposition_sp(base_sp, comb_sp)) {
+ base_sp = result_sp;
+ i += 4;
+ in +=2;
+ }
+ } while ((i + 4 <= inplen) && result_sp) ;
+
+ *out = base_sp >> 16;
+ out++;
+ *outlen -= 2;
+
+ if (*outlen <= 2) {
+ errno = E2BIG;
+ return (size_t)-1;
+ }
+
+ *out = base_sp & 0xFFFF;
+ out++;
+ *outlen -= 2;
+
+ i += 2;
+ in++;
+ base = *in;
+
+ result = 1;
+ }
+ }
+
+ /* Binary Search for BMP */
+ else if (result = do_precomposition(base, comb)) {
base = result;
}
base = comb;
}
}
-
+
errno = E2BIG;
return (size_t)-1;
}
/* --------------- */
-
-/* Singleton Decomposition is unsupported. */
-/* A new mapping table is needed for implementation. */
-
size_t decompose_w (ucs2_t *name, size_t inplen, ucs2_t *comp, size_t *outlen)
{
size_t i;
size_t comblen;
- ucs2_t base;
- ucs2_t comb[MAXCOMBLEN];
+ ucs2_t base, comb[COMBBUFLEN];
+ u_int32_t base_sp;
ucs2_t hangul_sindex, tjamo;
ucs2_t *in, *out;
unsigned int result;
+ u_int64_t result_sp;
size_t o_len = *outlen;
if (!inplen || (inplen & 1))
else if ((HANGUL_SBASE <= base) && (base < HANGUL_SBASE + HANGUL_SCOUNT)) {
hangul_sindex = base - HANGUL_SBASE;
base = HANGUL_LBASE + hangul_sindex / HANGUL_NCOUNT;
- comb[MAXCOMBLEN-2] = HANGUL_VBASE + (hangul_sindex % HANGUL_NCOUNT) / HANGUL_TCOUNT;
+ comb[COMBBUFLEN-2] = HANGUL_VBASE + (hangul_sindex % HANGUL_NCOUNT) / HANGUL_TCOUNT;
/* <L,V> */
if ((tjamo = HANGUL_TBASE + hangul_sindex % HANGUL_TCOUNT) == HANGUL_TBASE) {
- comb[MAXCOMBLEN-1] = comb[MAXCOMBLEN-2];
+ comb[COMBBUFLEN-1] = comb[COMBBUFLEN-2];
comblen = 1;
}
/* <L,V,T> */
else {
- comb[MAXCOMBLEN-1] = tjamo;
+ comb[COMBBUFLEN-1] = tjamo;
comblen = 2;
}
}
- /* Combining Sequence */
- /* exclude U2000-U2FFF and UFE30-UFE4F ranges in decompositions[] */
- /* from decomposition according to AFP 3.1 spec */
+ /* Binary Search for Surrogate Pair */
+ else if ((0xD800 <= base) && (base < 0xDC00)) {
+ if (i + 2 < inplen) {
+ base_sp = ((u_int32_t)base << 16) | (u_int32_t)in[1];
+ do {
+ if ( !(result_sp = do_decomposition_sp(base_sp))) break;
+ comblen += 2;
+ base_sp = result_sp >> 32;
+ comb[COMBBUFLEN-comblen] = (result_sp >> 16) & 0xFFFF; /* hi */
+ comb[COMBBUFLEN-comblen+1] = result_sp & 0xFFFF; /* lo */
+ } while (comblen < (MAXCOMBSPLEN<<1));
+
+ if (*outlen < (comblen + 1) << 1) {
+ errno = E2BIG;
+ return (size_t)-1;
+ }
+
+ *out = base_sp >> 16; /* hi */
+ out++;
+ *outlen -= 2;
+
+ base = base_sp & 0xFFFF; /* lo */
+
+ i += 2;
+ in++;
+ }
+ }
+
+ /* Binary Search for BMP */
else {
do {
- if ((comblen >= MAXCOMBLEN) || !(result = do_decomposition(base))) break;
+ if ( !(result = do_decomposition(base))) break;
comblen++;
base = result >> 16;
- comb[MAXCOMBLEN-comblen] = result & 0xffff;
- } while (0x007f < base) ;
+ comb[COMBBUFLEN-comblen] = result & 0xFFFF;
+ } while ((0x007f < base) && (comblen < MAXCOMBLEN));
}
if (*outlen < (comblen + 1) << 1) {
*outlen -= 2;
while ( comblen > 0 ) {
- *out = comb[MAXCOMBLEN-comblen];
+ *out = comb[COMBBUFLEN-comblen];
out++;
*outlen -= 2;
comblen--;
i += 2;
in++;
}
-
- /* Is Canonical Ordering necessary here? */
*out = 0;
return o_len-*outlen;
}
+/*******************************************************************
+length of UTF-8 character and string
+********************************************************************/
+
size_t utf8_charlen ( char* utf8 )
{
unsigned char *p;