#!/usr/bin/perl
-
+#
# usage: make-precompose.h.pl UnicodeData.txt > precompose.h
+#
+# (c) 2008-2010 by HAT <hat@fa2.so-net.ne.jp>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
# See
# http://www.unicode.org/Public/UNIDATA/UCD.html
# http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
-# table for binary search --------------------------------------------------
+# temp files for binary search (compose.TEMP, compose_sp.TEMP) -------------
open(UNICODEDATA, "<$ARGV[0]");
-open(PRECOMPOSE_TEMP, ">precompose.TEMP");
-open( DECOMPOSE_TEMP, ">decompose.TEMP");
-
-open(PRECOMPOSE_SP_TEMP, ">precompose_sp.TEMP");
-open( DECOMPOSE_SP_TEMP, ">decompose_sp.TEMP");
+open(COMPOSE_TEMP, ">compose.TEMP");
+open(COMPOSE_SP_TEMP, ">compose_sp.TEMP");
-while (<UNICODEDATA>){
+while (<UNICODEDATA>) {
chop;
(
$code0,
$Simple_Uppercase_Mapping12,
$Simple_Lowercase_Mapping13,
$Simple_Titlecase_Mapping14
- ) = split(/\;/);
+ ) = split(/\;/);
if (($Decomposition_Mapping5 ne "") && ($Decomposition_Mapping5 !~ /\</) && ($Decomposition_Mapping5 =~ / /)) {
($base, $comb) = split(/ /,$Decomposition_Mapping5);
-
+
$leftbracket = " { ";
$rightbracket =" }, ";
# AFP 3.x Spec
if ( ((0x2000 <= hex($code0)) && (hex($code0) <= 0x2FFF))
- || ((0xFE30 <= hex($code0)) && (hex($code0) <= 0xFE4F))
- || ((0x2F800 <= hex($code0)) && (hex($code0) <= 0x2FA1F))) {
+ || ((0xFE30 <= hex($code0)) && (hex($code0) <= 0xFE4F))
+ || ((0x2F800 <= hex($code0)) && (hex($code0) <= 0x2FA1F))) {
$leftbracket = "\/\*{ ";
$rightbracket =" },\*\/ ";
}
-
- if (hex($code0) > 0xFFFF) { # DELETE THIS LINE IF INTERNAL CODE IS UCS4
-
+
+ if (hex($code0) > 0xFFFF) {
+
$code0_sp_hi = 0xD800 - (0x10000 >> 10) + (hex($code0) >> 10);
$code0_sp_lo = 0xDC00 + (hex($code0) & 0x3FF);
- $base_sp_hi = 0xD800 - (0x10000 >> 10) + (hex($base) >> 10);
- $base_sp_lo = 0xDC00 + (hex($base) & 0x3FF);
+ $base_sp_hi = 0xD800 - (0x10000 >> 10) + (hex($base) >> 10);
+ $base_sp_lo = 0xDC00 + (hex($base) & 0x3FF);
- $comb_sp_hi = 0xD800 - (0x10000 >> 10) + (hex($comb) >> 10);
- $comb_sp_lo = 0xDC00 + (hex($comb) & 0x3FF);
+ $comb_sp_hi = 0xD800 - (0x10000 >> 10) + (hex($comb) >> 10);
+ $comb_sp_lo = 0xDC00 + (hex($comb) & 0x3FF);
- printf(PRECOMPOSE_SP_TEMP "%s0x%04X%04X, 0x%04X%04X, 0x%04X%04X%s\/\* %s \*\/\n",
+ printf(COMPOSE_SP_TEMP "%s0x%04X%04X, 0x%04X%04X, 0x%04X%04X%s\/\* %s \*\/\n",
$leftbracket, $code0_sp_hi ,$code0_sp_lo, $base_sp_hi, $base_sp_lo, $comb_sp_hi, $comb_sp_lo, $rightbracket, $Name1);
- printf(DECOMPOSE_SP_TEMP "%s0x%04X%04X, 0x%04X%04X, 0x%04X%04X%s\/\* %s \*\/\n",
- $leftbracket, $code0_sp_hi ,$code0_sp_lo, $base_sp_hi, $base_sp_lo, $comb_sp_hi, $comb_sp_lo, $rightbracket, $Name1);
-
- $leftbracket = "\/\*{ "; # DELETE THIS LINE IF INTERNAL CODE IS UCS4
- $rightbracket =" },\*\/ "; # DELETE THIS LINE IF INTERNAL CODE IS UCS4
- } # DELETE THIS LINE IF INTERNAL CODE IS UCS4
-
- printf(PRECOMPOSE_TEMP "%s0x%08X, 0x%08X, 0x%08X%s\/\* %s \*\/\n", $leftbracket, hex($code0), hex($base), hex($comb), $rightbracket, $Name1);
- printf( DECOMPOSE_TEMP "%s0x%08X, 0x%08X, 0x%08X%s\/\* %s \*\/\n", $leftbracket, hex($code0), hex($base), hex($comb), $rightbracket, $Name1);
-
+
+ $leftbracket = "\/\*{ ";
+ $rightbracket =" },\*\/ ";
+ }
+
+ printf(COMPOSE_TEMP "%s0x%08X, 0x%08X, 0x%08X%s\/\* %s \*\/\n", $leftbracket, hex($code0), hex($base), hex($comb), $rightbracket, $Name1);
+
}
}
+close(UNICODEDATA);
+
+close(COMPOSE_TEMP);
+close(COMPOSE_SP_TEMP);
+
+# macros for BMP (PRECOMP_COUNT, DECOMP_COUNT, MAXCOMBLEN) ----------------
+
+open(COMPOSE_TEMP, "<compose.TEMP");
+
+@comp_table = ();
+$comp_count = 0;
+
+while (<COMPOSE_TEMP>) {
+ if (m/^\/\*/) {
+ next;
+ }
+ $comp_table[$comp_count][0] = substr($_, 4, 10);
+ $comp_table[$comp_count][1] = substr($_, 16, 10);
+ $comp_count++;
+}
+
+$maxcomblen = 2; # Hangul's maxcomblen is already 2. That is, VT.
+
+for ($i = 0 ; $i < $comp_count ; $i++) {
+ $base = $comp_table[$i][1];
+ $comblen = 1;
+ $j = 0;
+ while ($j < $comp_count) {
+ if ($base ne $comp_table[$j][0]) {
+ $j++;
+ next;
+ } else {
+ $comblen++;
+ $base = $comp_table[$j][1];
+ $j = 0;
+ }
+ }
+ $maxcomblen = ($maxcomblen > $comblen) ? $maxcomblen : $comblen;
+}
+
+close(COMPOSE_TEMP);
+
+# macros for SP (PRECOMP_SP_COUNT,DECOMP_SP_COUNT, MAXCOMBSPLEN) -----------
+
+open(COMPOSE_SP_TEMP, "<compose_sp.TEMP");
+
+@comp_sp_table = ();
+$comp_sp_count = 0;
+
+while (<COMPOSE_SP_TEMP>) {
+ if (m/^\/\*/) {
+ next;
+ }
+ $comp_sp_table[$comp_sp_count][0] = substr($_, 4, 10);
+ $comp_sp_table[$comp_sp_count][1] = substr($_, 16, 10);
+ $comp_sp_count++;
+}
+
+$maxcombsplen = 2; # one char have 2 codepoints, like a D8xx DCxx.
+
+for ($i = 0 ; $i < $comp_sp_count ; $i++) {
+ $base_sp = $comp_sp_table[$i][1];
+ $comblen = 2;
+ $j = 0;
+ while ($j < $comp_sp_count) {
+ if ($base_sp ne $comp_sp_table[$j][0]) {
+ $j++;
+ next;
+ } else {
+ $comblen += 2;
+ $base_sp = $comp_sp_table[$j][1];
+ $j = 0;
+ }
+ }
+ $maxcombsplen = ($maxcombsplen > $comblen) ? $maxcombsplen : $comblen;
+}
+
+close(COMPOSE_SP_TEMP);
+
+# macro for buffer length (COMBBUFLEN) -------------------------------------
+
+$combbuflen = ($maxcomblen > $maxcombsplen) ? $maxcomblen : $maxcombsplen;
+
# sort ---------------------------------------------------------------------
-system("sort -k 3 precompose.TEMP \> precompose.SORT");
-system("sort -k 2 decompose.TEMP \> decompose.SORT");
+system("sort -k 3 compose.TEMP \> precompose.SORT");
+system("sort -k 2 compose.TEMP \> decompose.SORT");
-system("sort -k 3 precompose_sp.TEMP \> precompose_sp.SORT");
-system("sort -k 2 decompose_sp.TEMP \> decompose_sp.SORT");
+system("sort -k 3 compose_sp.TEMP \> precompose_sp.SORT");
+system("sort -k 2 compose_sp.TEMP \> decompose_sp.SORT");
# print -------------------------------------------------------------------
-printf ("\/\* This file is generated by contrib/misc/make-precompose.h.pl %s \*\/\n", $ARGV[0]);
print ("\/\* DO NOT EDIT BY HAND\!\!\! \*\/\n");
+print ("\/\* This file is generated by \*\/\n");
+printf ("\/\* contrib/misc/make-precompose.h.pl %s \*\/\n", $ARGV[0]);
print ("\n");
printf ("\/\* %s is got from \*\/\n", $ARGV[0]);
print ("\/\* http\:\/\/www.unicode.org\/Public\/UNIDATA\/UnicodeData.txt \*\/\n");
print ("\n");
+print ("\#define HANGUL_SBASE 0xAC00\n");
+print ("\#define HANGUL_LBASE 0x1100\n");
+print ("\#define HANGUL_VBASE 0x1161\n");
+print ("\#define HANGUL_TBASE 0x11A7\n");
+print ("\#define HANGUL_LCOUNT 19\n");
+print ("\#define HANGUL_VCOUNT 21\n");
+print ("\#define HANGUL_TCOUNT 28\n");
+print ("\#define HANGUL_NCOUNT 588 \/\* (HANGUL_VCOUNT \* HANGUL_TCOUNT) \*\/\n");
+print ("\#define HANGUL_SCOUNT 11172 \/\* (HANGUL_LCOUNT \* HANGUL_NCOUNT) \*\/\n");
+print ("\n");
+
+printf ("\#define PRECOMP_COUNT %d\n", $comp_count);
+printf ("\#define DECOMP_COUNT %d\n", $comp_count);
+printf ("\#define MAXCOMBLEN %d\n", $maxcomblen);
+print ("\n");
+printf ("\#define PRECOMP_SP_COUNT %d\n", $comp_sp_count);
+printf ("\#define DECOMP_SP_COUNT %d\n", $comp_sp_count);
+printf ("\#define MAXCOMBSPLEN %d\n", $maxcombsplen);
+print ("\n");
+printf ("\#define COMBBUFLEN %d \/\* max\(MAXCOMBLEN\,MAXCOMBSPLEN\) \*\/\n", $combbuflen);
+print ("\n");
+
print ("static const struct \{\n");
print (" unsigned int replacement\;\n");
print (" unsigned int base\;\n");
print ("static const struct \{\n");
-print (" unsigned int replacement\;\n");
-print (" unsigned int base\;\n");
-print (" unsigned int comb\;\n");
+print (" unsigned int replacement_sp\;\n");
+print (" unsigned int base_sp\;\n");
+print (" unsigned int comb_sp\;\n");
print ("\} precompositions_sp\[\] \= \{\n");
system("cat precompose_sp.SORT");
print ("\n");
print ("static const struct \{\n");
-print (" unsigned int replacement\;\n");
-print (" unsigned int base\;\n");
-print (" unsigned int comb\;\n");
+print (" unsigned int replacement_sp\;\n");
+print (" unsigned int base_sp\;\n");
+print (" unsigned int comb_sp\;\n");
print ("\} decompositions_sp\[\] \= \{\n");
system("cat decompose_sp.SORT");
-/* This file is generated by contrib/misc/make-precompose.h.pl UnicodeData.txt */
/* DO NOT EDIT BY HAND!!! */
+/* This file is generated by */
+/* contrib/misc/make-precompose.h.pl UnicodeData.txt */
/* UnicodeData.txt is got from */
/* http://www.unicode.org/Public/UNIDATA/UnicodeData.txt */
+#define HANGUL_SBASE 0xAC00
+#define HANGUL_LBASE 0x1100
+#define HANGUL_VBASE 0x1161
+#define HANGUL_TBASE 0x11A7
+#define HANGUL_LCOUNT 19
+#define HANGUL_VCOUNT 21
+#define HANGUL_TCOUNT 28
+#define HANGUL_NCOUNT 588 /* (HANGUL_VCOUNT * HANGUL_TCOUNT) */
+#define HANGUL_SCOUNT 11172 /* (HANGUL_LCOUNT * HANGUL_NCOUNT) */
+
+#define PRECOMP_COUNT 955
+#define DECOMP_COUNT 955
+#define MAXCOMBLEN 3
+
+#define PRECOMP_SP_COUNT 16
+#define DECOMP_SP_COUNT 16
+#define MAXCOMBSPLEN 4
+
+#define COMBBUFLEN 4 /* max(MAXCOMBLEN,MAXCOMBSPLEN) */
+
static const struct {
unsigned int replacement;
unsigned int base;
};
static const struct {
- unsigned int replacement;
- unsigned int base;
- unsigned int comb;
+ unsigned int replacement_sp;
+ unsigned int base_sp;
+ unsigned int comb_sp;
} precompositions_sp[] = {
{ 0xD804DC9A, 0xD804DC99, 0xD804DCBA }, /* KAITHI LETTER DDDHA */
{ 0xD804DC9C, 0xD804DC9B, 0xD804DCBA }, /* KAITHI LETTER RHA */
};
static const struct {
- unsigned int replacement;
- unsigned int base;
- unsigned int comb;
+ unsigned int replacement_sp;
+ unsigned int base_sp;
+ unsigned int comb_sp;
} decompositions_sp[] = {
{ 0xD804DC9A, 0xD804DC99, 0xD804DCBA }, /* KAITHI LETTER DDDHA */
{ 0xD804DC9C, 0xD804DC9B, 0xD804DCBA }, /* KAITHI LETTER RHA */
#include "precompose.h"
#include "byteorder.h"
-#define HANGUL_SBASE 0xAC00
-#define HANGUL_LBASE 0x1100
-#define HANGUL_VBASE 0x1161
-#define HANGUL_TBASE 0x11A7
-#define HANGUL_LCOUNT 19
-#define HANGUL_VCOUNT 21
-#define HANGUL_TCOUNT 28
-#define HANGUL_NCOUNT (HANGUL_VCOUNT * HANGUL_TCOUNT) /* 588 */
-#define HANGUL_SCOUNT (HANGUL_LCOUNT * HANGUL_NCOUNT) /* 11172 */
-
-#define MAXCOMBLEN 3
-#define MAXCOMBSPLEN 2
-#define COMBBUFLEN 4 /* max(MAXCOMBLEN, MAXCOMBSPLEN*2) */
-
/*******************************************************************
Convert a wide character to upper/lower case.
********************************************************************/
static ucs2_t do_precomposition(unsigned int base, unsigned int comb)
{
int min = 0;
- int max = sizeof(precompositions) / sizeof(precompositions[0]) - 1;
+ int max = PRECOMP_COUNT - 1;
int mid;
u_int32_t sought = (base << 16) | comb, that;
static u_int32_t do_precomposition_sp(unsigned int base_sp, unsigned int comb_sp)
{
int min = 0;
- int max = sizeof(precompositions_sp) / sizeof(precompositions_sp[0]) - 1;
+ int max = PRECOMP_SP_COUNT - 1;
int mid;
- u_int64_t sought = ((u_int64_t)base_sp << 32) | (u_int64_t)comb_sp, that;
+ u_int64_t sought_sp = ((u_int64_t)base_sp << 32) | (u_int64_t)comb_sp, that_sp;
/* binary search */
while (max >= min) {
mid = (min + max) / 2;
- that = ((u_int64_t)precompositions_sp[mid].base << 32) | ((u_int64_t)precompositions_sp[mid].comb);
- if (that < sought) {
+ that_sp = ((u_int64_t)precompositions_sp[mid].base_sp << 32) | ((u_int64_t)precompositions_sp[mid].comb_sp);
+ if (that_sp < sought_sp) {
min = mid + 1;
- } else if (that > sought) {
+ } else if (that_sp > sought_sp) {
max = mid - 1;
} else {
- return precompositions_sp[mid].replacement;
+ return precompositions_sp[mid].replacement_sp;
}
}
/* no match */
static u_int32_t do_decomposition(ucs2_t base)
{
int min = 0;
- int max = sizeof(decompositions) / sizeof(decompositions[0]) - 1;
+ int max = DECOMP_COUNT - 1;
int mid;
u_int32_t sought = base;
u_int32_t result, that;
}
/* -------------------------- */
-static u_int64_t do_decomposition_sp(unsigned int base)
+static u_int64_t do_decomposition_sp(unsigned int base_sp)
{
int min = 0;
- int max = sizeof(decompositions_sp) / sizeof(decompositions_sp[0]) - 1;
+ int max = DECOMP_SP_COUNT - 1;
int mid;
- u_int32_t sought = base;
- u_int32_t that;
- u_int64_t result;
+ u_int32_t sought_sp = base_sp;
+ u_int32_t that_sp;
+ u_int64_t result_sp;
/* binary search */
while (max >= min) {
mid = (min + max) / 2;
- that = decompositions_sp[mid].replacement;
- if (that < sought) {
+ that_sp = decompositions_sp[mid].replacement_sp;
+ if (that_sp < sought_sp) {
min = mid + 1;
- } else if (that > sought) {
+ } else if (that_sp > sought_sp) {
max = mid - 1;
} else {
- result = ((u_int64_t)decompositions_sp[mid].base << 32) | ((u_int64_t)decompositions_sp[mid].comb);
- return result;
+ result_sp = ((u_int64_t)decompositions_sp[mid].base_sp << 32) | ((u_int64_t)decompositions_sp[mid].comb_sp);
+ return result_sp;
}
}
/* no match */
we can't use static, this stuff needs to be reentrant
static char comp[MAXPATHLEN +1];
- exclude U2000-U2FFF, UFE30-UFE4F and U2F800-U2FA1F ranges
- in decompositions[] from decomposition according to AFP 3.x spec
-
- We don't implement Singleton and Canonical Ordering
+ We don't implement Singleton and Canonical Ordering.
+ We ignore CompositionExclusions.txt.
because they cause the problem of the roundtrip
- such as Dancing Icon
+ such as Dancing Icon.
+
+ exclude U2000-U2FFF, UFE30-UFE4F and U2F800-U2FA1F ranges
+ in precompose.h from composition according to AFP 3.x spec
********************************************************************/
size_t precompose_w (ucs2_t *name, size_t inplen, ucs2_t *comp, size_t *outlen)
base_sp = result_sp >> 32;
comb[COMBBUFLEN-comblen] = (result_sp >> 16) & 0xFFFF; /* hi */
comb[COMBBUFLEN-comblen+1] = result_sp & 0xFFFF; /* lo */
- } while (comblen < (MAXCOMBSPLEN<<1));
+ } while (comblen < MAXCOMBSPLEN);
if (*outlen < (comblen + 1) << 1) {
errno = E2BIG;