]> arthur.barton.de Git - netatalk.git/blob - contrib/shell_utils/make-casetable.pl
make-casetable.pl and make-precompose.h.pl moved.
[netatalk.git] / contrib / shell_utils / make-casetable.pl
1 #!/usr/bin/perl
2 #
3 # usage: make-casetable.pl <infile> <outfile1> <outfile2>
4 #        make-casetable.pl UnicodeData.txt utf16_casetable.h utf16_case.c
5 #
6 # (c) 2011 by HAT <hat@fa2.so-net.ne.jp>
7 #
8 #  This program is free software; you can redistribute it and/or modify
9 #  it under the terms of the GNU General Public License as published by
10 #  the Free Software Foundation; either version 2 of the License, or
11 #  (at your option) any later version.
12 #
13 #  This program is distributed in the hope that it will be useful,
14 #  but WITHOUT ANY WARRANTY; without even the implied warranty of
15 #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 #  GNU General Public License for more details.
17 #
18
19 # See
20 # http://www.unicode.org/reports/tr44/
21 # http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
22
23 # One block has 64 chars.
24 #
25 # BMP
26 # block    0 = dummy
27 # block    1 = U+0000 - U+003F
28 # block    2 = U+0040 - U+007F
29 # .....
30 # block 1024 = U+FFC0 - U+FFFF
31 # block 1025 = dummy
32 #
33 # Surrogate Pair
34 # block  1024 = dummy
35 # block  1025 = U+010000 - U+01003F
36 # block  1026 = U+010040 - U+01007F
37 # .....
38 # block 17408 = U+10FFC0 - U+10FFFF
39 # block 17409 = dummy
40 #
41 # Dummy block is for edge detection.
42 # If block include upper/lower chars, block_enable[]=1.
43
44 use strict;
45 use warnings;
46
47 our $code0;
48 our $Name1;
49 our $General_Category2;
50 our $Canonical_Combining_Class3;
51 our $Bidi_Class4;
52 our $Decomposition_Mapping5;
53 our $Numeric_Value6;
54 our $Numeric_Value7;
55 our $Numeric_Value8;
56 our $Bidi_Mirrored9;
57 our $Unicode_1_Name10;
58 our $ISO_Comment11;
59 our $Simple_Uppercase_Mapping12;
60 our $Simple_Lowercase_Mapping13;
61 our $Simple_Titlecase_Mapping14;
62
63 our $hex_code0;
64 our $Mapping;
65 our $hex_Mapping;
66
67 our $char;
68 our $sp;
69 our $block;
70
71 our @table;
72 our @table_sp;
73
74 our @block_enable;
75 our @block_enable_sp;
76
77 our $table_no;
78 our $block_start;
79 our $block_end;
80 our $char_start;
81 our $char_end;
82
83 open(CHEADER, ">$ARGV[1]");
84 open(CSOURCE, ">$ARGV[2]");
85
86 printf (CHEADER "\/\*\n");
87 printf (CHEADER "DO NOT EDIT BY HAND\!\!\!\n");
88 printf (CHEADER "\n");
89 printf (CHEADER "This file is generated by\n");
90 printf (CHEADER " contrib/shell_utils/make-casetable.pl %s %s %s\n", $ARGV[0], $ARGV[1], $ARGV[2]);
91 printf (CHEADER "\n");
92 printf (CHEADER "%s is got from\n", $ARGV[0]);
93 printf (CHEADER "http\:\/\/www.unicode.org\/Public\/UNIDATA\/UnicodeData.txt\n");
94 printf (CHEADER "\*\/\n");
95 printf (CHEADER "\n");
96
97 printf (CSOURCE "\/\*\n");
98 printf (CSOURCE "DO NOT EDIT BY HAND\!\!\!\n");
99 printf (CSOURCE "\n");
100 printf (CSOURCE "This file is generated by\n");
101 printf (CSOURCE " contrib/shell_utils/make-casetable.pl %s %s %s\n", $ARGV[0], $ARGV[1], $ARGV[2]);
102 printf (CSOURCE "\n");
103 printf (CSOURCE "%s is got from\n", $ARGV[0]);
104 printf (CSOURCE "http\:\/\/www.unicode.org\/Public\/UNIDATA\/UnicodeData.txt\n");
105 printf (CSOURCE "\*\/\n");
106 printf (CSOURCE "\n");
107 printf (CSOURCE "\#include \<stdint.h\>\n");
108 printf (CSOURCE "\#include \<atalk\/unicode.h\>\n");
109 printf (CSOURCE "\#include \"%s\"\n", $ARGV[1]);
110 printf (CSOURCE "\n");
111
112 &make_array("upper");
113 &make_array("lower");
114
115 printf (CHEADER "\/\* EOF \*\/\n");
116 printf (CSOURCE "\/\* EOF \*\/\n");
117
118 close(CHEADER);
119 close(CSOURCE);
120
121
122 ###########################################################################
123 sub make_array{
124
125     # init table -----------------------------------------------------
126
127     for ($char = 0 ; $char <= 0xFFFF ; $char++) {
128         $table[$char][0] = $char;       # mapped char
129         $table[$char][1] = $char;       # orig char
130         $table[$char][2] = "";          # char name
131     }
132
133     for ($char = 0x10000 ; $char <= 0x10FFFF ; $char++) {
134         $sp = ((0xD800 - (0x10000 >> 10) + ($char >> 10)) << 16)
135             + (0xDC00 + ($char & 0x3FF));
136         $table_sp[$char][0] = $sp;      # mapped surrogate pair
137         $table_sp[$char][1] = $sp;      # orig surrogate pair
138         $table_sp[$char][2] = $char;    # mapped char
139         $table_sp[$char][3] = $char;    # orig char
140         $table_sp[$char][4] = "";       # char name
141     }
142
143     for ($block = 0 ; $block <= 1025 ; $block++) {
144         $block_enable[$block] = 0;
145     }
146
147     $block_enable[1] = 1;           # ASCII block is forcibly included
148     $block_enable[2] = 1;           # in the array for Speed-Up.
149
150     for ($block = 1024 ; $block <= 17409 ; $block++) {
151         $block_enable_sp[$block] = 0;
152     }
153
154     # write data to table --------------------------------------------
155
156     open(UNICODEDATA, "<$ARGV[0]");
157
158     while (<UNICODEDATA>) {
159         chop;
160         (
161             $code0,
162             $Name1,
163             $General_Category2,
164             $Canonical_Combining_Class3,
165             $Bidi_Class4,
166             $Decomposition_Mapping5,
167             $Numeric_Value6,
168             $Numeric_Value7,
169             $Numeric_Value8,
170             $Bidi_Mirrored9,
171             $Unicode_1_Name10,
172             $ISO_Comment11,
173             $Simple_Uppercase_Mapping12,
174             $Simple_Lowercase_Mapping13,
175             $Simple_Titlecase_Mapping14
176         ) = split(/\;/);
177
178         if ($_[0] eq "upper") {
179             $Mapping = $Simple_Uppercase_Mapping12;
180         } elsif ($_[0] eq "lower") {
181             $Mapping = $Simple_Lowercase_Mapping13;
182         } else {
183             exit(1);
184         }
185
186         next if ($Mapping eq "");
187
188         $hex_code0 = hex($code0);
189         $hex_Mapping = hex($Mapping);
190
191         if ($hex_code0 <= 0xFFFF) {
192             $table[$hex_code0][0] = $hex_Mapping;
193             #table[$hex_code0][1]   already set
194             $table[$hex_code0][2] = $Name1;
195             $block_enable[($hex_code0 / 64) +1] = 1;
196         } else {
197             $sp = ((0xD800 - (0x10000 >> 10) + ($hex_Mapping >> 10)) << 16)
198                 + (0xDC00 + ($hex_Mapping & 0x3FF));
199             $table_sp[$hex_code0][0] = $sp;
200             #table_sp[$hex_code0][1]   already set
201             $table_sp[$hex_code0][2] = $hex_Mapping;
202             #table_sp[$hex_code0][3]   already set
203             $table_sp[$hex_code0][4] = $Name1;
204             $block_enable_sp[($hex_code0 / 64) +1] = 1;
205         }
206     }
207
208     close(UNICODEDATA);
209
210     # array for BMP --------------------------------------------------
211
212     printf(CSOURCE "\/*******************************************************************\n");
213     printf(CSOURCE " Convert a wide character to %s case.\n", $_[0]);
214     printf(CSOURCE "*******************************************************************\/\n");
215     printf(CSOURCE "ucs2\_t to%s\_w\(ucs2\_t val\)\n", $_[0]);
216     printf(CSOURCE "{\n");
217
218     $table_no = 1;
219
220     for ($block = 1 ; $block <= 1024 ; $block++) {
221
222         # rising edge detection
223         if ($block_enable[$block - 1] == 0 && $block_enable[$block] == 1) {
224             $block_start = $block;
225         }
226
227         # falling edge detection
228         if ($block_enable[$block] == 1 && $block_enable[$block + 1] == 0) {
229             $block_end = $block;
230
231             $char_start = ($block_start -1)* 64;
232             $char_end = ($block_end * 64) -1;
233
234             printf(CHEADER "static const u\_int16\_t %s\_table\_%d\[%d\] \= \{\n",
235                    $_[0], $table_no, $char_end - $char_start +1);
236
237             for ($char = $char_start ; $char <= $char_end ; $char++) {
238                 printf(CHEADER "  0x%04X, /*U\+%04X*/ /*%s*/\n",
239                        $table[$char][0],
240                        $table[$char][1],
241                        $table[$char][2]
242                    );
243             }
244             printf(CHEADER "\}\;\n");
245             printf(CHEADER "\n");
246
247             if ($char_start == 0x0000) {
248                 printf(CSOURCE "    if \( val \<\= 0x%04X)\n",
249                        $char_end);
250                 printf(CSOURCE "        return %s\_table\_%d\[val]\;\n",
251                        $_[0], $table_no);
252             } else {
253                 printf(CSOURCE "    if \( val \>\= 0x%04X \&\& val \<\= 0x%04X)\n",
254                        $char_start, $char_end);
255                 printf(CSOURCE "        return %s\_table\_%d\[val-0x%04X\]\;\n",
256                        $_[0], $table_no, $char_start);
257             }
258             printf(CSOURCE "\n");
259
260             $table_no++;
261         }
262     }
263
264     printf(CSOURCE "\treturn \(val\)\;\n");
265     printf(CSOURCE "\}\n");
266     printf(CSOURCE "\n");
267
268     # array for Surrogate Pair ---------------------------------------
269
270     printf(CSOURCE "\/*******************************************************************\n");
271     printf(CSOURCE " Convert a surrogate pair to %s case.\n", $_[0]);
272     printf(CSOURCE "*******************************************************************\/\n");
273     printf(CSOURCE "uint32\_t to%s\_sp\(uint32\_t val\)\n", $_[0]);
274     printf(CSOURCE "{\n");
275
276     $table_no = 1;
277
278     for ($block = 1025 ; $block <= 17408 ; $block++) {
279
280         # rising edge detection
281         if ((($block_enable_sp[$block - 1] == 0) || ((($block - 1) & 0xF) == 0))
282                 && ($block_enable_sp[$block] == 1)) {
283             $block_start = $block;
284         }
285
286         # falling edge detection
287         if (($block_enable_sp[$block] == 1) &&
288                 ((($block - 1) & 0xF == 0xF) || ($block_enable_sp[$block + 1] == 0))) {
289             $block_end = $block;
290
291             $char_start = ($block_start -1)* 64;
292             $char_end = ($block_end * 64) -1;
293
294             printf(CHEADER "static const u\_int32\_t %s\_table\_sp\_%d\[%d\] \= \{\n",
295                    $_[0], $table_no, $char_end - $char_start +1);
296
297             for ($char = $char_start ; $char <= $char_end ; $char++) {
298                 printf(CHEADER "  0x%08X, /*0x%08X*/ /*U\+%06X*/ /*U\+%06X*/ /*%s*/\n",
299                        $table_sp[$char][0],
300                        $table_sp[$char][1],
301                        $table_sp[$char][2],
302                        $table_sp[$char][3],
303                        $table_sp[$char][4]
304                    );
305             }
306             printf(CHEADER "\}\;\n");
307             printf(CHEADER "\n");
308
309             printf(CSOURCE "    if \( val \>\= 0x%08X \&\& val \<\= 0x%08X)\n",
310                    $table_sp[$char_start][1], $table_sp[$char_end][1]);
311             printf(CSOURCE "        return %s\_table\_sp\_%d\[val-0x%08X\]\;\n",
312                    $_[0], $table_no, $table_sp[$char_start][1]);
313             printf(CSOURCE "\n");
314
315             $table_no++;
316         }
317     }
318
319     printf(CSOURCE "\treturn \(val\)\;\n");
320     printf(CSOURCE "\}\n");
321     printf(CSOURCE "\n");
322 }
323
324 # EOF