]> arthur.barton.de Git - netatalk.git/blob - libatalk/unicode/util_unistr.c
Merge master
[netatalk.git] / libatalk / unicode / util_unistr.c
1 #ifdef HAVE_CONFIG_H
2 #include "config.h"
3 #endif /* HAVE_CONFIG_H */
4
5 #include <stdio.h>
6 #include <stdlib.h>
7 #include <string.h>
8 #include <sys/param.h>
9 #include <sys/stat.h>
10 #include <atalk/logger.h>
11 #include <errno.h>
12 #include <arpa/inet.h>
13
14 #include <atalk/unicode.h>
15 #include "ucs2_casetable.h"
16 #include "precompose.h"
17 #include "byteorder.h"
18
19 /*******************************************************************
20  Convert a wide character to upper/lower case.
21 ********************************************************************/
22 ucs2_t toupper_w(ucs2_t val)
23 {
24         if ( val >= 0x0040 && val <= 0x007F)
25                 return upcase_table_1[val-0x0040];
26         if ( val >= 0x00C0 && val <= 0x02BF)
27                 return upcase_table_2[val-0x00C0];
28         if ( val >= 0x0380 && val <= 0x04FF)
29                 return upcase_table_3[val-0x0380];
30         if ( val >= 0x0540 && val <= 0x05BF)
31                 return upcase_table_4[val-0x0540];
32         if ( val >= 0x1E00 && val <= 0x1FFF)
33                 return upcase_table_5[val-0x1E00];
34         if ( val >= 0x2140 && val <= 0x217F)
35                 return upcase_table_6[val-0x2140];
36         if ( val >= 0x24C0 && val <= 0x24FF)
37                 return upcase_table_7[val-0x24C0];
38         if ( val >= 0xFF40 && val <= 0xFF7F)
39                 return upcase_table_8[val-0xFF40];
40
41         return (val);
42 }
43
44
45 ucs2_t tolower_w(ucs2_t val)
46 {
47         if ( val >= 0x0040 && val <= 0x007F)
48                 return lowcase_table_1[val-0x0040];
49         if ( val >= 0x00C0 && val <= 0x023F)
50                 return lowcase_table_2[val-0x00C0];
51         if ( val >= 0x0380 && val <= 0x057F)
52                 return lowcase_table_3[val-0x0380];
53         if ( val >= 0x1E00 && val <= 0x1FFF)
54                 return lowcase_table_4[val-0x1E00];
55         if ( val >= 0x2140 && val <= 0x217F)
56                 return lowcase_table_5[val-0x2140];
57         if ( val >= 0x2480 && val <= 0x24FF)
58                 return lowcase_table_6[val-0x2480];
59         if ( val >= 0xFF00 && val <= 0xFF3F)
60                 return lowcase_table_7[val-0xFF00];
61
62         return (val);
63 }
64
65 /*******************************************************************
66  Convert a string to lower case.
67  return True if any char is converted
68 ********************************************************************/
69 int strlower_w(ucs2_t *s)
70 {
71         int ret = 0;
72         while (*s) {
73                 ucs2_t v = tolower_w(*s);
74                 if (v != *s) {
75                         *s = v;
76                         ret = 1;
77                 }
78                 s++;
79         }
80         return ret;
81 }
82
83 /*******************************************************************
84  Convert a string to upper case.
85  return True if any char is converted
86 ********************************************************************/
87 int strupper_w(ucs2_t *s)
88 {
89         int ret = 0;
90         while (*s) {
91                 ucs2_t v = toupper_w(*s);
92                 if (v != *s) {
93                         *s = v;
94                         ret = 1;
95                 }
96                 s++;
97         }
98         return ret;
99 }
100
101
102 /*******************************************************************
103 determine if a character is lowercase
104 ********************************************************************/
105 int islower_w(ucs2_t c)
106 {
107         return ( c == tolower_w(c));
108 }
109
110 /*******************************************************************
111 determine if a character is uppercase
112 ********************************************************************/
113 int isupper_w(ucs2_t c)
114 {
115         return ( c == toupper_w(c));
116 }
117
118
119 /*******************************************************************
120  Count the number of characters in a ucs2_t string.
121 ********************************************************************/
122 size_t strlen_w(const ucs2_t *src)
123 {
124         size_t len;
125
126         for(len = 0; *src++; len++) ;
127
128         return len;
129 }
130
131 /*******************************************************************
132  Count up to max number of characters in a ucs2_t string.
133 ********************************************************************/
134 size_t strnlen_w(const ucs2_t *src, size_t max)
135 {
136         size_t len;
137
138         for(len = 0; *src++ && (len < max); len++) ;
139
140         return len;
141 }
142
143 /*******************************************************************
144 wide strchr()
145 ********************************************************************/
146 ucs2_t *strchr_w(const ucs2_t *s, ucs2_t c)
147 {
148         while (*s != 0) {
149                 if (c == *s) return (ucs2_t *)s;
150                 s++;
151         }
152         if (c == *s) return (ucs2_t *)s;
153
154         return NULL;
155 }
156
157 ucs2_t *strcasechr_w(const ucs2_t *s, ucs2_t c)
158 {
159         while (*s != 0) {
160 /*              LOG(log_debug, logtype_default, "Comparing %X to %X (%X - %X)", c, *s, toupper_w(c), toupper_w(*s));*/
161                 if (toupper_w(c) == toupper_w(*s)) return (ucs2_t *)s;
162                 s++;
163         }
164         if (c == *s) return (ucs2_t *)s;
165
166         return NULL;
167 }
168
169
170 int strcmp_w(const ucs2_t *a, const ucs2_t *b)
171 {
172         while (*b && *a == *b) { a++; b++; }
173         return (*a - *b);
174         /* warning: if *a != *b and both are not 0 we retrun a random
175            greater or lesser than 0 number not realted to which
176            string is longer */
177 }
178
179 int strncmp_w(const ucs2_t *a, const ucs2_t *b, size_t len)
180 {
181         size_t n = 0;
182         while ((n < len) && *b && *a == *b) { a++; b++; n++;}
183         return (len - n)?(*a - *b):0;
184 }
185
186 /*******************************************************************
187 wide strstr()
188 ********************************************************************/
189 ucs2_t *strstr_w(const ucs2_t *s, const ucs2_t *ins)
190 {
191         ucs2_t *r;
192         size_t slen, inslen;
193
194         if (!s || !*s || !ins || !*ins) return NULL;
195         slen = strlen_w(s);
196         inslen = strlen_w(ins);
197         r = (ucs2_t *)s;
198         while ((r = strchr_w(r, *ins))) {
199                 if (strncmp_w(r, ins, inslen) == 0) return r;
200                 r++;
201         }
202         return NULL;
203 }
204
205 ucs2_t *strcasestr_w(const ucs2_t *s, const ucs2_t *ins)
206 {
207         ucs2_t *r;
208         size_t slen, inslen;
209
210         if (!s || !*s || !ins || !*ins) return NULL;
211         slen = strlen_w(s);
212         inslen = strlen_w(ins);
213         r = (ucs2_t *)s;
214         while ((r = strcasechr_w(r, *ins))) {
215                 if (strncasecmp_w(r, ins, inslen) == 0) return r;
216                 r++;
217         }
218         return NULL;
219 }
220
221
222
223
224 /*******************************************************************
225 case insensitive string comparison
226 ********************************************************************/
227 int strcasecmp_w(const ucs2_t *a, const ucs2_t *b)
228 {
229         while (*b && toupper_w(*a) == toupper_w(*b)) { a++; b++; }
230         return (tolower_w(*a) - tolower_w(*b));
231 }
232
233 /*******************************************************************
234 case insensitive string comparison, lenght limited
235 ********************************************************************/
236 int strncasecmp_w(const ucs2_t *a, const ucs2_t *b, size_t len)
237 {
238         size_t n = 0;
239         while ((n < len) && *b && (toupper_w(*a) == toupper_w(*b))) { a++; b++; n++; }
240         return (len - n)?(tolower_w(*a) - tolower_w(*b)):0;
241 }
242
243 /*******************************************************************
244 duplicate string
245 ********************************************************************/
246 /* if len == 0 then duplicate the whole string */
247 ucs2_t *strndup_w(const ucs2_t *src, size_t len)
248 {
249         ucs2_t *dest;
250
251         if (!len) len = strlen_w(src);
252         dest = (ucs2_t *)malloc((len + 1) * sizeof(ucs2_t));
253         if (!dest) {
254                 LOG (log_error, logtype_default, "strdup_w: out of memory!");
255                 return NULL;
256         }
257
258         memcpy(dest, src, len * sizeof(ucs2_t));
259         dest[len] = 0;
260
261         return dest;
262 }
263
264 ucs2_t *strdup_w(const ucs2_t *src)
265 {
266         return strndup_w(src, 0);
267 }
268
269 /*******************************************************************
270 copy a string with max len
271 ********************************************************************/
272
273 ucs2_t *strncpy_w(ucs2_t *dest, const ucs2_t *src, const size_t max)
274 {
275         size_t len;
276
277         if (!dest || !src) return NULL;
278
279         for (len = 0; (src[len] != 0) && (len < max); len++)
280                 dest[len] = src[len];
281         while (len < max)
282                 dest[len++] = 0;
283
284         return dest;
285 }
286
287
288 /*******************************************************************
289 append a string of len bytes and add a terminator
290 ********************************************************************/
291
292 ucs2_t *strncat_w(ucs2_t *dest, const ucs2_t *src, const size_t max)
293 {
294         size_t start;
295         size_t len;
296
297         if (!dest || !src) return NULL;
298
299         start = strlen_w(dest);
300         len = strnlen_w(src, max);
301
302         memcpy(&dest[start], src, len*sizeof(ucs2_t));
303         dest[start+len] = 0;
304
305         return dest;
306 }
307
308
309 ucs2_t *strcat_w(ucs2_t *dest, const ucs2_t *src)
310 {
311         size_t start;
312         size_t len;
313
314         if (!dest || !src) return NULL;
315
316         start = strlen_w(dest);
317         len = strlen_w(src);
318
319         memcpy(&dest[start], src, len*sizeof(ucs2_t));
320         dest[start+len] = 0;
321
322         return dest;
323 }
324
325
326 /*******************************************************************
327 binary search for pre|decomposition
328 ********************************************************************/
329
330 static ucs2_t do_precomposition(unsigned int base, unsigned int comb) 
331 {
332         int min = 0;
333         int max = PRECOMP_COUNT - 1;
334         int mid;
335         u_int32_t sought = (base << 16) | comb, that;
336
337         /* binary search */
338         while (max >= min) {
339                 mid = (min + max) / 2;
340                 that = (precompositions[mid].base << 16) | (precompositions[mid].comb);
341                 if (that < sought) {
342                         min = mid + 1;
343                 } else if (that > sought) {
344                         max = mid - 1;
345                 } else {
346                         return precompositions[mid].replacement;
347                 }
348         }
349         /* no match */
350         return 0;
351 }
352
353 /* ------------------------ */
354 static u_int32_t do_precomposition_sp(unsigned int base_sp, unsigned int comb_sp) 
355 {
356         int min = 0;
357         int max = PRECOMP_SP_COUNT - 1;
358         int mid;
359         u_int64_t sought_sp = ((u_int64_t)base_sp << 32) | (u_int64_t)comb_sp, that_sp;
360
361         /* binary search */
362         while (max >= min) {
363                 mid = (min + max) / 2;
364                 that_sp = ((u_int64_t)precompositions_sp[mid].base_sp << 32) | ((u_int64_t)precompositions_sp[mid].comb_sp);
365                 if (that_sp < sought_sp) {
366                         min = mid + 1;
367                 } else if (that_sp > sought_sp) {
368                         max = mid - 1;
369                 } else {
370                         return precompositions_sp[mid].replacement_sp;
371                 }
372         }
373         /* no match */
374         return 0;
375 }
376
377 /* -------------------------- */
378 static u_int32_t do_decomposition(ucs2_t base) 
379 {
380         int min = 0;
381         int max = DECOMP_COUNT - 1;
382         int mid;
383         u_int32_t sought = base;
384         u_int32_t result, that;
385
386         /* binary search */
387         while (max >= min) {
388                 mid = (min + max) / 2;
389                 that = decompositions[mid].replacement;
390                 if (that < sought) {
391                         min = mid + 1;
392                 } else if (that > sought) {
393                         max = mid - 1;
394                 } else {
395                         result = (decompositions[mid].base << 16) | (decompositions[mid].comb);
396                         return result;
397                 }
398         }
399         /* no match */
400         return 0;
401 }
402
403 /* -------------------------- */
404 static u_int64_t do_decomposition_sp(unsigned int base_sp) 
405 {
406         int min = 0;
407         int max = DECOMP_SP_COUNT - 1;
408         int mid;
409         u_int32_t sought_sp = base_sp;
410         u_int32_t that_sp;
411         u_int64_t result_sp;
412
413         /* binary search */
414         while (max >= min) {
415                 mid = (min + max) / 2;
416                 that_sp = decompositions_sp[mid].replacement_sp;
417                 if (that_sp < sought_sp) {
418                         min = mid + 1;
419                 } else if (that_sp > sought_sp) {
420                         max = mid - 1;
421                 } else {
422                         result_sp = ((u_int64_t)decompositions_sp[mid].base_sp << 32) | ((u_int64_t)decompositions_sp[mid].comb_sp);
423                         return result_sp;
424                 }
425         }
426         /* no match */
427         return 0;
428 }
429
430 /*******************************************************************
431 pre|decomposition
432
433    we can't use static, this stuff needs to be reentrant
434    static char comp[MAXPATHLEN +1];
435
436    We don't implement Singleton and Canonical Ordering.
437    We ignore CompositionExclusions.txt.
438    because they cause the problem of the roundtrip
439    such as Dancing Icon.
440
441    exclude U2000-U2FFF, UFE30-UFE4F and U2F800-U2FA1F ranges
442    in precompose.h from composition according to AFP 3.x spec
443 ********************************************************************/
444
445 size_t precompose_w (ucs2_t *name, size_t inplen, ucs2_t *comp, size_t *outlen)
446 {
447         size_t i;
448         ucs2_t base, comb;
449         u_int32_t base_sp, comb_sp;
450         ucs2_t *in, *out;
451         ucs2_t lindex, vindex;
452         ucs2_t result;
453         u_int32_t result_sp;
454         size_t o_len = *outlen;
455         
456         if (!inplen || (inplen & 1) || inplen > o_len)
457                 return (size_t)-1;
458         
459         i = 0;
460         in  = name;
461         out = comp;
462         
463         base = *in;
464         while (*outlen > 2) {
465                 i += 2;
466                 in++;
467
468                 if (i == inplen) {
469                         *out = base;
470                         out++;
471                         *out = 0;
472                         *outlen -= 2;
473                         return o_len - *outlen;
474                 }
475
476                 comb = *in;
477                 result = 0;
478
479                 /* Non-Combination Character */
480                 if (comb < 0x300) ;
481                 
482                 /* Unicode Standard Annex #15 A10.3 Hangul Composition */
483                 /* Step 1 <L,V> */
484                 else if ((VBASE <= comb) && (comb <= VBASE + VCOUNT)) {
485                         if ((LBASE <= base) && (base < LBASE + LCOUNT)) {
486                                 result = 1;
487                                 lindex = base - LBASE;
488                                 vindex = comb - VBASE;
489                                 base = SBASE + (lindex * VCOUNT + vindex) * TCOUNT;
490                         }
491                 }
492                 
493                 /* Step 2 <LV,T> */
494                 else if ((TBASE < comb) && (comb < TBASE + TCOUNT)) {
495                         if ((SBASE <= base) && (base < SBASE + SCOUNT) && (((base - SBASE) % TCOUNT) == 0)) {
496                                 result = 1;
497                                 base += comb - TBASE;
498                         }
499                 }
500                 
501                 /* Binary Search for Surrogate Pair */
502                 else if ((0xD800 <= base) && (base < 0xDC00)) {
503                         if ((0xDC00 <= comb) && (comb < 0xE000) && (i + 4 <= inplen)) {
504                                 base_sp = ((u_int32_t)base << 16) | (u_int32_t)comb;
505                                 do {
506                                         comb_sp = ((u_int32_t)in[1] << 16) | (u_int32_t)in[2];
507                                         if (result_sp = do_precomposition_sp(base_sp, comb_sp)) {
508                                                 base_sp = result_sp;
509                                                 i += 4;
510                                                 in +=2;
511                                         }
512                                 } while ((i + 4 <= inplen) && result_sp) ;
513
514                                 *out = base_sp >> 16;
515                                 out++;
516                                 *outlen -= 2;
517
518                                 if (*outlen <= 2) {
519                                         errno = E2BIG;
520                                         return (size_t)-1;
521                                 }
522
523                                 *out = base_sp & 0xFFFF;
524                                 out++;
525                                 *outlen -= 2;
526
527                                 i += 2;
528                                 in++;
529                                 base = *in;
530
531                                 result = 1;
532                         }
533                 }
534
535                 /* Binary Search for BMP */
536                 else if (result = do_precomposition(base, comb)) {
537                         base = result;
538                 }
539                 
540                 if (!result) {
541                         *out = base;
542                         out++;
543                         *outlen -= 2;
544                         base = comb;
545                 }
546         }
547
548         errno = E2BIG;
549         return (size_t)-1;
550 }
551
552 /* --------------- */
553 size_t decompose_w (ucs2_t *name, size_t inplen, ucs2_t *comp, size_t *outlen)
554 {
555         size_t i;
556         size_t comblen;
557         ucs2_t base, comb[COMBBUFLEN];
558         u_int32_t base_sp;
559         ucs2_t sindex, tjamo;
560         ucs2_t *in, *out;
561         unsigned int result;
562         u_int64_t result_sp;
563         size_t o_len = *outlen;
564
565         if (!inplen || (inplen & 1))
566                 return (size_t)-1;
567         i = 0;
568         in  = name;
569         out = comp;
570
571         while (i < inplen) {
572                 base = *in;
573                 comblen = 0;
574                 
575                 /* check ASCII first. this is frequent. */
576                 if (base <= 0x007f) ;
577                 
578                 /* Unicode Standard Annex #15 A10.2 Hangul Decomposition */
579                 else if ((SBASE <= base) && (base < SBASE + SCOUNT)) {
580                         sindex = base - SBASE;
581                         base = LBASE + sindex / NCOUNT;
582                         comb[COMBBUFLEN-2] = VBASE + (sindex % NCOUNT) / TCOUNT;
583                         
584                         /* <L,V> */
585                         if ((tjamo = TBASE + sindex % TCOUNT) == TBASE) {
586                                 comb[COMBBUFLEN-1] = comb[COMBBUFLEN-2];
587                                 comblen = 1;
588                         }
589                         
590                         /* <L,V,T> */
591                         else {
592                                 comb[COMBBUFLEN-1] = tjamo;
593                                 comblen = 2;
594                         }
595                 }
596                 
597                 /* Binary Search for Surrogate Pair */
598                 else if ((0xD800 <= base) && (base < 0xDC00)) {
599                         if (i + 2 < inplen) {
600                                 base_sp =  ((u_int32_t)base << 16) | (u_int32_t)in[1];
601                                 do {
602                                         if ( !(result_sp = do_decomposition_sp(base_sp))) break;
603                                         comblen += 2;
604                                         base_sp = result_sp >> 32;
605                                         comb[COMBBUFLEN-comblen] = (result_sp >> 16) & 0xFFFF;  /* hi */
606                                         comb[COMBBUFLEN-comblen+1] = result_sp & 0xFFFF;        /* lo */
607                                 } while (comblen < MAXCOMBSPLEN);
608
609                                 if (*outlen < (comblen + 1) << 1) {
610                                         errno = E2BIG;
611                                         return (size_t)-1;
612                                 }
613
614                                 *out = base_sp >> 16;   /* hi */
615                                 out++;
616                                 *outlen -= 2;
617                                 
618                                 base = base_sp & 0xFFFF; /* lo */
619                                 
620                                 i += 2;
621                                 in++;
622                         }
623                 }
624                         
625                 /* Binary Search for BMP */
626                 else {
627                         do {
628                                 if ( !(result = do_decomposition(base))) break;
629                                 comblen++;
630                                 base = result  >> 16;
631                                 comb[COMBBUFLEN-comblen] = result & 0xFFFF;
632                         } while ((0x007f < base) && (comblen < MAXCOMBLEN));
633                 }
634                 
635                 if (*outlen < (comblen + 1) << 1) {
636                         errno = E2BIG;
637                         return (size_t)-1;
638                 }
639                 
640                 *out = base;
641                 out++;
642                 *outlen -= 2;
643                 
644                 while ( comblen > 0 ) {
645                         *out = comb[COMBBUFLEN-comblen];
646                         out++;
647                         *outlen -= 2;
648                         comblen--;
649                 }
650                 
651                 i += 2;
652                 in++;
653         }
654         
655         *out = 0;
656         return o_len-*outlen;
657 }
658
659 /*******************************************************************
660 length of UTF-8 character and string
661 ********************************************************************/
662
663 size_t utf8_charlen ( char* utf8 )
664 {
665         unsigned char *p;
666
667         p = (unsigned char*) utf8;
668         
669         if ( *p < 0x80 )
670                 return (1);
671         else if ( *p > 0xC1 && *p < 0xe0 && *(p+1) > 0x7f && *(p+1) < 0xC0)
672                 return (2);
673         else if ( *p == 0xe0 && *(p+1) > 0x9f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0)
674                 return (3);
675         else if ( *p > 0xe0  && *p < 0xf0 && *(p+1) > 0x7f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0)
676                 return (3);
677         else if ( *p == 0xf0 && *(p+1) > 0x8f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
678                 return (4);
679         else if ( *p > 0xf0 && *p < 0xf4 && *(p+1) > 0x7f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
680                 return (4);
681         else if ( *p == 0xf4 && *(p+1) > 0x7f && *(p+1) < 0x90 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
682                 return (4);
683         else
684                 return ((size_t) -1);
685 }
686
687
688 size_t utf8_strlen_validate ( char * utf8 )
689 {
690         size_t len;
691         unsigned char *p;
692
693         p = (unsigned char*) utf8;
694         len = 0;
695
696         /* see http://www.unicode.org/unicode/reports/tr27/ for an explanation */
697
698         while ( *p != '\0')
699         {
700                 if ( *p < 0x80 )
701                         p++;
702
703                 else if ( *p > 0xC1 && *p < 0xe0 && *(p+1) > 0x7f && *(p+1) < 0xC0)
704                         p += 2;
705
706                 else if ( *p == 0xe0 && *(p+1) > 0x9f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0)
707                         p += 3;
708
709                 else if ( *p > 0xe0  && *p < 0xf0 && *(p+1) > 0x7f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0)
710                         p += 3;
711
712                 else if ( *p == 0xf0 && *(p+1) > 0x8f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
713                         p += 4;
714
715                 else if ( *p > 0xf0 && *p < 0xf4 && *(p+1) > 0x7f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
716                         p += 4;
717
718                 else if ( *p == 0xf4 && *(p+1) > 0x7f && *(p+1) < 0x90 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
719                         p += 4;
720
721                 else
722                         return ((size_t) -1);
723
724                 len++;
725         }
726
727         return (len);
728 }