]> arthur.barton.de Git - netatalk.git/blob - libatalk/unicode/util_unistr.c
Wrong path from 2-1
[netatalk.git] / libatalk / unicode / util_unistr.c
1 #ifdef HAVE_CONFIG_H
2 #include "config.h"
3 #endif /* HAVE_CONFIG_H */
4
5 #include <stdio.h>
6 #include <stdlib.h>
7 #include <string.h>
8 #include <sys/param.h>
9 #include <sys/stat.h>
10 #include <atalk/logger.h>
11 #include <errno.h>
12
13 #include <netatalk/endian.h>
14
15 #include <atalk/unicode.h>
16 #include "precompose.h"
17 #include "byteorder.h"
18
19 /*******************************************************************
20  Convert a string to lower case.
21  return True if any char is converted
22 ********************************************************************/
23 int strlower_w(ucs2_t *s)
24 {
25         int ret = 0;
26         while (*s) {
27                 ucs2_t v = tolower_w(*s);
28                 if (v != *s) {
29                         *s = v;
30                         ret = 1;
31                 }
32                 s++;
33         }
34         return ret;
35 }
36
37 /*******************************************************************
38  Convert a string to upper case.
39  return True if any char is converted
40 ********************************************************************/
41 int strupper_w(ucs2_t *s)
42 {
43         int ret = 0;
44         while (*s) {
45                 ucs2_t v = toupper_w(*s);
46                 if (v != *s) {
47                         *s = v;
48                         ret = 1;
49                 }
50                 s++;
51         }
52         return ret;
53 }
54
55
56 /*******************************************************************
57 determine if a character is lowercase
58 ********************************************************************/
59 int islower_w(ucs2_t c)
60 {
61         return ( c == tolower_w(c));
62 }
63
64 /*******************************************************************
65 determine if a character is uppercase
66 ********************************************************************/
67 int isupper_w(ucs2_t c)
68 {
69         return ( c == toupper_w(c));
70 }
71
72
73 /*******************************************************************
74  Count the number of characters in a ucs2_t string.
75 ********************************************************************/
76 size_t strlen_w(const ucs2_t *src)
77 {
78         size_t len;
79
80         for(len = 0; *src++; len++) ;
81
82         return len;
83 }
84
85 /*******************************************************************
86  Count up to max number of characters in a ucs2_t string.
87 ********************************************************************/
88 size_t strnlen_w(const ucs2_t *src, size_t max)
89 {
90         size_t len;
91
92         for(len = 0; *src++ && (len < max); len++) ;
93
94         return len;
95 }
96
97 /*******************************************************************
98 wide strchr()
99 ********************************************************************/
100 ucs2_t *strchr_w(const ucs2_t *s, ucs2_t c)
101 {
102         while (*s != 0) {
103                 if (c == *s) return (ucs2_t *)s;
104                 s++;
105         }
106         if (c == *s) return (ucs2_t *)s;
107
108         return NULL;
109 }
110
111 ucs2_t *strcasechr_w(const ucs2_t *s, ucs2_t c)
112 {
113         while (*s != 0) {
114 /*              LOG(log_debug, logtype_default, "Comparing %X to %X (%X - %X)", c, *s, toupper_w(c), toupper_w(*s));*/
115                 if (toupper_w(c) == toupper_w(*s)) return (ucs2_t *)s;
116                 s++;
117         }
118         if (c == *s) return (ucs2_t *)s;
119
120         return NULL;
121 }
122
123
124 int strcmp_w(const ucs2_t *a, const ucs2_t *b)
125 {
126         while (*b && *a == *b) { a++; b++; }
127         return (*a - *b);
128         /* warning: if *a != *b and both are not 0 we retrun a random
129            greater or lesser than 0 number not realted to which
130            string is longer */
131 }
132
133 int strncmp_w(const ucs2_t *a, const ucs2_t *b, size_t len)
134 {
135         size_t n = 0;
136         while ((n < len) && *b && *a == *b) { a++; b++; n++;}
137         return (len - n)?(*a - *b):0;
138 }
139
140 /*******************************************************************
141 wide strstr()
142 ********************************************************************/
143 ucs2_t *strstr_w(const ucs2_t *s, const ucs2_t *ins)
144 {
145         ucs2_t *r;
146         size_t slen, inslen;
147
148         if (!s || !*s || !ins || !*ins) return NULL;
149         slen = strlen_w(s);
150         inslen = strlen_w(ins);
151         r = (ucs2_t *)s;
152         while ((r = strchr_w(r, *ins))) {
153                 if (strncmp_w(r, ins, inslen) == 0) return r;
154                 r++;
155         }
156         return NULL;
157 }
158
159 ucs2_t *strcasestr_w(const ucs2_t *s, const ucs2_t *ins)
160 {
161         ucs2_t *r;
162         size_t slen, inslen;
163
164         if (!s || !*s || !ins || !*ins) return NULL;
165         slen = strlen_w(s);
166         inslen = strlen_w(ins);
167         r = (ucs2_t *)s;
168         while ((r = strcasechr_w(r, *ins))) {
169                 if (strncasecmp_w(r, ins, inslen) == 0) return r;
170                 r++;
171         }
172         return NULL;
173 }
174
175
176
177
178 /*******************************************************************
179 case insensitive string comparison
180 ********************************************************************/
181 int strcasecmp_w(const ucs2_t *a, const ucs2_t *b)
182 {
183         while (*b && toupper_w(*a) == toupper_w(*b)) { a++; b++; }
184         return (tolower_w(*a) - tolower_w(*b));
185 }
186
187 /*******************************************************************
188 case insensitive string comparison, lenght limited
189 ********************************************************************/
190 int strncasecmp_w(const ucs2_t *a, const ucs2_t *b, size_t len)
191 {
192         size_t n = 0;
193         while ((n < len) && *b && (toupper_w(*a) == toupper_w(*b))) { a++; b++; n++; }
194         return (len - n)?(tolower_w(*a) - tolower_w(*b)):0;
195 }
196
197 /*******************************************************************
198 duplicate string
199 ********************************************************************/
200 /* if len == 0 then duplicate the whole string */
201 ucs2_t *strndup_w(const ucs2_t *src, size_t len)
202 {
203         ucs2_t *dest;
204
205         if (!len) len = strlen_w(src);
206         dest = (ucs2_t *)malloc((len + 1) * sizeof(ucs2_t));
207         if (!dest) {
208                 LOG (log_error, logtype_default, "strdup_w: out of memory!");
209                 return NULL;
210         }
211
212         memcpy(dest, src, len * sizeof(ucs2_t));
213         dest[len] = 0;
214
215         return dest;
216 }
217
218 ucs2_t *strdup_w(const ucs2_t *src)
219 {
220         return strndup_w(src, 0);
221 }
222
223 /*******************************************************************
224 copy a string with max len
225 ********************************************************************/
226
227 ucs2_t *strncpy_w(ucs2_t *dest, const ucs2_t *src, const size_t max)
228 {
229         size_t len;
230
231         if (!dest || !src) return NULL;
232
233         for (len = 0; (src[len] != 0) && (len < max); len++)
234                 dest[len] = src[len];
235         while (len < max)
236                 dest[len++] = 0;
237
238         return dest;
239 }
240
241
242 /*******************************************************************
243 append a string of len bytes and add a terminator
244 ********************************************************************/
245
246 ucs2_t *strncat_w(ucs2_t *dest, const ucs2_t *src, const size_t max)
247 {
248         size_t start;
249         size_t len;
250
251         if (!dest || !src) return NULL;
252
253         start = strlen_w(dest);
254         len = strnlen_w(src, max);
255
256         memcpy(&dest[start], src, len*sizeof(ucs2_t));
257         dest[start+len] = 0;
258
259         return dest;
260 }
261
262
263 ucs2_t *strcat_w(ucs2_t *dest, const ucs2_t *src)
264 {
265         size_t start;
266         size_t len;
267
268         if (!dest || !src) return NULL;
269
270         start = strlen_w(dest);
271         len = strlen_w(src);
272
273         memcpy(&dest[start], src, len*sizeof(ucs2_t));
274         dest[start+len] = 0;
275
276         return dest;
277 }
278
279
280 /*******************************************************************
281 binary search for pre|decomposition
282 ********************************************************************/
283
284 static ucs2_t do_precomposition(unsigned int base, unsigned int comb) 
285 {
286         int min = 0;
287         int max = PRECOMP_COUNT - 1;
288         int mid;
289         u_int32_t sought = (base << 16) | comb, that;
290
291         /* binary search */
292         while (max >= min) {
293                 mid = (min + max) / 2;
294                 that = (precompositions[mid].base << 16) | (precompositions[mid].comb);
295                 if (that < sought) {
296                         min = mid + 1;
297                 } else if (that > sought) {
298                         max = mid - 1;
299                 } else {
300                         return precompositions[mid].replacement;
301                 }
302         }
303         /* no match */
304         return 0;
305 }
306
307 /* ------------------------ */
308 static u_int32_t do_precomposition_sp(unsigned int base_sp, unsigned int comb_sp) 
309 {
310         int min = 0;
311         int max = PRECOMP_SP_COUNT - 1;
312         int mid;
313         u_int64_t sought_sp = ((u_int64_t)base_sp << 32) | (u_int64_t)comb_sp, that_sp;
314
315         /* binary search */
316         while (max >= min) {
317                 mid = (min + max) / 2;
318                 that_sp = ((u_int64_t)precompositions_sp[mid].base_sp << 32) | ((u_int64_t)precompositions_sp[mid].comb_sp);
319                 if (that_sp < sought_sp) {
320                         min = mid + 1;
321                 } else if (that_sp > sought_sp) {
322                         max = mid - 1;
323                 } else {
324                         return precompositions_sp[mid].replacement_sp;
325                 }
326         }
327         /* no match */
328         return 0;
329 }
330
331 /* -------------------------- */
332 static u_int32_t do_decomposition(ucs2_t base) 
333 {
334         int min = 0;
335         int max = DECOMP_COUNT - 1;
336         int mid;
337         u_int32_t sought = base;
338         u_int32_t result, that;
339
340         /* binary search */
341         while (max >= min) {
342                 mid = (min + max) / 2;
343                 that = decompositions[mid].replacement;
344                 if (that < sought) {
345                         min = mid + 1;
346                 } else if (that > sought) {
347                         max = mid - 1;
348                 } else {
349                         result = (decompositions[mid].base << 16) | (decompositions[mid].comb);
350                         return result;
351                 }
352         }
353         /* no match */
354         return 0;
355 }
356
357 /* -------------------------- */
358 static u_int64_t do_decomposition_sp(unsigned int base_sp) 
359 {
360         int min = 0;
361         int max = DECOMP_SP_COUNT - 1;
362         int mid;
363         u_int32_t sought_sp = base_sp;
364         u_int32_t that_sp;
365         u_int64_t result_sp;
366
367         /* binary search */
368         while (max >= min) {
369                 mid = (min + max) / 2;
370                 that_sp = decompositions_sp[mid].replacement_sp;
371                 if (that_sp < sought_sp) {
372                         min = mid + 1;
373                 } else if (that_sp > sought_sp) {
374                         max = mid - 1;
375                 } else {
376                         result_sp = ((u_int64_t)decompositions_sp[mid].base_sp << 32) | ((u_int64_t)decompositions_sp[mid].comb_sp);
377                         return result_sp;
378                 }
379         }
380         /* no match */
381         return 0;
382 }
383
384 /*******************************************************************
385 pre|decomposition
386
387    we can't use static, this stuff needs to be reentrant
388    static char comp[MAXPATHLEN +1];
389
390    We don't implement Singleton and Canonical Ordering.
391    We ignore CompositionExclusions.txt.
392    because they cause the problem of the roundtrip
393    such as Dancing Icon.
394
395    exclude U2000-U2FFF, UFE30-UFE4F and U2F800-U2FA1F ranges
396    in precompose.h from composition according to AFP 3.x spec
397 ********************************************************************/
398
399 size_t precompose_w (ucs2_t *name, size_t inplen, ucs2_t *comp, size_t *outlen)
400 {
401         size_t i;
402         ucs2_t base, comb;
403         u_int32_t base_sp, comb_sp;
404         ucs2_t *in, *out;
405         ucs2_t lindex, vindex;
406         ucs2_t result;
407         u_int32_t result_sp;
408         size_t o_len = *outlen;
409         
410         if (!inplen || (inplen & 1) || inplen > o_len)
411                 return (size_t)-1;
412         
413         i = 0;
414         in  = name;
415         out = comp;
416         
417         base = *in;
418         while (*outlen > 2) {
419                 i += 2;
420                 in++;
421
422                 if (i == inplen) {
423                         *out = base;
424                         out++;
425                         *out = 0;
426                         *outlen -= 2;
427                         return o_len - *outlen;
428                 }
429
430                 comb = *in;
431                 result = 0;
432
433                 /* Non-Combination Character */
434                 if (comb < 0x300) ;
435                 
436                 /* Unicode Standard Annex #15 A10.3 Hangul Composition */
437                 /* Step 1 <L,V> */
438                 else if ((VBASE <= comb) && (comb <= VBASE + VCOUNT)) {
439                         if ((LBASE <= base) && (base < LBASE + LCOUNT)) {
440                                 result = 1;
441                                 lindex = base - LBASE;
442                                 vindex = comb - VBASE;
443                                 base = SBASE + (lindex * VCOUNT + vindex) * TCOUNT;
444                         }
445                 }
446                 
447                 /* Step 2 <LV,T> */
448                 else if ((TBASE < comb) && (comb < TBASE + TCOUNT)) {
449                         if ((SBASE <= base) && (base < SBASE + SCOUNT) && (((base - SBASE) % TCOUNT) == 0)) {
450                                 result = 1;
451                                 base += comb - TBASE;
452                         }
453                 }
454                 
455                 /* Binary Search for Surrogate Pair */
456                 else if ((0xD800 <= base) && (base < 0xDC00)) {
457                         if ((0xDC00 <= comb) && (comb < 0xE000) && (i + 4 <= inplen)) {
458                                 base_sp = ((u_int32_t)base << 16) | (u_int32_t)comb;
459                                 do {
460                                         comb_sp = ((u_int32_t)in[1] << 16) | (u_int32_t)in[2];
461                                         if (result_sp = do_precomposition_sp(base_sp, comb_sp)) {
462                                                 base_sp = result_sp;
463                                                 i += 4;
464                                                 in +=2;
465                                         }
466                                 } while ((i + 4 <= inplen) && result_sp) ;
467
468                                 *out = base_sp >> 16;
469                                 out++;
470                                 *outlen -= 2;
471
472                                 if (*outlen <= 2) {
473                                         errno = E2BIG;
474                                         return (size_t)-1;
475                                 }
476
477                                 *out = base_sp & 0xFFFF;
478                                 out++;
479                                 *outlen -= 2;
480
481                                 i += 2;
482                                 in++;
483                                 base = *in;
484
485                                 result = 1;
486                         }
487                 }
488
489                 /* Binary Search for BMP */
490                 else if (result = do_precomposition(base, comb)) {
491                         base = result;
492                 }
493                 
494                 if (!result) {
495                         *out = base;
496                         out++;
497                         *outlen -= 2;
498                         base = comb;
499                 }
500         }
501
502         errno = E2BIG;
503         return (size_t)-1;
504 }
505
506 /* --------------- */
507 size_t decompose_w (ucs2_t *name, size_t inplen, ucs2_t *comp, size_t *outlen)
508 {
509         size_t i;
510         size_t comblen;
511         ucs2_t base, comb[COMBBUFLEN];
512         u_int32_t base_sp;
513         ucs2_t sindex, tjamo;
514         ucs2_t *in, *out;
515         unsigned int result;
516         u_int64_t result_sp;
517         size_t o_len = *outlen;
518
519         if (!inplen || (inplen & 1))
520                 return (size_t)-1;
521         i = 0;
522         in  = name;
523         out = comp;
524
525         while (i < inplen) {
526                 base = *in;
527                 comblen = 0;
528                 
529                 /* check ASCII first. this is frequent. */
530                 if (base <= 0x007f) ;
531                 
532                 /* Unicode Standard Annex #15 A10.2 Hangul Decomposition */
533                 else if ((SBASE <= base) && (base < SBASE + SCOUNT)) {
534                         sindex = base - SBASE;
535                         base = LBASE + sindex / NCOUNT;
536                         comb[COMBBUFLEN-2] = VBASE + (sindex % NCOUNT) / TCOUNT;
537                         
538                         /* <L,V> */
539                         if ((tjamo = TBASE + sindex % TCOUNT) == TBASE) {
540                                 comb[COMBBUFLEN-1] = comb[COMBBUFLEN-2];
541                                 comblen = 1;
542                         }
543                         
544                         /* <L,V,T> */
545                         else {
546                                 comb[COMBBUFLEN-1] = tjamo;
547                                 comblen = 2;
548                         }
549                 }
550                 
551                 /* Binary Search for Surrogate Pair */
552                 else if ((0xD800 <= base) && (base < 0xDC00)) {
553                         if (i + 2 < inplen) {
554                                 base_sp =  ((u_int32_t)base << 16) | (u_int32_t)in[1];
555                                 do {
556                                         if ( !(result_sp = do_decomposition_sp(base_sp))) break;
557                                         comblen += 2;
558                                         base_sp = result_sp >> 32;
559                                         comb[COMBBUFLEN-comblen] = (result_sp >> 16) & 0xFFFF;  /* hi */
560                                         comb[COMBBUFLEN-comblen+1] = result_sp & 0xFFFF;        /* lo */
561                                 } while (comblen < MAXCOMBSPLEN);
562
563                                 if (*outlen < (comblen + 1) << 1) {
564                                         errno = E2BIG;
565                                         return (size_t)-1;
566                                 }
567
568                                 *out = base_sp >> 16;   /* hi */
569                                 out++;
570                                 *outlen -= 2;
571                                 
572                                 base = base_sp & 0xFFFF; /* lo */
573                                 
574                                 i += 2;
575                                 in++;
576                         }
577                 }
578                         
579                 /* Binary Search for BMP */
580                 else {
581                         do {
582                                 if ( !(result = do_decomposition(base))) break;
583                                 comblen++;
584                                 base = result  >> 16;
585                                 comb[COMBBUFLEN-comblen] = result & 0xFFFF;
586                         } while ((0x007f < base) && (comblen < MAXCOMBLEN));
587                 }
588                 
589                 if (*outlen < (comblen + 1) << 1) {
590                         errno = E2BIG;
591                         return (size_t)-1;
592                 }
593                 
594                 *out = base;
595                 out++;
596                 *outlen -= 2;
597                 
598                 while ( comblen > 0 ) {
599                         *out = comb[COMBBUFLEN-comblen];
600                         out++;
601                         *outlen -= 2;
602                         comblen--;
603                 }
604                 
605                 i += 2;
606                 in++;
607         }
608         
609         *out = 0;
610         return o_len-*outlen;
611 }
612
613 /*******************************************************************
614 length of UTF-8 character and string
615 ********************************************************************/
616
617 size_t utf8_charlen ( char* utf8 )
618 {
619         unsigned char *p;
620
621         p = (unsigned char*) utf8;
622         
623         if ( *p < 0x80 )
624                 return (1);
625         else if ( *p > 0xC1 && *p < 0xe0 && *(p+1) > 0x7f && *(p+1) < 0xC0)
626                 return (2);
627         else if ( *p == 0xe0 && *(p+1) > 0x9f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0)
628                 return (3);
629         else if ( *p > 0xe0  && *p < 0xf0 && *(p+1) > 0x7f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0)
630                 return (3);
631         else if ( *p == 0xf0 && *(p+1) > 0x8f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
632                 return (4);
633         else if ( *p > 0xf0 && *p < 0xf4 && *(p+1) > 0x7f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
634                 return (4);
635         else if ( *p == 0xf4 && *(p+1) > 0x7f && *(p+1) < 0x90 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
636                 return (4);
637         else
638                 return ((size_t) -1);
639 }
640
641
642 size_t utf8_strlen_validate ( char * utf8 )
643 {
644         size_t len;
645         unsigned char *p;
646
647         p = (unsigned char*) utf8;
648         len = 0;
649
650         /* see http://www.unicode.org/unicode/reports/tr27/ for an explanation */
651
652         while ( *p != '\0')
653         {
654                 if ( *p < 0x80 )
655                         p++;
656
657                 else if ( *p > 0xC1 && *p < 0xe0 && *(p+1) > 0x7f && *(p+1) < 0xC0)
658                         p += 2;
659
660                 else if ( *p == 0xe0 && *(p+1) > 0x9f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0)
661                         p += 3;
662
663                 else if ( *p > 0xe0  && *p < 0xf0 && *(p+1) > 0x7f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0)
664                         p += 3;
665
666                 else if ( *p == 0xf0 && *(p+1) > 0x8f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
667                         p += 4;
668
669                 else if ( *p > 0xf0 && *p < 0xf4 && *(p+1) > 0x7f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
670                         p += 4;
671
672                 else if ( *p == 0xf4 && *(p+1) > 0x7f && *(p+1) < 0x90 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
673                         p += 4;
674
675                 else
676                         return ((size_t) -1);
677
678                 len++;
679         }
680
681         return (len);
682 }