]> arthur.barton.de Git - netatalk.git/blob - libatalk/unicode/util_unistr.c
Merge master
[netatalk.git] / libatalk / unicode / util_unistr.c
1 #ifdef HAVE_CONFIG_H
2 #include "config.h"
3 #endif /* HAVE_CONFIG_H */
4
5 #include <stdio.h>
6 #include <stdlib.h>
7 #include <string.h>
8 #include <sys/param.h>
9 #include <sys/stat.h>
10 #include <atalk/logger.h>
11 #include <errno.h>
12 #include <arpa/inet.h>
13
14 #include <atalk/unicode.h>
15 #include "precompose.h"
16 #include "byteorder.h"
17
18 /*******************************************************************
19  Convert a string to lower case.
20  return True if any char is converted
21 ********************************************************************/
22 int strlower_w(ucs2_t *s)
23 {
24         int ret = 0;
25         while (*s) {
26                 ucs2_t v = tolower_w(*s);
27                 if (v != *s) {
28                         *s = v;
29                         ret = 1;
30                 }
31                 s++;
32         }
33         return ret;
34 }
35
36 /*******************************************************************
37  Convert a string to upper case.
38  return True if any char is converted
39 ********************************************************************/
40 int strupper_w(ucs2_t *s)
41 {
42         int ret = 0;
43         while (*s) {
44                 ucs2_t v = toupper_w(*s);
45                 if (v != *s) {
46                         *s = v;
47                         ret = 1;
48                 }
49                 s++;
50         }
51         return ret;
52 }
53
54
55 /*******************************************************************
56 determine if a character is lowercase
57 ********************************************************************/
58 int islower_w(ucs2_t c)
59 {
60         return ( c == tolower_w(c));
61 }
62
63 /*******************************************************************
64 determine if a character is uppercase
65 ********************************************************************/
66 int isupper_w(ucs2_t c)
67 {
68         return ( c == toupper_w(c));
69 }
70
71
72 /*******************************************************************
73  Count the number of characters in a ucs2_t string.
74 ********************************************************************/
75 size_t strlen_w(const ucs2_t *src)
76 {
77         size_t len;
78
79         for(len = 0; *src++; len++) ;
80
81         return len;
82 }
83
84 /*******************************************************************
85  Count up to max number of characters in a ucs2_t string.
86 ********************************************************************/
87 size_t strnlen_w(const ucs2_t *src, size_t max)
88 {
89         size_t len;
90
91         for(len = 0; *src++ && (len < max); len++) ;
92
93         return len;
94 }
95
96 /*******************************************************************
97 wide strchr()
98 ********************************************************************/
99 ucs2_t *strchr_w(const ucs2_t *s, ucs2_t c)
100 {
101         while (*s != 0) {
102                 if (c == *s) return (ucs2_t *)s;
103                 s++;
104         }
105         if (c == *s) return (ucs2_t *)s;
106
107         return NULL;
108 }
109
110 ucs2_t *strcasechr_w(const ucs2_t *s, ucs2_t c)
111 {
112         while (*s != 0) {
113 /*              LOG(log_debug, logtype_default, "Comparing %X to %X (%X - %X)", c, *s, toupper_w(c), toupper_w(*s));*/
114                 if (toupper_w(c) == toupper_w(*s)) return (ucs2_t *)s;
115                 s++;
116         }
117         if (c == *s) return (ucs2_t *)s;
118
119         return NULL;
120 }
121
122
123 int strcmp_w(const ucs2_t *a, const ucs2_t *b)
124 {
125         while (*b && *a == *b) { a++; b++; }
126         return (*a - *b);
127         /* warning: if *a != *b and both are not 0 we retrun a random
128            greater or lesser than 0 number not realted to which
129            string is longer */
130 }
131
132 int strncmp_w(const ucs2_t *a, const ucs2_t *b, size_t len)
133 {
134         size_t n = 0;
135         while ((n < len) && *b && *a == *b) { a++; b++; n++;}
136         return (len - n)?(*a - *b):0;
137 }
138
139 /*******************************************************************
140 wide strstr()
141 ********************************************************************/
142 ucs2_t *strstr_w(const ucs2_t *s, const ucs2_t *ins)
143 {
144         ucs2_t *r;
145         size_t slen, inslen;
146
147         if (!s || !*s || !ins || !*ins) return NULL;
148         slen = strlen_w(s);
149         inslen = strlen_w(ins);
150         r = (ucs2_t *)s;
151         while ((r = strchr_w(r, *ins))) {
152                 if (strncmp_w(r, ins, inslen) == 0) return r;
153                 r++;
154         }
155         return NULL;
156 }
157
158 ucs2_t *strcasestr_w(const ucs2_t *s, const ucs2_t *ins)
159 {
160         ucs2_t *r;
161         size_t slen, inslen;
162
163         if (!s || !*s || !ins || !*ins) return NULL;
164         slen = strlen_w(s);
165         inslen = strlen_w(ins);
166         r = (ucs2_t *)s;
167         while ((r = strcasechr_w(r, *ins))) {
168                 if (strncasecmp_w(r, ins, inslen) == 0) return r;
169                 r++;
170         }
171         return NULL;
172 }
173
174
175
176
177 /*******************************************************************
178 case insensitive string comparison
179 ********************************************************************/
180 int strcasecmp_w(const ucs2_t *a, const ucs2_t *b)
181 {
182         while (*b && toupper_w(*a) == toupper_w(*b)) { a++; b++; }
183         return (tolower_w(*a) - tolower_w(*b));
184 }
185
186 /*******************************************************************
187 case insensitive string comparison, lenght limited
188 ********************************************************************/
189 int strncasecmp_w(const ucs2_t *a, const ucs2_t *b, size_t len)
190 {
191         size_t n = 0;
192         while ((n < len) && *b && (toupper_w(*a) == toupper_w(*b))) { a++; b++; n++; }
193         return (len - n)?(tolower_w(*a) - tolower_w(*b)):0;
194 }
195
196 /*******************************************************************
197 duplicate string
198 ********************************************************************/
199 /* if len == 0 then duplicate the whole string */
200 ucs2_t *strndup_w(const ucs2_t *src, size_t len)
201 {
202         ucs2_t *dest;
203
204         if (!len) len = strlen_w(src);
205         dest = (ucs2_t *)malloc((len + 1) * sizeof(ucs2_t));
206         if (!dest) {
207                 LOG (log_error, logtype_default, "strdup_w: out of memory!");
208                 return NULL;
209         }
210
211         memcpy(dest, src, len * sizeof(ucs2_t));
212         dest[len] = 0;
213
214         return dest;
215 }
216
217 ucs2_t *strdup_w(const ucs2_t *src)
218 {
219         return strndup_w(src, 0);
220 }
221
222 /*******************************************************************
223 copy a string with max len
224 ********************************************************************/
225
226 ucs2_t *strncpy_w(ucs2_t *dest, const ucs2_t *src, const size_t max)
227 {
228         size_t len;
229
230         if (!dest || !src) return NULL;
231
232         for (len = 0; (src[len] != 0) && (len < max); len++)
233                 dest[len] = src[len];
234         while (len < max)
235                 dest[len++] = 0;
236
237         return dest;
238 }
239
240
241 /*******************************************************************
242 append a string of len bytes and add a terminator
243 ********************************************************************/
244
245 ucs2_t *strncat_w(ucs2_t *dest, const ucs2_t *src, const size_t max)
246 {
247         size_t start;
248         size_t len;
249
250         if (!dest || !src) return NULL;
251
252         start = strlen_w(dest);
253         len = strnlen_w(src, max);
254
255         memcpy(&dest[start], src, len*sizeof(ucs2_t));
256         dest[start+len] = 0;
257
258         return dest;
259 }
260
261
262 ucs2_t *strcat_w(ucs2_t *dest, const ucs2_t *src)
263 {
264         size_t start;
265         size_t len;
266
267         if (!dest || !src) return NULL;
268
269         start = strlen_w(dest);
270         len = strlen_w(src);
271
272         memcpy(&dest[start], src, len*sizeof(ucs2_t));
273         dest[start+len] = 0;
274
275         return dest;
276 }
277
278
279 /*******************************************************************
280 binary search for pre|decomposition
281 ********************************************************************/
282
283 static ucs2_t do_precomposition(unsigned int base, unsigned int comb) 
284 {
285         int min = 0;
286         int max = PRECOMP_COUNT - 1;
287         int mid;
288         u_int32_t sought = (base << 16) | comb, that;
289
290         /* binary search */
291         while (max >= min) {
292                 mid = (min + max) / 2;
293                 that = (precompositions[mid].base << 16) | (precompositions[mid].comb);
294                 if (that < sought) {
295                         min = mid + 1;
296                 } else if (that > sought) {
297                         max = mid - 1;
298                 } else {
299                         return precompositions[mid].replacement;
300                 }
301         }
302         /* no match */
303         return 0;
304 }
305
306 /* ------------------------ */
307 static u_int32_t do_precomposition_sp(unsigned int base_sp, unsigned int comb_sp) 
308 {
309         int min = 0;
310         int max = PRECOMP_SP_COUNT - 1;
311         int mid;
312         u_int64_t sought_sp = ((u_int64_t)base_sp << 32) | (u_int64_t)comb_sp, that_sp;
313
314         /* binary search */
315         while (max >= min) {
316                 mid = (min + max) / 2;
317                 that_sp = ((u_int64_t)precompositions_sp[mid].base_sp << 32) | ((u_int64_t)precompositions_sp[mid].comb_sp);
318                 if (that_sp < sought_sp) {
319                         min = mid + 1;
320                 } else if (that_sp > sought_sp) {
321                         max = mid - 1;
322                 } else {
323                         return precompositions_sp[mid].replacement_sp;
324                 }
325         }
326         /* no match */
327         return 0;
328 }
329
330 /* -------------------------- */
331 static u_int32_t do_decomposition(ucs2_t base) 
332 {
333         int min = 0;
334         int max = DECOMP_COUNT - 1;
335         int mid;
336         u_int32_t sought = base;
337         u_int32_t result, that;
338
339         /* binary search */
340         while (max >= min) {
341                 mid = (min + max) / 2;
342                 that = decompositions[mid].replacement;
343                 if (that < sought) {
344                         min = mid + 1;
345                 } else if (that > sought) {
346                         max = mid - 1;
347                 } else {
348                         result = (decompositions[mid].base << 16) | (decompositions[mid].comb);
349                         return result;
350                 }
351         }
352         /* no match */
353         return 0;
354 }
355
356 /* -------------------------- */
357 static u_int64_t do_decomposition_sp(unsigned int base_sp) 
358 {
359         int min = 0;
360         int max = DECOMP_SP_COUNT - 1;
361         int mid;
362         u_int32_t sought_sp = base_sp;
363         u_int32_t that_sp;
364         u_int64_t result_sp;
365
366         /* binary search */
367         while (max >= min) {
368                 mid = (min + max) / 2;
369                 that_sp = decompositions_sp[mid].replacement_sp;
370                 if (that_sp < sought_sp) {
371                         min = mid + 1;
372                 } else if (that_sp > sought_sp) {
373                         max = mid - 1;
374                 } else {
375                         result_sp = ((u_int64_t)decompositions_sp[mid].base_sp << 32) | ((u_int64_t)decompositions_sp[mid].comb_sp);
376                         return result_sp;
377                 }
378         }
379         /* no match */
380         return 0;
381 }
382
383 /*******************************************************************
384 pre|decomposition
385
386    we can't use static, this stuff needs to be reentrant
387    static char comp[MAXPATHLEN +1];
388
389    We don't implement Singleton and Canonical Ordering.
390    We ignore CompositionExclusions.txt.
391    because they cause the problem of the roundtrip
392    such as Dancing Icon.
393
394    exclude U2000-U2FFF, UFE30-UFE4F and U2F800-U2FA1F ranges
395    in precompose.h from composition according to AFP 3.x spec
396 ********************************************************************/
397
398 size_t precompose_w (ucs2_t *name, size_t inplen, ucs2_t *comp, size_t *outlen)
399 {
400         size_t i;
401         ucs2_t base, comb;
402         u_int32_t base_sp, comb_sp;
403         ucs2_t *in, *out;
404         ucs2_t lindex, vindex;
405         ucs2_t result;
406         u_int32_t result_sp;
407         size_t o_len = *outlen;
408         
409         if (!inplen || (inplen & 1) || inplen > o_len)
410                 return (size_t)-1;
411         
412         i = 0;
413         in  = name;
414         out = comp;
415         
416         base = *in;
417         while (*outlen > 2) {
418                 i += 2;
419                 in++;
420
421                 if (i == inplen) {
422                         *out = base;
423                         out++;
424                         *out = 0;
425                         *outlen -= 2;
426                         return o_len - *outlen;
427                 }
428
429                 comb = *in;
430                 result = 0;
431
432                 /* Non-Combination Character */
433                 if (comb < 0x300) ;
434                 
435                 /* Unicode Standard Annex #15 A10.3 Hangul Composition */
436                 /* Step 1 <L,V> */
437                 else if ((VBASE <= comb) && (comb <= VBASE + VCOUNT)) {
438                         if ((LBASE <= base) && (base < LBASE + LCOUNT)) {
439                                 result = 1;
440                                 lindex = base - LBASE;
441                                 vindex = comb - VBASE;
442                                 base = SBASE + (lindex * VCOUNT + vindex) * TCOUNT;
443                         }
444                 }
445                 
446                 /* Step 2 <LV,T> */
447                 else if ((TBASE < comb) && (comb < TBASE + TCOUNT)) {
448                         if ((SBASE <= base) && (base < SBASE + SCOUNT) && (((base - SBASE) % TCOUNT) == 0)) {
449                                 result = 1;
450                                 base += comb - TBASE;
451                         }
452                 }
453                 
454                 /* Binary Search for Surrogate Pair */
455                 else if ((0xD800 <= base) && (base < 0xDC00)) {
456                         if ((0xDC00 <= comb) && (comb < 0xE000) && (i + 4 <= inplen)) {
457                                 base_sp = ((u_int32_t)base << 16) | (u_int32_t)comb;
458                                 do {
459                                         comb_sp = ((u_int32_t)in[1] << 16) | (u_int32_t)in[2];
460                                         if (result_sp = do_precomposition_sp(base_sp, comb_sp)) {
461                                                 base_sp = result_sp;
462                                                 i += 4;
463                                                 in +=2;
464                                         }
465                                 } while ((i + 4 <= inplen) && result_sp) ;
466
467                                 *out = base_sp >> 16;
468                                 out++;
469                                 *outlen -= 2;
470
471                                 if (*outlen <= 2) {
472                                         errno = E2BIG;
473                                         return (size_t)-1;
474                                 }
475
476                                 *out = base_sp & 0xFFFF;
477                                 out++;
478                                 *outlen -= 2;
479
480                                 i += 2;
481                                 in++;
482                                 base = *in;
483
484                                 result = 1;
485                         }
486                 }
487
488                 /* Binary Search for BMP */
489                 else if (result = do_precomposition(base, comb)) {
490                         base = result;
491                 }
492                 
493                 if (!result) {
494                         *out = base;
495                         out++;
496                         *outlen -= 2;
497                         base = comb;
498                 }
499         }
500
501         errno = E2BIG;
502         return (size_t)-1;
503 }
504
505 /* --------------- */
506 size_t decompose_w (ucs2_t *name, size_t inplen, ucs2_t *comp, size_t *outlen)
507 {
508         size_t i;
509         size_t comblen;
510         ucs2_t base, comb[COMBBUFLEN];
511         u_int32_t base_sp;
512         ucs2_t sindex, tjamo;
513         ucs2_t *in, *out;
514         unsigned int result;
515         u_int64_t result_sp;
516         size_t o_len = *outlen;
517
518         if (!inplen || (inplen & 1))
519                 return (size_t)-1;
520         i = 0;
521         in  = name;
522         out = comp;
523
524         while (i < inplen) {
525                 base = *in;
526                 comblen = 0;
527                 
528                 /* check ASCII first. this is frequent. */
529                 if (base <= 0x007f) ;
530                 
531                 /* Unicode Standard Annex #15 A10.2 Hangul Decomposition */
532                 else if ((SBASE <= base) && (base < SBASE + SCOUNT)) {
533                         sindex = base - SBASE;
534                         base = LBASE + sindex / NCOUNT;
535                         comb[COMBBUFLEN-2] = VBASE + (sindex % NCOUNT) / TCOUNT;
536                         
537                         /* <L,V> */
538                         if ((tjamo = TBASE + sindex % TCOUNT) == TBASE) {
539                                 comb[COMBBUFLEN-1] = comb[COMBBUFLEN-2];
540                                 comblen = 1;
541                         }
542                         
543                         /* <L,V,T> */
544                         else {
545                                 comb[COMBBUFLEN-1] = tjamo;
546                                 comblen = 2;
547                         }
548                 }
549                 
550                 /* Binary Search for Surrogate Pair */
551                 else if ((0xD800 <= base) && (base < 0xDC00)) {
552                         if (i + 2 < inplen) {
553                                 base_sp =  ((u_int32_t)base << 16) | (u_int32_t)in[1];
554                                 do {
555                                         if ( !(result_sp = do_decomposition_sp(base_sp))) break;
556                                         comblen += 2;
557                                         base_sp = result_sp >> 32;
558                                         comb[COMBBUFLEN-comblen] = (result_sp >> 16) & 0xFFFF;  /* hi */
559                                         comb[COMBBUFLEN-comblen+1] = result_sp & 0xFFFF;        /* lo */
560                                 } while (comblen < MAXCOMBSPLEN);
561
562                                 if (*outlen < (comblen + 1) << 1) {
563                                         errno = E2BIG;
564                                         return (size_t)-1;
565                                 }
566
567                                 *out = base_sp >> 16;   /* hi */
568                                 out++;
569                                 *outlen -= 2;
570                                 
571                                 base = base_sp & 0xFFFF; /* lo */
572                                 
573                                 i += 2;
574                                 in++;
575                         }
576                 }
577                         
578                 /* Binary Search for BMP */
579                 else {
580                         do {
581                                 if ( !(result = do_decomposition(base))) break;
582                                 comblen++;
583                                 base = result  >> 16;
584                                 comb[COMBBUFLEN-comblen] = result & 0xFFFF;
585                         } while ((0x007f < base) && (comblen < MAXCOMBLEN));
586                 }
587                 
588                 if (*outlen < (comblen + 1) << 1) {
589                         errno = E2BIG;
590                         return (size_t)-1;
591                 }
592                 
593                 *out = base;
594                 out++;
595                 *outlen -= 2;
596                 
597                 while ( comblen > 0 ) {
598                         *out = comb[COMBBUFLEN-comblen];
599                         out++;
600                         *outlen -= 2;
601                         comblen--;
602                 }
603                 
604                 i += 2;
605                 in++;
606         }
607         
608         *out = 0;
609         return o_len-*outlen;
610 }
611
612 /*******************************************************************
613 length of UTF-8 character and string
614 ********************************************************************/
615
616 size_t utf8_charlen ( char* utf8 )
617 {
618         unsigned char *p;
619
620         p = (unsigned char*) utf8;
621         
622         if ( *p < 0x80 )
623                 return (1);
624         else if ( *p > 0xC1 && *p < 0xe0 && *(p+1) > 0x7f && *(p+1) < 0xC0)
625                 return (2);
626         else if ( *p == 0xe0 && *(p+1) > 0x9f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0)
627                 return (3);
628         else if ( *p > 0xe0  && *p < 0xf0 && *(p+1) > 0x7f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0)
629                 return (3);
630         else if ( *p == 0xf0 && *(p+1) > 0x8f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
631                 return (4);
632         else if ( *p > 0xf0 && *p < 0xf4 && *(p+1) > 0x7f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
633                 return (4);
634         else if ( *p == 0xf4 && *(p+1) > 0x7f && *(p+1) < 0x90 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
635                 return (4);
636         else
637                 return ((size_t) -1);
638 }
639
640
641 size_t utf8_strlen_validate ( char * utf8 )
642 {
643         size_t len;
644         unsigned char *p;
645
646         p = (unsigned char*) utf8;
647         len = 0;
648
649         /* see http://www.unicode.org/unicode/reports/tr27/ for an explanation */
650
651         while ( *p != '\0')
652         {
653                 if ( *p < 0x80 )
654                         p++;
655
656                 else if ( *p > 0xC1 && *p < 0xe0 && *(p+1) > 0x7f && *(p+1) < 0xC0)
657                         p += 2;
658
659                 else if ( *p == 0xe0 && *(p+1) > 0x9f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0)
660                         p += 3;
661
662                 else if ( *p > 0xe0  && *p < 0xf0 && *(p+1) > 0x7f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0)
663                         p += 3;
664
665                 else if ( *p == 0xf0 && *(p+1) > 0x8f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
666                         p += 4;
667
668                 else if ( *p > 0xf0 && *p < 0xf4 && *(p+1) > 0x7f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
669                         p += 4;
670
671                 else if ( *p == 0xf4 && *(p+1) > 0x7f && *(p+1) < 0x90 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
672                         p += 4;
673
674                 else
675                         return ((size_t) -1);
676
677                 len++;
678         }
679
680         return (len);
681 }