1 /*******************************************************************
3 The early netatalk 2.x was based on UCS-2.
4 UCS-2 don't support chars above U+10000.
5 Recent netatalk is based on UTF-16.
6 UTF-16 can support chars above U+10000, using Surrogate Pair.
7 However, Surrogate Pair is complex, dirty, filthy and disagreeable.
8 There might still be latent bugs...
9 ********************************************************************/
13 #endif /* HAVE_CONFIG_H */
18 #include <sys/param.h>
20 #include <atalk/logger.h>
22 #include <arpa/inet.h>
24 #include <atalk/unicode.h>
25 #include "precompose.h"
26 #include "byteorder.h"
28 /*******************************************************************
29 Convert a string to lower case.
30 return True if any char is converted
31 ********************************************************************/
32 /* surrogate pair support */
34 int strlower_w(ucs2_t *s)
39 if ((0xD800 <= *s) && (*s < 0xDC00)) {
40 if ((0xDC00 <= s[1]) && (s[1] < 0xE000)) {
41 uint32_t s_sp = (uint32_t)*s << 16 | (uint32_t)s[1];
42 uint32_t v_sp = tolower_sp(s_sp);
51 ucs2_t v = tolower_w(*s);
62 /*******************************************************************
63 Convert a string to upper case.
64 return True if any char is converted
65 ********************************************************************/
66 /* surrogate pair support */
68 int strupper_w(ucs2_t *s)
73 if ((0xD800 <= *s) && (*s < 0xDC00)) {
74 if ((0xDC00 <= s[1]) && (s[1] < 0xE000)) {
75 uint32_t s_sp = (uint32_t)*s << 16 | (uint32_t)s[1];
76 uint32_t v_sp = toupper_sp(s_sp);
85 ucs2_t v = toupper_w(*s);
96 /*******************************************************************
98 determine if a character is lowercase
99 ********************************************************************/
100 /* These functions are not used. */
102 int islower_w(ucs2_t c)
104 return ( c == tolower_w(c));
107 int islower_sp(uint32_t c_sp)
109 return ( c_sp == tolower_sp(c_sp));
112 /*******************************************************************
114 determine if a character is uppercase
115 ********************************************************************/
116 /* These functions are not used. */
118 int isupper_w(ucs2_t c)
120 return ( c == toupper_w(c));
123 int isupper_sp(uint32_t c_sp)
125 return ( c_sp == toupper_sp(c_sp));
128 /*******************************************************************
130 Count the number of characters in a UTF-16 string.
131 ********************************************************************/
132 /* NOTE: one surrogate pair is two characters. */
134 size_t strlen_w(const ucs2_t *src)
138 for(len = 0; *src++; len++) ;
143 /*******************************************************************
145 Count up to max number of characters in a UTF-16 string.
146 ********************************************************************/
147 /* NOTE: one surrogate pair is two characters. */
149 size_t strnlen_w(const ucs2_t *src, size_t max)
153 for(len = 0; *src++ && (len < max); len++) ;
158 /*******************************************************************
160 ********************************************************************/
161 /* NOTE: hi and lo of surrogate pair are separately processed. */
163 ucs2_t *strchr_w(const ucs2_t *s, ucs2_t c)
166 if (c == *s) return (ucs2_t *)s;
169 if (c == *s) return (ucs2_t *)s;
174 /*******************************************************************
175 wide & sp strcasechr()
176 ********************************************************************/
177 /* NOTE: separately process BMP and surrogate pair */
179 ucs2_t *strcasechr_w(const ucs2_t *s, ucs2_t c)
182 if (tolower_w(c) == tolower_w(*s)) return (ucs2_t *)s;
185 if (c == *s) return (ucs2_t *)s;
190 ucs2_t *strcasechr_sp(const ucs2_t *s, uint32_t c_sp)
192 if (*s == 0) return NULL;
194 if (tolower_sp(c_sp) == tolower_sp((uint32_t)*s << 16 | (uint32_t)s[1])) return (ucs2_t *)s;
201 /*******************************************************************
203 ********************************************************************/
204 /* no problem of surrogate pair */
206 int strcmp_w(const ucs2_t *a, const ucs2_t *b)
208 while (*b && *a == *b) { a++; b++; }
210 /* warning: if *a != *b and both are not 0 we retrun a random
211 greater or lesser than 0 number not realted to which
215 /*******************************************************************
217 ********************************************************************/
218 /* no problem of surrogate pair */
220 int strncmp_w(const ucs2_t *a, const ucs2_t *b, size_t len)
223 while ((n < len) && *b && *a == *b) { a++; b++; n++;}
224 return (len - n)?(*a - *b):0;
227 /*******************************************************************
229 ********************************************************************/
230 /* no problem of surrogate pair */
232 ucs2_t *strstr_w(const ucs2_t *s, const ucs2_t *ins)
237 if (!s || !*s || !ins || !*ins) return NULL;
239 inslen = strlen_w(ins);
241 while ((r = strchr_w(r, *ins))) {
242 if (strncmp_w(r, ins, inslen) == 0) return r;
248 /*******************************************************************
250 ********************************************************************/
251 /* surrogate pair support */
253 ucs2_t *strcasestr_w(const ucs2_t *s, const ucs2_t *ins)
258 if (!s || !*s || !ins || !*ins) return NULL;
260 inslen = strlen_w(ins);
263 if ((0xD800 <= *ins) && (*ins < 0xDC00)) {
264 if ((0xDC00 <= ins[1]) && (ins[1] < 0xE000)) {
265 uint32_t ins_sp = (uint32_t)*ins << 16 | (uint32_t)ins[1];
266 while ((r = strcasechr_sp(r, ins_sp))) {
267 if (strncasecmp_w(r, ins, inslen) == 0) return r;
271 return NULL; /* illegal sequence */
274 while ((r = strcasechr_w(r, *ins))) {
275 if (strncasecmp_w(r, ins, inslen) == 0) return r;
282 /*******************************************************************
284 case insensitive string comparison
285 ********************************************************************/
286 /* surrogate pair support */
288 int strcasecmp_w(const ucs2_t *a, const ucs2_t *b)
293 if ((0xD800 <= *a) && (*a < 0xDC00)) {
294 if (ret = tolower_sp((uint32_t)*a << 16 | (uint32_t)a[1]) - tolower_sp((uint32_t)*b << 16 | (uint32_t)b[1])) return ret;
297 if (!(*a && *b)) return (tolower_w(*a) - tolower_w(*b)); /* avoid buffer over run */
299 if (ret = tolower_w(*a) - tolower_w(*b)) return ret;
304 return (tolower_w(*a) - tolower_w(*b));
307 /*******************************************************************
309 case insensitive string comparison, length limited
310 ********************************************************************/
311 /* NOTE: compare up to 'len+1' if 'len' isolate surrogate pair */
313 int strncasecmp_w(const ucs2_t *a, const ucs2_t *b, size_t len)
318 while ((n < len) && *a && *b) {
319 if ((0xD800 <= *a) && (*a < 0xDC00)) {
320 if (ret = tolower_sp((uint32_t)*a << 16 | (uint32_t)a[1]) - tolower_sp((uint32_t)*b << 16 | (uint32_t)b[1])) return ret;
324 if (!((n < len) && *a && *b)) return (tolower_w(*a) - tolower_w(*b));
326 if (ret = tolower_w(*a) - tolower_w(*b)) return ret;
332 return (len - n)?(tolower_w(*a) - tolower_w(*b)):0;
335 /*******************************************************************
338 ********************************************************************/
339 /* NOTE: not check isolation of surrogate pair */
340 /* if len == 0 then duplicate the whole string */
342 ucs2_t *strndup_w(const ucs2_t *src, size_t len)
346 if (!len) len = strlen_w(src);
347 dest = (ucs2_t *)malloc((len + 1) * sizeof(ucs2_t));
349 LOG (log_error, logtype_default, "strdup_w: out of memory!");
353 memcpy(dest, src, len * sizeof(ucs2_t));
359 /*******************************************************************
362 ********************************************************************/
363 /* no problem of surrogate pair */
365 ucs2_t *strdup_w(const ucs2_t *src)
367 return strndup_w(src, 0);
370 /*******************************************************************
371 copy a string with max len
372 ********************************************************************/
373 /* This function is not used. */
374 /* NOTE: not check isolation of surrogate pair */
376 ucs2_t *strncpy_w(ucs2_t *dest, const ucs2_t *src, const size_t max)
380 if (!dest || !src) return NULL;
382 for (len = 0; (src[len] != 0) && (len < max); len++)
383 dest[len] = src[len];
391 /*******************************************************************
392 append a string of len bytes and add a terminator
393 ********************************************************************/
394 /* These functions are not used. */
396 /* NOTE: not check isolation of surrogate pair */
397 ucs2_t *strncat_w(ucs2_t *dest, const ucs2_t *src, const size_t max)
402 if (!dest || !src) return NULL;
404 start = strlen_w(dest);
405 len = strnlen_w(src, max);
407 memcpy(&dest[start], src, len*sizeof(ucs2_t));
413 /* no problem of surrogate pair */
414 ucs2_t *strcat_w(ucs2_t *dest, const ucs2_t *src)
419 if (!dest || !src) return NULL;
421 start = strlen_w(dest);
424 memcpy(&dest[start], src, len*sizeof(ucs2_t));
431 /*******************************************************************
432 binary search for pre|decomposition
433 ********************************************************************/
435 static ucs2_t do_precomposition(unsigned int base, unsigned int comb)
438 int max = PRECOMP_COUNT - 1;
440 uint32_t sought = (base << 16) | comb, that;
444 mid = (min + max) / 2;
445 that = (precompositions[mid].base << 16) | (precompositions[mid].comb);
448 } else if (that > sought) {
451 return precompositions[mid].replacement;
458 /* ------------------------ */
459 static uint32_t do_precomposition_sp(unsigned int base_sp, unsigned int comb_sp)
462 int max = PRECOMP_SP_COUNT - 1;
464 uint64_t sought_sp = ((uint64_t)base_sp << 32) | (uint64_t)comb_sp, that_sp;
468 mid = (min + max) / 2;
469 that_sp = ((uint64_t)precompositions_sp[mid].base_sp << 32) | ((uint64_t)precompositions_sp[mid].comb_sp);
470 if (that_sp < sought_sp) {
472 } else if (that_sp > sought_sp) {
475 return precompositions_sp[mid].replacement_sp;
482 /* -------------------------- */
483 static uint32_t do_decomposition(ucs2_t base)
486 int max = DECOMP_COUNT - 1;
488 uint32_t sought = base;
489 uint32_t result, that;
493 mid = (min + max) / 2;
494 that = decompositions[mid].replacement;
497 } else if (that > sought) {
500 result = (decompositions[mid].base << 16) | (decompositions[mid].comb);
508 /* -------------------------- */
509 static uint64_t do_decomposition_sp(unsigned int base_sp)
512 int max = DECOMP_SP_COUNT - 1;
514 uint32_t sought_sp = base_sp;
520 mid = (min + max) / 2;
521 that_sp = decompositions_sp[mid].replacement_sp;
522 if (that_sp < sought_sp) {
524 } else if (that_sp > sought_sp) {
527 result_sp = ((uint64_t)decompositions_sp[mid].base_sp << 32) | ((uint64_t)decompositions_sp[mid].comb_sp);
535 /*******************************************************************
538 we can't use static, this stuff needs to be reentrant
539 static char comp[MAXPATHLEN +1];
541 We don't implement Singleton and Canonical Ordering.
542 We ignore CompositionExclusions.txt.
543 because they cause the problem of the roundtrip
544 such as Dancing Icon.
546 exclude U2000-U2FFF, UFE30-UFE4F and U2F800-U2FA1F ranges
547 in precompose.h from composition according to AFP 3.x spec
548 ********************************************************************/
550 size_t precompose_w (ucs2_t *name, size_t inplen, ucs2_t *comp, size_t *outlen)
554 uint32_t base_sp, comb_sp;
556 ucs2_t lindex, vindex;
559 size_t o_len = *outlen;
561 if (!inplen || (inplen & 1) || inplen > o_len)
569 while (*outlen > 2) {
576 return o_len - *outlen;
582 /* Non-Combination Character */
585 /* Unicode Standard Annex #15 A10.3 Hangul Composition */
587 else if ((VBASE <= comb) && (comb <= VBASE + VCOUNT)) {
588 if ((LBASE <= base) && (base < LBASE + LCOUNT)) {
590 lindex = base - LBASE;
591 vindex = comb - VBASE;
592 base = SBASE + (lindex * VCOUNT + vindex) * TCOUNT;
597 else if ((TBASE < comb) && (comb < TBASE + TCOUNT)) {
598 if ((SBASE <= base) && (base < SBASE + SCOUNT) && (((base - SBASE) % TCOUNT) == 0)) {
600 base += comb - TBASE;
604 /* Binary Search for Surrogate Pair */
605 else if ((0xD800 <= base) && (base < 0xDC00)) {
606 if ((0xDC00 <= comb) && (comb < 0xE000) && (i + 6 <= inplen)) {
607 base_sp = ((uint32_t)base << 16) | (uint32_t)comb;
609 comb_sp = ((uint32_t)in[1] << 16) | (uint32_t)in[2];
610 if (result_sp = do_precomposition_sp(base_sp, comb_sp)) {
615 } while ((i + 6 <= inplen) && result_sp) ;
617 *out = base_sp >> 16;
626 *out = base_sp & 0xFFFF;
634 return o_len - *outlen;
643 /* Binary Search for BMP */
644 else if (result = do_precomposition(base, comb)) {
660 /* --------------- */
661 size_t decompose_w (ucs2_t *name, size_t inplen, ucs2_t *comp, size_t *outlen)
665 ucs2_t base, comb[COMBBUFLEN];
667 ucs2_t sindex, tjamo;
671 size_t o_len = *outlen;
673 if (!inplen || (inplen & 1))
683 /* check ASCII first. this is frequent. */
684 if (base <= 0x007f) ;
686 /* Unicode Standard Annex #15 A10.2 Hangul Decomposition */
687 else if ((SBASE <= base) && (base < SBASE + SCOUNT)) {
688 sindex = base - SBASE;
689 base = LBASE + sindex / NCOUNT;
690 comb[COMBBUFLEN-2] = VBASE + (sindex % NCOUNT) / TCOUNT;
693 if ((tjamo = TBASE + sindex % TCOUNT) == TBASE) {
694 comb[COMBBUFLEN-1] = comb[COMBBUFLEN-2];
700 comb[COMBBUFLEN-1] = tjamo;
705 /* Binary Search for Surrogate Pair */
706 else if ((0xD800 <= base) && (base < 0xDC00)) {
707 if (i + 2 < inplen) {
708 base_sp = ((uint32_t)base << 16) | (uint32_t)in[1];
710 if ( !(result_sp = do_decomposition_sp(base_sp))) break;
712 base_sp = result_sp >> 32;
713 comb[COMBBUFLEN-comblen] = (result_sp >> 16) & 0xFFFF; /* hi */
714 comb[COMBBUFLEN-comblen+1] = result_sp & 0xFFFF; /* lo */
715 } while (comblen < MAXCOMBSPLEN);
717 if (*outlen < (comblen + 1) << 1) {
722 *out = base_sp >> 16; /* hi */
726 base = base_sp & 0xFFFF; /* lo */
733 /* Binary Search for BMP */
736 if ( !(result = do_decomposition(base))) break;
739 comb[COMBBUFLEN-comblen] = result & 0xFFFF;
740 } while ((0x007f < base) && (comblen < MAXCOMBLEN));
743 if (*outlen < (comblen + 1) << 1) {
752 while ( comblen > 0 ) {
753 *out = comb[COMBBUFLEN-comblen];
764 return o_len-*outlen;
767 /*******************************************************************
768 length of UTF-8 character and string
769 ********************************************************************/
771 size_t utf8_charlen ( char* utf8 )
775 p = (unsigned char*) utf8;
779 else if ( *p > 0xC1 && *p < 0xe0 && *(p+1) > 0x7f && *(p+1) < 0xC0)
781 else if ( *p == 0xe0 && *(p+1) > 0x9f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0)
783 else if ( *p > 0xe0 && *p < 0xf0 && *(p+1) > 0x7f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0)
785 else if ( *p == 0xf0 && *(p+1) > 0x8f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
787 else if ( *p > 0xf0 && *p < 0xf4 && *(p+1) > 0x7f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
789 else if ( *p == 0xf4 && *(p+1) > 0x7f && *(p+1) < 0x90 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
792 return ((size_t) -1);
796 size_t utf8_strlen_validate ( char * utf8 )
801 p = (unsigned char*) utf8;
804 /* see http://www.unicode.org/unicode/reports/tr27/ for an explanation */
811 else if ( *p > 0xC1 && *p < 0xe0 && *(p+1) > 0x7f && *(p+1) < 0xC0)
814 else if ( *p == 0xe0 && *(p+1) > 0x9f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0)
817 else if ( *p > 0xe0 && *p < 0xf0 && *(p+1) > 0x7f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0)
820 else if ( *p == 0xf0 && *(p+1) > 0x8f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
823 else if ( *p > 0xf0 && *p < 0xf4 && *(p+1) > 0x7f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
826 else if ( *p == 0xf4 && *(p+1) > 0x7f && *(p+1) < 0x90 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
830 return ((size_t) -1);