1 /*******************************************************************
3 The early netatalk 2.x was based on UCS-2.
4 UCS-2 don't support chars above U+10000.
5 Recent netatalk is based on UTF-16.
6 UTF-16 can support chars above U+10000, using Surrogate Pair.
7 However, Surrogate Pair is complex, dirty, filthy and disagreeable.
8 There might still be latent bugs...
9 ********************************************************************/
13 #endif /* HAVE_CONFIG_H */
18 #include <sys/param.h>
20 #include <atalk/logger.h>
23 #include <netatalk/endian.h>
25 #include <atalk/unicode.h>
26 #include "precompose.h"
27 #include "byteorder.h"
29 /*******************************************************************
30 Convert a string to lower case.
31 return True if any char is converted
32 ********************************************************************/
33 /* surrogate pair support */
35 int strlower_w(ucs2_t *s)
40 if ((0xD800 <= *s) && (*s < 0xDC00)) {
41 if ((0xDC00 <= s[1]) && (s[1] < 0xE000)) {
42 u_int32_t s_sp = (u_int32_t)*s << 16 | (u_int32_t)s[1];
43 u_int32_t v_sp = tolower_sp(s_sp);
52 ucs2_t v = tolower_w(*s);
63 /*******************************************************************
64 Convert a string to upper case.
65 return True if any char is converted
66 ********************************************************************/
67 /* surrogate pair support */
69 int strupper_w(ucs2_t *s)
74 if ((0xD800 <= *s) && (*s < 0xDC00)) {
75 if ((0xDC00 <= s[1]) && (s[1] < 0xE000)) {
76 u_int32_t s_sp = (u_int32_t)*s << 16 | (u_int32_t)s[1];
77 u_int32_t v_sp = toupper_sp(s_sp);
86 ucs2_t v = toupper_w(*s);
97 /*******************************************************************
99 determine if a character is lowercase
100 ********************************************************************/
101 /* These functions are not used. */
103 int islower_w(ucs2_t c)
105 return ( c == tolower_w(c));
108 int islower_sp(u_int32_t c_sp)
110 return ( c_sp == tolower_sp(c_sp));
113 /*******************************************************************
115 determine if a character is uppercase
116 ********************************************************************/
117 /* These functions are not used. */
119 int isupper_w(ucs2_t c)
121 return ( c == toupper_w(c));
124 int isupper_sp(u_int32_t c_sp)
126 return ( c_sp == toupper_sp(c_sp));
129 /*******************************************************************
131 Count the number of characters in a UTF-16 string.
132 ********************************************************************/
133 /* NOTE: one surrogate pair is two characters. */
135 size_t strlen_w(const ucs2_t *src)
139 for(len = 0; *src++; len++) ;
144 /*******************************************************************
146 Count up to max number of characters in a UTF-16 string.
147 ********************************************************************/
148 /* NOTE: one surrogate pair is two characters. */
150 size_t strnlen_w(const ucs2_t *src, size_t max)
154 for(len = 0; *src++ && (len < max); len++) ;
159 /*******************************************************************
161 ********************************************************************/
162 /* NOTE: hi and lo of surrogate pair are separately processed. */
164 ucs2_t *strchr_w(const ucs2_t *s, ucs2_t c)
167 if (c == *s) return (ucs2_t *)s;
170 if (c == *s) return (ucs2_t *)s;
175 /*******************************************************************
176 wide & sp strcasechr()
177 ********************************************************************/
178 /* NOTE: separately process BMP and surrogate pair */
180 ucs2_t *strcasechr_w(const ucs2_t *s, ucs2_t c)
183 /* LOG(log_debug, logtype_default, "Comparing %X to %X (%X - %X)", c, *s, toupper_w(c), toupper_w(*s));*/
184 if (toupper_w(c) == toupper_w(*s)) return (ucs2_t *)s;
187 if (c == *s) return (ucs2_t *)s;
192 ucs2_t *strcasechr_sp(const ucs2_t *s, u_int32_t c_sp)
194 if (*s == 0) return NULL;
196 if (toupper_sp(c_sp) == toupper_sp((u_int32_t)*s << 16 | (u_int32_t)s[1])) return (ucs2_t *)s;
203 /*******************************************************************
205 ********************************************************************/
206 /* no problem of surrogate pair */
208 int strcmp_w(const ucs2_t *a, const ucs2_t *b)
210 while (*b && *a == *b) { a++; b++; }
212 /* warning: if *a != *b and both are not 0 we retrun a random
213 greater or lesser than 0 number not realted to which
217 /*******************************************************************
219 ********************************************************************/
220 /* no problem of surrogate pair */
222 int strncmp_w(const ucs2_t *a, const ucs2_t *b, size_t len)
225 while ((n < len) && *b && *a == *b) { a++; b++; n++;}
226 return (len - n)?(*a - *b):0;
229 /*******************************************************************
231 ********************************************************************/
232 /* no problem of surrogate pair */
234 ucs2_t *strstr_w(const ucs2_t *s, const ucs2_t *ins)
239 if (!s || !*s || !ins || !*ins) return NULL;
241 inslen = strlen_w(ins);
243 while ((r = strchr_w(r, *ins))) {
244 if (strncmp_w(r, ins, inslen) == 0) return r;
250 /*******************************************************************
252 ********************************************************************/
253 /* surrogate pair support */
255 ucs2_t *strcasestr_w(const ucs2_t *s, const ucs2_t *ins)
260 if (!s || !*s || !ins || !*ins) return NULL;
262 inslen = strlen_w(ins);
265 if ((0xD800 <= *ins) && (*ins < 0xDC00)) {
266 if ((0xDC00 <= ins[1]) && (ins[1] < 0xE000)) {
267 u_int32_t ins_sp = (u_int32_t)*ins << 16 | (u_int32_t)ins[1];
268 while ((r = strcasechr_sp(r, ins_sp))) {
269 if (strncasecmp_w(r, ins, inslen) == 0) return r;
273 return NULL; /* illegal sequence */
276 while ((r = strcasechr_w(r, *ins))) {
277 if (strncasecmp_w(r, ins, inslen) == 0) return r;
284 /*******************************************************************
286 case insensitive string comparison
287 ********************************************************************/
288 /* surrogate pair support */
290 int strcasecmp_w(const ucs2_t *a, const ucs2_t *b)
295 if ((0xD800 <= *a) && (*a < 0xDC00)) {
296 if (ret = tolower_sp((u_int32_t)*a << 16 | (u_int32_t)a[1]) - tolower_sp((u_int32_t)*b << 16 | (u_int32_t)b[1])) return ret;
299 if (!(*a && *b)) return (tolower_w(*a) - tolower_w(*b)); /* avoid buffer over run */
301 if (ret = tolower_w(*a) - tolower_w(*b)) return ret;
306 return (tolower_w(*a) - tolower_w(*b));
309 /*******************************************************************
311 case insensitive string comparison, length limited
312 ********************************************************************/
313 /* NOTE: compare up to 'len+1' if 'len' isolate surrogate pair */
315 int strncasecmp_w(const ucs2_t *a, const ucs2_t *b, size_t len)
320 while ((n < len) && *a && *b) {
321 if ((0xD800 <= *a) && (*a < 0xDC00)) {
322 if (ret = tolower_sp((u_int32_t)*a << 16 | (u_int32_t)a[1]) - tolower_sp((u_int32_t)*b << 16 | (u_int32_t)b[1])) return ret;
326 if (!((n < len) && *a && *b)) return (tolower_w(*a) - tolower_w(*b));
328 if (ret = tolower_w(*a) - tolower_w(*b)) return ret;
334 return (len - n)?(tolower_w(*a) - tolower_w(*b)):0;
337 /*******************************************************************
340 ********************************************************************/
341 /* NOTE: not check isolation of surrogate pair */
342 /* if len == 0 then duplicate the whole string */
344 ucs2_t *strndup_w(const ucs2_t *src, size_t len)
348 if (!len) len = strlen_w(src);
349 dest = (ucs2_t *)malloc((len + 1) * sizeof(ucs2_t));
351 LOG (log_error, logtype_default, "strdup_w: out of memory!");
355 memcpy(dest, src, len * sizeof(ucs2_t));
361 /*******************************************************************
364 ********************************************************************/
365 /* no problem of surrogate pair */
367 ucs2_t *strdup_w(const ucs2_t *src)
369 return strndup_w(src, 0);
372 /*******************************************************************
373 copy a string with max len
374 ********************************************************************/
375 /* This function is not used. */
376 /* NOTE: not check isolation of surrogate pair */
378 ucs2_t *strncpy_w(ucs2_t *dest, const ucs2_t *src, const size_t max)
382 if (!dest || !src) return NULL;
384 for (len = 0; (src[len] != 0) && (len < max); len++)
385 dest[len] = src[len];
393 /*******************************************************************
394 append a string of len bytes and add a terminator
395 ********************************************************************/
396 /* These functions are not used. */
398 /* NOTE: not check isolation of surrogate pair */
399 ucs2_t *strncat_w(ucs2_t *dest, const ucs2_t *src, const size_t max)
404 if (!dest || !src) return NULL;
406 start = strlen_w(dest);
407 len = strnlen_w(src, max);
409 memcpy(&dest[start], src, len*sizeof(ucs2_t));
415 /* no problem of surrogate pair */
416 ucs2_t *strcat_w(ucs2_t *dest, const ucs2_t *src)
421 if (!dest || !src) return NULL;
423 start = strlen_w(dest);
426 memcpy(&dest[start], src, len*sizeof(ucs2_t));
433 /*******************************************************************
434 binary search for pre|decomposition
435 ********************************************************************/
437 static ucs2_t do_precomposition(unsigned int base, unsigned int comb)
440 int max = PRECOMP_COUNT - 1;
442 u_int32_t sought = (base << 16) | comb, that;
446 mid = (min + max) / 2;
447 that = (precompositions[mid].base << 16) | (precompositions[mid].comb);
450 } else if (that > sought) {
453 return precompositions[mid].replacement;
460 /* ------------------------ */
461 static u_int32_t do_precomposition_sp(unsigned int base_sp, unsigned int comb_sp)
464 int max = PRECOMP_SP_COUNT - 1;
466 u_int64_t sought_sp = ((u_int64_t)base_sp << 32) | (u_int64_t)comb_sp, that_sp;
470 mid = (min + max) / 2;
471 that_sp = ((u_int64_t)precompositions_sp[mid].base_sp << 32) | ((u_int64_t)precompositions_sp[mid].comb_sp);
472 if (that_sp < sought_sp) {
474 } else if (that_sp > sought_sp) {
477 return precompositions_sp[mid].replacement_sp;
484 /* -------------------------- */
485 static u_int32_t do_decomposition(ucs2_t base)
488 int max = DECOMP_COUNT - 1;
490 u_int32_t sought = base;
491 u_int32_t result, that;
495 mid = (min + max) / 2;
496 that = decompositions[mid].replacement;
499 } else if (that > sought) {
502 result = (decompositions[mid].base << 16) | (decompositions[mid].comb);
510 /* -------------------------- */
511 static u_int64_t do_decomposition_sp(unsigned int base_sp)
514 int max = DECOMP_SP_COUNT - 1;
516 u_int32_t sought_sp = base_sp;
522 mid = (min + max) / 2;
523 that_sp = decompositions_sp[mid].replacement_sp;
524 if (that_sp < sought_sp) {
526 } else if (that_sp > sought_sp) {
529 result_sp = ((u_int64_t)decompositions_sp[mid].base_sp << 32) | ((u_int64_t)decompositions_sp[mid].comb_sp);
537 /*******************************************************************
540 we can't use static, this stuff needs to be reentrant
541 static char comp[MAXPATHLEN +1];
543 We don't implement Singleton and Canonical Ordering.
544 We ignore CompositionExclusions.txt.
545 because they cause the problem of the roundtrip
546 such as Dancing Icon.
548 exclude U2000-U2FFF, UFE30-UFE4F and U2F800-U2FA1F ranges
549 in precompose.h from composition according to AFP 3.x spec
550 ********************************************************************/
552 size_t precompose_w (ucs2_t *name, size_t inplen, ucs2_t *comp, size_t *outlen)
556 u_int32_t base_sp, comb_sp;
558 ucs2_t lindex, vindex;
561 size_t o_len = *outlen;
563 if (!inplen || (inplen & 1) || inplen > o_len)
571 while (*outlen > 2) {
578 return o_len - *outlen;
584 /* Non-Combination Character */
587 /* Unicode Standard Annex #15 A10.3 Hangul Composition */
589 else if ((VBASE <= comb) && (comb <= VBASE + VCOUNT)) {
590 if ((LBASE <= base) && (base < LBASE + LCOUNT)) {
592 lindex = base - LBASE;
593 vindex = comb - VBASE;
594 base = SBASE + (lindex * VCOUNT + vindex) * TCOUNT;
599 else if ((TBASE < comb) && (comb < TBASE + TCOUNT)) {
600 if ((SBASE <= base) && (base < SBASE + SCOUNT) && (((base - SBASE) % TCOUNT) == 0)) {
602 base += comb - TBASE;
606 /* Binary Search for Surrogate Pair */
607 else if ((0xD800 <= base) && (base < 0xDC00)) {
608 if ((0xDC00 <= comb) && (comb < 0xE000) && (i + 6 <= inplen)) {
609 base_sp = ((u_int32_t)base << 16) | (u_int32_t)comb;
611 comb_sp = ((u_int32_t)in[1] << 16) | (u_int32_t)in[2];
612 if (result_sp = do_precomposition_sp(base_sp, comb_sp)) {
617 } while ((i + 6 <= inplen) && result_sp) ;
619 *out = base_sp >> 16;
628 *out = base_sp & 0xFFFF;
636 return o_len - *outlen;
645 /* Binary Search for BMP */
646 else if (result = do_precomposition(base, comb)) {
662 /* --------------- */
663 size_t decompose_w (ucs2_t *name, size_t inplen, ucs2_t *comp, size_t *outlen)
667 ucs2_t base, comb[COMBBUFLEN];
669 ucs2_t sindex, tjamo;
673 size_t o_len = *outlen;
675 if (!inplen || (inplen & 1))
685 /* check ASCII first. this is frequent. */
686 if (base <= 0x007f) ;
688 /* Unicode Standard Annex #15 A10.2 Hangul Decomposition */
689 else if ((SBASE <= base) && (base < SBASE + SCOUNT)) {
690 sindex = base - SBASE;
691 base = LBASE + sindex / NCOUNT;
692 comb[COMBBUFLEN-2] = VBASE + (sindex % NCOUNT) / TCOUNT;
695 if ((tjamo = TBASE + sindex % TCOUNT) == TBASE) {
696 comb[COMBBUFLEN-1] = comb[COMBBUFLEN-2];
702 comb[COMBBUFLEN-1] = tjamo;
707 /* Binary Search for Surrogate Pair */
708 else if ((0xD800 <= base) && (base < 0xDC00)) {
709 if (i + 2 < inplen) {
710 base_sp = ((u_int32_t)base << 16) | (u_int32_t)in[1];
712 if ( !(result_sp = do_decomposition_sp(base_sp))) break;
714 base_sp = result_sp >> 32;
715 comb[COMBBUFLEN-comblen] = (result_sp >> 16) & 0xFFFF; /* hi */
716 comb[COMBBUFLEN-comblen+1] = result_sp & 0xFFFF; /* lo */
717 } while (comblen < MAXCOMBSPLEN);
719 if (*outlen < (comblen + 1) << 1) {
724 *out = base_sp >> 16; /* hi */
728 base = base_sp & 0xFFFF; /* lo */
735 /* Binary Search for BMP */
738 if ( !(result = do_decomposition(base))) break;
741 comb[COMBBUFLEN-comblen] = result & 0xFFFF;
742 } while ((0x007f < base) && (comblen < MAXCOMBLEN));
745 if (*outlen < (comblen + 1) << 1) {
754 while ( comblen > 0 ) {
755 *out = comb[COMBBUFLEN-comblen];
766 return o_len-*outlen;
769 /*******************************************************************
770 length of UTF-8 character and string
771 ********************************************************************/
773 size_t utf8_charlen ( char* utf8 )
777 p = (unsigned char*) utf8;
781 else if ( *p > 0xC1 && *p < 0xe0 && *(p+1) > 0x7f && *(p+1) < 0xC0)
783 else if ( *p == 0xe0 && *(p+1) > 0x9f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0)
785 else if ( *p > 0xe0 && *p < 0xf0 && *(p+1) > 0x7f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0)
787 else if ( *p == 0xf0 && *(p+1) > 0x8f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
789 else if ( *p > 0xf0 && *p < 0xf4 && *(p+1) > 0x7f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
791 else if ( *p == 0xf4 && *(p+1) > 0x7f && *(p+1) < 0x90 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
794 return ((size_t) -1);
798 size_t utf8_strlen_validate ( char * utf8 )
803 p = (unsigned char*) utf8;
806 /* see http://www.unicode.org/unicode/reports/tr27/ for an explanation */
813 else if ( *p > 0xC1 && *p < 0xe0 && *(p+1) > 0x7f && *(p+1) < 0xC0)
816 else if ( *p == 0xe0 && *(p+1) > 0x9f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0)
819 else if ( *p > 0xe0 && *p < 0xf0 && *(p+1) > 0x7f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0)
822 else if ( *p == 0xf0 && *(p+1) > 0x8f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
825 else if ( *p > 0xf0 && *p < 0xf4 && *(p+1) > 0x7f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
828 else if ( *p == 0xf4 && *(p+1) > 0x7f && *(p+1) < 0x90 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
832 return ((size_t) -1);