]> arthur.barton.de Git - netatalk.git/blob - libatalk/unicode/util_unistr.c
case-conversion of surrogate pair
[netatalk.git] / libatalk / unicode / util_unistr.c
1 /*******************************************************************
2   NOTE:
3   The early netatalk 2.x was based on UCS-2.
4   UCS-2 don't support chars above U+10000.
5   Recent netatalk is based on UTF-16.
6   UTF-16 can support chars above U+10000, using Surrogate Pair.
7   However, Surrogate Pair is complex, dirty, filthy and disagreeable.
8   There might still be latent bugs...
9 ********************************************************************/
10
11 #ifdef HAVE_CONFIG_H
12 #include "config.h"
13 #endif /* HAVE_CONFIG_H */
14
15 #include <stdio.h>
16 #include <stdlib.h>
17 #include <string.h>
18 #include <sys/param.h>
19 #include <sys/stat.h>
20 #include <atalk/logger.h>
21 #include <errno.h>
22
23 #include <netatalk/endian.h>
24
25 #include <atalk/unicode.h>
26 #include "precompose.h"
27 #include "byteorder.h"
28
29 /*******************************************************************
30  Convert a string to lower case.
31  return True if any char is converted
32 ********************************************************************/
33 /* surrogate pair support */
34
35 int strlower_w(ucs2_t *s)
36 {
37         int ret = 0;
38
39         while (*s) {
40                 if ((0xD800 <= *s) && (*s < 0xDC00)) {
41                         if ((0xDC00 <= s[1]) && (s[1] < 0xE000)) {
42                                 u_int32_t s_sp = (u_int32_t)*s << 16 | (u_int32_t)s[1];
43                                 u_int32_t v_sp = tolower_sp(s_sp);
44                                 if (v_sp != s_sp) {
45                                         *s = v_sp >> 16;
46                                         s++;
47                                         *s = v_sp & 0xFFFF;
48                                         ret = 1;
49                                 }
50                         }
51                 } else {
52                         ucs2_t v = tolower_w(*s);
53                         if (v != *s) {
54                                 *s = v;
55                                 ret = 1;
56                         }
57                 }
58                 s++;
59         }
60         return ret;
61 }
62
63 /*******************************************************************
64  Convert a string to upper case.
65  return True if any char is converted
66 ********************************************************************/
67 /* surrogate pair support */
68
69 int strupper_w(ucs2_t *s)
70 {
71         int ret = 0;
72
73         while (*s) {
74                 if ((0xD800 <= *s) && (*s < 0xDC00)) {
75                         if ((0xDC00 <= s[1]) && (s[1] < 0xE000)) {
76                                 u_int32_t s_sp = (u_int32_t)*s << 16 | (u_int32_t)s[1];
77                                 u_int32_t v_sp = toupper_sp(s_sp);
78                                 if (v_sp != s_sp) {
79                                         *s = v_sp >> 16;
80                                         s++;
81                                         *s = v_sp & 0xFFFF;
82                                         ret = 1;
83                                 }
84                         }
85                 } else {
86                         ucs2_t v = toupper_w(*s);
87                         if (v != *s) {
88                                 *s = v;
89                                 ret = 1;
90                         }
91                 }
92                 s++;
93         }
94         return ret;
95 }
96
97 /*******************************************************************
98 wide & sp islower()
99 determine if a character is lowercase
100 ********************************************************************/
101 /* These functions are not used. */
102
103 int islower_w(ucs2_t c)
104 {
105         return ( c == tolower_w(c));
106 }
107
108 int islower_sp(u_int32_t c_sp)
109 {
110         return ( c_sp == tolower_sp(c_sp));
111 }
112
113 /*******************************************************************
114 wide & sp isupper()
115 determine if a character is uppercase
116 ********************************************************************/
117 /* These functions are not used. */
118
119 int isupper_w(ucs2_t c)
120 {
121         return ( c == toupper_w(c));
122 }
123
124 int isupper_sp(u_int32_t c_sp)
125 {
126         return ( c_sp == toupper_sp(c_sp));
127 }
128
129 /*******************************************************************
130 wide strlen()
131  Count the number of characters in a UTF-16 string.
132 ********************************************************************/
133 /* NOTE: one surrogate pair is two characters. */
134
135 size_t strlen_w(const ucs2_t *src)
136 {
137         size_t len;
138
139         for(len = 0; *src++; len++) ;
140
141         return len;
142 }
143
144 /*******************************************************************
145 wide strnlen()
146  Count up to max number of characters in a UTF-16 string.
147 ********************************************************************/
148 /* NOTE: one surrogate pair is two characters. */
149
150 size_t strnlen_w(const ucs2_t *src, size_t max)
151 {
152         size_t len;
153
154         for(len = 0; *src++ && (len < max); len++) ;
155
156         return len;
157 }
158
159 /*******************************************************************
160 wide strchr()
161 ********************************************************************/
162 /* NOTE: hi and lo of surrogate pair are separately processed. */
163
164 ucs2_t *strchr_w(const ucs2_t *s, ucs2_t c)
165 {
166         while (*s != 0) {
167                 if (c == *s) return (ucs2_t *)s;
168                 s++;
169         }
170         if (c == *s) return (ucs2_t *)s;
171
172         return NULL;
173 }
174
175 /*******************************************************************
176 wide & sp strcasechr()
177 ********************************************************************/
178 /* NOTE: separately process BMP and surrogate pair */
179
180 ucs2_t *strcasechr_w(const ucs2_t *s, ucs2_t c)
181 {
182         while (*s != 0) {
183 /*              LOG(log_debug, logtype_default, "Comparing %X to %X (%X - %X)", c, *s, toupper_w(c), toupper_w(*s));*/
184                 if (toupper_w(c) == toupper_w(*s)) return (ucs2_t *)s;
185                 s++;
186         }
187         if (c == *s) return (ucs2_t *)s;
188
189         return NULL;
190 }
191
192 ucs2_t *strcasechr_sp(const ucs2_t *s, u_int32_t c_sp)
193 {
194         if (*s == 0) return NULL;
195         while (s[1] != 0) {
196                 if (toupper_sp(c_sp) == toupper_sp((u_int32_t)*s << 16 | (u_int32_t)s[1])) return (ucs2_t *)s;
197                 s++;
198         }
199
200         return NULL;
201 }
202
203 /*******************************************************************
204 wide strcmp()
205 ********************************************************************/
206 /* no problem of surrogate pair */
207
208 int strcmp_w(const ucs2_t *a, const ucs2_t *b)
209 {
210         while (*b && *a == *b) { a++; b++; }
211         return (*a - *b);
212         /* warning: if *a != *b and both are not 0 we retrun a random
213            greater or lesser than 0 number not realted to which
214            string is longer */
215 }
216
217 /*******************************************************************
218 wide strncmp()
219 ********************************************************************/
220 /* no problem of surrogate pair */
221
222 int strncmp_w(const ucs2_t *a, const ucs2_t *b, size_t len)
223 {
224         size_t n = 0;
225         while ((n < len) && *b && *a == *b) { a++; b++; n++;}
226         return (len - n)?(*a - *b):0;
227 }
228
229 /*******************************************************************
230 wide strstr()
231 ********************************************************************/
232 /* no problem of surrogate pair */
233
234 ucs2_t *strstr_w(const ucs2_t *s, const ucs2_t *ins)
235 {
236         ucs2_t *r;
237         size_t slen, inslen;
238
239         if (!s || !*s || !ins || !*ins) return NULL;
240         slen = strlen_w(s);
241         inslen = strlen_w(ins);
242         r = (ucs2_t *)s;
243         while ((r = strchr_w(r, *ins))) {
244                 if (strncmp_w(r, ins, inslen) == 0) return r;
245                 r++;
246         }
247         return NULL;
248 }
249
250 /*******************************************************************
251 wide strcasestr()
252 ********************************************************************/
253 /* */
254
255 ucs2_t *strcasestr_w(const ucs2_t *s, const ucs2_t *ins)
256 {
257         ucs2_t *r;
258         size_t slen, inslen;
259
260         if (!s || !*s || !ins || !*ins) return NULL;
261         slen = strlen_w(s);
262         inslen = strlen_w(ins);
263         r = (ucs2_t *)s;
264         while ((r = strcasechr_w(r, *ins))) {
265                 if (strncasecmp_w(r, ins, inslen) == 0) return r;
266                 r++;
267         }
268         return NULL;
269 }
270
271 /*******************************************************************
272 wide strcasecmp()
273 case insensitive string comparison
274 ********************************************************************/
275 /* surrogate pair support */
276
277 int strcasecmp_w(const ucs2_t *a, const ucs2_t *b)
278 {
279         int ret;
280
281         while (*a && *b) {
282                 if ((0xD800 <= *a) && (*a < 0xDC00)) {
283                         if (ret = tolower_sp((u_int32_t)*a << 16 | (u_int32_t)a[1]) - tolower_sp((u_int32_t)*b << 16 | (u_int32_t)b[1])) return ret;
284                         a++;
285                         b++;
286                         if (!(*a && *b)) return (tolower_w(*a) - tolower_w(*b)); /* avoid buffer over run */
287                 } else {
288                         if (ret = tolower_w(*a) - tolower_w(*b)) return ret;
289                 }
290                 a++;
291                 b++;
292         }
293         return (tolower_w(*a) - tolower_w(*b));
294 }
295
296 /*******************************************************************
297 wide strncasecmp()
298 case insensitive string comparison, length limited
299 ********************************************************************/
300 /* NOTE: compare up to 'len+1' if 'len' isolate surrogate pair  */
301
302 int strncasecmp_w(const ucs2_t *a, const ucs2_t *b, size_t len)
303 {
304         size_t n = 0;
305         int ret;
306
307         while ((n < len) && *a && *b) {
308                 if ((0xD800 <= *a) && (*a < 0xDC00)) {
309                         if (ret = tolower_sp((u_int32_t)*a << 16 | (u_int32_t)a[1]) - tolower_sp((u_int32_t)*b << 16 | (u_int32_t)b[1])) return ret;
310                         a++;
311                         b++;
312                         n++;
313                         if (!((n < len) && *a && *b)) return (tolower_w(*a) - tolower_w(*b));
314                 } else {
315                         if (ret = tolower_w(*a) - tolower_w(*b)) return ret;
316                 }
317                 a++;
318                 b++;
319                 n++;
320         }
321         return (len - n)?(tolower_w(*a) - tolower_w(*b)):0;
322 }
323
324 /*******************************************************************
325 wide strndup()
326 duplicate string
327 ********************************************************************/
328 /* NOTE: not check isolation of surrogate pair */
329 /* if len == 0 then duplicate the whole string */
330
331 ucs2_t *strndup_w(const ucs2_t *src, size_t len)
332 {
333         ucs2_t *dest;
334
335         if (!len) len = strlen_w(src);
336         dest = (ucs2_t *)malloc((len + 1) * sizeof(ucs2_t));
337         if (!dest) {
338                 LOG (log_error, logtype_default, "strdup_w: out of memory!");
339                 return NULL;
340         }
341
342         memcpy(dest, src, len * sizeof(ucs2_t));
343         dest[len] = 0;
344
345         return dest;
346 }
347
348 /*******************************************************************
349 wide strdup()
350 duplicate string
351 ********************************************************************/
352 /* no problem of surrogate pair */
353
354 ucs2_t *strdup_w(const ucs2_t *src)
355 {
356         return strndup_w(src, 0);
357 }
358
359 /*******************************************************************
360 copy a string with max len
361 ********************************************************************/
362
363 ucs2_t *strncpy_w(ucs2_t *dest, const ucs2_t *src, const size_t max)
364 {
365         size_t len;
366
367         if (!dest || !src) return NULL;
368
369         for (len = 0; (src[len] != 0) && (len < max); len++)
370                 dest[len] = src[len];
371         while (len < max)
372                 dest[len++] = 0;
373
374         return dest;
375 }
376
377
378 /*******************************************************************
379 append a string of len bytes and add a terminator
380 ********************************************************************/
381
382 ucs2_t *strncat_w(ucs2_t *dest, const ucs2_t *src, const size_t max)
383 {
384         size_t start;
385         size_t len;
386
387         if (!dest || !src) return NULL;
388
389         start = strlen_w(dest);
390         len = strnlen_w(src, max);
391
392         memcpy(&dest[start], src, len*sizeof(ucs2_t));
393         dest[start+len] = 0;
394
395         return dest;
396 }
397
398
399 ucs2_t *strcat_w(ucs2_t *dest, const ucs2_t *src)
400 {
401         size_t start;
402         size_t len;
403
404         if (!dest || !src) return NULL;
405
406         start = strlen_w(dest);
407         len = strlen_w(src);
408
409         memcpy(&dest[start], src, len*sizeof(ucs2_t));
410         dest[start+len] = 0;
411
412         return dest;
413 }
414
415
416 /*******************************************************************
417 binary search for pre|decomposition
418 ********************************************************************/
419
420 static ucs2_t do_precomposition(unsigned int base, unsigned int comb) 
421 {
422         int min = 0;
423         int max = PRECOMP_COUNT - 1;
424         int mid;
425         u_int32_t sought = (base << 16) | comb, that;
426
427         /* binary search */
428         while (max >= min) {
429                 mid = (min + max) / 2;
430                 that = (precompositions[mid].base << 16) | (precompositions[mid].comb);
431                 if (that < sought) {
432                         min = mid + 1;
433                 } else if (that > sought) {
434                         max = mid - 1;
435                 } else {
436                         return precompositions[mid].replacement;
437                 }
438         }
439         /* no match */
440         return 0;
441 }
442
443 /* ------------------------ */
444 static u_int32_t do_precomposition_sp(unsigned int base_sp, unsigned int comb_sp) 
445 {
446         int min = 0;
447         int max = PRECOMP_SP_COUNT - 1;
448         int mid;
449         u_int64_t sought_sp = ((u_int64_t)base_sp << 32) | (u_int64_t)comb_sp, that_sp;
450
451         /* binary search */
452         while (max >= min) {
453                 mid = (min + max) / 2;
454                 that_sp = ((u_int64_t)precompositions_sp[mid].base_sp << 32) | ((u_int64_t)precompositions_sp[mid].comb_sp);
455                 if (that_sp < sought_sp) {
456                         min = mid + 1;
457                 } else if (that_sp > sought_sp) {
458                         max = mid - 1;
459                 } else {
460                         return precompositions_sp[mid].replacement_sp;
461                 }
462         }
463         /* no match */
464         return 0;
465 }
466
467 /* -------------------------- */
468 static u_int32_t do_decomposition(ucs2_t base) 
469 {
470         int min = 0;
471         int max = DECOMP_COUNT - 1;
472         int mid;
473         u_int32_t sought = base;
474         u_int32_t result, that;
475
476         /* binary search */
477         while (max >= min) {
478                 mid = (min + max) / 2;
479                 that = decompositions[mid].replacement;
480                 if (that < sought) {
481                         min = mid + 1;
482                 } else if (that > sought) {
483                         max = mid - 1;
484                 } else {
485                         result = (decompositions[mid].base << 16) | (decompositions[mid].comb);
486                         return result;
487                 }
488         }
489         /* no match */
490         return 0;
491 }
492
493 /* -------------------------- */
494 static u_int64_t do_decomposition_sp(unsigned int base_sp) 
495 {
496         int min = 0;
497         int max = DECOMP_SP_COUNT - 1;
498         int mid;
499         u_int32_t sought_sp = base_sp;
500         u_int32_t that_sp;
501         u_int64_t result_sp;
502
503         /* binary search */
504         while (max >= min) {
505                 mid = (min + max) / 2;
506                 that_sp = decompositions_sp[mid].replacement_sp;
507                 if (that_sp < sought_sp) {
508                         min = mid + 1;
509                 } else if (that_sp > sought_sp) {
510                         max = mid - 1;
511                 } else {
512                         result_sp = ((u_int64_t)decompositions_sp[mid].base_sp << 32) | ((u_int64_t)decompositions_sp[mid].comb_sp);
513                         return result_sp;
514                 }
515         }
516         /* no match */
517         return 0;
518 }
519
520 /*******************************************************************
521 pre|decomposition
522
523    we can't use static, this stuff needs to be reentrant
524    static char comp[MAXPATHLEN +1];
525
526    We don't implement Singleton and Canonical Ordering.
527    We ignore CompositionExclusions.txt.
528    because they cause the problem of the roundtrip
529    such as Dancing Icon.
530
531    exclude U2000-U2FFF, UFE30-UFE4F and U2F800-U2FA1F ranges
532    in precompose.h from composition according to AFP 3.x spec
533 ********************************************************************/
534
535 size_t precompose_w (ucs2_t *name, size_t inplen, ucs2_t *comp, size_t *outlen)
536 {
537         size_t i;
538         ucs2_t base, comb;
539         u_int32_t base_sp, comb_sp;
540         ucs2_t *in, *out;
541         ucs2_t lindex, vindex;
542         ucs2_t result;
543         u_int32_t result_sp;
544         size_t o_len = *outlen;
545         
546         if (!inplen || (inplen & 1) || inplen > o_len)
547                 return (size_t)-1;
548         
549         i = 0;
550         in  = name;
551         out = comp;
552         
553         base = *in;
554         while (*outlen > 2) {
555                 i += 2;
556                 in++;
557
558                 if (i == inplen) {
559                         *out = base;
560                         out++;
561                         *out = 0;
562                         *outlen -= 2;
563                         return o_len - *outlen;
564                 }
565
566                 comb = *in;
567                 result = 0;
568
569                 /* Non-Combination Character */
570                 if (comb < 0x300) ;
571                 
572                 /* Unicode Standard Annex #15 A10.3 Hangul Composition */
573                 /* Step 1 <L,V> */
574                 else if ((VBASE <= comb) && (comb <= VBASE + VCOUNT)) {
575                         if ((LBASE <= base) && (base < LBASE + LCOUNT)) {
576                                 result = 1;
577                                 lindex = base - LBASE;
578                                 vindex = comb - VBASE;
579                                 base = SBASE + (lindex * VCOUNT + vindex) * TCOUNT;
580                         }
581                 }
582                 
583                 /* Step 2 <LV,T> */
584                 else if ((TBASE < comb) && (comb < TBASE + TCOUNT)) {
585                         if ((SBASE <= base) && (base < SBASE + SCOUNT) && (((base - SBASE) % TCOUNT) == 0)) {
586                                 result = 1;
587                                 base += comb - TBASE;
588                         }
589                 }
590                 
591                 /* Binary Search for Surrogate Pair */
592                 else if ((0xD800 <= base) && (base < 0xDC00)) {
593                         if ((0xDC00 <= comb) && (comb < 0xE000) && (i + 4 <= inplen)) {
594                                 base_sp = ((u_int32_t)base << 16) | (u_int32_t)comb;
595                                 do {
596                                         comb_sp = ((u_int32_t)in[1] << 16) | (u_int32_t)in[2];
597                                         if (result_sp = do_precomposition_sp(base_sp, comb_sp)) {
598                                                 base_sp = result_sp;
599                                                 i += 4;
600                                                 in +=2;
601                                         }
602                                 } while ((i + 4 <= inplen) && result_sp) ;
603
604                                 *out = base_sp >> 16;
605                                 out++;
606                                 *outlen -= 2;
607
608                                 if (*outlen <= 2) {
609                                         errno = E2BIG;
610                                         return (size_t)-1;
611                                 }
612
613                                 *out = base_sp & 0xFFFF;
614                                 out++;
615                                 *outlen -= 2;
616
617                                 i += 2;
618                                 in++;
619                                 base = *in;
620
621                                 result = 1;
622                         }
623                 }
624
625                 /* Binary Search for BMP */
626                 else if (result = do_precomposition(base, comb)) {
627                         base = result;
628                 }
629                 
630                 if (!result) {
631                         *out = base;
632                         out++;
633                         *outlen -= 2;
634                         base = comb;
635                 }
636         }
637
638         errno = E2BIG;
639         return (size_t)-1;
640 }
641
642 /* --------------- */
643 size_t decompose_w (ucs2_t *name, size_t inplen, ucs2_t *comp, size_t *outlen)
644 {
645         size_t i;
646         size_t comblen;
647         ucs2_t base, comb[COMBBUFLEN];
648         u_int32_t base_sp;
649         ucs2_t sindex, tjamo;
650         ucs2_t *in, *out;
651         unsigned int result;
652         u_int64_t result_sp;
653         size_t o_len = *outlen;
654
655         if (!inplen || (inplen & 1))
656                 return (size_t)-1;
657         i = 0;
658         in  = name;
659         out = comp;
660
661         while (i < inplen) {
662                 base = *in;
663                 comblen = 0;
664                 
665                 /* check ASCII first. this is frequent. */
666                 if (base <= 0x007f) ;
667                 
668                 /* Unicode Standard Annex #15 A10.2 Hangul Decomposition */
669                 else if ((SBASE <= base) && (base < SBASE + SCOUNT)) {
670                         sindex = base - SBASE;
671                         base = LBASE + sindex / NCOUNT;
672                         comb[COMBBUFLEN-2] = VBASE + (sindex % NCOUNT) / TCOUNT;
673                         
674                         /* <L,V> */
675                         if ((tjamo = TBASE + sindex % TCOUNT) == TBASE) {
676                                 comb[COMBBUFLEN-1] = comb[COMBBUFLEN-2];
677                                 comblen = 1;
678                         }
679                         
680                         /* <L,V,T> */
681                         else {
682                                 comb[COMBBUFLEN-1] = tjamo;
683                                 comblen = 2;
684                         }
685                 }
686                 
687                 /* Binary Search for Surrogate Pair */
688                 else if ((0xD800 <= base) && (base < 0xDC00)) {
689                         if (i + 2 < inplen) {
690                                 base_sp =  ((u_int32_t)base << 16) | (u_int32_t)in[1];
691                                 do {
692                                         if ( !(result_sp = do_decomposition_sp(base_sp))) break;
693                                         comblen += 2;
694                                         base_sp = result_sp >> 32;
695                                         comb[COMBBUFLEN-comblen] = (result_sp >> 16) & 0xFFFF;  /* hi */
696                                         comb[COMBBUFLEN-comblen+1] = result_sp & 0xFFFF;        /* lo */
697                                 } while (comblen < MAXCOMBSPLEN);
698
699                                 if (*outlen < (comblen + 1) << 1) {
700                                         errno = E2BIG;
701                                         return (size_t)-1;
702                                 }
703
704                                 *out = base_sp >> 16;   /* hi */
705                                 out++;
706                                 *outlen -= 2;
707                                 
708                                 base = base_sp & 0xFFFF; /* lo */
709                                 
710                                 i += 2;
711                                 in++;
712                         }
713                 }
714                         
715                 /* Binary Search for BMP */
716                 else {
717                         do {
718                                 if ( !(result = do_decomposition(base))) break;
719                                 comblen++;
720                                 base = result  >> 16;
721                                 comb[COMBBUFLEN-comblen] = result & 0xFFFF;
722                         } while ((0x007f < base) && (comblen < MAXCOMBLEN));
723                 }
724                 
725                 if (*outlen < (comblen + 1) << 1) {
726                         errno = E2BIG;
727                         return (size_t)-1;
728                 }
729                 
730                 *out = base;
731                 out++;
732                 *outlen -= 2;
733                 
734                 while ( comblen > 0 ) {
735                         *out = comb[COMBBUFLEN-comblen];
736                         out++;
737                         *outlen -= 2;
738                         comblen--;
739                 }
740                 
741                 i += 2;
742                 in++;
743         }
744         
745         *out = 0;
746         return o_len-*outlen;
747 }
748
749 /*******************************************************************
750 length of UTF-8 character and string
751 ********************************************************************/
752
753 size_t utf8_charlen ( char* utf8 )
754 {
755         unsigned char *p;
756
757         p = (unsigned char*) utf8;
758         
759         if ( *p < 0x80 )
760                 return (1);
761         else if ( *p > 0xC1 && *p < 0xe0 && *(p+1) > 0x7f && *(p+1) < 0xC0)
762                 return (2);
763         else if ( *p == 0xe0 && *(p+1) > 0x9f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0)
764                 return (3);
765         else if ( *p > 0xe0  && *p < 0xf0 && *(p+1) > 0x7f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0)
766                 return (3);
767         else if ( *p == 0xf0 && *(p+1) > 0x8f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
768                 return (4);
769         else if ( *p > 0xf0 && *p < 0xf4 && *(p+1) > 0x7f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
770                 return (4);
771         else if ( *p == 0xf4 && *(p+1) > 0x7f && *(p+1) < 0x90 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
772                 return (4);
773         else
774                 return ((size_t) -1);
775 }
776
777
778 size_t utf8_strlen_validate ( char * utf8 )
779 {
780         size_t len;
781         unsigned char *p;
782
783         p = (unsigned char*) utf8;
784         len = 0;
785
786         /* see http://www.unicode.org/unicode/reports/tr27/ for an explanation */
787
788         while ( *p != '\0')
789         {
790                 if ( *p < 0x80 )
791                         p++;
792
793                 else if ( *p > 0xC1 && *p < 0xe0 && *(p+1) > 0x7f && *(p+1) < 0xC0)
794                         p += 2;
795
796                 else if ( *p == 0xe0 && *(p+1) > 0x9f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0)
797                         p += 3;
798
799                 else if ( *p > 0xe0  && *p < 0xf0 && *(p+1) > 0x7f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0)
800                         p += 3;
801
802                 else if ( *p == 0xf0 && *(p+1) > 0x8f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
803                         p += 4;
804
805                 else if ( *p > 0xf0 && *p < 0xf4 && *(p+1) > 0x7f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
806                         p += 4;
807
808                 else if ( *p == 0xf4 && *(p+1) > 0x7f && *(p+1) < 0x90 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
809                         p += 4;
810
811                 else
812                         return ((size_t) -1);
813
814                 len++;
815         }
816
817         return (len);
818 }