]> arthur.barton.de Git - netatalk.git/blob - libatalk/unicode/util_unistr.c
416c38d0b86d5aaf098a5d09fb8891569fc8c324
[netatalk.git] / libatalk / unicode / util_unistr.c
1 /*******************************************************************
2   NOTE:
3   The early netatalk 2.x was based on UCS-2.
4   UCS-2 don't support chars above U+10000.
5   Recent netatalk is based on UTF-16.
6   UTF-16 can support chars above U+10000, using Surrogate Pair.
7   However, Surrogate Pair is complex, dirty, filthy and disagreeable.
8   There might still be latent bugs...
9 ********************************************************************/
10
11 #ifdef HAVE_CONFIG_H
12 #include "config.h"
13 #endif /* HAVE_CONFIG_H */
14
15 #include <stdio.h>
16 #include <stdlib.h>
17 #include <string.h>
18 #include <sys/param.h>
19 #include <sys/stat.h>
20 #include <atalk/logger.h>
21 #include <errno.h>
22 #include <arpa/inet.h>
23
24 #include <atalk/unicode.h>
25 #include "precompose.h"
26 #include "byteorder.h"
27
28 /*******************************************************************
29  Convert a string to lower case.
30  return True if any char is converted
31 ********************************************************************/
32 /* surrogate pair support */
33
34 int strlower_w(ucs2_t *s)
35 {
36         int ret = 0;
37
38         while (*s) {
39                 if ((0xD800 <= *s) && (*s < 0xDC00)) {
40                         if ((0xDC00 <= s[1]) && (s[1] < 0xE000)) {
41                                 u_int32_t s_sp = (u_int32_t)*s << 16 | (u_int32_t)s[1];
42                                 u_int32_t v_sp = tolower_sp(s_sp);
43                                 if (v_sp != s_sp) {
44                                         *s = v_sp >> 16;
45                                         s++;
46                                         *s = v_sp & 0xFFFF;
47                                         ret = 1;
48                                 }
49                         }
50                 } else {
51                         ucs2_t v = tolower_w(*s);
52                         if (v != *s) {
53                                 *s = v;
54                                 ret = 1;
55                         }
56                 }
57                 s++;
58         }
59         return ret;
60 }
61
62 /*******************************************************************
63  Convert a string to upper case.
64  return True if any char is converted
65 ********************************************************************/
66 /* surrogate pair support */
67
68 int strupper_w(ucs2_t *s)
69 {
70         int ret = 0;
71
72         while (*s) {
73                 if ((0xD800 <= *s) && (*s < 0xDC00)) {
74                         if ((0xDC00 <= s[1]) && (s[1] < 0xE000)) {
75                                 u_int32_t s_sp = (u_int32_t)*s << 16 | (u_int32_t)s[1];
76                                 u_int32_t v_sp = toupper_sp(s_sp);
77                                 if (v_sp != s_sp) {
78                                         *s = v_sp >> 16;
79                                         s++;
80                                         *s = v_sp & 0xFFFF;
81                                         ret = 1;
82                                 }
83                         }
84                 } else {
85                         ucs2_t v = toupper_w(*s);
86                         if (v != *s) {
87                                 *s = v;
88                                 ret = 1;
89                         }
90                 }
91                 s++;
92         }
93         return ret;
94 }
95
96 /*******************************************************************
97 wide & sp islower()
98 determine if a character is lowercase
99 ********************************************************************/
100 /* These functions are not used. */
101
102 int islower_w(ucs2_t c)
103 {
104         return ( c == tolower_w(c));
105 }
106
107 int islower_sp(u_int32_t c_sp)
108 {
109         return ( c_sp == tolower_sp(c_sp));
110 }
111
112 /*******************************************************************
113 wide & sp isupper()
114 determine if a character is uppercase
115 ********************************************************************/
116 /* These functions are not used. */
117
118 int isupper_w(ucs2_t c)
119 {
120         return ( c == toupper_w(c));
121 }
122
123 int isupper_sp(u_int32_t c_sp)
124 {
125         return ( c_sp == toupper_sp(c_sp));
126 }
127
128 /*******************************************************************
129 wide strlen()
130  Count the number of characters in a UTF-16 string.
131 ********************************************************************/
132 /* NOTE: one surrogate pair is two characters. */
133
134 size_t strlen_w(const ucs2_t *src)
135 {
136         size_t len;
137
138         for(len = 0; *src++; len++) ;
139
140         return len;
141 }
142
143 /*******************************************************************
144 wide strnlen()
145  Count up to max number of characters in a UTF-16 string.
146 ********************************************************************/
147 /* NOTE: one surrogate pair is two characters. */
148
149 size_t strnlen_w(const ucs2_t *src, size_t max)
150 {
151         size_t len;
152
153         for(len = 0; *src++ && (len < max); len++) ;
154
155         return len;
156 }
157
158 /*******************************************************************
159 wide strchr()
160 ********************************************************************/
161 /* NOTE: hi and lo of surrogate pair are separately processed. */
162
163 ucs2_t *strchr_w(const ucs2_t *s, ucs2_t c)
164 {
165         while (*s != 0) {
166                 if (c == *s) return (ucs2_t *)s;
167                 s++;
168         }
169         if (c == *s) return (ucs2_t *)s;
170
171         return NULL;
172 }
173
174 /*******************************************************************
175 wide & sp strcasechr()
176 ********************************************************************/
177 /* NOTE: separately process BMP and surrogate pair */
178
179 ucs2_t *strcasechr_w(const ucs2_t *s, ucs2_t c)
180 {
181         while (*s != 0) {
182 /*              LOG(log_debug, logtype_default, "Comparing %X to %X (%X - %X)", c, *s, toupper_w(c), toupper_w(*s));*/
183                 if (toupper_w(c) == toupper_w(*s)) return (ucs2_t *)s;
184                 s++;
185         }
186         if (c == *s) return (ucs2_t *)s;
187
188         return NULL;
189 }
190
191 ucs2_t *strcasechr_sp(const ucs2_t *s, u_int32_t c_sp)
192 {
193         if (*s == 0) return NULL;
194         while (s[1] != 0) {
195                 if (toupper_sp(c_sp) == toupper_sp((u_int32_t)*s << 16 | (u_int32_t)s[1])) return (ucs2_t *)s;
196                 s++;
197         }
198
199         return NULL;
200 }
201
202 /*******************************************************************
203 wide strcmp()
204 ********************************************************************/
205 /* no problem of surrogate pair */
206
207 int strcmp_w(const ucs2_t *a, const ucs2_t *b)
208 {
209         while (*b && *a == *b) { a++; b++; }
210         return (*a - *b);
211         /* warning: if *a != *b and both are not 0 we retrun a random
212            greater or lesser than 0 number not realted to which
213            string is longer */
214 }
215
216 /*******************************************************************
217 wide strncmp()
218 ********************************************************************/
219 /* no problem of surrogate pair */
220
221 int strncmp_w(const ucs2_t *a, const ucs2_t *b, size_t len)
222 {
223         size_t n = 0;
224         while ((n < len) && *b && *a == *b) { a++; b++; n++;}
225         return (len - n)?(*a - *b):0;
226 }
227
228 /*******************************************************************
229 wide strstr()
230 ********************************************************************/
231 /* no problem of surrogate pair */
232
233 ucs2_t *strstr_w(const ucs2_t *s, const ucs2_t *ins)
234 {
235         ucs2_t *r;
236         size_t slen, inslen;
237
238         if (!s || !*s || !ins || !*ins) return NULL;
239         slen = strlen_w(s);
240         inslen = strlen_w(ins);
241         r = (ucs2_t *)s;
242         while ((r = strchr_w(r, *ins))) {
243                 if (strncmp_w(r, ins, inslen) == 0) return r;
244                 r++;
245         }
246         return NULL;
247 }
248
249 /*******************************************************************
250 wide strcasestr()
251 ********************************************************************/
252 /* */
253
254 ucs2_t *strcasestr_w(const ucs2_t *s, const ucs2_t *ins)
255 {
256         ucs2_t *r;
257         size_t slen, inslen;
258
259         if (!s || !*s || !ins || !*ins) return NULL;
260         slen = strlen_w(s);
261         inslen = strlen_w(ins);
262         r = (ucs2_t *)s;
263         while ((r = strcasechr_w(r, *ins))) {
264                 if (strncasecmp_w(r, ins, inslen) == 0) return r;
265                 r++;
266         }
267         return NULL;
268 }
269
270 /*******************************************************************
271 wide strcasecmp()
272 case insensitive string comparison
273 ********************************************************************/
274 /* surrogate pair support */
275
276 int strcasecmp_w(const ucs2_t *a, const ucs2_t *b)
277 {
278         int ret;
279
280         while (*a && *b) {
281                 if ((0xD800 <= *a) && (*a < 0xDC00)) {
282                         if (ret = tolower_sp((u_int32_t)*a << 16 | (u_int32_t)a[1]) - tolower_sp((u_int32_t)*b << 16 | (u_int32_t)b[1])) return ret;
283                         a++;
284                         b++;
285                         if (!(*a && *b)) return (tolower_w(*a) - tolower_w(*b)); /* avoid buffer over run */
286                 } else {
287                         if (ret = tolower_w(*a) - tolower_w(*b)) return ret;
288                 }
289                 a++;
290                 b++;
291         }
292         return (tolower_w(*a) - tolower_w(*b));
293 }
294
295 /*******************************************************************
296 wide strncasecmp()
297 case insensitive string comparison, length limited
298 ********************************************************************/
299 /* NOTE: compare up to 'len+1' if 'len' isolate surrogate pair  */
300
301 int strncasecmp_w(const ucs2_t *a, const ucs2_t *b, size_t len)
302 {
303         size_t n = 0;
304         int ret;
305
306         while ((n < len) && *a && *b) {
307                 if ((0xD800 <= *a) && (*a < 0xDC00)) {
308                         if (ret = tolower_sp((u_int32_t)*a << 16 | (u_int32_t)a[1]) - tolower_sp((u_int32_t)*b << 16 | (u_int32_t)b[1])) return ret;
309                         a++;
310                         b++;
311                         n++;
312                         if (!((n < len) && *a && *b)) return (tolower_w(*a) - tolower_w(*b));
313                 } else {
314                         if (ret = tolower_w(*a) - tolower_w(*b)) return ret;
315                 }
316                 a++;
317                 b++;
318                 n++;
319         }
320         return (len - n)?(tolower_w(*a) - tolower_w(*b)):0;
321 }
322
323 /*******************************************************************
324 wide strndup()
325 duplicate string
326 ********************************************************************/
327 /* NOTE: not check isolation of surrogate pair */
328 /* if len == 0 then duplicate the whole string */
329
330 ucs2_t *strndup_w(const ucs2_t *src, size_t len)
331 {
332         ucs2_t *dest;
333
334         if (!len) len = strlen_w(src);
335         dest = (ucs2_t *)malloc((len + 1) * sizeof(ucs2_t));
336         if (!dest) {
337                 LOG (log_error, logtype_default, "strdup_w: out of memory!");
338                 return NULL;
339         }
340
341         memcpy(dest, src, len * sizeof(ucs2_t));
342         dest[len] = 0;
343
344         return dest;
345 }
346
347 /*******************************************************************
348 wide strdup()
349 duplicate string
350 ********************************************************************/
351 /* no problem of surrogate pair */
352
353 ucs2_t *strdup_w(const ucs2_t *src)
354 {
355         return strndup_w(src, 0);
356 }
357
358 /*******************************************************************
359 copy a string with max len
360 ********************************************************************/
361
362 ucs2_t *strncpy_w(ucs2_t *dest, const ucs2_t *src, const size_t max)
363 {
364         size_t len;
365
366         if (!dest || !src) return NULL;
367
368         for (len = 0; (src[len] != 0) && (len < max); len++)
369                 dest[len] = src[len];
370         while (len < max)
371                 dest[len++] = 0;
372
373         return dest;
374 }
375
376
377 /*******************************************************************
378 append a string of len bytes and add a terminator
379 ********************************************************************/
380
381 ucs2_t *strncat_w(ucs2_t *dest, const ucs2_t *src, const size_t max)
382 {
383         size_t start;
384         size_t len;
385
386         if (!dest || !src) return NULL;
387
388         start = strlen_w(dest);
389         len = strnlen_w(src, max);
390
391         memcpy(&dest[start], src, len*sizeof(ucs2_t));
392         dest[start+len] = 0;
393
394         return dest;
395 }
396
397
398 ucs2_t *strcat_w(ucs2_t *dest, const ucs2_t *src)
399 {
400         size_t start;
401         size_t len;
402
403         if (!dest || !src) return NULL;
404
405         start = strlen_w(dest);
406         len = strlen_w(src);
407
408         memcpy(&dest[start], src, len*sizeof(ucs2_t));
409         dest[start+len] = 0;
410
411         return dest;
412 }
413
414
415 /*******************************************************************
416 binary search for pre|decomposition
417 ********************************************************************/
418
419 static ucs2_t do_precomposition(unsigned int base, unsigned int comb) 
420 {
421         int min = 0;
422         int max = PRECOMP_COUNT - 1;
423         int mid;
424         uint32_t sought = (base << 16) | comb, that;
425
426         /* binary search */
427         while (max >= min) {
428                 mid = (min + max) / 2;
429                 that = (precompositions[mid].base << 16) | (precompositions[mid].comb);
430                 if (that < sought) {
431                         min = mid + 1;
432                 } else if (that > sought) {
433                         max = mid - 1;
434                 } else {
435                         return precompositions[mid].replacement;
436                 }
437         }
438         /* no match */
439         return 0;
440 }
441
442 /* ------------------------ */
443 static uint32_t do_precomposition_sp(unsigned int base_sp, unsigned int comb_sp) 
444 {
445         int min = 0;
446         int max = PRECOMP_SP_COUNT - 1;
447         int mid;
448         uint64_t sought_sp = ((uint64_t)base_sp << 32) | (uint64_t)comb_sp, that_sp;
449
450         /* binary search */
451         while (max >= min) {
452                 mid = (min + max) / 2;
453                 that_sp = ((uint64_t)precompositions_sp[mid].base_sp << 32) | ((uint64_t)precompositions_sp[mid].comb_sp);
454                 if (that_sp < sought_sp) {
455                         min = mid + 1;
456                 } else if (that_sp > sought_sp) {
457                         max = mid - 1;
458                 } else {
459                         return precompositions_sp[mid].replacement_sp;
460                 }
461         }
462         /* no match */
463         return 0;
464 }
465
466 /* -------------------------- */
467 static uint32_t do_decomposition(ucs2_t base) 
468 {
469         int min = 0;
470         int max = DECOMP_COUNT - 1;
471         int mid;
472         uint32_t sought = base;
473         uint32_t result, that;
474
475         /* binary search */
476         while (max >= min) {
477                 mid = (min + max) / 2;
478                 that = decompositions[mid].replacement;
479                 if (that < sought) {
480                         min = mid + 1;
481                 } else if (that > sought) {
482                         max = mid - 1;
483                 } else {
484                         result = (decompositions[mid].base << 16) | (decompositions[mid].comb);
485                         return result;
486                 }
487         }
488         /* no match */
489         return 0;
490 }
491
492 /* -------------------------- */
493 static uint64_t do_decomposition_sp(unsigned int base_sp) 
494 {
495         int min = 0;
496         int max = DECOMP_SP_COUNT - 1;
497         int mid;
498         uint32_t sought_sp = base_sp;
499         uint32_t that_sp;
500         uint64_t result_sp;
501
502         /* binary search */
503         while (max >= min) {
504                 mid = (min + max) / 2;
505                 that_sp = decompositions_sp[mid].replacement_sp;
506                 if (that_sp < sought_sp) {
507                         min = mid + 1;
508                 } else if (that_sp > sought_sp) {
509                         max = mid - 1;
510                 } else {
511                         result_sp = ((uint64_t)decompositions_sp[mid].base_sp << 32) | ((uint64_t)decompositions_sp[mid].comb_sp);
512                         return result_sp;
513                 }
514         }
515         /* no match */
516         return 0;
517 }
518
519 /*******************************************************************
520 pre|decomposition
521
522    we can't use static, this stuff needs to be reentrant
523    static char comp[MAXPATHLEN +1];
524
525    We don't implement Singleton and Canonical Ordering.
526    We ignore CompositionExclusions.txt.
527    because they cause the problem of the roundtrip
528    such as Dancing Icon.
529
530    exclude U2000-U2FFF, UFE30-UFE4F and U2F800-U2FA1F ranges
531    in precompose.h from composition according to AFP 3.x spec
532 ********************************************************************/
533
534 size_t precompose_w (ucs2_t *name, size_t inplen, ucs2_t *comp, size_t *outlen)
535 {
536         size_t i;
537         ucs2_t base, comb;
538         uint32_t base_sp, comb_sp;
539         ucs2_t *in, *out;
540         ucs2_t lindex, vindex;
541         ucs2_t result;
542         uint32_t result_sp;
543         size_t o_len = *outlen;
544         
545         if (!inplen || (inplen & 1) || inplen > o_len)
546                 return (size_t)-1;
547         
548         i = 0;
549         in  = name;
550         out = comp;
551         
552         base = *in;
553         while (*outlen > 2) {
554                 i += 2;
555                 in++;
556
557                 if (i == inplen) {
558                         *out = base;
559                         out++;
560                         *out = 0;
561                         *outlen -= 2;
562                         return o_len - *outlen;
563                 }
564
565                 comb = *in;
566                 result = 0;
567
568                 /* Non-Combination Character */
569                 if (comb < 0x300) ;
570                 
571                 /* Unicode Standard Annex #15 A10.3 Hangul Composition */
572                 /* Step 1 <L,V> */
573                 else if ((VBASE <= comb) && (comb <= VBASE + VCOUNT)) {
574                         if ((LBASE <= base) && (base < LBASE + LCOUNT)) {
575                                 result = 1;
576                                 lindex = base - LBASE;
577                                 vindex = comb - VBASE;
578                                 base = SBASE + (lindex * VCOUNT + vindex) * TCOUNT;
579                         }
580                 }
581                 
582                 /* Step 2 <LV,T> */
583                 else if ((TBASE < comb) && (comb < TBASE + TCOUNT)) {
584                         if ((SBASE <= base) && (base < SBASE + SCOUNT) && (((base - SBASE) % TCOUNT) == 0)) {
585                                 result = 1;
586                                 base += comb - TBASE;
587                         }
588                 }
589                 
590                 /* Binary Search for Surrogate Pair */
591                 else if ((0xD800 <= base) && (base < 0xDC00)) {
592                         if ((0xDC00 <= comb) && (comb < 0xE000) && (i + 4 <= inplen)) {
593                                 base_sp = ((uint32_t)base << 16) | (uint32_t)comb;
594                                 do {
595                                         comb_sp = ((uint32_t)in[1] << 16) | (uint32_t)in[2];
596                                         if (result_sp = do_precomposition_sp(base_sp, comb_sp)) {
597                                                 base_sp = result_sp;
598                                                 i += 4;
599                                                 in +=2;
600                                         }
601                                 } while ((i + 4 <= inplen) && result_sp) ;
602
603                                 *out = base_sp >> 16;
604                                 out++;
605                                 *outlen -= 2;
606
607                                 if (*outlen <= 2) {
608                                         errno = E2BIG;
609                                         return (size_t)-1;
610                                 }
611
612                                 *out = base_sp & 0xFFFF;
613                                 out++;
614                                 *outlen -= 2;
615
616                                 i += 2;
617                                 in++;
618                                 base = *in;
619
620                                 result = 1;
621                         }
622                 }
623
624                 /* Binary Search for BMP */
625                 else if (result = do_precomposition(base, comb)) {
626                         base = result;
627                 }
628                 
629                 if (!result) {
630                         *out = base;
631                         out++;
632                         *outlen -= 2;
633                         base = comb;
634                 }
635         }
636
637         errno = E2BIG;
638         return (size_t)-1;
639 }
640
641 /* --------------- */
642 size_t decompose_w (ucs2_t *name, size_t inplen, ucs2_t *comp, size_t *outlen)
643 {
644         size_t i;
645         size_t comblen;
646         ucs2_t base, comb[COMBBUFLEN];
647         uint32_t base_sp;
648         ucs2_t sindex, tjamo;
649         ucs2_t *in, *out;
650         unsigned int result;
651         uint64_t result_sp;
652         size_t o_len = *outlen;
653
654         if (!inplen || (inplen & 1))
655                 return (size_t)-1;
656         i = 0;
657         in  = name;
658         out = comp;
659
660         while (i < inplen) {
661                 base = *in;
662                 comblen = 0;
663                 
664                 /* check ASCII first. this is frequent. */
665                 if (base <= 0x007f) ;
666                 
667                 /* Unicode Standard Annex #15 A10.2 Hangul Decomposition */
668                 else if ((SBASE <= base) && (base < SBASE + SCOUNT)) {
669                         sindex = base - SBASE;
670                         base = LBASE + sindex / NCOUNT;
671                         comb[COMBBUFLEN-2] = VBASE + (sindex % NCOUNT) / TCOUNT;
672                         
673                         /* <L,V> */
674                         if ((tjamo = TBASE + sindex % TCOUNT) == TBASE) {
675                                 comb[COMBBUFLEN-1] = comb[COMBBUFLEN-2];
676                                 comblen = 1;
677                         }
678                         
679                         /* <L,V,T> */
680                         else {
681                                 comb[COMBBUFLEN-1] = tjamo;
682                                 comblen = 2;
683                         }
684                 }
685                 
686                 /* Binary Search for Surrogate Pair */
687                 else if ((0xD800 <= base) && (base < 0xDC00)) {
688                         if (i + 2 < inplen) {
689                                 base_sp =  ((uint32_t)base << 16) | (uint32_t)in[1];
690                                 do {
691                                         if ( !(result_sp = do_decomposition_sp(base_sp))) break;
692                                         comblen += 2;
693                                         base_sp = result_sp >> 32;
694                                         comb[COMBBUFLEN-comblen] = (result_sp >> 16) & 0xFFFF;  /* hi */
695                                         comb[COMBBUFLEN-comblen+1] = result_sp & 0xFFFF;        /* lo */
696                                 } while (comblen < MAXCOMBSPLEN);
697
698                                 if (*outlen < (comblen + 1) << 1) {
699                                         errno = E2BIG;
700                                         return (size_t)-1;
701                                 }
702
703                                 *out = base_sp >> 16;   /* hi */
704                                 out++;
705                                 *outlen -= 2;
706                                 
707                                 base = base_sp & 0xFFFF; /* lo */
708                                 
709                                 i += 2;
710                                 in++;
711                         }
712                 }
713                         
714                 /* Binary Search for BMP */
715                 else {
716                         do {
717                                 if ( !(result = do_decomposition(base))) break;
718                                 comblen++;
719                                 base = result  >> 16;
720                                 comb[COMBBUFLEN-comblen] = result & 0xFFFF;
721                         } while ((0x007f < base) && (comblen < MAXCOMBLEN));
722                 }
723                 
724                 if (*outlen < (comblen + 1) << 1) {
725                         errno = E2BIG;
726                         return (size_t)-1;
727                 }
728                 
729                 *out = base;
730                 out++;
731                 *outlen -= 2;
732                 
733                 while ( comblen > 0 ) {
734                         *out = comb[COMBBUFLEN-comblen];
735                         out++;
736                         *outlen -= 2;
737                         comblen--;
738                 }
739                 
740                 i += 2;
741                 in++;
742         }
743         
744         *out = 0;
745         return o_len-*outlen;
746 }
747
748 /*******************************************************************
749 length of UTF-8 character and string
750 ********************************************************************/
751
752 size_t utf8_charlen ( char* utf8 )
753 {
754         unsigned char *p;
755
756         p = (unsigned char*) utf8;
757         
758         if ( *p < 0x80 )
759                 return (1);
760         else if ( *p > 0xC1 && *p < 0xe0 && *(p+1) > 0x7f && *(p+1) < 0xC0)
761                 return (2);
762         else if ( *p == 0xe0 && *(p+1) > 0x9f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0)
763                 return (3);
764         else if ( *p > 0xe0  && *p < 0xf0 && *(p+1) > 0x7f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0)
765                 return (3);
766         else if ( *p == 0xf0 && *(p+1) > 0x8f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
767                 return (4);
768         else if ( *p > 0xf0 && *p < 0xf4 && *(p+1) > 0x7f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
769                 return (4);
770         else if ( *p == 0xf4 && *(p+1) > 0x7f && *(p+1) < 0x90 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
771                 return (4);
772         else
773                 return ((size_t) -1);
774 }
775
776
777 size_t utf8_strlen_validate ( char * utf8 )
778 {
779         size_t len;
780         unsigned char *p;
781
782         p = (unsigned char*) utf8;
783         len = 0;
784
785         /* see http://www.unicode.org/unicode/reports/tr27/ for an explanation */
786
787         while ( *p != '\0')
788         {
789                 if ( *p < 0x80 )
790                         p++;
791
792                 else if ( *p > 0xC1 && *p < 0xe0 && *(p+1) > 0x7f && *(p+1) < 0xC0)
793                         p += 2;
794
795                 else if ( *p == 0xe0 && *(p+1) > 0x9f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0)
796                         p += 3;
797
798                 else if ( *p > 0xe0  && *p < 0xf0 && *(p+1) > 0x7f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0)
799                         p += 3;
800
801                 else if ( *p == 0xf0 && *(p+1) > 0x8f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
802                         p += 4;
803
804                 else if ( *p > 0xf0 && *p < 0xf4 && *(p+1) > 0x7f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
805                         p += 4;
806
807                 else if ( *p == 0xf4 && *(p+1) > 0x7f && *(p+1) < 0x90 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
808                         p += 4;
809
810                 else
811                         return ((size_t) -1);
812
813                 len++;
814         }
815
816         return (len);
817 }