]> arthur.barton.de Git - netatalk.git/blob - libatalk/unicode/util_unistr.c
3aa7036bf81358e0ca4267f7edd357b27d24c11e
[netatalk.git] / libatalk / unicode / util_unistr.c
1 /*******************************************************************
2   NOTE:
3   The early netatalk 2.x was based on UCS-2.
4   UCS-2 don't support chars above U+10000.
5   Recent netatalk is based on UTF-16.
6   UTF-16 can support chars above U+10000, using Surrogate Pair.
7   However, Surrogate Pair is complex, dirty, filthy and disagreeable.
8   There might still be latent bugs...
9 ********************************************************************/
10
11 #ifdef HAVE_CONFIG_H
12 #include "config.h"
13 #endif /* HAVE_CONFIG_H */
14
15 #include <stdio.h>
16 #include <stdlib.h>
17 #include <string.h>
18 #include <sys/param.h>
19 #include <sys/stat.h>
20 #include <atalk/logger.h>
21 #include <errno.h>
22
23 #include <netatalk/endian.h>
24
25 #include <atalk/unicode.h>
26 #include "precompose.h"
27 #include "byteorder.h"
28
29 /*******************************************************************
30  Convert a string to lower case.
31  return True if any char is converted
32 ********************************************************************/
33 /* surrogate pair support */
34
35 int strlower_w(ucs2_t *s)
36 {
37         int ret = 0;
38
39         while (*s) {
40                 if ((0xD800 <= *s) && (*s < 0xDC00)) {
41                         if ((0xDC00 <= s[1]) && (s[1] < 0xE000)) {
42                                 u_int32_t s_sp = (u_int32_t)*s << 16 | (u_int32_t)s[1];
43                                 u_int32_t v_sp = tolower_sp(s_sp);
44                                 if (v_sp != s_sp) {
45                                         *s = v_sp >> 16;
46                                         s++;
47                                         *s = v_sp & 0xFFFF;
48                                         ret = 1;
49                                 }
50                         }
51                 } else {
52                         ucs2_t v = tolower_w(*s);
53                         if (v != *s) {
54                                 *s = v;
55                                 ret = 1;
56                         }
57                 }
58                 s++;
59         }
60         return ret;
61 }
62
63 /*******************************************************************
64  Convert a string to upper case.
65  return True if any char is converted
66 ********************************************************************/
67 /* surrogate pair support */
68
69 int strupper_w(ucs2_t *s)
70 {
71         int ret = 0;
72
73         while (*s) {
74                 if ((0xD800 <= *s) && (*s < 0xDC00)) {
75                         if ((0xDC00 <= s[1]) && (s[1] < 0xE000)) {
76                                 u_int32_t s_sp = (u_int32_t)*s << 16 | (u_int32_t)s[1];
77                                 u_int32_t v_sp = toupper_sp(s_sp);
78                                 if (v_sp != s_sp) {
79                                         *s = v_sp >> 16;
80                                         s++;
81                                         *s = v_sp & 0xFFFF;
82                                         ret = 1;
83                                 }
84                         }
85                 } else {
86                         ucs2_t v = toupper_w(*s);
87                         if (v != *s) {
88                                 *s = v;
89                                 ret = 1;
90                         }
91                 }
92                 s++;
93         }
94         return ret;
95 }
96
97 /*******************************************************************
98 wide & sp islower()
99 determine if a character is lowercase
100 ********************************************************************/
101 /* These functions are not used. */
102
103 int islower_w(ucs2_t c)
104 {
105         return ( c == tolower_w(c));
106 }
107
108 int islower_sp(u_int32_t c_sp)
109 {
110         return ( c_sp == tolower_sp(c_sp));
111 }
112
113 /*******************************************************************
114 wide & sp isupper()
115 determine if a character is uppercase
116 ********************************************************************/
117 /* These functions are not used. */
118
119 int isupper_w(ucs2_t c)
120 {
121         return ( c == toupper_w(c));
122 }
123
124 int isupper_sp(u_int32_t c_sp)
125 {
126         return ( c_sp == toupper_sp(c_sp));
127 }
128
129 /*******************************************************************
130 wide strlen()
131  Count the number of characters in a UTF-16 string.
132 ********************************************************************/
133 /* NOTE: one surrogate pair is two characters. */
134
135 size_t strlen_w(const ucs2_t *src)
136 {
137         size_t len;
138
139         for(len = 0; *src++; len++) ;
140
141         return len;
142 }
143
144 /*******************************************************************
145 wide strnlen()
146  Count up to max number of characters in a UTF-16 string.
147 ********************************************************************/
148 /* NOTE: one surrogate pair is two characters. */
149
150 size_t strnlen_w(const ucs2_t *src, size_t max)
151 {
152         size_t len;
153
154         for(len = 0; *src++ && (len < max); len++) ;
155
156         return len;
157 }
158
159 /*******************************************************************
160 wide strchr()
161 ********************************************************************/
162 /* NOTE: hi and lo of surrogate pair are separately processed. */
163
164 ucs2_t *strchr_w(const ucs2_t *s, ucs2_t c)
165 {
166         while (*s != 0) {
167                 if (c == *s) return (ucs2_t *)s;
168                 s++;
169         }
170         if (c == *s) return (ucs2_t *)s;
171
172         return NULL;
173 }
174
175 /*******************************************************************
176 wide & sp strcasechr()
177 ********************************************************************/
178 /* NOTE: separately process BMP and surrogate pair */
179
180 ucs2_t *strcasechr_w(const ucs2_t *s, ucs2_t c)
181 {
182         while (*s != 0) {
183 /*              LOG(log_debug, logtype_default, "Comparing %X to %X (%X - %X)", c, *s, toupper_w(c), toupper_w(*s));*/
184                 if (toupper_w(c) == toupper_w(*s)) return (ucs2_t *)s;
185                 s++;
186         }
187         if (c == *s) return (ucs2_t *)s;
188
189         return NULL;
190 }
191
192 ucs2_t *strcasechr_sp(const ucs2_t *s, u_int32_t c_sp)
193 {
194         if (*s == 0) return NULL;
195         while (s[1] != 0) {
196                 if (toupper_sp(c_sp) == toupper_sp((u_int32_t)*s << 16 | (u_int32_t)s[1])) return (ucs2_t *)s;
197                 s++;
198         }
199
200         return NULL;
201 }
202
203 /*******************************************************************
204 wide strcmp()
205 ********************************************************************/
206 /* no problem of surrogate pair */
207
208 int strcmp_w(const ucs2_t *a, const ucs2_t *b)
209 {
210         while (*b && *a == *b) { a++; b++; }
211         return (*a - *b);
212         /* warning: if *a != *b and both are not 0 we retrun a random
213            greater or lesser than 0 number not realted to which
214            string is longer */
215 }
216
217 /*******************************************************************
218 wide strncmp()
219 ********************************************************************/
220 /* no problem of surrogate pair */
221
222 int strncmp_w(const ucs2_t *a, const ucs2_t *b, size_t len)
223 {
224         size_t n = 0;
225         while ((n < len) && *b && *a == *b) { a++; b++; n++;}
226         return (len - n)?(*a - *b):0;
227 }
228
229 /*******************************************************************
230 wide strstr()
231 ********************************************************************/
232 /* no problem of surrogate pair */
233
234 ucs2_t *strstr_w(const ucs2_t *s, const ucs2_t *ins)
235 {
236         ucs2_t *r;
237         size_t slen, inslen;
238
239         if (!s || !*s || !ins || !*ins) return NULL;
240         slen = strlen_w(s);
241         inslen = strlen_w(ins);
242         r = (ucs2_t *)s;
243         while ((r = strchr_w(r, *ins))) {
244                 if (strncmp_w(r, ins, inslen) == 0) return r;
245                 r++;
246         }
247         return NULL;
248 }
249
250 /*******************************************************************
251 wide strcasestr()
252 ********************************************************************/
253 /* */
254
255 ucs2_t *strcasestr_w(const ucs2_t *s, const ucs2_t *ins)
256 {
257         ucs2_t *r;
258         size_t slen, inslen;
259
260         if (!s || !*s || !ins || !*ins) return NULL;
261         slen = strlen_w(s);
262         inslen = strlen_w(ins);
263         r = (ucs2_t *)s;
264         while ((r = strcasechr_w(r, *ins))) {
265                 if (strncasecmp_w(r, ins, inslen) == 0) return r;
266                 r++;
267         }
268         return NULL;
269 }
270
271 /*******************************************************************
272 wide strcasecmp()
273 case insensitive string comparison
274 ********************************************************************/
275 /* surrogate pair support */
276
277 int strcasecmp_w(const ucs2_t *a, const ucs2_t *b)
278 {
279         int ret;
280
281         while (*a && *b) {
282                 if ((0xD800 <= *a) && (*a < 0xDC00)) {
283                         if (ret = tolower_sp((u_int32_t)*a << 16 | (u_int32_t)a[1]) - tolower_sp((u_int32_t)*b << 16 | (u_int32_t)b[1])) return ret;
284                         a++;
285                         b++;
286                         if (!(*a && *b)) return (tolower_w(*a) - tolower_w(*b)); /* avoid buffer over run */
287                 } else {
288                         if (ret = tolower_w(*a) - tolower_w(*b)) return ret;
289                 }
290                 a++;
291                 b++;
292         }
293         return (tolower_w(*a) - tolower_w(*b));
294 }
295
296 /*******************************************************************
297 wide strncasecmp()
298 case insensitive string comparison, length limited
299 ********************************************************************/
300 /* NOTE: compare up to 'len+1' if 'len' isolate surrogate pair  */
301
302 int strncasecmp_w(const ucs2_t *a, const ucs2_t *b, size_t len)
303 {
304         size_t n = 0;
305         int ret;
306
307         while ((n < len) && *a && *b) {
308                 if ((0xD800 <= *a) && (*a < 0xDC00)) {
309                         if (ret = tolower_sp((u_int32_t)*a << 16 | (u_int32_t)a[1]) - tolower_sp((u_int32_t)*b << 16 | (u_int32_t)b[1])) return ret;
310                         a++;
311                         b++;
312                         n++;
313                         if (!((n < len) && *a && *b)) return (tolower_w(*a) - tolower_w(*b));
314                 } else {
315                         if (ret = tolower_w(*a) - tolower_w(*b)) return ret;
316                 }
317                 a++;
318                 b++;
319                 n++;
320         }
321         return (len - n)?(tolower_w(*a) - tolower_w(*b)):0;
322 }
323
324 /*******************************************************************
325 wide strndup()
326 duplicate string
327 ********************************************************************/
328 /* NOTE: not check isolation of surrogate pair */
329 /* if len == 0 then duplicate the whole string */
330
331 ucs2_t *strndup_w(const ucs2_t *src, size_t len)
332 {
333         ucs2_t *dest;
334
335         if (!len) len = strlen_w(src);
336         dest = (ucs2_t *)malloc((len + 1) * sizeof(ucs2_t));
337         if (!dest) {
338                 LOG (log_error, logtype_default, "strdup_w: out of memory!");
339                 return NULL;
340         }
341
342         memcpy(dest, src, len * sizeof(ucs2_t));
343         dest[len] = 0;
344
345         return dest;
346 }
347
348 /*******************************************************************
349 wide strdup()
350 duplicate string
351 ********************************************************************/
352 /* no problem of surrogate pair */
353
354 ucs2_t *strdup_w(const ucs2_t *src)
355 {
356         return strndup_w(src, 0);
357 }
358
359 /*******************************************************************
360 copy a string with max len
361 ********************************************************************/
362
363 ucs2_t *strncpy_w(ucs2_t *dest, const ucs2_t *src, const size_t max)
364 {
365         size_t len;
366
367         if (!dest || !src) return NULL;
368
369         for (len = 0; (src[len] != 0) && (len < max); len++)
370                 dest[len] = src[len];
371         while (len < max)
372                 dest[len++] = 0;
373
374         return dest;
375 }
376
377
378 /*******************************************************************
379 append a string of len bytes and add a terminator
380 ********************************************************************/
381
382 ucs2_t *strncat_w(ucs2_t *dest, const ucs2_t *src, const size_t max)
383 {
384         size_t start;
385         size_t len;
386
387         if (!dest || !src) return NULL;
388
389         start = strlen_w(dest);
390         len = strnlen_w(src, max);
391
392         memcpy(&dest[start], src, len*sizeof(ucs2_t));
393         dest[start+len] = 0;
394
395         return dest;
396 }
397
398
399 ucs2_t *strcat_w(ucs2_t *dest, const ucs2_t *src)
400 {
401         size_t start;
402         size_t len;
403
404         if (!dest || !src) return NULL;
405
406         start = strlen_w(dest);
407         len = strlen_w(src);
408
409         memcpy(&dest[start], src, len*sizeof(ucs2_t));
410         dest[start+len] = 0;
411
412         return dest;
413 }
414
415
416 /*******************************************************************
417 binary search for pre|decomposition
418 ********************************************************************/
419
420 static ucs2_t do_precomposition(unsigned int base, unsigned int comb) 
421 {
422         int min = 0;
423         int max = PRECOMP_COUNT - 1;
424         int mid;
425         u_int32_t sought = (base << 16) | comb, that;
426
427         /* binary search */
428         while (max >= min) {
429                 mid = (min + max) / 2;
430                 that = (precompositions[mid].base << 16) | (precompositions[mid].comb);
431                 if (that < sought) {
432                         min = mid + 1;
433                 } else if (that > sought) {
434                         max = mid - 1;
435                 } else {
436                         return precompositions[mid].replacement;
437                 }
438         }
439         /* no match */
440         return 0;
441 }
442
443 /* ------------------------ */
444 static u_int32_t do_precomposition_sp(unsigned int base_sp, unsigned int comb_sp) 
445 {
446         int min = 0;
447         int max = PRECOMP_SP_COUNT - 1;
448         int mid;
449         u_int64_t sought_sp = ((u_int64_t)base_sp << 32) | (u_int64_t)comb_sp, that_sp;
450
451         /* binary search */
452         while (max >= min) {
453                 mid = (min + max) / 2;
454                 that_sp = ((u_int64_t)precompositions_sp[mid].base_sp << 32) | ((u_int64_t)precompositions_sp[mid].comb_sp);
455                 if (that_sp < sought_sp) {
456                         min = mid + 1;
457                 } else if (that_sp > sought_sp) {
458                         max = mid - 1;
459                 } else {
460                         return precompositions_sp[mid].replacement_sp;
461                 }
462         }
463         /* no match */
464         return 0;
465 }
466
467 /* -------------------------- */
468 static u_int32_t do_decomposition(ucs2_t base) 
469 {
470         int min = 0;
471         int max = DECOMP_COUNT - 1;
472         int mid;
473         u_int32_t sought = base;
474         u_int32_t result, that;
475
476         /* binary search */
477         while (max >= min) {
478                 mid = (min + max) / 2;
479                 that = decompositions[mid].replacement;
480                 if (that < sought) {
481                         min = mid + 1;
482                 } else if (that > sought) {
483                         max = mid - 1;
484                 } else {
485                         result = (decompositions[mid].base << 16) | (decompositions[mid].comb);
486                         return result;
487                 }
488         }
489         /* no match */
490         return 0;
491 }
492
493 /* -------------------------- */
494 static u_int64_t do_decomposition_sp(unsigned int base_sp) 
495 {
496         int min = 0;
497         int max = DECOMP_SP_COUNT - 1;
498         int mid;
499         u_int32_t sought_sp = base_sp;
500         u_int32_t that_sp;
501         u_int64_t result_sp;
502
503         /* binary search */
504         while (max >= min) {
505                 mid = (min + max) / 2;
506                 that_sp = decompositions_sp[mid].replacement_sp;
507                 if (that_sp < sought_sp) {
508                         min = mid + 1;
509                 } else if (that_sp > sought_sp) {
510                         max = mid - 1;
511                 } else {
512                         result_sp = ((u_int64_t)decompositions_sp[mid].base_sp << 32) | ((u_int64_t)decompositions_sp[mid].comb_sp);
513                         return result_sp;
514                 }
515         }
516         /* no match */
517         return 0;
518 }
519
520 /*******************************************************************
521 pre|decomposition
522
523    we can't use static, this stuff needs to be reentrant
524    static char comp[MAXPATHLEN +1];
525
526    We don't implement Singleton and Canonical Ordering.
527    We ignore CompositionExclusions.txt.
528    because they cause the problem of the roundtrip
529    such as Dancing Icon.
530
531    exclude U2000-U2FFF, UFE30-UFE4F and U2F800-U2FA1F ranges
532    in precompose.h from composition according to AFP 3.x spec
533 ********************************************************************/
534
535 size_t precompose_w (ucs2_t *name, size_t inplen, ucs2_t *comp, size_t *outlen)
536 {
537         size_t i;
538         ucs2_t base, comb;
539         u_int32_t base_sp, comb_sp;
540         ucs2_t *in, *out;
541         ucs2_t lindex, vindex;
542         ucs2_t result;
543         u_int32_t result_sp;
544         size_t o_len = *outlen;
545         
546         if (!inplen || (inplen & 1) || inplen > o_len)
547                 return (size_t)-1;
548         
549         i = 0;
550         in  = name;
551         out = comp;
552         
553         base = *in;
554         while (*outlen > 2) {
555                 i += 2;
556                 if (i == inplen) {
557                         *out = base;
558                         out++;
559                         *out = 0;
560                         *outlen -= 2;
561                         return o_len - *outlen;
562                 }
563                 in++;
564                 comb = *in;
565                 result = 0;
566
567                 /* Non-Combination Character */
568                 if (comb < 0x300) ;
569                 
570                 /* Unicode Standard Annex #15 A10.3 Hangul Composition */
571                 /* Step 1 <L,V> */
572                 else if ((VBASE <= comb) && (comb <= VBASE + VCOUNT)) {
573                         if ((LBASE <= base) && (base < LBASE + LCOUNT)) {
574                                 result = 1;
575                                 lindex = base - LBASE;
576                                 vindex = comb - VBASE;
577                                 base = SBASE + (lindex * VCOUNT + vindex) * TCOUNT;
578                         }
579                 }
580                 
581                 /* Step 2 <LV,T> */
582                 else if ((TBASE < comb) && (comb < TBASE + TCOUNT)) {
583                         if ((SBASE <= base) && (base < SBASE + SCOUNT) && (((base - SBASE) % TCOUNT) == 0)) {
584                                 result = 1;
585                                 base += comb - TBASE;
586                         }
587                 }
588                 
589                 /* Binary Search for Surrogate Pair */
590                 else if ((0xD800 <= base) && (base < 0xDC00)) {
591                         if ((0xDC00 <= comb) && (comb < 0xE000) && (i + 6 <= inplen)) {
592                                 base_sp = ((u_int32_t)base << 16) | (u_int32_t)comb;
593                                 do {
594                                         comb_sp = ((u_int32_t)in[1] << 16) | (u_int32_t)in[2];
595                                         if (result_sp = do_precomposition_sp(base_sp, comb_sp)) {
596                                                 base_sp = result_sp;
597                                                 i += 4;
598                                                 in +=2;
599                                         }
600                                 } while ((i + 6 <= inplen) && result_sp) ;
601
602                                 *out = base_sp >> 16;
603                                 out++;
604                                 *outlen -= 2;
605
606                                 if (*outlen <= 2) {
607                                         errno = E2BIG;
608                                         return (size_t)-1;
609                                 }
610
611                                 *out = base_sp & 0xFFFF;
612                                 out++;
613                                 *outlen -= 2;
614
615                                 i += 2;
616                                 if (i == inplen) {
617                                         out++;
618                                         *out = 0;
619                                         return o_len - *outlen;
620                                 }
621                                 in++;
622                                 base = *in;
623
624                                 result = 1;
625                         }
626                 }
627
628                 /* Binary Search for BMP */
629                 else if (result = do_precomposition(base, comb)) {
630                         base = result;
631                 }
632                 
633                 if (!result) {
634                         *out = base;
635                         out++;
636                         *outlen -= 2;
637                         base = comb;
638                 }
639         }
640
641         errno = E2BIG;
642         return (size_t)-1;
643 }
644
645 /* --------------- */
646 size_t decompose_w (ucs2_t *name, size_t inplen, ucs2_t *comp, size_t *outlen)
647 {
648         size_t i;
649         size_t comblen;
650         ucs2_t base, comb[COMBBUFLEN];
651         u_int32_t base_sp;
652         ucs2_t sindex, tjamo;
653         ucs2_t *in, *out;
654         unsigned int result;
655         u_int64_t result_sp;
656         size_t o_len = *outlen;
657
658         if (!inplen || (inplen & 1))
659                 return (size_t)-1;
660         i = 0;
661         in  = name;
662         out = comp;
663
664         while (i < inplen) {
665                 base = *in;
666                 comblen = 0;
667                 
668                 /* check ASCII first. this is frequent. */
669                 if (base <= 0x007f) ;
670                 
671                 /* Unicode Standard Annex #15 A10.2 Hangul Decomposition */
672                 else if ((SBASE <= base) && (base < SBASE + SCOUNT)) {
673                         sindex = base - SBASE;
674                         base = LBASE + sindex / NCOUNT;
675                         comb[COMBBUFLEN-2] = VBASE + (sindex % NCOUNT) / TCOUNT;
676                         
677                         /* <L,V> */
678                         if ((tjamo = TBASE + sindex % TCOUNT) == TBASE) {
679                                 comb[COMBBUFLEN-1] = comb[COMBBUFLEN-2];
680                                 comblen = 1;
681                         }
682                         
683                         /* <L,V,T> */
684                         else {
685                                 comb[COMBBUFLEN-1] = tjamo;
686                                 comblen = 2;
687                         }
688                 }
689                 
690                 /* Binary Search for Surrogate Pair */
691                 else if ((0xD800 <= base) && (base < 0xDC00)) {
692                         if (i + 2 < inplen) {
693                                 base_sp =  ((u_int32_t)base << 16) | (u_int32_t)in[1];
694                                 do {
695                                         if ( !(result_sp = do_decomposition_sp(base_sp))) break;
696                                         comblen += 2;
697                                         base_sp = result_sp >> 32;
698                                         comb[COMBBUFLEN-comblen] = (result_sp >> 16) & 0xFFFF;  /* hi */
699                                         comb[COMBBUFLEN-comblen+1] = result_sp & 0xFFFF;        /* lo */
700                                 } while (comblen < MAXCOMBSPLEN);
701
702                                 if (*outlen < (comblen + 1) << 1) {
703                                         errno = E2BIG;
704                                         return (size_t)-1;
705                                 }
706
707                                 *out = base_sp >> 16;   /* hi */
708                                 out++;
709                                 *outlen -= 2;
710                                 
711                                 base = base_sp & 0xFFFF; /* lo */
712                                 
713                                 i += 2;
714                                 in++;
715                         }
716                 }
717                         
718                 /* Binary Search for BMP */
719                 else {
720                         do {
721                                 if ( !(result = do_decomposition(base))) break;
722                                 comblen++;
723                                 base = result  >> 16;
724                                 comb[COMBBUFLEN-comblen] = result & 0xFFFF;
725                         } while ((0x007f < base) && (comblen < MAXCOMBLEN));
726                 }
727                 
728                 if (*outlen < (comblen + 1) << 1) {
729                         errno = E2BIG;
730                         return (size_t)-1;
731                 }
732                 
733                 *out = base;
734                 out++;
735                 *outlen -= 2;
736                 
737                 while ( comblen > 0 ) {
738                         *out = comb[COMBBUFLEN-comblen];
739                         out++;
740                         *outlen -= 2;
741                         comblen--;
742                 }
743                 
744                 i += 2;
745                 in++;
746         }
747         
748         *out = 0;
749         return o_len-*outlen;
750 }
751
752 /*******************************************************************
753 length of UTF-8 character and string
754 ********************************************************************/
755
756 size_t utf8_charlen ( char* utf8 )
757 {
758         unsigned char *p;
759
760         p = (unsigned char*) utf8;
761         
762         if ( *p < 0x80 )
763                 return (1);
764         else if ( *p > 0xC1 && *p < 0xe0 && *(p+1) > 0x7f && *(p+1) < 0xC0)
765                 return (2);
766         else if ( *p == 0xe0 && *(p+1) > 0x9f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0)
767                 return (3);
768         else if ( *p > 0xe0  && *p < 0xf0 && *(p+1) > 0x7f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0)
769                 return (3);
770         else if ( *p == 0xf0 && *(p+1) > 0x8f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
771                 return (4);
772         else if ( *p > 0xf0 && *p < 0xf4 && *(p+1) > 0x7f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
773                 return (4);
774         else if ( *p == 0xf4 && *(p+1) > 0x7f && *(p+1) < 0x90 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
775                 return (4);
776         else
777                 return ((size_t) -1);
778 }
779
780
781 size_t utf8_strlen_validate ( char * utf8 )
782 {
783         size_t len;
784         unsigned char *p;
785
786         p = (unsigned char*) utf8;
787         len = 0;
788
789         /* see http://www.unicode.org/unicode/reports/tr27/ for an explanation */
790
791         while ( *p != '\0')
792         {
793                 if ( *p < 0x80 )
794                         p++;
795
796                 else if ( *p > 0xC1 && *p < 0xe0 && *(p+1) > 0x7f && *(p+1) < 0xC0)
797                         p += 2;
798
799                 else if ( *p == 0xe0 && *(p+1) > 0x9f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0)
800                         p += 3;
801
802                 else if ( *p > 0xe0  && *p < 0xf0 && *(p+1) > 0x7f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0)
803                         p += 3;
804
805                 else if ( *p == 0xf0 && *(p+1) > 0x8f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
806                         p += 4;
807
808                 else if ( *p > 0xf0 && *p < 0xf4 && *(p+1) > 0x7f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
809                         p += 4;
810
811                 else if ( *p == 0xf4 && *(p+1) > 0x7f && *(p+1) < 0x90 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
812                         p += 4;
813
814                 else
815                         return ((size_t) -1);
816
817                 len++;
818         }
819
820         return (len);
821 }