]> arthur.barton.de Git - netatalk.git/blob - libatalk/unicode/util_unistr.c
Merge master
[netatalk.git] / libatalk / unicode / util_unistr.c
1 /*******************************************************************
2   NOTE:
3   The early netatalk 2.x was based on UCS-2.
4   UCS-2 don't support chars above U+10000.
5   Recent netatalk is based on UTF-16.
6   UTF-16 can support chars above U+10000, using Surrogate Pair.
7   However, Surrogate Pair is complex, dirty, filthy and disagreeable.
8   There might still be latent bugs...
9 ********************************************************************/
10
11 #ifdef HAVE_CONFIG_H
12 #include "config.h"
13 #endif /* HAVE_CONFIG_H */
14
15 #include <stdio.h>
16 #include <stdlib.h>
17 #include <string.h>
18 #include <sys/param.h>
19 #include <sys/stat.h>
20 #include <atalk/logger.h>
21 #include <errno.h>
22 #include <arpa/inet.h>
23
24 #include <atalk/unicode.h>
25 #include "precompose.h"
26 #include "byteorder.h"
27
28 /*******************************************************************
29  Convert a string to lower case.
30  return True if any char is converted
31 ********************************************************************/
32 /* surrogate pair support */
33
34 int strlower_w(ucs2_t *s)
35 {
36         int ret = 0;
37
38         while (*s) {
39                 if ((0xD800 <= *s) && (*s < 0xDC00)) {
40                         if ((0xDC00 <= s[1]) && (s[1] < 0xE000)) {
41                                 uint32_t s_sp = (uint32_t)*s << 16 | (uint32_t)s[1];
42                                 uint32_t v_sp = tolower_sp(s_sp);
43                                 if (v_sp != s_sp) {
44                                         *s = v_sp >> 16;
45                                         s++;
46                                         *s = v_sp & 0xFFFF;
47                                         ret = 1;
48                                 }
49                         }
50                 } else {
51                         ucs2_t v = tolower_w(*s);
52                         if (v != *s) {
53                                 *s = v;
54                                 ret = 1;
55                         }
56                 }
57                 s++;
58         }
59         return ret;
60 }
61
62 /*******************************************************************
63  Convert a string to upper case.
64  return True if any char is converted
65 ********************************************************************/
66 /* surrogate pair support */
67
68 int strupper_w(ucs2_t *s)
69 {
70         int ret = 0;
71
72         while (*s) {
73                 if ((0xD800 <= *s) && (*s < 0xDC00)) {
74                         if ((0xDC00 <= s[1]) && (s[1] < 0xE000)) {
75                                 uint32_t s_sp = (uint32_t)*s << 16 | (uint32_t)s[1];
76                                 uint32_t v_sp = toupper_sp(s_sp);
77                                 if (v_sp != s_sp) {
78                                         *s = v_sp >> 16;
79                                         s++;
80                                         *s = v_sp & 0xFFFF;
81                                         ret = 1;
82                                 }
83                         }
84                 } else {
85                         ucs2_t v = toupper_w(*s);
86                         if (v != *s) {
87                                 *s = v;
88                                 ret = 1;
89                         }
90                 }
91                 s++;
92         }
93         return ret;
94 }
95
96 /*******************************************************************
97 wide & sp islower()
98 determine if a character is lowercase
99 ********************************************************************/
100 /* These functions are not used. */
101
102 int islower_w(ucs2_t c)
103 {
104         return ( c == tolower_w(c));
105 }
106
107 int islower_sp(uint32_t c_sp)
108 {
109         return ( c_sp == tolower_sp(c_sp));
110 }
111
112 /*******************************************************************
113 wide & sp isupper()
114 determine if a character is uppercase
115 ********************************************************************/
116 /* These functions are not used. */
117
118 int isupper_w(ucs2_t c)
119 {
120         return ( c == toupper_w(c));
121 }
122
123 int isupper_sp(uint32_t c_sp)
124 {
125         return ( c_sp == toupper_sp(c_sp));
126 }
127
128 /*******************************************************************
129 wide strlen()
130  Count the number of characters in a UTF-16 string.
131 ********************************************************************/
132 /* NOTE: one surrogate pair is two characters. */
133
134 size_t strlen_w(const ucs2_t *src)
135 {
136         size_t len;
137
138         for(len = 0; *src++; len++) ;
139
140         return len;
141 }
142
143 /*******************************************************************
144 wide strnlen()
145  Count up to max number of characters in a UTF-16 string.
146 ********************************************************************/
147 /* NOTE: one surrogate pair is two characters. */
148
149 size_t strnlen_w(const ucs2_t *src, size_t max)
150 {
151         size_t len;
152
153         for(len = 0; *src++ && (len < max); len++) ;
154
155         return len;
156 }
157
158 /*******************************************************************
159 wide strchr()
160 ********************************************************************/
161 /* NOTE: hi and lo of surrogate pair are separately processed. */
162
163 ucs2_t *strchr_w(const ucs2_t *s, ucs2_t c)
164 {
165         while (*s != 0) {
166                 if (c == *s) return (ucs2_t *)s;
167                 s++;
168         }
169         if (c == *s) return (ucs2_t *)s;
170
171         return NULL;
172 }
173
174 /*******************************************************************
175 wide & sp strcasechr()
176 ********************************************************************/
177 /* NOTE: separately process BMP and surrogate pair */
178
179 ucs2_t *strcasechr_w(const ucs2_t *s, ucs2_t c)
180 {
181         while (*s != 0) {
182                 if (tolower_w(c) == tolower_w(*s)) return (ucs2_t *)s;
183                 s++;
184         }
185         if (c == *s) return (ucs2_t *)s;
186
187         return NULL;
188 }
189
190 ucs2_t *strcasechr_sp(const ucs2_t *s, uint32_t c_sp)
191 {
192         if (*s == 0) return NULL;
193         while (s[1] != 0) {
194                 if (tolower_sp(c_sp) == tolower_sp((uint32_t)*s << 16 | (uint32_t)s[1])) return (ucs2_t *)s;
195                 s++;
196         }
197
198         return NULL;
199 }
200
201 /*******************************************************************
202 wide strcmp()
203 ********************************************************************/
204 /* no problem of surrogate pair */
205
206 int strcmp_w(const ucs2_t *a, const ucs2_t *b)
207 {
208         while (*b && *a == *b) { a++; b++; }
209         return (*a - *b);
210         /* warning: if *a != *b and both are not 0 we retrun a random
211            greater or lesser than 0 number not realted to which
212            string is longer */
213 }
214
215 /*******************************************************************
216 wide strncmp()
217 ********************************************************************/
218 /* no problem of surrogate pair */
219
220 int strncmp_w(const ucs2_t *a, const ucs2_t *b, size_t len)
221 {
222         size_t n = 0;
223         while ((n < len) && *b && *a == *b) { a++; b++; n++;}
224         return (len - n)?(*a - *b):0;
225 }
226
227 /*******************************************************************
228 wide strstr()
229 ********************************************************************/
230 /* no problem of surrogate pair */
231
232 ucs2_t *strstr_w(const ucs2_t *s, const ucs2_t *ins)
233 {
234         ucs2_t *r;
235         size_t slen, inslen;
236
237         if (!s || !*s || !ins || !*ins) return NULL;
238         slen = strlen_w(s);
239         inslen = strlen_w(ins);
240         r = (ucs2_t *)s;
241         while ((r = strchr_w(r, *ins))) {
242                 if (strncmp_w(r, ins, inslen) == 0) return r;
243                 r++;
244         }
245         return NULL;
246 }
247
248 /*******************************************************************
249 wide strcasestr()
250 ********************************************************************/
251 /* surrogate pair support */
252
253 ucs2_t *strcasestr_w(const ucs2_t *s, const ucs2_t *ins)
254 {
255         ucs2_t *r;
256         size_t slen, inslen;
257
258         if (!s || !*s || !ins || !*ins) return NULL;
259         slen = strlen_w(s);
260         inslen = strlen_w(ins);
261         r = (ucs2_t *)s;
262
263         if ((0xD800 <= *ins) && (*ins < 0xDC00)) {
264                 if ((0xDC00 <= ins[1]) && (ins[1] < 0xE000)) {
265                         u_int32_t ins_sp = (u_int32_t)*ins << 16 | (u_int32_t)ins[1];
266                         while ((r = strcasechr_sp(r, ins_sp))) {
267                                 if (strncasecmp_w(r, ins, inslen) == 0) return r;
268                                 r++;
269                         }
270                 } else {
271                         return NULL; /* illegal sequence */
272                 }
273         } else {
274                 while ((r = strcasechr_w(r, *ins))) {
275                         if (strncasecmp_w(r, ins, inslen) == 0) return r;
276                         r++;
277                 }
278         }
279         return NULL;
280 }
281
282 /*******************************************************************
283 wide strcasecmp()
284 case insensitive string comparison
285 ********************************************************************/
286 /* surrogate pair support */
287
288 int strcasecmp_w(const ucs2_t *a, const ucs2_t *b)
289 {
290         int ret;
291
292         while (*a && *b) {
293                 if ((0xD800 <= *a) && (*a < 0xDC00)) {
294                         if (ret = tolower_sp((uint32_t)*a << 16 | (uint32_t)a[1]) - tolower_sp((uint32_t)*b << 16 | (uint32_t)b[1])) return ret;
295                         a++;
296                         b++;
297                         if (!(*a && *b)) return (tolower_w(*a) - tolower_w(*b)); /* avoid buffer over run */
298                 } else {
299                         if (ret = tolower_w(*a) - tolower_w(*b)) return ret;
300                 }
301                 a++;
302                 b++;
303         }
304         return (tolower_w(*a) - tolower_w(*b));
305 }
306
307 /*******************************************************************
308 wide strncasecmp()
309 case insensitive string comparison, length limited
310 ********************************************************************/
311 /* NOTE: compare up to 'len+1' if 'len' isolate surrogate pair  */
312
313 int strncasecmp_w(const ucs2_t *a, const ucs2_t *b, size_t len)
314 {
315         size_t n = 0;
316         int ret;
317
318         while ((n < len) && *a && *b) {
319                 if ((0xD800 <= *a) && (*a < 0xDC00)) {
320                         if (ret = tolower_sp((uint32_t)*a << 16 | (uint32_t)a[1]) - tolower_sp((uint32_t)*b << 16 | (uint32_t)b[1])) return ret;
321                         a++;
322                         b++;
323                         n++;
324                         if (!((n < len) && *a && *b)) return (tolower_w(*a) - tolower_w(*b));
325                 } else {
326                         if (ret = tolower_w(*a) - tolower_w(*b)) return ret;
327                 }
328                 a++;
329                 b++;
330                 n++;
331         }
332         return (len - n)?(tolower_w(*a) - tolower_w(*b)):0;
333 }
334
335 /*******************************************************************
336 wide strndup()
337 duplicate string
338 ********************************************************************/
339 /* NOTE: not check isolation of surrogate pair */
340 /* if len == 0 then duplicate the whole string */
341
342 ucs2_t *strndup_w(const ucs2_t *src, size_t len)
343 {
344         ucs2_t *dest;
345
346         if (!len) len = strlen_w(src);
347         dest = (ucs2_t *)malloc((len + 1) * sizeof(ucs2_t));
348         if (!dest) {
349                 LOG (log_error, logtype_default, "strdup_w: out of memory!");
350                 return NULL;
351         }
352
353         memcpy(dest, src, len * sizeof(ucs2_t));
354         dest[len] = 0;
355
356         return dest;
357 }
358
359 /*******************************************************************
360 wide strdup()
361 duplicate string
362 ********************************************************************/
363 /* no problem of surrogate pair */
364
365 ucs2_t *strdup_w(const ucs2_t *src)
366 {
367         return strndup_w(src, 0);
368 }
369
370 /*******************************************************************
371 copy a string with max len
372 ********************************************************************/
373 /* This function is not used. */
374 /* NOTE: not check isolation of surrogate pair */
375
376 ucs2_t *strncpy_w(ucs2_t *dest, const ucs2_t *src, const size_t max)
377 {
378         size_t len;
379
380         if (!dest || !src) return NULL;
381
382         for (len = 0; (src[len] != 0) && (len < max); len++)
383                 dest[len] = src[len];
384         while (len < max)
385                 dest[len++] = 0;
386
387         return dest;
388 }
389
390
391 /*******************************************************************
392 append a string of len bytes and add a terminator
393 ********************************************************************/
394 /* These functions are not used. */
395
396 /* NOTE: not check isolation of surrogate pair */
397 ucs2_t *strncat_w(ucs2_t *dest, const ucs2_t *src, const size_t max)
398 {
399         size_t start;
400         size_t len;
401
402         if (!dest || !src) return NULL;
403
404         start = strlen_w(dest);
405         len = strnlen_w(src, max);
406
407         memcpy(&dest[start], src, len*sizeof(ucs2_t));
408         dest[start+len] = 0;
409
410         return dest;
411 }
412
413 /* no problem of surrogate pair */
414 ucs2_t *strcat_w(ucs2_t *dest, const ucs2_t *src)
415 {
416         size_t start;
417         size_t len;
418
419         if (!dest || !src) return NULL;
420
421         start = strlen_w(dest);
422         len = strlen_w(src);
423
424         memcpy(&dest[start], src, len*sizeof(ucs2_t));
425         dest[start+len] = 0;
426
427         return dest;
428 }
429
430
431 /*******************************************************************
432 binary search for pre|decomposition
433 ********************************************************************/
434
435 static ucs2_t do_precomposition(unsigned int base, unsigned int comb) 
436 {
437         int min = 0;
438         int max = PRECOMP_COUNT - 1;
439         int mid;
440         uint32_t sought = (base << 16) | comb, that;
441
442         /* binary search */
443         while (max >= min) {
444                 mid = (min + max) / 2;
445                 that = (precompositions[mid].base << 16) | (precompositions[mid].comb);
446                 if (that < sought) {
447                         min = mid + 1;
448                 } else if (that > sought) {
449                         max = mid - 1;
450                 } else {
451                         return precompositions[mid].replacement;
452                 }
453         }
454         /* no match */
455         return 0;
456 }
457
458 /* ------------------------ */
459 static uint32_t do_precomposition_sp(unsigned int base_sp, unsigned int comb_sp) 
460 {
461         int min = 0;
462         int max = PRECOMP_SP_COUNT - 1;
463         int mid;
464         uint64_t sought_sp = ((uint64_t)base_sp << 32) | (uint64_t)comb_sp, that_sp;
465
466         /* binary search */
467         while (max >= min) {
468                 mid = (min + max) / 2;
469                 that_sp = ((uint64_t)precompositions_sp[mid].base_sp << 32) | ((uint64_t)precompositions_sp[mid].comb_sp);
470                 if (that_sp < sought_sp) {
471                         min = mid + 1;
472                 } else if (that_sp > sought_sp) {
473                         max = mid - 1;
474                 } else {
475                         return precompositions_sp[mid].replacement_sp;
476                 }
477         }
478         /* no match */
479         return 0;
480 }
481
482 /* -------------------------- */
483 static uint32_t do_decomposition(ucs2_t base) 
484 {
485         int min = 0;
486         int max = DECOMP_COUNT - 1;
487         int mid;
488         uint32_t sought = base;
489         uint32_t result, that;
490
491         /* binary search */
492         while (max >= min) {
493                 mid = (min + max) / 2;
494                 that = decompositions[mid].replacement;
495                 if (that < sought) {
496                         min = mid + 1;
497                 } else if (that > sought) {
498                         max = mid - 1;
499                 } else {
500                         result = (decompositions[mid].base << 16) | (decompositions[mid].comb);
501                         return result;
502                 }
503         }
504         /* no match */
505         return 0;
506 }
507
508 /* -------------------------- */
509 static uint64_t do_decomposition_sp(unsigned int base_sp) 
510 {
511         int min = 0;
512         int max = DECOMP_SP_COUNT - 1;
513         int mid;
514         uint32_t sought_sp = base_sp;
515         uint32_t that_sp;
516         uint64_t result_sp;
517
518         /* binary search */
519         while (max >= min) {
520                 mid = (min + max) / 2;
521                 that_sp = decompositions_sp[mid].replacement_sp;
522                 if (that_sp < sought_sp) {
523                         min = mid + 1;
524                 } else if (that_sp > sought_sp) {
525                         max = mid - 1;
526                 } else {
527                         result_sp = ((uint64_t)decompositions_sp[mid].base_sp << 32) | ((uint64_t)decompositions_sp[mid].comb_sp);
528                         return result_sp;
529                 }
530         }
531         /* no match */
532         return 0;
533 }
534
535 /*******************************************************************
536 pre|decomposition
537
538    we can't use static, this stuff needs to be reentrant
539    static char comp[MAXPATHLEN +1];
540
541    We don't implement Singleton and Canonical Ordering.
542    We ignore CompositionExclusions.txt.
543    because they cause the problem of the roundtrip
544    such as Dancing Icon.
545
546    exclude U2000-U2FFF, UFE30-UFE4F and U2F800-U2FA1F ranges
547    in precompose.h from composition according to AFP 3.x spec
548 ********************************************************************/
549
550 size_t precompose_w (ucs2_t *name, size_t inplen, ucs2_t *comp, size_t *outlen)
551 {
552         size_t i;
553         ucs2_t base, comb;
554         uint32_t base_sp, comb_sp;
555         ucs2_t *in, *out;
556         ucs2_t lindex, vindex;
557         ucs2_t result;
558         uint32_t result_sp;
559         size_t o_len = *outlen;
560         
561         if (!inplen || (inplen & 1) || inplen > o_len)
562                 return (size_t)-1;
563         
564         i = 0;
565         in  = name;
566         out = comp;
567         
568         base = *in;
569         while (*outlen > 2) {
570                 i += 2;
571                 if (i == inplen) {
572                         *out = base;
573                         out++;
574                         *out = 0;
575                         *outlen -= 2;
576                         return o_len - *outlen;
577                 }
578                 in++;
579                 comb = *in;
580                 result = 0;
581
582                 /* Non-Combination Character */
583                 if (comb < 0x300) ;
584                 
585                 /* Unicode Standard Annex #15 A10.3 Hangul Composition */
586                 /* Step 1 <L,V> */
587                 else if ((VBASE <= comb) && (comb <= VBASE + VCOUNT)) {
588                         if ((LBASE <= base) && (base < LBASE + LCOUNT)) {
589                                 result = 1;
590                                 lindex = base - LBASE;
591                                 vindex = comb - VBASE;
592                                 base = SBASE + (lindex * VCOUNT + vindex) * TCOUNT;
593                         }
594                 }
595                 
596                 /* Step 2 <LV,T> */
597                 else if ((TBASE < comb) && (comb < TBASE + TCOUNT)) {
598                         if ((SBASE <= base) && (base < SBASE + SCOUNT) && (((base - SBASE) % TCOUNT) == 0)) {
599                                 result = 1;
600                                 base += comb - TBASE;
601                         }
602                 }
603                 
604                 /* Binary Search for Surrogate Pair */
605                 else if ((0xD800 <= base) && (base < 0xDC00)) {
606                         if ((0xDC00 <= comb) && (comb < 0xE000) && (i + 6 <= inplen)) {
607                                 base_sp = ((uint32_t)base << 16) | (uint32_t)comb;
608                                 do {
609                                         comb_sp = ((uint32_t)in[1] << 16) | (uint32_t)in[2];
610                                         if (result_sp = do_precomposition_sp(base_sp, comb_sp)) {
611                                                 base_sp = result_sp;
612                                                 i += 4;
613                                                 in +=2;
614                                         }
615                                 } while ((i + 6 <= inplen) && result_sp) ;
616
617                                 *out = base_sp >> 16;
618                                 out++;
619                                 *outlen -= 2;
620
621                                 if (*outlen <= 2) {
622                                         errno = E2BIG;
623                                         return (size_t)-1;
624                                 }
625
626                                 *out = base_sp & 0xFFFF;
627                                 out++;
628                                 *outlen -= 2;
629
630                                 i += 2;
631                                 if (i == inplen) {
632                                         out++;
633                                         *out = 0;
634                                         return o_len - *outlen;
635                                 }
636                                 in++;
637                                 base = *in;
638
639                                 result = 1;
640                         }
641                 }
642
643                 /* Binary Search for BMP */
644                 else if (result = do_precomposition(base, comb)) {
645                         base = result;
646                 }
647                 
648                 if (!result) {
649                         *out = base;
650                         out++;
651                         *outlen -= 2;
652                         base = comb;
653                 }
654         }
655
656         errno = E2BIG;
657         return (size_t)-1;
658 }
659
660 /* --------------- */
661 size_t decompose_w (ucs2_t *name, size_t inplen, ucs2_t *comp, size_t *outlen)
662 {
663         size_t i;
664         size_t comblen;
665         ucs2_t base, comb[COMBBUFLEN];
666         uint32_t base_sp;
667         ucs2_t sindex, tjamo;
668         ucs2_t *in, *out;
669         unsigned int result;
670         uint64_t result_sp;
671         size_t o_len = *outlen;
672
673         if (!inplen || (inplen & 1))
674                 return (size_t)-1;
675         i = 0;
676         in  = name;
677         out = comp;
678
679         while (i < inplen) {
680                 base = *in;
681                 comblen = 0;
682                 
683                 /* check ASCII first. this is frequent. */
684                 if (base <= 0x007f) ;
685                 
686                 /* Unicode Standard Annex #15 A10.2 Hangul Decomposition */
687                 else if ((SBASE <= base) && (base < SBASE + SCOUNT)) {
688                         sindex = base - SBASE;
689                         base = LBASE + sindex / NCOUNT;
690                         comb[COMBBUFLEN-2] = VBASE + (sindex % NCOUNT) / TCOUNT;
691                         
692                         /* <L,V> */
693                         if ((tjamo = TBASE + sindex % TCOUNT) == TBASE) {
694                                 comb[COMBBUFLEN-1] = comb[COMBBUFLEN-2];
695                                 comblen = 1;
696                         }
697                         
698                         /* <L,V,T> */
699                         else {
700                                 comb[COMBBUFLEN-1] = tjamo;
701                                 comblen = 2;
702                         }
703                 }
704                 
705                 /* Binary Search for Surrogate Pair */
706                 else if ((0xD800 <= base) && (base < 0xDC00)) {
707                         if (i + 2 < inplen) {
708                                 base_sp =  ((uint32_t)base << 16) | (uint32_t)in[1];
709                                 do {
710                                         if ( !(result_sp = do_decomposition_sp(base_sp))) break;
711                                         comblen += 2;
712                                         base_sp = result_sp >> 32;
713                                         comb[COMBBUFLEN-comblen] = (result_sp >> 16) & 0xFFFF;  /* hi */
714                                         comb[COMBBUFLEN-comblen+1] = result_sp & 0xFFFF;        /* lo */
715                                 } while (comblen < MAXCOMBSPLEN);
716
717                                 if (*outlen < (comblen + 1) << 1) {
718                                         errno = E2BIG;
719                                         return (size_t)-1;
720                                 }
721
722                                 *out = base_sp >> 16;   /* hi */
723                                 out++;
724                                 *outlen -= 2;
725                                 
726                                 base = base_sp & 0xFFFF; /* lo */
727                                 
728                                 i += 2;
729                                 in++;
730                         }
731                 }
732                         
733                 /* Binary Search for BMP */
734                 else {
735                         do {
736                                 if ( !(result = do_decomposition(base))) break;
737                                 comblen++;
738                                 base = result  >> 16;
739                                 comb[COMBBUFLEN-comblen] = result & 0xFFFF;
740                         } while ((0x007f < base) && (comblen < MAXCOMBLEN));
741                 }
742                 
743                 if (*outlen < (comblen + 1) << 1) {
744                         errno = E2BIG;
745                         return (size_t)-1;
746                 }
747                 
748                 *out = base;
749                 out++;
750                 *outlen -= 2;
751                 
752                 while ( comblen > 0 ) {
753                         *out = comb[COMBBUFLEN-comblen];
754                         out++;
755                         *outlen -= 2;
756                         comblen--;
757                 }
758                 
759                 i += 2;
760                 in++;
761         }
762         
763         *out = 0;
764         return o_len-*outlen;
765 }
766
767 /*******************************************************************
768 length of UTF-8 character and string
769 ********************************************************************/
770
771 size_t utf8_charlen ( char* utf8 )
772 {
773         unsigned char *p;
774
775         p = (unsigned char*) utf8;
776         
777         if ( *p < 0x80 )
778                 return (1);
779         else if ( *p > 0xC1 && *p < 0xe0 && *(p+1) > 0x7f && *(p+1) < 0xC0)
780                 return (2);
781         else if ( *p == 0xe0 && *(p+1) > 0x9f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0)
782                 return (3);
783         else if ( *p > 0xe0  && *p < 0xf0 && *(p+1) > 0x7f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0)
784                 return (3);
785         else if ( *p == 0xf0 && *(p+1) > 0x8f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
786                 return (4);
787         else if ( *p > 0xf0 && *p < 0xf4 && *(p+1) > 0x7f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
788                 return (4);
789         else if ( *p == 0xf4 && *(p+1) > 0x7f && *(p+1) < 0x90 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
790                 return (4);
791         else
792                 return ((size_t) -1);
793 }
794
795
796 size_t utf8_strlen_validate ( char * utf8 )
797 {
798         size_t len;
799         unsigned char *p;
800
801         p = (unsigned char*) utf8;
802         len = 0;
803
804         /* see http://www.unicode.org/unicode/reports/tr27/ for an explanation */
805
806         while ( *p != '\0')
807         {
808                 if ( *p < 0x80 )
809                         p++;
810
811                 else if ( *p > 0xC1 && *p < 0xe0 && *(p+1) > 0x7f && *(p+1) < 0xC0)
812                         p += 2;
813
814                 else if ( *p == 0xe0 && *(p+1) > 0x9f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0)
815                         p += 3;
816
817                 else if ( *p > 0xe0  && *p < 0xf0 && *(p+1) > 0x7f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0)
818                         p += 3;
819
820                 else if ( *p == 0xf0 && *(p+1) > 0x8f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
821                         p += 4;
822
823                 else if ( *p > 0xf0 && *p < 0xf4 && *(p+1) > 0x7f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
824                         p += 4;
825
826                 else if ( *p == 0xf4 && *(p+1) > 0x7f && *(p+1) < 0x90 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
827                         p += 4;
828
829                 else
830                         return ((size_t) -1);
831
832                 len++;
833         }
834
835         return (len);
836 }