]> arthur.barton.de Git - netatalk.git/blob - libatalk/unicode/util_unistr.c
filename should be compared using lower case.
[netatalk.git] / libatalk / unicode / util_unistr.c
1 /*******************************************************************
2   NOTE:
3   The early netatalk 2.x was based on UCS-2.
4   UCS-2 don't support chars above U+10000.
5   Recent netatalk is based on UTF-16.
6   UTF-16 can support chars above U+10000, using Surrogate Pair.
7   However, Surrogate Pair is complex, dirty, filthy and disagreeable.
8   There might still be latent bugs...
9 ********************************************************************/
10
11 #ifdef HAVE_CONFIG_H
12 #include "config.h"
13 #endif /* HAVE_CONFIG_H */
14
15 #include <stdio.h>
16 #include <stdlib.h>
17 #include <string.h>
18 #include <sys/param.h>
19 #include <sys/stat.h>
20 #include <atalk/logger.h>
21 #include <errno.h>
22
23 #include <netatalk/endian.h>
24
25 #include <atalk/unicode.h>
26 #include "precompose.h"
27 #include "byteorder.h"
28
29 /*******************************************************************
30  Convert a string to lower case.
31  return True if any char is converted
32 ********************************************************************/
33 /* surrogate pair support */
34
35 int strlower_w(ucs2_t *s)
36 {
37         int ret = 0;
38
39         while (*s) {
40                 if ((0xD800 <= *s) && (*s < 0xDC00)) {
41                         if ((0xDC00 <= s[1]) && (s[1] < 0xE000)) {
42                                 u_int32_t s_sp = (u_int32_t)*s << 16 | (u_int32_t)s[1];
43                                 u_int32_t v_sp = tolower_sp(s_sp);
44                                 if (v_sp != s_sp) {
45                                         *s = v_sp >> 16;
46                                         s++;
47                                         *s = v_sp & 0xFFFF;
48                                         ret = 1;
49                                 }
50                         }
51                 } else {
52                         ucs2_t v = tolower_w(*s);
53                         if (v != *s) {
54                                 *s = v;
55                                 ret = 1;
56                         }
57                 }
58                 s++;
59         }
60         return ret;
61 }
62
63 /*******************************************************************
64  Convert a string to upper case.
65  return True if any char is converted
66 ********************************************************************/
67 /* surrogate pair support */
68
69 int strupper_w(ucs2_t *s)
70 {
71         int ret = 0;
72
73         while (*s) {
74                 if ((0xD800 <= *s) && (*s < 0xDC00)) {
75                         if ((0xDC00 <= s[1]) && (s[1] < 0xE000)) {
76                                 u_int32_t s_sp = (u_int32_t)*s << 16 | (u_int32_t)s[1];
77                                 u_int32_t v_sp = toupper_sp(s_sp);
78                                 if (v_sp != s_sp) {
79                                         *s = v_sp >> 16;
80                                         s++;
81                                         *s = v_sp & 0xFFFF;
82                                         ret = 1;
83                                 }
84                         }
85                 } else {
86                         ucs2_t v = toupper_w(*s);
87                         if (v != *s) {
88                                 *s = v;
89                                 ret = 1;
90                         }
91                 }
92                 s++;
93         }
94         return ret;
95 }
96
97 /*******************************************************************
98 wide & sp islower()
99 determine if a character is lowercase
100 ********************************************************************/
101 /* These functions are not used. */
102
103 int islower_w(ucs2_t c)
104 {
105         return ( c == tolower_w(c));
106 }
107
108 int islower_sp(u_int32_t c_sp)
109 {
110         return ( c_sp == tolower_sp(c_sp));
111 }
112
113 /*******************************************************************
114 wide & sp isupper()
115 determine if a character is uppercase
116 ********************************************************************/
117 /* These functions are not used. */
118
119 int isupper_w(ucs2_t c)
120 {
121         return ( c == toupper_w(c));
122 }
123
124 int isupper_sp(u_int32_t c_sp)
125 {
126         return ( c_sp == toupper_sp(c_sp));
127 }
128
129 /*******************************************************************
130 wide strlen()
131  Count the number of characters in a UTF-16 string.
132 ********************************************************************/
133 /* NOTE: one surrogate pair is two characters. */
134
135 size_t strlen_w(const ucs2_t *src)
136 {
137         size_t len;
138
139         for(len = 0; *src++; len++) ;
140
141         return len;
142 }
143
144 /*******************************************************************
145 wide strnlen()
146  Count up to max number of characters in a UTF-16 string.
147 ********************************************************************/
148 /* NOTE: one surrogate pair is two characters. */
149
150 size_t strnlen_w(const ucs2_t *src, size_t max)
151 {
152         size_t len;
153
154         for(len = 0; *src++ && (len < max); len++) ;
155
156         return len;
157 }
158
159 /*******************************************************************
160 wide strchr()
161 ********************************************************************/
162 /* NOTE: hi and lo of surrogate pair are separately processed. */
163
164 ucs2_t *strchr_w(const ucs2_t *s, ucs2_t c)
165 {
166         while (*s != 0) {
167                 if (c == *s) return (ucs2_t *)s;
168                 s++;
169         }
170         if (c == *s) return (ucs2_t *)s;
171
172         return NULL;
173 }
174
175 /*******************************************************************
176 wide & sp strcasechr()
177 ********************************************************************/
178 /* NOTE: separately process BMP and surrogate pair */
179
180 ucs2_t *strcasechr_w(const ucs2_t *s, ucs2_t c)
181 {
182         while (*s != 0) {
183                 if (tolower_w(c) == tolower_w(*s)) return (ucs2_t *)s;
184                 s++;
185         }
186         if (c == *s) return (ucs2_t *)s;
187
188         return NULL;
189 }
190
191 ucs2_t *strcasechr_sp(const ucs2_t *s, u_int32_t c_sp)
192 {
193         if (*s == 0) return NULL;
194         while (s[1] != 0) {
195                 if (tolower_sp(c_sp) == tolower_sp((u_int32_t)*s << 16 | (u_int32_t)s[1])) return (ucs2_t *)s;
196                 s++;
197         }
198
199         return NULL;
200 }
201
202 /*******************************************************************
203 wide strcmp()
204 ********************************************************************/
205 /* no problem of surrogate pair */
206
207 int strcmp_w(const ucs2_t *a, const ucs2_t *b)
208 {
209         while (*b && *a == *b) { a++; b++; }
210         return (*a - *b);
211         /* warning: if *a != *b and both are not 0 we retrun a random
212            greater or lesser than 0 number not realted to which
213            string is longer */
214 }
215
216 /*******************************************************************
217 wide strncmp()
218 ********************************************************************/
219 /* no problem of surrogate pair */
220
221 int strncmp_w(const ucs2_t *a, const ucs2_t *b, size_t len)
222 {
223         size_t n = 0;
224         while ((n < len) && *b && *a == *b) { a++; b++; n++;}
225         return (len - n)?(*a - *b):0;
226 }
227
228 /*******************************************************************
229 wide strstr()
230 ********************************************************************/
231 /* no problem of surrogate pair */
232
233 ucs2_t *strstr_w(const ucs2_t *s, const ucs2_t *ins)
234 {
235         ucs2_t *r;
236         size_t slen, inslen;
237
238         if (!s || !*s || !ins || !*ins) return NULL;
239         slen = strlen_w(s);
240         inslen = strlen_w(ins);
241         r = (ucs2_t *)s;
242         while ((r = strchr_w(r, *ins))) {
243                 if (strncmp_w(r, ins, inslen) == 0) return r;
244                 r++;
245         }
246         return NULL;
247 }
248
249 /*******************************************************************
250 wide strcasestr()
251 ********************************************************************/
252 /* surrogate pair support */
253
254 ucs2_t *strcasestr_w(const ucs2_t *s, const ucs2_t *ins)
255 {
256         ucs2_t *r;
257         size_t slen, inslen;
258
259         if (!s || !*s || !ins || !*ins) return NULL;
260         slen = strlen_w(s);
261         inslen = strlen_w(ins);
262         r = (ucs2_t *)s;
263
264         if ((0xD800 <= *ins) && (*ins < 0xDC00)) {
265                 if ((0xDC00 <= ins[1]) && (ins[1] < 0xE000)) {
266                         u_int32_t ins_sp = (u_int32_t)*ins << 16 | (u_int32_t)ins[1];
267                         while ((r = strcasechr_sp(r, ins_sp))) {
268                                 if (strncasecmp_w(r, ins, inslen) == 0) return r;
269                                 r++;
270                         }
271                 } else {
272                         return NULL; /* illegal sequence */
273                 }
274         } else {
275                 while ((r = strcasechr_w(r, *ins))) {
276                         if (strncasecmp_w(r, ins, inslen) == 0) return r;
277                         r++;
278                 }
279         }
280         return NULL;
281 }
282
283 /*******************************************************************
284 wide strcasecmp()
285 case insensitive string comparison
286 ********************************************************************/
287 /* surrogate pair support */
288
289 int strcasecmp_w(const ucs2_t *a, const ucs2_t *b)
290 {
291         int ret;
292
293         while (*a && *b) {
294                 if ((0xD800 <= *a) && (*a < 0xDC00)) {
295                         if (ret = tolower_sp((u_int32_t)*a << 16 | (u_int32_t)a[1]) - tolower_sp((u_int32_t)*b << 16 | (u_int32_t)b[1])) return ret;
296                         a++;
297                         b++;
298                         if (!(*a && *b)) return (tolower_w(*a) - tolower_w(*b)); /* avoid buffer over run */
299                 } else {
300                         if (ret = tolower_w(*a) - tolower_w(*b)) return ret;
301                 }
302                 a++;
303                 b++;
304         }
305         return (tolower_w(*a) - tolower_w(*b));
306 }
307
308 /*******************************************************************
309 wide strncasecmp()
310 case insensitive string comparison, length limited
311 ********************************************************************/
312 /* NOTE: compare up to 'len+1' if 'len' isolate surrogate pair  */
313
314 int strncasecmp_w(const ucs2_t *a, const ucs2_t *b, size_t len)
315 {
316         size_t n = 0;
317         int ret;
318
319         while ((n < len) && *a && *b) {
320                 if ((0xD800 <= *a) && (*a < 0xDC00)) {
321                         if (ret = tolower_sp((u_int32_t)*a << 16 | (u_int32_t)a[1]) - tolower_sp((u_int32_t)*b << 16 | (u_int32_t)b[1])) return ret;
322                         a++;
323                         b++;
324                         n++;
325                         if (!((n < len) && *a && *b)) return (tolower_w(*a) - tolower_w(*b));
326                 } else {
327                         if (ret = tolower_w(*a) - tolower_w(*b)) return ret;
328                 }
329                 a++;
330                 b++;
331                 n++;
332         }
333         return (len - n)?(tolower_w(*a) - tolower_w(*b)):0;
334 }
335
336 /*******************************************************************
337 wide strndup()
338 duplicate string
339 ********************************************************************/
340 /* NOTE: not check isolation of surrogate pair */
341 /* if len == 0 then duplicate the whole string */
342
343 ucs2_t *strndup_w(const ucs2_t *src, size_t len)
344 {
345         ucs2_t *dest;
346
347         if (!len) len = strlen_w(src);
348         dest = (ucs2_t *)malloc((len + 1) * sizeof(ucs2_t));
349         if (!dest) {
350                 LOG (log_error, logtype_default, "strdup_w: out of memory!");
351                 return NULL;
352         }
353
354         memcpy(dest, src, len * sizeof(ucs2_t));
355         dest[len] = 0;
356
357         return dest;
358 }
359
360 /*******************************************************************
361 wide strdup()
362 duplicate string
363 ********************************************************************/
364 /* no problem of surrogate pair */
365
366 ucs2_t *strdup_w(const ucs2_t *src)
367 {
368         return strndup_w(src, 0);
369 }
370
371 /*******************************************************************
372 copy a string with max len
373 ********************************************************************/
374 /* This function is not used. */
375 /* NOTE: not check isolation of surrogate pair */
376
377 ucs2_t *strncpy_w(ucs2_t *dest, const ucs2_t *src, const size_t max)
378 {
379         size_t len;
380
381         if (!dest || !src) return NULL;
382
383         for (len = 0; (src[len] != 0) && (len < max); len++)
384                 dest[len] = src[len];
385         while (len < max)
386                 dest[len++] = 0;
387
388         return dest;
389 }
390
391
392 /*******************************************************************
393 append a string of len bytes and add a terminator
394 ********************************************************************/
395 /* These functions are not used. */
396
397 /* NOTE: not check isolation of surrogate pair */
398 ucs2_t *strncat_w(ucs2_t *dest, const ucs2_t *src, const size_t max)
399 {
400         size_t start;
401         size_t len;
402
403         if (!dest || !src) return NULL;
404
405         start = strlen_w(dest);
406         len = strnlen_w(src, max);
407
408         memcpy(&dest[start], src, len*sizeof(ucs2_t));
409         dest[start+len] = 0;
410
411         return dest;
412 }
413
414 /* no problem of surrogate pair */
415 ucs2_t *strcat_w(ucs2_t *dest, const ucs2_t *src)
416 {
417         size_t start;
418         size_t len;
419
420         if (!dest || !src) return NULL;
421
422         start = strlen_w(dest);
423         len = strlen_w(src);
424
425         memcpy(&dest[start], src, len*sizeof(ucs2_t));
426         dest[start+len] = 0;
427
428         return dest;
429 }
430
431
432 /*******************************************************************
433 binary search for pre|decomposition
434 ********************************************************************/
435
436 static ucs2_t do_precomposition(unsigned int base, unsigned int comb) 
437 {
438         int min = 0;
439         int max = PRECOMP_COUNT - 1;
440         int mid;
441         u_int32_t sought = (base << 16) | comb, that;
442
443         /* binary search */
444         while (max >= min) {
445                 mid = (min + max) / 2;
446                 that = (precompositions[mid].base << 16) | (precompositions[mid].comb);
447                 if (that < sought) {
448                         min = mid + 1;
449                 } else if (that > sought) {
450                         max = mid - 1;
451                 } else {
452                         return precompositions[mid].replacement;
453                 }
454         }
455         /* no match */
456         return 0;
457 }
458
459 /* ------------------------ */
460 static u_int32_t do_precomposition_sp(unsigned int base_sp, unsigned int comb_sp) 
461 {
462         int min = 0;
463         int max = PRECOMP_SP_COUNT - 1;
464         int mid;
465         u_int64_t sought_sp = ((u_int64_t)base_sp << 32) | (u_int64_t)comb_sp, that_sp;
466
467         /* binary search */
468         while (max >= min) {
469                 mid = (min + max) / 2;
470                 that_sp = ((u_int64_t)precompositions_sp[mid].base_sp << 32) | ((u_int64_t)precompositions_sp[mid].comb_sp);
471                 if (that_sp < sought_sp) {
472                         min = mid + 1;
473                 } else if (that_sp > sought_sp) {
474                         max = mid - 1;
475                 } else {
476                         return precompositions_sp[mid].replacement_sp;
477                 }
478         }
479         /* no match */
480         return 0;
481 }
482
483 /* -------------------------- */
484 static u_int32_t do_decomposition(ucs2_t base) 
485 {
486         int min = 0;
487         int max = DECOMP_COUNT - 1;
488         int mid;
489         u_int32_t sought = base;
490         u_int32_t result, that;
491
492         /* binary search */
493         while (max >= min) {
494                 mid = (min + max) / 2;
495                 that = decompositions[mid].replacement;
496                 if (that < sought) {
497                         min = mid + 1;
498                 } else if (that > sought) {
499                         max = mid - 1;
500                 } else {
501                         result = (decompositions[mid].base << 16) | (decompositions[mid].comb);
502                         return result;
503                 }
504         }
505         /* no match */
506         return 0;
507 }
508
509 /* -------------------------- */
510 static u_int64_t do_decomposition_sp(unsigned int base_sp) 
511 {
512         int min = 0;
513         int max = DECOMP_SP_COUNT - 1;
514         int mid;
515         u_int32_t sought_sp = base_sp;
516         u_int32_t that_sp;
517         u_int64_t result_sp;
518
519         /* binary search */
520         while (max >= min) {
521                 mid = (min + max) / 2;
522                 that_sp = decompositions_sp[mid].replacement_sp;
523                 if (that_sp < sought_sp) {
524                         min = mid + 1;
525                 } else if (that_sp > sought_sp) {
526                         max = mid - 1;
527                 } else {
528                         result_sp = ((u_int64_t)decompositions_sp[mid].base_sp << 32) | ((u_int64_t)decompositions_sp[mid].comb_sp);
529                         return result_sp;
530                 }
531         }
532         /* no match */
533         return 0;
534 }
535
536 /*******************************************************************
537 pre|decomposition
538
539    we can't use static, this stuff needs to be reentrant
540    static char comp[MAXPATHLEN +1];
541
542    We don't implement Singleton and Canonical Ordering.
543    We ignore CompositionExclusions.txt.
544    because they cause the problem of the roundtrip
545    such as Dancing Icon.
546
547    exclude U2000-U2FFF, UFE30-UFE4F and U2F800-U2FA1F ranges
548    in precompose.h from composition according to AFP 3.x spec
549 ********************************************************************/
550
551 size_t precompose_w (ucs2_t *name, size_t inplen, ucs2_t *comp, size_t *outlen)
552 {
553         size_t i;
554         ucs2_t base, comb;
555         u_int32_t base_sp, comb_sp;
556         ucs2_t *in, *out;
557         ucs2_t lindex, vindex;
558         ucs2_t result;
559         u_int32_t result_sp;
560         size_t o_len = *outlen;
561         
562         if (!inplen || (inplen & 1) || inplen > o_len)
563                 return (size_t)-1;
564         
565         i = 0;
566         in  = name;
567         out = comp;
568         
569         base = *in;
570         while (*outlen > 2) {
571                 i += 2;
572                 if (i == inplen) {
573                         *out = base;
574                         out++;
575                         *out = 0;
576                         *outlen -= 2;
577                         return o_len - *outlen;
578                 }
579                 in++;
580                 comb = *in;
581                 result = 0;
582
583                 /* Non-Combination Character */
584                 if (comb < 0x300) ;
585                 
586                 /* Unicode Standard Annex #15 A10.3 Hangul Composition */
587                 /* Step 1 <L,V> */
588                 else if ((VBASE <= comb) && (comb <= VBASE + VCOUNT)) {
589                         if ((LBASE <= base) && (base < LBASE + LCOUNT)) {
590                                 result = 1;
591                                 lindex = base - LBASE;
592                                 vindex = comb - VBASE;
593                                 base = SBASE + (lindex * VCOUNT + vindex) * TCOUNT;
594                         }
595                 }
596                 
597                 /* Step 2 <LV,T> */
598                 else if ((TBASE < comb) && (comb < TBASE + TCOUNT)) {
599                         if ((SBASE <= base) && (base < SBASE + SCOUNT) && (((base - SBASE) % TCOUNT) == 0)) {
600                                 result = 1;
601                                 base += comb - TBASE;
602                         }
603                 }
604                 
605                 /* Binary Search for Surrogate Pair */
606                 else if ((0xD800 <= base) && (base < 0xDC00)) {
607                         if ((0xDC00 <= comb) && (comb < 0xE000) && (i + 6 <= inplen)) {
608                                 base_sp = ((u_int32_t)base << 16) | (u_int32_t)comb;
609                                 do {
610                                         comb_sp = ((u_int32_t)in[1] << 16) | (u_int32_t)in[2];
611                                         if (result_sp = do_precomposition_sp(base_sp, comb_sp)) {
612                                                 base_sp = result_sp;
613                                                 i += 4;
614                                                 in +=2;
615                                         }
616                                 } while ((i + 6 <= inplen) && result_sp) ;
617
618                                 *out = base_sp >> 16;
619                                 out++;
620                                 *outlen -= 2;
621
622                                 if (*outlen <= 2) {
623                                         errno = E2BIG;
624                                         return (size_t)-1;
625                                 }
626
627                                 *out = base_sp & 0xFFFF;
628                                 out++;
629                                 *outlen -= 2;
630
631                                 i += 2;
632                                 if (i == inplen) {
633                                         out++;
634                                         *out = 0;
635                                         return o_len - *outlen;
636                                 }
637                                 in++;
638                                 base = *in;
639
640                                 result = 1;
641                         }
642                 }
643
644                 /* Binary Search for BMP */
645                 else if (result = do_precomposition(base, comb)) {
646                         base = result;
647                 }
648                 
649                 if (!result) {
650                         *out = base;
651                         out++;
652                         *outlen -= 2;
653                         base = comb;
654                 }
655         }
656
657         errno = E2BIG;
658         return (size_t)-1;
659 }
660
661 /* --------------- */
662 size_t decompose_w (ucs2_t *name, size_t inplen, ucs2_t *comp, size_t *outlen)
663 {
664         size_t i;
665         size_t comblen;
666         ucs2_t base, comb[COMBBUFLEN];
667         u_int32_t base_sp;
668         ucs2_t sindex, tjamo;
669         ucs2_t *in, *out;
670         unsigned int result;
671         u_int64_t result_sp;
672         size_t o_len = *outlen;
673
674         if (!inplen || (inplen & 1))
675                 return (size_t)-1;
676         i = 0;
677         in  = name;
678         out = comp;
679
680         while (i < inplen) {
681                 base = *in;
682                 comblen = 0;
683                 
684                 /* check ASCII first. this is frequent. */
685                 if (base <= 0x007f) ;
686                 
687                 /* Unicode Standard Annex #15 A10.2 Hangul Decomposition */
688                 else if ((SBASE <= base) && (base < SBASE + SCOUNT)) {
689                         sindex = base - SBASE;
690                         base = LBASE + sindex / NCOUNT;
691                         comb[COMBBUFLEN-2] = VBASE + (sindex % NCOUNT) / TCOUNT;
692                         
693                         /* <L,V> */
694                         if ((tjamo = TBASE + sindex % TCOUNT) == TBASE) {
695                                 comb[COMBBUFLEN-1] = comb[COMBBUFLEN-2];
696                                 comblen = 1;
697                         }
698                         
699                         /* <L,V,T> */
700                         else {
701                                 comb[COMBBUFLEN-1] = tjamo;
702                                 comblen = 2;
703                         }
704                 }
705                 
706                 /* Binary Search for Surrogate Pair */
707                 else if ((0xD800 <= base) && (base < 0xDC00)) {
708                         if (i + 2 < inplen) {
709                                 base_sp =  ((u_int32_t)base << 16) | (u_int32_t)in[1];
710                                 do {
711                                         if ( !(result_sp = do_decomposition_sp(base_sp))) break;
712                                         comblen += 2;
713                                         base_sp = result_sp >> 32;
714                                         comb[COMBBUFLEN-comblen] = (result_sp >> 16) & 0xFFFF;  /* hi */
715                                         comb[COMBBUFLEN-comblen+1] = result_sp & 0xFFFF;        /* lo */
716                                 } while (comblen < MAXCOMBSPLEN);
717
718                                 if (*outlen < (comblen + 1) << 1) {
719                                         errno = E2BIG;
720                                         return (size_t)-1;
721                                 }
722
723                                 *out = base_sp >> 16;   /* hi */
724                                 out++;
725                                 *outlen -= 2;
726                                 
727                                 base = base_sp & 0xFFFF; /* lo */
728                                 
729                                 i += 2;
730                                 in++;
731                         }
732                 }
733                         
734                 /* Binary Search for BMP */
735                 else {
736                         do {
737                                 if ( !(result = do_decomposition(base))) break;
738                                 comblen++;
739                                 base = result  >> 16;
740                                 comb[COMBBUFLEN-comblen] = result & 0xFFFF;
741                         } while ((0x007f < base) && (comblen < MAXCOMBLEN));
742                 }
743                 
744                 if (*outlen < (comblen + 1) << 1) {
745                         errno = E2BIG;
746                         return (size_t)-1;
747                 }
748                 
749                 *out = base;
750                 out++;
751                 *outlen -= 2;
752                 
753                 while ( comblen > 0 ) {
754                         *out = comb[COMBBUFLEN-comblen];
755                         out++;
756                         *outlen -= 2;
757                         comblen--;
758                 }
759                 
760                 i += 2;
761                 in++;
762         }
763         
764         *out = 0;
765         return o_len-*outlen;
766 }
767
768 /*******************************************************************
769 length of UTF-8 character and string
770 ********************************************************************/
771
772 size_t utf8_charlen ( char* utf8 )
773 {
774         unsigned char *p;
775
776         p = (unsigned char*) utf8;
777         
778         if ( *p < 0x80 )
779                 return (1);
780         else if ( *p > 0xC1 && *p < 0xe0 && *(p+1) > 0x7f && *(p+1) < 0xC0)
781                 return (2);
782         else if ( *p == 0xe0 && *(p+1) > 0x9f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0)
783                 return (3);
784         else if ( *p > 0xe0  && *p < 0xf0 && *(p+1) > 0x7f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0)
785                 return (3);
786         else if ( *p == 0xf0 && *(p+1) > 0x8f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
787                 return (4);
788         else if ( *p > 0xf0 && *p < 0xf4 && *(p+1) > 0x7f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
789                 return (4);
790         else if ( *p == 0xf4 && *(p+1) > 0x7f && *(p+1) < 0x90 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
791                 return (4);
792         else
793                 return ((size_t) -1);
794 }
795
796
797 size_t utf8_strlen_validate ( char * utf8 )
798 {
799         size_t len;
800         unsigned char *p;
801
802         p = (unsigned char*) utf8;
803         len = 0;
804
805         /* see http://www.unicode.org/unicode/reports/tr27/ for an explanation */
806
807         while ( *p != '\0')
808         {
809                 if ( *p < 0x80 )
810                         p++;
811
812                 else if ( *p > 0xC1 && *p < 0xe0 && *(p+1) > 0x7f && *(p+1) < 0xC0)
813                         p += 2;
814
815                 else if ( *p == 0xe0 && *(p+1) > 0x9f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0)
816                         p += 3;
817
818                 else if ( *p > 0xe0  && *p < 0xf0 && *(p+1) > 0x7f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0)
819                         p += 3;
820
821                 else if ( *p == 0xf0 && *(p+1) > 0x8f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
822                         p += 4;
823
824                 else if ( *p > 0xf0 && *p < 0xf4 && *(p+1) > 0x7f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
825                         p += 4;
826
827                 else if ( *p == 0xf4 && *(p+1) > 0x7f && *(p+1) < 0x90 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
828                         p += 4;
829
830                 else
831                         return ((size_t) -1);
832
833                 len++;
834         }
835
836         return (len);
837 }