]> arthur.barton.de Git - netatalk.git/blob - libatalk/unicode/util_unistr.c
f878c8c08015e685974b081d4b199e84e3a73d71
[netatalk.git] / libatalk / unicode / util_unistr.c
1 /*******************************************************************
2   NOTE:
3   The early netatalk 2.x was based on UCS-2.
4   UCS-2 don't support chars above U+10000.
5   Recent netatalk is based on UTF-16.
6   UTF-16 can support chars above U+10000, using Surrogate Pair.
7   However, Surrogate Pair is complex, dirty, filthy and disagreeable.
8   There might still be latent bugs...
9 ********************************************************************/
10
11 #ifdef HAVE_CONFIG_H
12 #include "config.h"
13 #endif /* HAVE_CONFIG_H */
14
15 #include <stdio.h>
16 #include <stdlib.h>
17 #include <string.h>
18 #include <sys/param.h>
19 #include <sys/stat.h>
20 #include <atalk/logger.h>
21 #include <errno.h>
22
23 #include <netatalk/endian.h>
24
25 #include <atalk/unicode.h>
26 #include "precompose.h"
27 #include "byteorder.h"
28
29 /*******************************************************************
30  Convert a string to lower case.
31  return True if any char is converted
32 ********************************************************************/
33 /* surrogate pair support */
34
35 int strlower_w(ucs2_t *s)
36 {
37         int ret = 0;
38
39         while (*s) {
40                 if ((0xD800 <= *s) && (*s < 0xDC00)) {
41                         if ((0xDC00 <= s[1]) && (s[1] < 0xE000)) {
42                                 u_int32_t s_sp = (u_int32_t)*s << 16 | (u_int32_t)s[1];
43                                 u_int32_t v_sp = tolower_sp(s_sp);
44                                 if (v_sp != s_sp) {
45                                         *s = v_sp >> 16;
46                                         s++;
47                                         *s = v_sp & 0xFFFF;
48                                         ret = 1;
49                                 }
50                         }
51                 } else {
52                         ucs2_t v = tolower_w(*s);
53                         if (v != *s) {
54                                 *s = v;
55                                 ret = 1;
56                         }
57                 }
58                 s++;
59         }
60         return ret;
61 }
62
63 /*******************************************************************
64  Convert a string to upper case.
65  return True if any char is converted
66 ********************************************************************/
67 /* surrogate pair support */
68
69 int strupper_w(ucs2_t *s)
70 {
71         int ret = 0;
72
73         while (*s) {
74                 if ((0xD800 <= *s) && (*s < 0xDC00)) {
75                         if ((0xDC00 <= s[1]) && (s[1] < 0xE000)) {
76                                 u_int32_t s_sp = (u_int32_t)*s << 16 | (u_int32_t)s[1];
77                                 u_int32_t v_sp = toupper_sp(s_sp);
78                                 if (v_sp != s_sp) {
79                                         *s = v_sp >> 16;
80                                         s++;
81                                         *s = v_sp & 0xFFFF;
82                                         ret = 1;
83                                 }
84                         }
85                 } else {
86                         ucs2_t v = toupper_w(*s);
87                         if (v != *s) {
88                                 *s = v;
89                                 ret = 1;
90                         }
91                 }
92                 s++;
93         }
94         return ret;
95 }
96
97 /*******************************************************************
98 wide & sp islower()
99 determine if a character is lowercase
100 ********************************************************************/
101 /* These functions are not used. */
102
103 int islower_w(ucs2_t c)
104 {
105         return ( c == tolower_w(c));
106 }
107
108 int islower_sp(u_int32_t c_sp)
109 {
110         return ( c_sp == tolower_sp(c_sp));
111 }
112
113 /*******************************************************************
114 wide & sp isupper()
115 determine if a character is uppercase
116 ********************************************************************/
117 /* These functions are not used. */
118
119 int isupper_w(ucs2_t c)
120 {
121         return ( c == toupper_w(c));
122 }
123
124 int isupper_sp(u_int32_t c_sp)
125 {
126         return ( c_sp == toupper_sp(c_sp));
127 }
128
129 /*******************************************************************
130 wide strlen()
131  Count the number of characters in a UTF-16 string.
132 ********************************************************************/
133 /* NOTE: one surrogate pair is two characters. */
134
135 size_t strlen_w(const ucs2_t *src)
136 {
137         size_t len;
138
139         for(len = 0; *src++; len++) ;
140
141         return len;
142 }
143
144 /*******************************************************************
145 wide strnlen()
146  Count up to max number of characters in a UTF-16 string.
147 ********************************************************************/
148 /* NOTE: one surrogate pair is two characters. */
149
150 size_t strnlen_w(const ucs2_t *src, size_t max)
151 {
152         size_t len;
153
154         for(len = 0; *src++ && (len < max); len++) ;
155
156         return len;
157 }
158
159 /*******************************************************************
160 wide strchr()
161 ********************************************************************/
162 /* NOTE: hi and lo of surrogate pair are separately processed. */
163
164 ucs2_t *strchr_w(const ucs2_t *s, ucs2_t c)
165 {
166         while (*s != 0) {
167                 if (c == *s) return (ucs2_t *)s;
168                 s++;
169         }
170         if (c == *s) return (ucs2_t *)s;
171
172         return NULL;
173 }
174
175 /*******************************************************************
176 wide & sp strcasechr()
177 ********************************************************************/
178 /* NOTE: separately process BMP and surrogate pair */
179
180 ucs2_t *strcasechr_w(const ucs2_t *s, ucs2_t c)
181 {
182         while (*s != 0) {
183 /*              LOG(log_debug, logtype_default, "Comparing %X to %X (%X - %X)", c, *s, toupper_w(c), toupper_w(*s));*/
184                 if (toupper_w(c) == toupper_w(*s)) return (ucs2_t *)s;
185                 s++;
186         }
187         if (c == *s) return (ucs2_t *)s;
188
189         return NULL;
190 }
191
192 ucs2_t *strcasechr_sp(const ucs2_t *s, u_int32_t c_sp)
193 {
194         if (*s == 0) return NULL;
195         while (s[1] != 0) {
196                 if (toupper_sp(c_sp) == toupper_sp((u_int32_t)*s << 16 | (u_int32_t)s[1])) return (ucs2_t *)s;
197                 s++;
198         }
199
200         return NULL;
201 }
202
203 /*******************************************************************
204 wide strcmp()
205 ********************************************************************/
206 /* no problem of surrogate pair */
207
208 int strcmp_w(const ucs2_t *a, const ucs2_t *b)
209 {
210         while (*b && *a == *b) { a++; b++; }
211         return (*a - *b);
212         /* warning: if *a != *b and both are not 0 we retrun a random
213            greater or lesser than 0 number not realted to which
214            string is longer */
215 }
216
217 /*******************************************************************
218 wide strncmp()
219 ********************************************************************/
220 /* no problem of surrogate pair */
221
222 int strncmp_w(const ucs2_t *a, const ucs2_t *b, size_t len)
223 {
224         size_t n = 0;
225         while ((n < len) && *b && *a == *b) { a++; b++; n++;}
226         return (len - n)?(*a - *b):0;
227 }
228
229 /*******************************************************************
230 wide strstr()
231 ********************************************************************/
232 /* no problem of surrogate pair */
233
234 ucs2_t *strstr_w(const ucs2_t *s, const ucs2_t *ins)
235 {
236         ucs2_t *r;
237         size_t slen, inslen;
238
239         if (!s || !*s || !ins || !*ins) return NULL;
240         slen = strlen_w(s);
241         inslen = strlen_w(ins);
242         r = (ucs2_t *)s;
243         while ((r = strchr_w(r, *ins))) {
244                 if (strncmp_w(r, ins, inslen) == 0) return r;
245                 r++;
246         }
247         return NULL;
248 }
249
250 /*******************************************************************
251 wide strcasestr()
252 ********************************************************************/
253 /* surrogate pair support */
254
255 ucs2_t *strcasestr_w(const ucs2_t *s, const ucs2_t *ins)
256 {
257         ucs2_t *r;
258         size_t slen, inslen;
259
260         if (!s || !*s || !ins || !*ins) return NULL;
261         slen = strlen_w(s);
262         inslen = strlen_w(ins);
263         r = (ucs2_t *)s;
264
265         if ((0xD800 <= *ins) && (*ins < 0xDC00)) {
266                 if ((0xDC00 <= ins[1]) && (ins[1] < 0xE000)) {
267                         u_int32_t ins_sp = (u_int32_t)*ins << 16 | (u_int32_t)ins[1];
268                         while ((r = strcasechr_sp(r, ins_sp))) {
269                                 if (strncasecmp_w(r, ins, inslen) == 0) return r;
270                                 r++;
271                         }
272                 } else {
273                         return NULL; /* illegal sequence */
274                 }
275         } else {
276                 while ((r = strcasechr_w(r, *ins))) {
277                         if (strncasecmp_w(r, ins, inslen) == 0) return r;
278                         r++;
279                 }
280         }
281         return NULL;
282 }
283
284 /*******************************************************************
285 wide strcasecmp()
286 case insensitive string comparison
287 ********************************************************************/
288 /* surrogate pair support */
289
290 int strcasecmp_w(const ucs2_t *a, const ucs2_t *b)
291 {
292         int ret;
293
294         while (*a && *b) {
295                 if ((0xD800 <= *a) && (*a < 0xDC00)) {
296                         if (ret = tolower_sp((u_int32_t)*a << 16 | (u_int32_t)a[1]) - tolower_sp((u_int32_t)*b << 16 | (u_int32_t)b[1])) return ret;
297                         a++;
298                         b++;
299                         if (!(*a && *b)) return (tolower_w(*a) - tolower_w(*b)); /* avoid buffer over run */
300                 } else {
301                         if (ret = tolower_w(*a) - tolower_w(*b)) return ret;
302                 }
303                 a++;
304                 b++;
305         }
306         return (tolower_w(*a) - tolower_w(*b));
307 }
308
309 /*******************************************************************
310 wide strncasecmp()
311 case insensitive string comparison, length limited
312 ********************************************************************/
313 /* NOTE: compare up to 'len+1' if 'len' isolate surrogate pair  */
314
315 int strncasecmp_w(const ucs2_t *a, const ucs2_t *b, size_t len)
316 {
317         size_t n = 0;
318         int ret;
319
320         while ((n < len) && *a && *b) {
321                 if ((0xD800 <= *a) && (*a < 0xDC00)) {
322                         if (ret = tolower_sp((u_int32_t)*a << 16 | (u_int32_t)a[1]) - tolower_sp((u_int32_t)*b << 16 | (u_int32_t)b[1])) return ret;
323                         a++;
324                         b++;
325                         n++;
326                         if (!((n < len) && *a && *b)) return (tolower_w(*a) - tolower_w(*b));
327                 } else {
328                         if (ret = tolower_w(*a) - tolower_w(*b)) return ret;
329                 }
330                 a++;
331                 b++;
332                 n++;
333         }
334         return (len - n)?(tolower_w(*a) - tolower_w(*b)):0;
335 }
336
337 /*******************************************************************
338 wide strndup()
339 duplicate string
340 ********************************************************************/
341 /* NOTE: not check isolation of surrogate pair */
342 /* if len == 0 then duplicate the whole string */
343
344 ucs2_t *strndup_w(const ucs2_t *src, size_t len)
345 {
346         ucs2_t *dest;
347
348         if (!len) len = strlen_w(src);
349         dest = (ucs2_t *)malloc((len + 1) * sizeof(ucs2_t));
350         if (!dest) {
351                 LOG (log_error, logtype_default, "strdup_w: out of memory!");
352                 return NULL;
353         }
354
355         memcpy(dest, src, len * sizeof(ucs2_t));
356         dest[len] = 0;
357
358         return dest;
359 }
360
361 /*******************************************************************
362 wide strdup()
363 duplicate string
364 ********************************************************************/
365 /* no problem of surrogate pair */
366
367 ucs2_t *strdup_w(const ucs2_t *src)
368 {
369         return strndup_w(src, 0);
370 }
371
372 /*******************************************************************
373 copy a string with max len
374 ********************************************************************/
375 /* This function is not used. */
376 /* NOTE: not check isolation of surrogate pair */
377
378 ucs2_t *strncpy_w(ucs2_t *dest, const ucs2_t *src, const size_t max)
379 {
380         size_t len;
381
382         if (!dest || !src) return NULL;
383
384         for (len = 0; (src[len] != 0) && (len < max); len++)
385                 dest[len] = src[len];
386         while (len < max)
387                 dest[len++] = 0;
388
389         return dest;
390 }
391
392
393 /*******************************************************************
394 append a string of len bytes and add a terminator
395 ********************************************************************/
396 /* These functions are not used. */
397
398 /* NOTE: not check isolation of surrogate pair */
399 ucs2_t *strncat_w(ucs2_t *dest, const ucs2_t *src, const size_t max)
400 {
401         size_t start;
402         size_t len;
403
404         if (!dest || !src) return NULL;
405
406         start = strlen_w(dest);
407         len = strnlen_w(src, max);
408
409         memcpy(&dest[start], src, len*sizeof(ucs2_t));
410         dest[start+len] = 0;
411
412         return dest;
413 }
414
415 /* no problem of surrogate pair */
416 ucs2_t *strcat_w(ucs2_t *dest, const ucs2_t *src)
417 {
418         size_t start;
419         size_t len;
420
421         if (!dest || !src) return NULL;
422
423         start = strlen_w(dest);
424         len = strlen_w(src);
425
426         memcpy(&dest[start], src, len*sizeof(ucs2_t));
427         dest[start+len] = 0;
428
429         return dest;
430 }
431
432
433 /*******************************************************************
434 binary search for pre|decomposition
435 ********************************************************************/
436
437 static ucs2_t do_precomposition(unsigned int base, unsigned int comb) 
438 {
439         int min = 0;
440         int max = PRECOMP_COUNT - 1;
441         int mid;
442         u_int32_t sought = (base << 16) | comb, that;
443
444         /* binary search */
445         while (max >= min) {
446                 mid = (min + max) / 2;
447                 that = (precompositions[mid].base << 16) | (precompositions[mid].comb);
448                 if (that < sought) {
449                         min = mid + 1;
450                 } else if (that > sought) {
451                         max = mid - 1;
452                 } else {
453                         return precompositions[mid].replacement;
454                 }
455         }
456         /* no match */
457         return 0;
458 }
459
460 /* ------------------------ */
461 static u_int32_t do_precomposition_sp(unsigned int base_sp, unsigned int comb_sp) 
462 {
463         int min = 0;
464         int max = PRECOMP_SP_COUNT - 1;
465         int mid;
466         u_int64_t sought_sp = ((u_int64_t)base_sp << 32) | (u_int64_t)comb_sp, that_sp;
467
468         /* binary search */
469         while (max >= min) {
470                 mid = (min + max) / 2;
471                 that_sp = ((u_int64_t)precompositions_sp[mid].base_sp << 32) | ((u_int64_t)precompositions_sp[mid].comb_sp);
472                 if (that_sp < sought_sp) {
473                         min = mid + 1;
474                 } else if (that_sp > sought_sp) {
475                         max = mid - 1;
476                 } else {
477                         return precompositions_sp[mid].replacement_sp;
478                 }
479         }
480         /* no match */
481         return 0;
482 }
483
484 /* -------------------------- */
485 static u_int32_t do_decomposition(ucs2_t base) 
486 {
487         int min = 0;
488         int max = DECOMP_COUNT - 1;
489         int mid;
490         u_int32_t sought = base;
491         u_int32_t result, that;
492
493         /* binary search */
494         while (max >= min) {
495                 mid = (min + max) / 2;
496                 that = decompositions[mid].replacement;
497                 if (that < sought) {
498                         min = mid + 1;
499                 } else if (that > sought) {
500                         max = mid - 1;
501                 } else {
502                         result = (decompositions[mid].base << 16) | (decompositions[mid].comb);
503                         return result;
504                 }
505         }
506         /* no match */
507         return 0;
508 }
509
510 /* -------------------------- */
511 static u_int64_t do_decomposition_sp(unsigned int base_sp) 
512 {
513         int min = 0;
514         int max = DECOMP_SP_COUNT - 1;
515         int mid;
516         u_int32_t sought_sp = base_sp;
517         u_int32_t that_sp;
518         u_int64_t result_sp;
519
520         /* binary search */
521         while (max >= min) {
522                 mid = (min + max) / 2;
523                 that_sp = decompositions_sp[mid].replacement_sp;
524                 if (that_sp < sought_sp) {
525                         min = mid + 1;
526                 } else if (that_sp > sought_sp) {
527                         max = mid - 1;
528                 } else {
529                         result_sp = ((u_int64_t)decompositions_sp[mid].base_sp << 32) | ((u_int64_t)decompositions_sp[mid].comb_sp);
530                         return result_sp;
531                 }
532         }
533         /* no match */
534         return 0;
535 }
536
537 /*******************************************************************
538 pre|decomposition
539
540    we can't use static, this stuff needs to be reentrant
541    static char comp[MAXPATHLEN +1];
542
543    We don't implement Singleton and Canonical Ordering.
544    We ignore CompositionExclusions.txt.
545    because they cause the problem of the roundtrip
546    such as Dancing Icon.
547
548    exclude U2000-U2FFF, UFE30-UFE4F and U2F800-U2FA1F ranges
549    in precompose.h from composition according to AFP 3.x spec
550 ********************************************************************/
551
552 size_t precompose_w (ucs2_t *name, size_t inplen, ucs2_t *comp, size_t *outlen)
553 {
554         size_t i;
555         ucs2_t base, comb;
556         u_int32_t base_sp, comb_sp;
557         ucs2_t *in, *out;
558         ucs2_t lindex, vindex;
559         ucs2_t result;
560         u_int32_t result_sp;
561         size_t o_len = *outlen;
562         
563         if (!inplen || (inplen & 1) || inplen > o_len)
564                 return (size_t)-1;
565         
566         i = 0;
567         in  = name;
568         out = comp;
569         
570         base = *in;
571         while (*outlen > 2) {
572                 i += 2;
573                 if (i == inplen) {
574                         *out = base;
575                         out++;
576                         *out = 0;
577                         *outlen -= 2;
578                         return o_len - *outlen;
579                 }
580                 in++;
581                 comb = *in;
582                 result = 0;
583
584                 /* Non-Combination Character */
585                 if (comb < 0x300) ;
586                 
587                 /* Unicode Standard Annex #15 A10.3 Hangul Composition */
588                 /* Step 1 <L,V> */
589                 else if ((VBASE <= comb) && (comb <= VBASE + VCOUNT)) {
590                         if ((LBASE <= base) && (base < LBASE + LCOUNT)) {
591                                 result = 1;
592                                 lindex = base - LBASE;
593                                 vindex = comb - VBASE;
594                                 base = SBASE + (lindex * VCOUNT + vindex) * TCOUNT;
595                         }
596                 }
597                 
598                 /* Step 2 <LV,T> */
599                 else if ((TBASE < comb) && (comb < TBASE + TCOUNT)) {
600                         if ((SBASE <= base) && (base < SBASE + SCOUNT) && (((base - SBASE) % TCOUNT) == 0)) {
601                                 result = 1;
602                                 base += comb - TBASE;
603                         }
604                 }
605                 
606                 /* Binary Search for Surrogate Pair */
607                 else if ((0xD800 <= base) && (base < 0xDC00)) {
608                         if ((0xDC00 <= comb) && (comb < 0xE000) && (i + 6 <= inplen)) {
609                                 base_sp = ((u_int32_t)base << 16) | (u_int32_t)comb;
610                                 do {
611                                         comb_sp = ((u_int32_t)in[1] << 16) | (u_int32_t)in[2];
612                                         if (result_sp = do_precomposition_sp(base_sp, comb_sp)) {
613                                                 base_sp = result_sp;
614                                                 i += 4;
615                                                 in +=2;
616                                         }
617                                 } while ((i + 6 <= inplen) && result_sp) ;
618
619                                 *out = base_sp >> 16;
620                                 out++;
621                                 *outlen -= 2;
622
623                                 if (*outlen <= 2) {
624                                         errno = E2BIG;
625                                         return (size_t)-1;
626                                 }
627
628                                 *out = base_sp & 0xFFFF;
629                                 out++;
630                                 *outlen -= 2;
631
632                                 i += 2;
633                                 if (i == inplen) {
634                                         out++;
635                                         *out = 0;
636                                         return o_len - *outlen;
637                                 }
638                                 in++;
639                                 base = *in;
640
641                                 result = 1;
642                         }
643                 }
644
645                 /* Binary Search for BMP */
646                 else if (result = do_precomposition(base, comb)) {
647                         base = result;
648                 }
649                 
650                 if (!result) {
651                         *out = base;
652                         out++;
653                         *outlen -= 2;
654                         base = comb;
655                 }
656         }
657
658         errno = E2BIG;
659         return (size_t)-1;
660 }
661
662 /* --------------- */
663 size_t decompose_w (ucs2_t *name, size_t inplen, ucs2_t *comp, size_t *outlen)
664 {
665         size_t i;
666         size_t comblen;
667         ucs2_t base, comb[COMBBUFLEN];
668         u_int32_t base_sp;
669         ucs2_t sindex, tjamo;
670         ucs2_t *in, *out;
671         unsigned int result;
672         u_int64_t result_sp;
673         size_t o_len = *outlen;
674
675         if (!inplen || (inplen & 1))
676                 return (size_t)-1;
677         i = 0;
678         in  = name;
679         out = comp;
680
681         while (i < inplen) {
682                 base = *in;
683                 comblen = 0;
684                 
685                 /* check ASCII first. this is frequent. */
686                 if (base <= 0x007f) ;
687                 
688                 /* Unicode Standard Annex #15 A10.2 Hangul Decomposition */
689                 else if ((SBASE <= base) && (base < SBASE + SCOUNT)) {
690                         sindex = base - SBASE;
691                         base = LBASE + sindex / NCOUNT;
692                         comb[COMBBUFLEN-2] = VBASE + (sindex % NCOUNT) / TCOUNT;
693                         
694                         /* <L,V> */
695                         if ((tjamo = TBASE + sindex % TCOUNT) == TBASE) {
696                                 comb[COMBBUFLEN-1] = comb[COMBBUFLEN-2];
697                                 comblen = 1;
698                         }
699                         
700                         /* <L,V,T> */
701                         else {
702                                 comb[COMBBUFLEN-1] = tjamo;
703                                 comblen = 2;
704                         }
705                 }
706                 
707                 /* Binary Search for Surrogate Pair */
708                 else if ((0xD800 <= base) && (base < 0xDC00)) {
709                         if (i + 2 < inplen) {
710                                 base_sp =  ((u_int32_t)base << 16) | (u_int32_t)in[1];
711                                 do {
712                                         if ( !(result_sp = do_decomposition_sp(base_sp))) break;
713                                         comblen += 2;
714                                         base_sp = result_sp >> 32;
715                                         comb[COMBBUFLEN-comblen] = (result_sp >> 16) & 0xFFFF;  /* hi */
716                                         comb[COMBBUFLEN-comblen+1] = result_sp & 0xFFFF;        /* lo */
717                                 } while (comblen < MAXCOMBSPLEN);
718
719                                 if (*outlen < (comblen + 1) << 1) {
720                                         errno = E2BIG;
721                                         return (size_t)-1;
722                                 }
723
724                                 *out = base_sp >> 16;   /* hi */
725                                 out++;
726                                 *outlen -= 2;
727                                 
728                                 base = base_sp & 0xFFFF; /* lo */
729                                 
730                                 i += 2;
731                                 in++;
732                         }
733                 }
734                         
735                 /* Binary Search for BMP */
736                 else {
737                         do {
738                                 if ( !(result = do_decomposition(base))) break;
739                                 comblen++;
740                                 base = result  >> 16;
741                                 comb[COMBBUFLEN-comblen] = result & 0xFFFF;
742                         } while ((0x007f < base) && (comblen < MAXCOMBLEN));
743                 }
744                 
745                 if (*outlen < (comblen + 1) << 1) {
746                         errno = E2BIG;
747                         return (size_t)-1;
748                 }
749                 
750                 *out = base;
751                 out++;
752                 *outlen -= 2;
753                 
754                 while ( comblen > 0 ) {
755                         *out = comb[COMBBUFLEN-comblen];
756                         out++;
757                         *outlen -= 2;
758                         comblen--;
759                 }
760                 
761                 i += 2;
762                 in++;
763         }
764         
765         *out = 0;
766         return o_len-*outlen;
767 }
768
769 /*******************************************************************
770 length of UTF-8 character and string
771 ********************************************************************/
772
773 size_t utf8_charlen ( char* utf8 )
774 {
775         unsigned char *p;
776
777         p = (unsigned char*) utf8;
778         
779         if ( *p < 0x80 )
780                 return (1);
781         else if ( *p > 0xC1 && *p < 0xe0 && *(p+1) > 0x7f && *(p+1) < 0xC0)
782                 return (2);
783         else if ( *p == 0xe0 && *(p+1) > 0x9f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0)
784                 return (3);
785         else if ( *p > 0xe0  && *p < 0xf0 && *(p+1) > 0x7f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0)
786                 return (3);
787         else if ( *p == 0xf0 && *(p+1) > 0x8f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
788                 return (4);
789         else if ( *p > 0xf0 && *p < 0xf4 && *(p+1) > 0x7f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
790                 return (4);
791         else if ( *p == 0xf4 && *(p+1) > 0x7f && *(p+1) < 0x90 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
792                 return (4);
793         else
794                 return ((size_t) -1);
795 }
796
797
798 size_t utf8_strlen_validate ( char * utf8 )
799 {
800         size_t len;
801         unsigned char *p;
802
803         p = (unsigned char*) utf8;
804         len = 0;
805
806         /* see http://www.unicode.org/unicode/reports/tr27/ for an explanation */
807
808         while ( *p != '\0')
809         {
810                 if ( *p < 0x80 )
811                         p++;
812
813                 else if ( *p > 0xC1 && *p < 0xe0 && *(p+1) > 0x7f && *(p+1) < 0xC0)
814                         p += 2;
815
816                 else if ( *p == 0xe0 && *(p+1) > 0x9f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0)
817                         p += 3;
818
819                 else if ( *p > 0xe0  && *p < 0xf0 && *(p+1) > 0x7f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0)
820                         p += 3;
821
822                 else if ( *p == 0xf0 && *(p+1) > 0x8f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
823                         p += 4;
824
825                 else if ( *p > 0xf0 && *p < 0xf4 && *(p+1) > 0x7f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
826                         p += 4;
827
828                 else if ( *p == 0xf4 && *(p+1) > 0x7f && *(p+1) < 0x90 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
829                         p += 4;
830
831                 else
832                         return ((size_t) -1);
833
834                 len++;
835         }
836
837         return (len);
838 }