]> arthur.barton.de Git - netatalk.git/blob - libatalk/unicode/util_unistr.c
Merge master
[netatalk.git] / libatalk / unicode / util_unistr.c
1 /*******************************************************************
2   NOTE:
3   The early netatalk 2.x was based on UCS-2.
4   UCS-2 don't support chars above U+10000.
5   Recent netatalk is based on UTF-16.
6   UTF-16 can support chars above U+10000, using Surrogate Pair.
7   However, Surrogate Pair is complex, dirty, filthy and disagreeable.
8   There might still be latent bugs...
9 ********************************************************************/
10
11 #ifdef HAVE_CONFIG_H
12 #include "config.h"
13 #endif /* HAVE_CONFIG_H */
14
15 #include <stdio.h>
16 #include <stdlib.h>
17 #include <string.h>
18 #include <sys/param.h>
19 #include <sys/stat.h>
20 #include <atalk/logger.h>
21 #include <errno.h>
22 #include <arpa/inet.h>
23
24 #include <atalk/unicode.h>
25 #include "precompose.h"
26 #include "byteorder.h"
27
28 /*******************************************************************
29  Convert a string to lower case.
30  return True if any char is converted
31 ********************************************************************/
32 /* surrogate pair support */
33
34 int strlower_w(ucs2_t *s)
35 {
36         int ret = 0;
37
38         while (*s) {
39                 if ((0xD800 <= *s) && (*s < 0xDC00)) {
40                         if ((0xDC00 <= s[1]) && (s[1] < 0xE000)) {
41                                 u_int32_t s_sp = (u_int32_t)*s << 16 | (u_int32_t)s[1];
42                                 u_int32_t v_sp = tolower_sp(s_sp);
43                                 if (v_sp != s_sp) {
44                                         *s = v_sp >> 16;
45                                         s++;
46                                         *s = v_sp & 0xFFFF;
47                                         ret = 1;
48                                 }
49                         }
50                 } else {
51                         ucs2_t v = tolower_w(*s);
52                         if (v != *s) {
53                                 *s = v;
54                                 ret = 1;
55                         }
56                 }
57                 s++;
58         }
59         return ret;
60 }
61
62 /*******************************************************************
63  Convert a string to upper case.
64  return True if any char is converted
65 ********************************************************************/
66 /* surrogate pair support */
67
68 int strupper_w(ucs2_t *s)
69 {
70         int ret = 0;
71
72         while (*s) {
73                 if ((0xD800 <= *s) && (*s < 0xDC00)) {
74                         if ((0xDC00 <= s[1]) && (s[1] < 0xE000)) {
75                                 u_int32_t s_sp = (u_int32_t)*s << 16 | (u_int32_t)s[1];
76                                 u_int32_t v_sp = toupper_sp(s_sp);
77                                 if (v_sp != s_sp) {
78                                         *s = v_sp >> 16;
79                                         s++;
80                                         *s = v_sp & 0xFFFF;
81                                         ret = 1;
82                                 }
83                         }
84                 } else {
85                         ucs2_t v = toupper_w(*s);
86                         if (v != *s) {
87                                 *s = v;
88                                 ret = 1;
89                         }
90                 }
91                 s++;
92         }
93         return ret;
94 }
95
96 /*******************************************************************
97 wide & sp islower()
98 determine if a character is lowercase
99 ********************************************************************/
100 /* These functions are not used. */
101
102 int islower_w(ucs2_t c)
103 {
104         return ( c == tolower_w(c));
105 }
106
107 int islower_sp(u_int32_t c_sp)
108 {
109         return ( c_sp == tolower_sp(c_sp));
110 }
111
112 /*******************************************************************
113 wide & sp isupper()
114 determine if a character is uppercase
115 ********************************************************************/
116 /* These functions are not used. */
117
118 int isupper_w(ucs2_t c)
119 {
120         return ( c == toupper_w(c));
121 }
122
123 int isupper_sp(u_int32_t c_sp)
124 {
125         return ( c_sp == toupper_sp(c_sp));
126 }
127
128 /*******************************************************************
129 wide strlen()
130  Count the number of characters in a UTF-16 string.
131 ********************************************************************/
132 /* NOTE: one surrogate pair is two characters. */
133
134 size_t strlen_w(const ucs2_t *src)
135 {
136         size_t len;
137
138         for(len = 0; *src++; len++) ;
139
140         return len;
141 }
142
143 /*******************************************************************
144 wide strnlen()
145  Count up to max number of characters in a UTF-16 string.
146 ********************************************************************/
147 /* NOTE: one surrogate pair is two characters. */
148
149 size_t strnlen_w(const ucs2_t *src, size_t max)
150 {
151         size_t len;
152
153         for(len = 0; *src++ && (len < max); len++) ;
154
155         return len;
156 }
157
158 /*******************************************************************
159 wide strchr()
160 ********************************************************************/
161 /* NOTE: hi and lo of surrogate pair are separately processed. */
162
163 ucs2_t *strchr_w(const ucs2_t *s, ucs2_t c)
164 {
165         while (*s != 0) {
166                 if (c == *s) return (ucs2_t *)s;
167                 s++;
168         }
169         if (c == *s) return (ucs2_t *)s;
170
171         return NULL;
172 }
173
174 /*******************************************************************
175 wide & sp strcasechr()
176 ********************************************************************/
177 /* NOTE: separately process BMP and surrogate pair */
178
179 ucs2_t *strcasechr_w(const ucs2_t *s, ucs2_t c)
180 {
181         while (*s != 0) {
182 /*              LOG(log_debug, logtype_default, "Comparing %X to %X (%X - %X)", c, *s, toupper_w(c), toupper_w(*s));*/
183                 if (toupper_w(c) == toupper_w(*s)) return (ucs2_t *)s;
184                 s++;
185         }
186         if (c == *s) return (ucs2_t *)s;
187
188         return NULL;
189 }
190
191 ucs2_t *strcasechr_sp(const ucs2_t *s, u_int32_t c_sp)
192 {
193         if (*s == 0) return NULL;
194         while (s[1] != 0) {
195                 if (toupper_sp(c_sp) == toupper_sp((u_int32_t)*s << 16 | (u_int32_t)s[1])) return (ucs2_t *)s;
196                 s++;
197         }
198
199         return NULL;
200 }
201
202 /*******************************************************************
203 wide strcmp()
204 ********************************************************************/
205 /* no problem of surrogate pair */
206
207 int strcmp_w(const ucs2_t *a, const ucs2_t *b)
208 {
209         while (*b && *a == *b) { a++; b++; }
210         return (*a - *b);
211         /* warning: if *a != *b and both are not 0 we retrun a random
212            greater or lesser than 0 number not realted to which
213            string is longer */
214 }
215
216 /*******************************************************************
217 wide strncmp()
218 ********************************************************************/
219 /* no problem of surrogate pair */
220
221 int strncmp_w(const ucs2_t *a, const ucs2_t *b, size_t len)
222 {
223         size_t n = 0;
224         while ((n < len) && *b && *a == *b) { a++; b++; n++;}
225         return (len - n)?(*a - *b):0;
226 }
227
228 /*******************************************************************
229 wide strstr()
230 ********************************************************************/
231 /* no problem of surrogate pair */
232
233 ucs2_t *strstr_w(const ucs2_t *s, const ucs2_t *ins)
234 {
235         ucs2_t *r;
236         size_t slen, inslen;
237
238         if (!s || !*s || !ins || !*ins) return NULL;
239         slen = strlen_w(s);
240         inslen = strlen_w(ins);
241         r = (ucs2_t *)s;
242         while ((r = strchr_w(r, *ins))) {
243                 if (strncmp_w(r, ins, inslen) == 0) return r;
244                 r++;
245         }
246         return NULL;
247 }
248
249 /*******************************************************************
250 wide strcasestr()
251 ********************************************************************/
252 /* */
253
254 ucs2_t *strcasestr_w(const ucs2_t *s, const ucs2_t *ins)
255 {
256         ucs2_t *r;
257         size_t slen, inslen;
258
259         if (!s || !*s || !ins || !*ins) return NULL;
260         slen = strlen_w(s);
261         inslen = strlen_w(ins);
262         r = (ucs2_t *)s;
263         while ((r = strcasechr_w(r, *ins))) {
264                 if (strncasecmp_w(r, ins, inslen) == 0) return r;
265                 r++;
266         }
267         return NULL;
268 }
269
270 /*******************************************************************
271 wide strcasecmp()
272 case insensitive string comparison
273 ********************************************************************/
274 /* surrogate pair support */
275
276 int strcasecmp_w(const ucs2_t *a, const ucs2_t *b)
277 {
278         int ret;
279
280         while (*a && *b) {
281                 if ((0xD800 <= *a) && (*a < 0xDC00)) {
282                         if (ret = tolower_sp((u_int32_t)*a << 16 | (u_int32_t)a[1]) - tolower_sp((u_int32_t)*b << 16 | (u_int32_t)b[1])) return ret;
283                         a++;
284                         b++;
285                         if (!(*a && *b)) return (tolower_w(*a) - tolower_w(*b)); /* avoid buffer over run */
286                 } else {
287                         if (ret = tolower_w(*a) - tolower_w(*b)) return ret;
288                 }
289                 a++;
290                 b++;
291         }
292         return (tolower_w(*a) - tolower_w(*b));
293 }
294
295 /*******************************************************************
296 wide strncasecmp()
297 case insensitive string comparison, length limited
298 ********************************************************************/
299 /* NOTE: compare up to 'len+1' if 'len' isolate surrogate pair  */
300
301 int strncasecmp_w(const ucs2_t *a, const ucs2_t *b, size_t len)
302 {
303         size_t n = 0;
304         int ret;
305
306         while ((n < len) && *a && *b) {
307                 if ((0xD800 <= *a) && (*a < 0xDC00)) {
308                         if (ret = tolower_sp((u_int32_t)*a << 16 | (u_int32_t)a[1]) - tolower_sp((u_int32_t)*b << 16 | (u_int32_t)b[1])) return ret;
309                         a++;
310                         b++;
311                         n++;
312                         if (!((n < len) && *a && *b)) return (tolower_w(*a) - tolower_w(*b));
313                 } else {
314                         if (ret = tolower_w(*a) - tolower_w(*b)) return ret;
315                 }
316                 a++;
317                 b++;
318                 n++;
319         }
320         return (len - n)?(tolower_w(*a) - tolower_w(*b)):0;
321 }
322
323 /*******************************************************************
324 wide strndup()
325 duplicate string
326 ********************************************************************/
327 /* NOTE: not check isolation of surrogate pair */
328 /* if len == 0 then duplicate the whole string */
329
330 ucs2_t *strndup_w(const ucs2_t *src, size_t len)
331 {
332         ucs2_t *dest;
333
334         if (!len) len = strlen_w(src);
335         dest = (ucs2_t *)malloc((len + 1) * sizeof(ucs2_t));
336         if (!dest) {
337                 LOG (log_error, logtype_default, "strdup_w: out of memory!");
338                 return NULL;
339         }
340
341         memcpy(dest, src, len * sizeof(ucs2_t));
342         dest[len] = 0;
343
344         return dest;
345 }
346
347 /*******************************************************************
348 wide strdup()
349 duplicate string
350 ********************************************************************/
351 /* no problem of surrogate pair */
352
353 ucs2_t *strdup_w(const ucs2_t *src)
354 {
355         return strndup_w(src, 0);
356 }
357
358 /*******************************************************************
359 copy a string with max len
360 ********************************************************************/
361
362 ucs2_t *strncpy_w(ucs2_t *dest, const ucs2_t *src, const size_t max)
363 {
364         size_t len;
365
366         if (!dest || !src) return NULL;
367
368         for (len = 0; (src[len] != 0) && (len < max); len++)
369                 dest[len] = src[len];
370         while (len < max)
371                 dest[len++] = 0;
372
373         return dest;
374 }
375
376
377 /*******************************************************************
378 append a string of len bytes and add a terminator
379 ********************************************************************/
380
381 ucs2_t *strncat_w(ucs2_t *dest, const ucs2_t *src, const size_t max)
382 {
383         size_t start;
384         size_t len;
385
386         if (!dest || !src) return NULL;
387
388         start = strlen_w(dest);
389         len = strnlen_w(src, max);
390
391         memcpy(&dest[start], src, len*sizeof(ucs2_t));
392         dest[start+len] = 0;
393
394         return dest;
395 }
396
397
398 ucs2_t *strcat_w(ucs2_t *dest, const ucs2_t *src)
399 {
400         size_t start;
401         size_t len;
402
403         if (!dest || !src) return NULL;
404
405         start = strlen_w(dest);
406         len = strlen_w(src);
407
408         memcpy(&dest[start], src, len*sizeof(ucs2_t));
409         dest[start+len] = 0;
410
411         return dest;
412 }
413
414
415 /*******************************************************************
416 binary search for pre|decomposition
417 ********************************************************************/
418
419 static ucs2_t do_precomposition(unsigned int base, unsigned int comb) 
420 {
421         int min = 0;
422         int max = PRECOMP_COUNT - 1;
423         int mid;
424         uint32_t sought = (base << 16) | comb, that;
425
426         /* binary search */
427         while (max >= min) {
428                 mid = (min + max) / 2;
429                 that = (precompositions[mid].base << 16) | (precompositions[mid].comb);
430                 if (that < sought) {
431                         min = mid + 1;
432                 } else if (that > sought) {
433                         max = mid - 1;
434                 } else {
435                         return precompositions[mid].replacement;
436                 }
437         }
438         /* no match */
439         return 0;
440 }
441
442 /* ------------------------ */
443 static uint32_t do_precomposition_sp(unsigned int base_sp, unsigned int comb_sp) 
444 {
445         int min = 0;
446         int max = PRECOMP_SP_COUNT - 1;
447         int mid;
448         uint64_t sought_sp = ((uint64_t)base_sp << 32) | (uint64_t)comb_sp, that_sp;
449
450         /* binary search */
451         while (max >= min) {
452                 mid = (min + max) / 2;
453                 that_sp = ((uint64_t)precompositions_sp[mid].base_sp << 32) | ((uint64_t)precompositions_sp[mid].comb_sp);
454                 if (that_sp < sought_sp) {
455                         min = mid + 1;
456                 } else if (that_sp > sought_sp) {
457                         max = mid - 1;
458                 } else {
459                         return precompositions_sp[mid].replacement_sp;
460                 }
461         }
462         /* no match */
463         return 0;
464 }
465
466 /* -------------------------- */
467 static uint32_t do_decomposition(ucs2_t base) 
468 {
469         int min = 0;
470         int max = DECOMP_COUNT - 1;
471         int mid;
472         uint32_t sought = base;
473         uint32_t result, that;
474
475         /* binary search */
476         while (max >= min) {
477                 mid = (min + max) / 2;
478                 that = decompositions[mid].replacement;
479                 if (that < sought) {
480                         min = mid + 1;
481                 } else if (that > sought) {
482                         max = mid - 1;
483                 } else {
484                         result = (decompositions[mid].base << 16) | (decompositions[mid].comb);
485                         return result;
486                 }
487         }
488         /* no match */
489         return 0;
490 }
491
492 /* -------------------------- */
493 static uint64_t do_decomposition_sp(unsigned int base_sp) 
494 {
495         int min = 0;
496         int max = DECOMP_SP_COUNT - 1;
497         int mid;
498         uint32_t sought_sp = base_sp;
499         uint32_t that_sp;
500         uint64_t result_sp;
501
502         /* binary search */
503         while (max >= min) {
504                 mid = (min + max) / 2;
505                 that_sp = decompositions_sp[mid].replacement_sp;
506                 if (that_sp < sought_sp) {
507                         min = mid + 1;
508                 } else if (that_sp > sought_sp) {
509                         max = mid - 1;
510                 } else {
511                         result_sp = ((uint64_t)decompositions_sp[mid].base_sp << 32) | ((uint64_t)decompositions_sp[mid].comb_sp);
512                         return result_sp;
513                 }
514         }
515         /* no match */
516         return 0;
517 }
518
519 /*******************************************************************
520 pre|decomposition
521
522    we can't use static, this stuff needs to be reentrant
523    static char comp[MAXPATHLEN +1];
524
525    We don't implement Singleton and Canonical Ordering.
526    We ignore CompositionExclusions.txt.
527    because they cause the problem of the roundtrip
528    such as Dancing Icon.
529
530    exclude U2000-U2FFF, UFE30-UFE4F and U2F800-U2FA1F ranges
531    in precompose.h from composition according to AFP 3.x spec
532 ********************************************************************/
533
534 size_t precompose_w (ucs2_t *name, size_t inplen, ucs2_t *comp, size_t *outlen)
535 {
536         size_t i;
537         ucs2_t base, comb;
538         uint32_t base_sp, comb_sp;
539         ucs2_t *in, *out;
540         ucs2_t lindex, vindex;
541         ucs2_t result;
542         uint32_t result_sp;
543         size_t o_len = *outlen;
544         
545         if (!inplen || (inplen & 1) || inplen > o_len)
546                 return (size_t)-1;
547         
548         i = 0;
549         in  = name;
550         out = comp;
551         
552         base = *in;
553         while (*outlen > 2) {
554                 i += 2;
555                 if (i == inplen) {
556                         *out = base;
557                         out++;
558                         *out = 0;
559                         *outlen -= 2;
560                         return o_len - *outlen;
561                 }
562                 in++;
563                 comb = *in;
564                 result = 0;
565
566                 /* Non-Combination Character */
567                 if (comb < 0x300) ;
568                 
569                 /* Unicode Standard Annex #15 A10.3 Hangul Composition */
570                 /* Step 1 <L,V> */
571                 else if ((VBASE <= comb) && (comb <= VBASE + VCOUNT)) {
572                         if ((LBASE <= base) && (base < LBASE + LCOUNT)) {
573                                 result = 1;
574                                 lindex = base - LBASE;
575                                 vindex = comb - VBASE;
576                                 base = SBASE + (lindex * VCOUNT + vindex) * TCOUNT;
577                         }
578                 }
579                 
580                 /* Step 2 <LV,T> */
581                 else if ((TBASE < comb) && (comb < TBASE + TCOUNT)) {
582                         if ((SBASE <= base) && (base < SBASE + SCOUNT) && (((base - SBASE) % TCOUNT) == 0)) {
583                                 result = 1;
584                                 base += comb - TBASE;
585                         }
586                 }
587                 
588                 /* Binary Search for Surrogate Pair */
589                 else if ((0xD800 <= base) && (base < 0xDC00)) {
590                         if ((0xDC00 <= comb) && (comb < 0xE000) && (i + 6 <= inplen)) {
591                                 base_sp = ((uint32_t)base << 16) | (uint32_t)comb;
592                                 do {
593                                         comb_sp = ((uint32_t)in[1] << 16) | (uint32_t)in[2];
594                                         if (result_sp = do_precomposition_sp(base_sp, comb_sp)) {
595                                                 base_sp = result_sp;
596                                                 i += 4;
597                                                 in +=2;
598                                         }
599                                 } while ((i + 6 <= inplen) && result_sp) ;
600
601                                 *out = base_sp >> 16;
602                                 out++;
603                                 *outlen -= 2;
604
605                                 if (*outlen <= 2) {
606                                         errno = E2BIG;
607                                         return (size_t)-1;
608                                 }
609
610                                 *out = base_sp & 0xFFFF;
611                                 out++;
612                                 *outlen -= 2;
613
614                                 i += 2;
615                                 if (i == inplen) {
616                                         out++;
617                                         *out = 0;
618                                         return o_len - *outlen;
619                                 }
620                                 in++;
621                                 base = *in;
622
623                                 result = 1;
624                         }
625                 }
626
627                 /* Binary Search for BMP */
628                 else if (result = do_precomposition(base, comb)) {
629                         base = result;
630                 }
631                 
632                 if (!result) {
633                         *out = base;
634                         out++;
635                         *outlen -= 2;
636                         base = comb;
637                 }
638         }
639
640         errno = E2BIG;
641         return (size_t)-1;
642 }
643
644 /* --------------- */
645 size_t decompose_w (ucs2_t *name, size_t inplen, ucs2_t *comp, size_t *outlen)
646 {
647         size_t i;
648         size_t comblen;
649         ucs2_t base, comb[COMBBUFLEN];
650         uint32_t base_sp;
651         ucs2_t sindex, tjamo;
652         ucs2_t *in, *out;
653         unsigned int result;
654         uint64_t result_sp;
655         size_t o_len = *outlen;
656
657         if (!inplen || (inplen & 1))
658                 return (size_t)-1;
659         i = 0;
660         in  = name;
661         out = comp;
662
663         while (i < inplen) {
664                 base = *in;
665                 comblen = 0;
666                 
667                 /* check ASCII first. this is frequent. */
668                 if (base <= 0x007f) ;
669                 
670                 /* Unicode Standard Annex #15 A10.2 Hangul Decomposition */
671                 else if ((SBASE <= base) && (base < SBASE + SCOUNT)) {
672                         sindex = base - SBASE;
673                         base = LBASE + sindex / NCOUNT;
674                         comb[COMBBUFLEN-2] = VBASE + (sindex % NCOUNT) / TCOUNT;
675                         
676                         /* <L,V> */
677                         if ((tjamo = TBASE + sindex % TCOUNT) == TBASE) {
678                                 comb[COMBBUFLEN-1] = comb[COMBBUFLEN-2];
679                                 comblen = 1;
680                         }
681                         
682                         /* <L,V,T> */
683                         else {
684                                 comb[COMBBUFLEN-1] = tjamo;
685                                 comblen = 2;
686                         }
687                 }
688                 
689                 /* Binary Search for Surrogate Pair */
690                 else if ((0xD800 <= base) && (base < 0xDC00)) {
691                         if (i + 2 < inplen) {
692                                 base_sp =  ((uint32_t)base << 16) | (uint32_t)in[1];
693                                 do {
694                                         if ( !(result_sp = do_decomposition_sp(base_sp))) break;
695                                         comblen += 2;
696                                         base_sp = result_sp >> 32;
697                                         comb[COMBBUFLEN-comblen] = (result_sp >> 16) & 0xFFFF;  /* hi */
698                                         comb[COMBBUFLEN-comblen+1] = result_sp & 0xFFFF;        /* lo */
699                                 } while (comblen < MAXCOMBSPLEN);
700
701                                 if (*outlen < (comblen + 1) << 1) {
702                                         errno = E2BIG;
703                                         return (size_t)-1;
704                                 }
705
706                                 *out = base_sp >> 16;   /* hi */
707                                 out++;
708                                 *outlen -= 2;
709                                 
710                                 base = base_sp & 0xFFFF; /* lo */
711                                 
712                                 i += 2;
713                                 in++;
714                         }
715                 }
716                         
717                 /* Binary Search for BMP */
718                 else {
719                         do {
720                                 if ( !(result = do_decomposition(base))) break;
721                                 comblen++;
722                                 base = result  >> 16;
723                                 comb[COMBBUFLEN-comblen] = result & 0xFFFF;
724                         } while ((0x007f < base) && (comblen < MAXCOMBLEN));
725                 }
726                 
727                 if (*outlen < (comblen + 1) << 1) {
728                         errno = E2BIG;
729                         return (size_t)-1;
730                 }
731                 
732                 *out = base;
733                 out++;
734                 *outlen -= 2;
735                 
736                 while ( comblen > 0 ) {
737                         *out = comb[COMBBUFLEN-comblen];
738                         out++;
739                         *outlen -= 2;
740                         comblen--;
741                 }
742                 
743                 i += 2;
744                 in++;
745         }
746         
747         *out = 0;
748         return o_len-*outlen;
749 }
750
751 /*******************************************************************
752 length of UTF-8 character and string
753 ********************************************************************/
754
755 size_t utf8_charlen ( char* utf8 )
756 {
757         unsigned char *p;
758
759         p = (unsigned char*) utf8;
760         
761         if ( *p < 0x80 )
762                 return (1);
763         else if ( *p > 0xC1 && *p < 0xe0 && *(p+1) > 0x7f && *(p+1) < 0xC0)
764                 return (2);
765         else if ( *p == 0xe0 && *(p+1) > 0x9f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0)
766                 return (3);
767         else if ( *p > 0xe0  && *p < 0xf0 && *(p+1) > 0x7f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0)
768                 return (3);
769         else if ( *p == 0xf0 && *(p+1) > 0x8f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
770                 return (4);
771         else if ( *p > 0xf0 && *p < 0xf4 && *(p+1) > 0x7f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
772                 return (4);
773         else if ( *p == 0xf4 && *(p+1) > 0x7f && *(p+1) < 0x90 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
774                 return (4);
775         else
776                 return ((size_t) -1);
777 }
778
779
780 size_t utf8_strlen_validate ( char * utf8 )
781 {
782         size_t len;
783         unsigned char *p;
784
785         p = (unsigned char*) utf8;
786         len = 0;
787
788         /* see http://www.unicode.org/unicode/reports/tr27/ for an explanation */
789
790         while ( *p != '\0')
791         {
792                 if ( *p < 0x80 )
793                         p++;
794
795                 else if ( *p > 0xC1 && *p < 0xe0 && *(p+1) > 0x7f && *(p+1) < 0xC0)
796                         p += 2;
797
798                 else if ( *p == 0xe0 && *(p+1) > 0x9f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0)
799                         p += 3;
800
801                 else if ( *p > 0xe0  && *p < 0xf0 && *(p+1) > 0x7f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0)
802                         p += 3;
803
804                 else if ( *p == 0xf0 && *(p+1) > 0x8f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
805                         p += 4;
806
807                 else if ( *p > 0xf0 && *p < 0xf4 && *(p+1) > 0x7f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
808                         p += 4;
809
810                 else if ( *p == 0xf4 && *(p+1) > 0x7f && *(p+1) < 0x90 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
811                         p += 4;
812
813                 else
814                         return ((size_t) -1);
815
816                 len++;
817         }
818
819         return (len);
820 }