libatalk/unicode/util_unistr.c

   1 /*******************************************************************
   2   NOTE:
   3   The early netatalk 2.x was based on UCS-2.
   4   UCS-2 don't support chars above U+10000.
   5   Recent netatalk is based on UTF-16.
   6   UTF-16 can support chars above U+10000, using Surrogate Pair.
   7   However, Surrogate Pair is complex, dirty, filthy and disagreeable.
   8   There might still be latent bugs...
   9 ********************************************************************/
  10
  11 #ifdef HAVE_CONFIG_H
  12 #include "config.h"
  13 #endif /* HAVE_CONFIG_H */
  14
  15 #include <stdio.h>
  16 #include <stdlib.h>
  17 #include <string.h>
  18 #include <sys/param.h>
  19 #include <sys/stat.h>
  20 #include <atalk/logger.h>
  21 #include <errno.h>
  22
  23 #include <netatalk/endian.h>
  24
  25 #include <atalk/unicode.h>
  26 #include "precompose.h"
  27 #include "byteorder.h"
  28
  29 /*******************************************************************
  30  Convert a string to lower case.
  31  return True if any char is converted
  32 ********************************************************************/
  33 /* surrogate pair support */
  34
  35 int strlower_w(ucs2_t *s)
  36 {
  37         int ret = 0;
  38
  39         while (*s) {
  40                 if ((0xD800 <= *s) && (*s < 0xDC00)) {
  41                         if ((0xDC00 <= s[1]) && (s[1] < 0xE000)) {
  42                                 u_int32_t s_sp = (u_int32_t)*s << 16 | (u_int32_t)s[1];
  43                                 u_int32_t v_sp = tolower_sp(s_sp);
  44                                 if (v_sp != s_sp) {
  45                                         *s = v_sp >> 16;
  46                                         s++;
  47                                         *s = v_sp & 0xFFFF;
  48                                         ret = 1;
  49                                 }
  50                         }
  51                 } else {
  52                         ucs2_t v = tolower_w(*s);
  53                         if (v != *s) {
  54                                 *s = v;
  55                                 ret = 1;
  56                         }
  57                 }
  58                 s++;
  59         }
  60         return ret;
  61 }
  62
  63 /*******************************************************************
  64  Convert a string to upper case.
  65  return True if any char is converted
  66 ********************************************************************/
  67 /* surrogate pair support */
  68
  69 int strupper_w(ucs2_t *s)
  70 {
  71         int ret = 0;
  72
  73         while (*s) {
  74                 if ((0xD800 <= *s) && (*s < 0xDC00)) {
  75                         if ((0xDC00 <= s[1]) && (s[1] < 0xE000)) {
  76                                 u_int32_t s_sp = (u_int32_t)*s << 16 | (u_int32_t)s[1];
  77                                 u_int32_t v_sp = toupper_sp(s_sp);
  78                                 if (v_sp != s_sp) {
  79                                         *s = v_sp >> 16;
  80                                         s++;
  81                                         *s = v_sp & 0xFFFF;
  82                                         ret = 1;
  83                                 }
  84                         }
  85                 } else {
  86                         ucs2_t v = toupper_w(*s);
  87                         if (v != *s) {
  88                                 *s = v;
  89                                 ret = 1;
  90                         }
  91                 }
  92                 s++;
  93         }
  94         return ret;
  95 }
  96
  97 /*******************************************************************
  98 wide & sp islower()
  99 determine if a character is lowercase
 100 ********************************************************************/
 101 /* These functions are not used. */
 102
 103 int islower_w(ucs2_t c)
 104 {
 105         return ( c == tolower_w(c));
 106 }
 107
 108 int islower_sp(u_int32_t c_sp)
 109 {
 110         return ( c_sp == tolower_sp(c_sp));
 111 }
 112
 113 /*******************************************************************
 114 wide & sp isupper()
 115 determine if a character is uppercase
 116 ********************************************************************/
 117 /* These functions are not used. */
 118
 119 int isupper_w(ucs2_t c)
 120 {
 121         return ( c == toupper_w(c));
 122 }
 123
 124 int isupper_sp(u_int32_t c_sp)
 125 {
 126         return ( c_sp == toupper_sp(c_sp));
 127 }
 128
 129 /*******************************************************************
 130 wide strlen()
 131  Count the number of characters in a UTF-16 string.
 132 ********************************************************************/
 133 /* NOTE: one surrogate pair is two characters. */
 134
 135 size_t strlen_w(const ucs2_t *src)
 136 {
 137         size_t len;
 138
 139         for(len = 0; *src++; len++) ;
 140
 141         return len;
 142 }
 143
 144 /*******************************************************************
 145 wide strnlen()
 146  Count up to max number of characters in a UTF-16 string.
 147 ********************************************************************/
 148 /* NOTE: one surrogate pair is two characters. */
 149
 150 size_t strnlen_w(const ucs2_t *src, size_t max)
 151 {
 152         size_t len;
 153
 154         for(len = 0; *src++ && (len < max); len++) ;
 155
 156         return len;
 157 }
 158
 159 /*******************************************************************
 160 wide strchr()
 161 ********************************************************************/
 162 /* NOTE: hi and lo of surrogate pair are separately processed. */
 163
 164 ucs2_t *strchr_w(const ucs2_t *s, ucs2_t c)
 165 {
 166         while (*s != 0) {
 167                 if (c == *s) return (ucs2_t *)s;
 168                 s++;
 169         }
 170         if (c == *s) return (ucs2_t *)s;
 171
 172         return NULL;
 173 }
 174
 175 /*******************************************************************
 176 wide & sp strcasechr()
 177 ********************************************************************/
 178 /* NOTE: separately process BMP and surrogate pair */
 179
 180 ucs2_t *strcasechr_w(const ucs2_t *s, ucs2_t c)
 181 {
 182         while (*s != 0) {
 183 /*              LOG(log_debug, logtype_default, "Comparing %X to %X (%X - %X)", c, *s, toupper_w(c), toupper_w(*s));*/
 184                 if (toupper_w(c) == toupper_w(*s)) return (ucs2_t *)s;
 185                 s++;
 186         }
 187         if (c == *s) return (ucs2_t *)s;
 188
 189         return NULL;
 190 }
 191
 192 ucs2_t *strcasechr_sp(const ucs2_t *s, u_int32_t c_sp)
 193 {
 194         if (*s == 0) return NULL;
 195         while (s[1] != 0) {
 196                 if (toupper_sp(c_sp) == toupper_sp((u_int32_t)*s << 16 | (u_int32_t)s[1])) return (ucs2_t *)s;
 197                 s++;
 198         }
 199
 200         return NULL;
 201 }
 202
 203 /*******************************************************************
 204 wide strcmp()
 205 ********************************************************************/
 206 /* no problem of surrogate pair */
 207
 208 int strcmp_w(const ucs2_t *a, const ucs2_t *b)
 209 {
 210         while (*b && *a == *b) { a++; b++; }
 211         return (*a - *b);
 212         /* warning: if *a != *b and both are not 0 we retrun a random
 213            greater or lesser than 0 number not realted to which
 214            string is longer */
 215 }
 216
 217 /*******************************************************************
 218 wide strncmp()
 219 ********************************************************************/
 220 /* no problem of surrogate pair */
 221
 222 int strncmp_w(const ucs2_t *a, const ucs2_t *b, size_t len)
 223 {
 224         size_t n = 0;
 225         while ((n < len) && *b && *a == *b) { a++; b++; n++;}
 226         return (len - n)?(*a - *b):0;
 227 }
 228
 229 /*******************************************************************
 230 wide strstr()
 231 ********************************************************************/
 232 /* no problem of surrogate pair */
 233
 234 ucs2_t *strstr_w(const ucs2_t *s, const ucs2_t *ins)
 235 {
 236         ucs2_t *r;
 237         size_t slen, inslen;
 238
 239         if (!s || !*s || !ins || !*ins) return NULL;
 240         slen = strlen_w(s);
 241         inslen = strlen_w(ins);
 242         r = (ucs2_t *)s;
 243         while ((r = strchr_w(r, *ins))) {
 244                 if (strncmp_w(r, ins, inslen) == 0) return r;
 245                 r++;
 246         }
 247         return NULL;
 248 }
 249
 250 /*******************************************************************
 251 wide strcasestr()
 252 ********************************************************************/
 253 /* */
 254
 255 ucs2_t *strcasestr_w(const ucs2_t *s, const ucs2_t *ins)
 256 {
 257         ucs2_t *r;
 258         size_t slen, inslen;
 259
 260         if (!s || !*s || !ins || !*ins) return NULL;
 261         slen = strlen_w(s);
 262         inslen = strlen_w(ins);
 263         r = (ucs2_t *)s;
 264         while ((r = strcasechr_w(r, *ins))) {
 265                 if (strncasecmp_w(r, ins, inslen) == 0) return r;
 266                 r++;
 267         }
 268         return NULL;
 269 }
 270
 271 /*******************************************************************
 272 wide strcasecmp()
 273 case insensitive string comparison
 274 ********************************************************************/
 275 /* surrogate pair support */
 276
 277 int strcasecmp_w(const ucs2_t *a, const ucs2_t *b)
 278 {
 279         int ret;
 280
 281         while (*a && *b) {
 282                 if ((0xD800 <= *a) && (*a < 0xDC00)) {
 283                         if (ret = tolower_sp((u_int32_t)*a << 16 | (u_int32_t)a[1]) - tolower_sp((u_int32_t)*b << 16 | (u_int32_t)b[1])) return ret;
 284                         a++;
 285                         b++;
 286                         if (!(*a && *b)) return (tolower_w(*a) - tolower_w(*b)); /* avoid buffer over run */
 287                 } else {
 288                         if (ret = tolower_w(*a) - tolower_w(*b)) return ret;
 289                 }
 290                 a++;
 291                 b++;
 292         }
 293         return (tolower_w(*a) - tolower_w(*b));
 294 }
 295
 296 /*******************************************************************
 297 wide strncasecmp()
 298 case insensitive string comparison, length limited
 299 ********************************************************************/
 300 /* NOTE: compare up to 'len+1' if 'len' isolate surrogate pair  */
 301
 302 int strncasecmp_w(const ucs2_t *a, const ucs2_t *b, size_t len)
 303 {
 304         size_t n = 0;
 305         int ret;
 306
 307         while ((n < len) && *a && *b) {
 308                 if ((0xD800 <= *a) && (*a < 0xDC00)) {
 309                         if (ret = tolower_sp((u_int32_t)*a << 16 | (u_int32_t)a[1]) - tolower_sp((u_int32_t)*b << 16 | (u_int32_t)b[1])) return ret;
 310                         a++;
 311                         b++;
 312                         n++;
 313                         if (!((n < len) && *a && *b)) return (tolower_w(*a) - tolower_w(*b));
 314                 } else {
 315                         if (ret = tolower_w(*a) - tolower_w(*b)) return ret;
 316                 }
 317                 a++;
 318                 b++;
 319                 n++;
 320         }
 321         return (len - n)?(tolower_w(*a) - tolower_w(*b)):0;
 322 }
 323
 324 /*******************************************************************
 325 wide strndup()
 326 duplicate string
 327 ********************************************************************/
 328 /* NOTE: not check isolation of surrogate pair */
 329 /* if len == 0 then duplicate the whole string */
 330
 331 ucs2_t *strndup_w(const ucs2_t *src, size_t len)
 332 {
 333         ucs2_t *dest;
 334
 335         if (!len) len = strlen_w(src);
 336         dest = (ucs2_t *)malloc((len + 1) * sizeof(ucs2_t));
 337         if (!dest) {
 338                 LOG (log_error, logtype_default, "strdup_w: out of memory!");
 339                 return NULL;
 340         }
 341
 342         memcpy(dest, src, len * sizeof(ucs2_t));
 343         dest[len] = 0;
 344
 345         return dest;
 346 }
 347
 348 /*******************************************************************
 349 wide strdup()
 350 duplicate string
 351 ********************************************************************/
 352 /* no problem of surrogate pair */
 353
 354 ucs2_t *strdup_w(const ucs2_t *src)
 355 {
 356         return strndup_w(src, 0);
 357 }
 358
 359 /*******************************************************************
 360 copy a string with max len
 361 ********************************************************************/
 362
 363 ucs2_t *strncpy_w(ucs2_t *dest, const ucs2_t *src, const size_t max)
 364 {
 365         size_t len;
 366
 367         if (!dest || !src) return NULL;
 368
 369         for (len = 0; (src[len] != 0) && (len < max); len++)
 370                 dest[len] = src[len];
 371         while (len < max)
 372                 dest[len++] = 0;
 373
 374         return dest;
 375 }
 376
 377
 378 /*******************************************************************
 379 append a string of len bytes and add a terminator
 380 ********************************************************************/
 381
 382 ucs2_t *strncat_w(ucs2_t *dest, const ucs2_t *src, const size_t max)
 383 {
 384         size_t start;
 385         size_t len;
 386
 387         if (!dest || !src) return NULL;
 388
 389         start = strlen_w(dest);
 390         len = strnlen_w(src, max);
 391
 392         memcpy(&dest[start], src, len*sizeof(ucs2_t));
 393         dest[start+len] = 0;
 394
 395         return dest;
 396 }
 397
 398
 399 ucs2_t *strcat_w(ucs2_t *dest, const ucs2_t *src)
 400 {
 401         size_t start;
 402         size_t len;
 403
 404         if (!dest || !src) return NULL;
 405
 406         start = strlen_w(dest);
 407         len = strlen_w(src);
 408
 409         memcpy(&dest[start], src, len*sizeof(ucs2_t));
 410         dest[start+len] = 0;
 411
 412         return dest;
 413 }
 414
 415
 416 /*******************************************************************
 417 binary search for pre|decomposition
 418 ********************************************************************/
 419
 420 static ucs2_t do_precomposition(unsigned int base, unsigned int comb)
 421 {
 422         int min = 0;
 423         int max = PRECOMP_COUNT - 1;
 424         int mid;
 425         u_int32_t sought = (base << 16) | comb, that;
 426
 427         /* binary search */
 428         while (max >= min) {
 429                 mid = (min + max) / 2;
 430                 that = (precompositions[mid].base << 16) | (precompositions[mid].comb);
 431                 if (that < sought) {
 432                         min = mid + 1;
 433                 } else if (that > sought) {
 434                         max = mid - 1;
 435                 } else {
 436                         return precompositions[mid].replacement;
 437                 }
 438         }
 439         /* no match */
 440         return 0;
 441 }
 442
 443 /* ------------------------ */
 444 static u_int32_t do_precomposition_sp(unsigned int base_sp, unsigned int comb_sp)
 445 {
 446         int min = 0;
 447         int max = PRECOMP_SP_COUNT - 1;
 448         int mid;
 449         u_int64_t sought_sp = ((u_int64_t)base_sp << 32) | (u_int64_t)comb_sp, that_sp;
 450
 451         /* binary search */
 452         while (max >= min) {
 453                 mid = (min + max) / 2;
 454                 that_sp = ((u_int64_t)precompositions_sp[mid].base_sp << 32) | ((u_int64_t)precompositions_sp[mid].comb_sp);
 455                 if (that_sp < sought_sp) {
 456                         min = mid + 1;
 457                 } else if (that_sp > sought_sp) {
 458                         max = mid - 1;
 459                 } else {
 460                         return precompositions_sp[mid].replacement_sp;
 461                 }
 462         }
 463         /* no match */
 464         return 0;
 465 }
 466
 467 /* -------------------------- */
 468 static u_int32_t do_decomposition(ucs2_t base)
 469 {
 470         int min = 0;
 471         int max = DECOMP_COUNT - 1;
 472         int mid;
 473         u_int32_t sought = base;
 474         u_int32_t result, that;
 475
 476         /* binary search */
 477         while (max >= min) {
 478                 mid = (min + max) / 2;
 479                 that = decompositions[mid].replacement;
 480                 if (that < sought) {
 481                         min = mid + 1;
 482                 } else if (that > sought) {
 483                         max = mid - 1;
 484                 } else {
 485                         result = (decompositions[mid].base << 16) | (decompositions[mid].comb);
 486                         return result;
 487                 }
 488         }
 489         /* no match */
 490         return 0;
 491 }
 492
 493 /* -------------------------- */
 494 static u_int64_t do_decomposition_sp(unsigned int base_sp)
 495 {
 496         int min = 0;
 497         int max = DECOMP_SP_COUNT - 1;
 498         int mid;
 499         u_int32_t sought_sp = base_sp;
 500         u_int32_t that_sp;
 501         u_int64_t result_sp;
 502
 503         /* binary search */
 504         while (max >= min) {
 505                 mid = (min + max) / 2;
 506                 that_sp = decompositions_sp[mid].replacement_sp;
 507                 if (that_sp < sought_sp) {
 508                         min = mid + 1;
 509                 } else if (that_sp > sought_sp) {
 510                         max = mid - 1;
 511                 } else {
 512                         result_sp = ((u_int64_t)decompositions_sp[mid].base_sp << 32) | ((u_int64_t)decompositions_sp[mid].comb_sp);
 513                         return result_sp;
 514                 }
 515         }
 516         /* no match */
 517         return 0;
 518 }
 519
 520 /*******************************************************************
 521 pre|decomposition
 522
 523    we can't use static, this stuff needs to be reentrant
 524    static char comp[MAXPATHLEN +1];
 525
 526    We don't implement Singleton and Canonical Ordering.
 527    We ignore CompositionExclusions.txt.
 528    because they cause the problem of the roundtrip
 529    such as Dancing Icon.
 530
 531    exclude U2000-U2FFF, UFE30-UFE4F and U2F800-U2FA1F ranges
 532    in precompose.h from composition according to AFP 3.x spec
 533 ********************************************************************/
 534
 535 size_t precompose_w (ucs2_t *name, size_t inplen, ucs2_t *comp, size_t *outlen)
 536 {
 537         size_t i;
 538         ucs2_t base, comb;
 539         u_int32_t base_sp, comb_sp;
 540         ucs2_t *in, *out;
 541         ucs2_t lindex, vindex;
 542         ucs2_t result;
 543         u_int32_t result_sp;
 544         size_t o_len = *outlen;
 545
 546         if (!inplen || (inplen & 1) || inplen > o_len)
 547                 return (size_t)-1;
 548
 549         i = 0;
 550         in  = name;
 551         out = comp;
 552
 553         base = *in;
 554         while (*outlen > 2) {
 555                 i += 2;
 556                 if (i == inplen) {
 557                         *out = base;
 558                         out++;
 559                         *out = 0;
 560                         *outlen -= 2;
 561                         return o_len - *outlen;
 562                 }
 563                 in++;
 564                 comb = *in;
 565                 result = 0;
 566
 567                 /* Non-Combination Character */
 568                 if (comb < 0x300) ;
 569
 570                 /* Unicode Standard Annex #15 A10.3 Hangul Composition */
 571                 /* Step 1 <L,V> */
 572                 else if ((VBASE <= comb) && (comb <= VBASE + VCOUNT)) {
 573                         if ((LBASE <= base) && (base < LBASE + LCOUNT)) {
 574                                 result = 1;
 575                                 lindex = base - LBASE;
 576                                 vindex = comb - VBASE;
 577                                 base = SBASE + (lindex * VCOUNT + vindex) * TCOUNT;
 578                         }
 579                 }
 580
 581                 /* Step 2 <LV,T> */
 582                 else if ((TBASE < comb) && (comb < TBASE + TCOUNT)) {
 583                         if ((SBASE <= base) && (base < SBASE + SCOUNT) && (((base - SBASE) % TCOUNT) == 0)) {
 584                                 result = 1;
 585                                 base += comb - TBASE;
 586                         }
 587                 }
 588
 589                 /* Binary Search for Surrogate Pair */
 590                 else if ((0xD800 <= base) && (base < 0xDC00)) {
 591                         if ((0xDC00 <= comb) && (comb < 0xE000) && (i + 6 <= inplen)) {
 592                                 base_sp = ((u_int32_t)base << 16) | (u_int32_t)comb;
 593                                 do {
 594                                         comb_sp = ((u_int32_t)in[1] << 16) | (u_int32_t)in[2];
 595                                         if (result_sp = do_precomposition_sp(base_sp, comb_sp)) {
 596                                                 base_sp = result_sp;
 597                                                 i += 4;
 598                                                 in +=2;
 599                                         }
 600                                 } while ((i + 6 <= inplen) && result_sp) ;
 601
 602                                 *out = base_sp >> 16;
 603                                 out++;
 604                                 *outlen -= 2;
 605
 606                                 if (*outlen <= 2) {
 607                                         errno = E2BIG;
 608                                         return (size_t)-1;
 609                                 }
 610
 611                                 *out = base_sp & 0xFFFF;
 612                                 out++;
 613                                 *outlen -= 2;
 614
 615                                 i += 2;
 616                                 if (i == inplen) {
 617                                         out++;
 618                                         *out = 0;
 619                                         return o_len - *outlen;
 620                                 }
 621                                 in++;
 622                                 base = *in;
 623
 624                                 result = 1;
 625                         }
 626                 }
 627
 628                 /* Binary Search for BMP */
 629                 else if (result = do_precomposition(base, comb)) {
 630                         base = result;
 631                 }
 632
 633                 if (!result) {
 634                         *out = base;
 635                         out++;
 636                         *outlen -= 2;
 637                         base = comb;
 638                 }
 639         }
 640
 641         errno = E2BIG;
 642         return (size_t)-1;
 643 }
 644
 645 /* --------------- */
 646 size_t decompose_w (ucs2_t *name, size_t inplen, ucs2_t *comp, size_t *outlen)
 647 {
 648         size_t i;
 649         size_t comblen;
 650         ucs2_t base, comb[COMBBUFLEN];
 651         u_int32_t base_sp;
 652         ucs2_t sindex, tjamo;
 653         ucs2_t *in, *out;
 654         unsigned int result;
 655         u_int64_t result_sp;
 656         size_t o_len = *outlen;
 657
 658         if (!inplen || (inplen & 1))
 659                 return (size_t)-1;
 660         i = 0;
 661         in  = name;
 662         out = comp;
 663
 664         while (i < inplen) {
 665                 base = *in;
 666                 comblen = 0;
 667
 668                 /* check ASCII first. this is frequent. */
 669                 if (base <= 0x007f) ;
 670
 671                 /* Unicode Standard Annex #15 A10.2 Hangul Decomposition */
 672                 else if ((SBASE <= base) && (base < SBASE + SCOUNT)) {
 673                         sindex = base - SBASE;
 674                         base = LBASE + sindex / NCOUNT;
 675                         comb[COMBBUFLEN-2] = VBASE + (sindex % NCOUNT) / TCOUNT;
 676
 677                         /* <L,V> */
 678                         if ((tjamo = TBASE + sindex % TCOUNT) == TBASE) {
 679                                 comb[COMBBUFLEN-1] = comb[COMBBUFLEN-2];
 680                                 comblen = 1;
 681                         }
 682
 683                         /* <L,V,T> */
 684                         else {
 685                                 comb[COMBBUFLEN-1] = tjamo;
 686                                 comblen = 2;
 687                         }
 688                 }
 689
 690                 /* Binary Search for Surrogate Pair */
 691                 else if ((0xD800 <= base) && (base < 0xDC00)) {
 692                         if (i + 2 < inplen) {
 693                                 base_sp =  ((u_int32_t)base << 16) | (u_int32_t)in[1];
 694                                 do {
 695                                         if ( !(result_sp = do_decomposition_sp(base_sp))) break;
 696                                         comblen += 2;
 697                                         base_sp = result_sp >> 32;
 698                                         comb[COMBBUFLEN-comblen] = (result_sp >> 16) & 0xFFFF;  /* hi */
 699                                         comb[COMBBUFLEN-comblen+1] = result_sp & 0xFFFF;        /* lo */
 700                                 } while (comblen < MAXCOMBSPLEN);
 701
 702                                 if (*outlen < (comblen + 1) << 1) {
 703                                         errno = E2BIG;
 704                                         return (size_t)-1;
 705                                 }
 706
 707                                 *out = base_sp >> 16;   /* hi */
 708                                 out++;
 709                                 *outlen -= 2;
 710
 711                                 base = base_sp & 0xFFFF; /* lo */
 712
 713                                 i += 2;
 714                                 in++;
 715                         }
 716                 }
 717
 718                 /* Binary Search for BMP */
 719                 else {
 720                         do {
 721                                 if ( !(result = do_decomposition(base))) break;
 722                                 comblen++;
 723                                 base = result  >> 16;
 724                                 comb[COMBBUFLEN-comblen] = result & 0xFFFF;
 725                         } while ((0x007f < base) && (comblen < MAXCOMBLEN));
 726                 }
 727
 728                 if (*outlen < (comblen + 1) << 1) {
 729                         errno = E2BIG;
 730                         return (size_t)-1;
 731                 }
 732
 733                 *out = base;
 734                 out++;
 735                 *outlen -= 2;
 736
 737                 while ( comblen > 0 ) {
 738                         *out = comb[COMBBUFLEN-comblen];
 739                         out++;
 740                         *outlen -= 2;
 741                         comblen--;
 742                 }
 743
 744                 i += 2;
 745                 in++;
 746         }
 747
 748         *out = 0;
 749         return o_len-*outlen;
 750 }
 751
 752 /*******************************************************************
 753 length of UTF-8 character and string
 754 ********************************************************************/
 755
 756 size_t utf8_charlen ( char* utf8 )
 757 {
 758         unsigned char *p;
 759
 760         p = (unsigned char*) utf8;
 761
 762         if ( *p < 0x80 )
 763                 return (1);
 764         else if ( *p > 0xC1 && *p < 0xe0 && *(p+1) > 0x7f && *(p+1) < 0xC0)
 765                 return (2);
 766         else if ( *p == 0xe0 && *(p+1) > 0x9f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0)
 767                 return (3);
 768         else if ( *p > 0xe0  && *p < 0xf0 && *(p+1) > 0x7f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0)
 769                 return (3);
 770         else if ( *p == 0xf0 && *(p+1) > 0x8f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
 771                 return (4);
 772         else if ( *p > 0xf0 && *p < 0xf4 && *(p+1) > 0x7f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
 773                 return (4);
 774         else if ( *p == 0xf4 && *(p+1) > 0x7f && *(p+1) < 0x90 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
 775                 return (4);
 776         else
 777                 return ((size_t) -1);
 778 }
 779
 780
 781 size_t utf8_strlen_validate ( char * utf8 )
 782 {
 783         size_t len;
 784         unsigned char *p;
 785
 786         p = (unsigned char*) utf8;
 787         len = 0;
 788
 789         /* see http://www.unicode.org/unicode/reports/tr27/ for an explanation */
 790
 791         while ( *p != '\0')
 792         {
 793                 if ( *p < 0x80 )
 794                         p++;
 795
 796                 else if ( *p > 0xC1 && *p < 0xe0 && *(p+1) > 0x7f && *(p+1) < 0xC0)
 797                         p += 2;
 798
 799                 else if ( *p == 0xe0 && *(p+1) > 0x9f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0)
 800                         p += 3;
 801
 802                 else if ( *p > 0xe0  && *p < 0xf0 && *(p+1) > 0x7f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0)
 803                         p += 3;
 804
 805                 else if ( *p == 0xf0 && *(p+1) > 0x8f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
 806                         p += 4;
 807
 808                 else if ( *p > 0xf0 && *p < 0xf4 && *(p+1) > 0x7f && *(p+1) < 0xc0 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
 809                         p += 4;
 810
 811                 else if ( *p == 0xf4 && *(p+1) > 0x7f && *(p+1) < 0x90 && *(p+2) > 0x7f && *(p+2) < 0xc0 && *(p+3) > 0x7f && *(p+3) < 0xc0 )
 812                         p += 4;
 813
 814                 else
 815                         return ((size_t) -1);
 816
 817                 len++;
 818         }
 819
 820         return (len);
 821 }