libatalk/unicode/utf8.c

   1 /*
   2    Unix SMB/CIFS implementation.
   3    minimal iconv implementation
   4    Copyright (C) Andrew Tridgell 2001
   5    Copyright (C) Jelmer Vernooij 2002,2003
   6
   7    This program is free software; you can redistribute it and/or modify
   8    it under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 2 of the License, or
  10    (at your option) any later version.
  11
  12    This program is distributed in the hope that it will be useful,
  13    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15    GNU General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with this program; if not, write to the Free Software
  19    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  20
  21    From samba 3.0 beta and GNU libiconv-1.8
  22    It's bad but most of the time we can't use libc iconv service:
  23    - it doesn't round trip for most encoding
  24    - it doesn't know about Apple extension
  25 */
  26
  27 #ifdef HAVE_CONFIG_H
  28 #include "config.h"
  29 #endif /* HAVE_CONFIG_H */
  30 #include <stdlib.h>
  31 #include <errno.h>
  32 #include <arpa/inet.h>
  33
  34 #include <atalk/unicode.h>
  35 #include <atalk/logger.h>
  36 #include <atalk/unicode.h>
  37 #include <atalk/byteorder.h>
  38
  39 /* Given a trailing UTF-8 byte, get the contribution from it to
  40  * the Unicode scalar value for a particular bit shift amount
  41  */
  42 #define GETUCVAL(utf8_trailbyte,shift)  ((unsigned int) (( utf8_trailbyte & 0x3F) << shift))
  43
  44 /* Given a unicode scalar, get a trail UTF-8 byte for a particular bit shift amount */
  45 #define GETUTF8TRAILBYTE(uc,shift)      ((char)( 0x80 | ((uc >> shift) & 0x3F) ) )
  46
  47
  48
  49 static size_t   utf8_pull(void *,char **, size_t *, char **, size_t *);
  50 static size_t   utf8_push(void *,char **, size_t *, char **, size_t *);
  51
  52 struct charset_functions charset_utf8 =
  53 {
  54         "UTF8",
  55         0x08000103,
  56         utf8_pull,
  57         utf8_push,
  58         CHARSET_VOLUME | CHARSET_MULTIBYTE | CHARSET_PRECOMPOSED,
  59         NULL,
  60         NULL, NULL
  61 };
  62
  63 struct charset_functions charset_utf8_mac =
  64 {
  65         "UTF8-MAC",
  66         0x08000103,
  67         utf8_pull,
  68         utf8_push,
  69         CHARSET_VOLUME | CHARSET_CLIENT | CHARSET_MULTIBYTE | CHARSET_DECOMPOSED,
  70         NULL,
  71         NULL, NULL
  72 };
  73
  74 /* The Unicode Standard Version 6.2 – Core Specification          */
  75 /* http://www.unicode.org/versions/Unicode6.2.0/ch03.pdf          */
  76 /*                                                                */
  77 /* Scalar Value               First    Second   Third    Fourth   */
  78 /* 00000000 0xxxxxxx          0xxxxxxx                            */
  79 /* 00000yyy yyxxxxxx          110yyyyy 10xxxxxx                   */
  80 /* zzzzyyyy yyxxxxxx          1110zzzz 10yyyyyy 10xxxxxx          */
  81 /* 000uuuuu zzzzyyyy yyxxxxxx 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx */
  82
  83
  84 /* ------------------- Convert from UTF-8 to UTF-16 -------------------*/
  85
  86 /* Code Points        First   Second  Third    Fourth  */
  87 /* U+0000..U+007F      00..7F                          */
  88 /* U+0080..U+07FF      C2..DF  80..BF                  */
  89 /* U+0800..U+0FFF      E0      A0..BF  80..BF          */
  90 /* U+1000..U+CFFF      E1..EC  80..BF  80..BF          */
  91 /* U+D000..U+D7FF      ED      80..9F  80..BF          */
  92 /* U+E000..U+FFFF      EE..EF  80..BF  80..BF          */
  93 /* U+10000..U+3FFFF    F0      90..BF  80..BF  80..BF  */
  94 /* U+40000..U+FFFFF    F1..F3  80..BF  80..BF  80..BF  */
  95 /* U+100000..U+10FFFF  F4      80..8F  80..BF  80..BF  */
  96
  97 static size_t utf8_pull(void *cd _U_, char **inbuf, size_t *inbytesleft,
  98                          char **outbuf, size_t *outbytesleft)
  99 {
 100         ucs2_t uc = 0;
 101         unsigned int codepoint;
 102         int len;
 103
 104         while (*inbytesleft >= 1 && *outbytesleft >= 2) {
 105                 unsigned char *c = (unsigned char *)*inbuf;
 106                 len = 1;
 107
 108                 /* Arrange conditionals in the order of most frequent occurrence
 109                  * for users of Latin-based chars */
 110                 if ((c[0] & 0x80) == 0) {                                /* 0xxx xxxx */ /* 1 byte  */
 111                         uc = c[0];
 112                 } else if ((c[0] & 0xe0) == 0xc0) {                             /* 110y yyyy */ /* 2 bytes */
 113                         if (*inbytesleft < 2) goto inval;
 114                         if (c[0] < 0xc2) goto ilseq;                                /* C2-DF */
 115                         if ((c[1] & 0xc0) != 0x80) goto ilseq;                      /* 80-BF */
 116                         uc = (ucs2_t) (((c[0] & 0x1f) << 6) | GETUCVAL(c[1],0)) ;
 117                         len = 2;
 118                 } else if ((c[0] & 0xf0) == 0xe0) {                                  /* 1110 zzzz */ /* 3 bytes */
 119                         if (*inbytesleft < 3) goto inval;
 120                         if (!((c[0] == 0xe0                && (c[1] & 0xe0) == 0xa0)  || /* E0    A0-BF*/
 121                                   (0xe1 <= c[0] && c[0] <= 0xec && (c[1] & 0xc0) == 0x80) || /* E1-EC 80-BF */
 122                                   (c[0] == 0xed                 && (c[1] & 0xe0) == 0x80) || /* ED    80-9F */
 123                                   ((c[0] & 0xfe) == 0xee        && (c[1] & 0xc0) == 0x80)))  /* EE-EF 80-BF */
 124                                 goto ilseq;
 125                         if ((c[2] & 0xc0) != 0x80) goto ilseq;                           /* 80-BF */
 126                         uc = (ucs2_t) (((c[0] & 0x0f) << 12) | GETUCVAL(c[1],6) | GETUCVAL(c[2],0)) ;
 127                         len = 3;
 128                 } else if ((c[0] & 0xf8) == 0xf0) {                                         /* 1111 0uuu */ /* 4 bytes */
 129                         if (*inbytesleft < 4) goto inval;
 130                         if (*outbytesleft < 4) goto toobig;
 131                         if (c[0] > 0xf4) goto ilseq;                                            /* happens for surrogate pairs only */
 132                         if (!((c[0] == 0xf0                 && 0x90 <= c[1] && c[1] <= 0xbf) || /* F0    90-BF */
 133                                   (0xf1 <= c[0] && c[0] <= 0xf3 && (c[1] & 0xc0) == 0x80) ||        /* F1-F3 80-BF */
 134                                   (c[0] == 0xf4                 && (c[1] & 0xc0) == 0x80)))         /* F4    80-8F */
 135                                 goto ilseq;
 136                         if ((c[2] & 0xc0) != 0x80) goto ilseq;                                  /* 80-BF */
 137                         if ((c[3] & 0xc0) != 0x80) goto ilseq;                                  /* 80-BF */
 138                         codepoint = ((c[0] & 0x07) << 18) | GETUCVAL(c[1],12) |
 139                                 GETUCVAL(c[2],6) |  GETUCVAL(c[3],0);
 140                         SSVAL(*outbuf,0,(((codepoint - 0x10000) >> 10) + 0xD800)); /* hi  */
 141                         SSVAL(*outbuf,2,(0xDC00 + (codepoint & 0x03FF)));          /* low */
 142                         len = 4;
 143                         (*inbuf)  += 4;
 144                         (*inbytesleft)  -= 4;
 145                         (*outbytesleft) -= 4;
 146                         (*outbuf) += 4;
 147                         continue;
 148                 } else {
 149                         goto ilseq;
 150                 }
 151
 152                 SSVAL(*outbuf,0,uc);
 153                 (*inbuf)  += len;
 154                 (*inbytesleft)  -= len;
 155                 (*outbytesleft) -= 2;
 156                 (*outbuf) += 2;
 157         }
 158
 159         if (*inbytesleft > 0) {
 160                 goto toobig;
 161         }
 162
 163         return 0;
 164
 165 toobig:
 166         LOG(log_debug, logtype_default, "short ucs-2 write");
 167         errno = E2BIG;
 168         return -1;
 169
 170 ilseq:
 171         LOG(log_debug, logtype_default, "malformed utf8 sequence");
 172         errno = EILSEQ;
 173         return -1;
 174
 175 inval:
 176         LOG(log_debug, logtype_default, "short utf8 char");
 177         errno = EINVAL;
 178         return -1;
 179 }
 180
 181 /* --------------------- Convert from UTF-16 to UTF-8 -----------*/
 182 static size_t utf8_push(void *cd _U_, char **inbuf, size_t *inbytesleft,
 183                          char **outbuf, size_t *outbytesleft)
 184 {
 185         ucs2_t uc=0;
 186         ucs2_t hi, low;
 187         unsigned int codepoint;
 188         int olen, ilen;
 189
 190         while (*inbytesleft >= 2 && *outbytesleft >= 1) {
 191                 unsigned char *c = (unsigned char *)*outbuf;
 192                 uc = SVAL((*inbuf),0);
 193                 olen=1;
 194                 ilen=2;
 195
 196                 /* Arrange conditionals in the order of most frequent occurrence for
 197                    users of Latin-based chars */
 198                 if (uc < 0x80) {
 199                         c[0] = uc;
 200                 } else if (uc < 0x800) {
 201                         if (*outbytesleft < 2) {
 202                                 LOG(log_debug, logtype_default, "short utf8 write");
 203                                 goto toobig;
 204                         }
 205                         c[1] = GETUTF8TRAILBYTE(uc, 0);
 206                         c[0] = (char)(0xc0 | ((uc >> 6) & 0x1f));
 207                         olen = 2;
 208                 }
 209                 else if ( uc >= 0x202a && uc <= 0x202e ) {
 210                         /* ignore bidi hint characters */
 211                         olen = 0;
 212                 }
 213                 /*
 214                  * A 2-byte uc value represents a stand-alone Unicode character if
 215                  *     0 <= uc < 0xd800 or 0xdfff < uc <= 0xffff.
 216                  * If  0xd800 <= uc <= 0xdfff, uc itself does not represent a Unicode character.
 217                  * Rather, it is just part of a surrogate pair.  A surrogate pair consists of
 218                  * a high surrogate in the range [0xd800 ... 0xdbff] and a low surrogate in the
 219                  * range [0xdc00 ... 0xdfff].  Together the pair maps to a single Unicode character
 220                  * whose scalar value is 64K or larger.  It is this scalar value that is transformed
 221                  * to UTF-8, not the individual surrogates.
 222                  *
 223                  * See www.unicode.org/faq/utf_bom.html for more info.
 224                  */
 225
 226                 else if ( 0xd800 <= uc && uc <= 0xdfff) {
 227                         /* surrogate - needs 4 bytes from input and 4 bytes for output to UTF-8 */
 228                         if (*outbytesleft < 4) {
 229                                 LOG(log_debug, logtype_default, "short utf8 write");
 230                                 goto toobig;
 231                         }
 232                         if (*inbytesleft < 4) {
 233                                 errno = EINVAL;
 234                                 return -1;
 235                         }
 236                         hi =  SVAL((*inbuf),0);
 237                         low = SVAL((*inbuf),2);
 238                         if ( 0xd800 <= hi && hi <= 0xdbff && 0xdc00 <= low && low <= 0xdfff) {
 239                                 codepoint = ((hi - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
 240                                 c[3] = GETUTF8TRAILBYTE(codepoint, 0);
 241                                 c[2] = GETUTF8TRAILBYTE(codepoint, 6);
 242                                 c[1] = GETUTF8TRAILBYTE(codepoint, 12);
 243                                 c[0] = (char)(0xf0 | ((codepoint >> 18) & 0x07));
 244                                 ilen = olen = 4;
 245                         } else { /* invalid values for surrogate */
 246                                 errno = EINVAL;
 247                                 return -1;
 248                         }
 249                 } else {
 250                         if (*outbytesleft < 3) {
 251                                 LOG(log_debug, logtype_default, "short utf8 write");
 252                                 goto toobig;
 253                         }
 254                         c[2] = GETUTF8TRAILBYTE(uc, 0);
 255                         c[1] = GETUTF8TRAILBYTE(uc, 6);
 256                         c[0] = (char)(0xe0 | ((uc >> 12) & 0x0f));
 257                         olen = 3;
 258                 }
 259
 260                 (*inbytesleft)  -= ilen;
 261                 (*outbytesleft) -= olen;
 262                 (*inbuf)  += ilen;
 263                 (*outbuf) += olen;
 264         }
 265
 266         if (*inbytesleft == 1) {
 267                 errno = EINVAL;
 268                 return -1;
 269         }
 270
 271         if (*inbytesleft > 1) {
 272                 errno = E2BIG;
 273                 return -1;
 274         }
 275
 276         return 0;
 277
 278 toobig:
 279         errno = E2BIG;
 280         return -1;
 281 }