2 Unix SMB/CIFS implementation.
3 minimal iconv implementation
4 Copyright (C) Andrew Tridgell 2001
5 Copyright (C) Jelmer Vernooij 2002,2003
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 From samba 3.0 beta and GNU libiconv-1.8
22 It's bad but most of the time we can't use libc iconv service:
23 - it doesn't round trip for most encoding
24 - it doesn't know about Apple extension
29 #endif /* HAVE_CONFIG_H */
32 #include <arpa/inet.h>
34 #include <atalk/unicode.h>
35 #include <atalk/logger.h>
36 #include <atalk/unicode.h>
37 #include <atalk/byteorder.h>
39 /* Given a trailing UTF-8 byte, get the contribution from it to
40 * the Unicode scalar value for a particular bit shift amount
42 #define GETUCVAL(utf8_trailbyte,shift) ((unsigned int) (( utf8_trailbyte & 0x3F) << shift))
44 /* Given a unicode scalar, get a trail UTF-8 byte for a particular bit shift amount */
45 #define GETUTF8TRAILBYTE(uc,shift) ((char)( 0x80 | ((uc >> shift) & 0x3F) ) )
49 static size_t utf8_pull(void *,char **, size_t *, char **, size_t *);
50 static size_t utf8_push(void *,char **, size_t *, char **, size_t *);
52 struct charset_functions charset_utf8 =
58 CHARSET_VOLUME | CHARSET_MULTIBYTE | CHARSET_PRECOMPOSED,
63 struct charset_functions charset_utf8_mac =
69 CHARSET_VOLUME | CHARSET_CLIENT | CHARSET_MULTIBYTE | CHARSET_DECOMPOSED,
74 /* The Unicode Standard Version 6.2 – Core Specification */
75 /* http://www.unicode.org/versions/Unicode6.2.0/ch03.pdf */
77 /* Scalar Value First Second Third Fourth */
78 /* 00000000 0xxxxxxx 0xxxxxxx */
79 /* 00000yyy yyxxxxxx 110yyyyy 10xxxxxx */
80 /* zzzzyyyy yyxxxxxx 1110zzzz 10yyyyyy 10xxxxxx */
81 /* 000uuuuu zzzzyyyy yyxxxxxx 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx */
84 /* ------------------- Convert from UTF-8 to UTF-16 -------------------*/
86 /* Code Points First Second Third Fourth */
87 /* U+0000..U+007F 00..7F */
88 /* U+0080..U+07FF C2..DF 80..BF */
89 /* U+0800..U+0FFF E0 A0..BF 80..BF */
90 /* U+1000..U+CFFF E1..EC 80..BF 80..BF */
91 /* U+D000..U+D7FF ED 80..9F 80..BF */
92 /* U+E000..U+FFFF EE..EF 80..BF 80..BF */
93 /* U+10000..U+3FFFF F0 90..BF 80..BF 80..BF */
94 /* U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF */
95 /* U+100000..U+10FFFF F4 80..8F 80..BF 80..BF */
97 static size_t utf8_pull(void *cd _U_, char **inbuf, size_t *inbytesleft,
98 char **outbuf, size_t *outbytesleft)
101 unsigned int codepoint;
104 while (*inbytesleft >= 1 && *outbytesleft >= 2) {
105 unsigned char *c = (unsigned char *)*inbuf;
108 /* Arrange conditionals in the order of most frequent occurrence
109 * for users of Latin-based chars */
110 if ((c[0] & 0x80) == 0) { /* 0xxx xxxx */ /* 1 byte */
112 } else if ((c[0] & 0xe0) == 0xc0) { /* 110y yyyy */ /* 2 bytes */
113 if (*inbytesleft < 2) goto inval;
114 if (c[0] < 0xc2) goto ilseq; /* C2-DF */
115 if ((c[1] & 0xc0) != 0x80) goto ilseq; /* 80-BF */
116 uc = (ucs2_t) (((c[0] & 0x1f) << 6) | GETUCVAL(c[1],0)) ;
118 } else if ((c[0] & 0xf0) == 0xe0) { /* 1110 zzzz */ /* 3 bytes */
119 if (*inbytesleft < 3) goto inval;
120 if (!((c[0] == 0xe0 && (c[1] & 0xe0) == 0xa0) || /* E0 A0-BF*/
121 (0xe1 <= c[0] && c[0] <= 0xec && (c[1] & 0xc0) == 0x80) || /* E1-EC 80-BF */
122 (c[0] == 0xed && (c[1] & 0xe0) == 0x80) || /* ED 80-9F */
123 ((c[0] & 0xfe) == 0xee && (c[1] & 0xc0) == 0x80))) /* EE-EF 80-BF */
125 if ((c[2] & 0xc0) != 0x80) goto ilseq; /* 80-BF */
126 uc = (ucs2_t) (((c[0] & 0x0f) << 12) | GETUCVAL(c[1],6) | GETUCVAL(c[2],0)) ;
128 } else if ((c[0] & 0xf8) == 0xf0) { /* 1111 0uuu */ /* 4 bytes */
129 if (*inbytesleft < 4) goto inval;
130 if (*outbytesleft < 4) goto toobig;
131 if (c[0] > 0xf4) goto ilseq; /* happens for surrogate pairs only */
132 if (!((c[0] == 0xf0 && 0x90 <= c[1] && c[1] <= 0xbf) || /* F0 90-BF */
133 (0xf1 <= c[0] && c[0] <= 0xf3 && (c[1] & 0xc0) == 0x80) || /* F1-F3 80-BF */
134 (c[0] == 0xf4 && (c[1] & 0xc0) == 0x80))) /* F4 80-8F */
136 if ((c[2] & 0xc0) != 0x80) goto ilseq; /* 80-BF */
137 if ((c[3] & 0xc0) != 0x80) goto ilseq; /* 80-BF */
138 codepoint = ((c[0] & 0x07) << 18) | GETUCVAL(c[1],12) |
139 GETUCVAL(c[2],6) | GETUCVAL(c[3],0);
140 SSVAL(*outbuf,0,(((codepoint - 0x10000) >> 10) + 0xD800)); /* hi */
141 SSVAL(*outbuf,2,(0xDC00 + (codepoint & 0x03FF))); /* low */
145 (*outbytesleft) -= 4;
154 (*inbytesleft) -= len;
155 (*outbytesleft) -= 2;
159 if (*inbytesleft > 0) {
166 LOG(log_debug, logtype_default, "short ucs-2 write");
171 LOG(log_debug, logtype_default, "malformed utf8 sequence");
176 LOG(log_debug, logtype_default, "short utf8 char");
181 /* --------------------- Convert from UTF-16 to UTF-8 -----------*/
182 static size_t utf8_push(void *cd _U_, char **inbuf, size_t *inbytesleft,
183 char **outbuf, size_t *outbytesleft)
187 unsigned int codepoint;
190 while (*inbytesleft >= 2 && *outbytesleft >= 1) {
191 unsigned char *c = (unsigned char *)*outbuf;
192 uc = SVAL((*inbuf),0);
196 /* Arrange conditionals in the order of most frequent occurrence for
197 users of Latin-based chars */
200 } else if (uc < 0x800) {
201 if (*outbytesleft < 2) {
202 LOG(log_debug, logtype_default, "short utf8 write");
205 c[1] = GETUTF8TRAILBYTE(uc, 0);
206 c[0] = (char)(0xc0 | ((uc >> 6) & 0x1f));
209 else if ( uc >= 0x202a && uc <= 0x202e ) {
210 /* ignore bidi hint characters */
214 * A 2-byte uc value represents a stand-alone Unicode character if
215 * 0 <= uc < 0xd800 or 0xdfff < uc <= 0xffff.
216 * If 0xd800 <= uc <= 0xdfff, uc itself does not represent a Unicode character.
217 * Rather, it is just part of a surrogate pair. A surrogate pair consists of
218 * a high surrogate in the range [0xd800 ... 0xdbff] and a low surrogate in the
219 * range [0xdc00 ... 0xdfff]. Together the pair maps to a single Unicode character
220 * whose scalar value is 64K or larger. It is this scalar value that is transformed
221 * to UTF-8, not the individual surrogates.
223 * See www.unicode.org/faq/utf_bom.html for more info.
226 else if ( 0xd800 <= uc && uc <= 0xdfff) {
227 /* surrogate - needs 4 bytes from input and 4 bytes for output to UTF-8 */
228 if (*outbytesleft < 4) {
229 LOG(log_debug, logtype_default, "short utf8 write");
232 if (*inbytesleft < 4) {
236 hi = SVAL((*inbuf),0);
237 low = SVAL((*inbuf),2);
238 if ( 0xd800 <= hi && hi <= 0xdbff && 0xdc00 <= low && low <= 0xdfff) {
239 codepoint = ((hi - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
240 c[3] = GETUTF8TRAILBYTE(codepoint, 0);
241 c[2] = GETUTF8TRAILBYTE(codepoint, 6);
242 c[1] = GETUTF8TRAILBYTE(codepoint, 12);
243 c[0] = (char)(0xf0 | ((codepoint >> 18) & 0x07));
245 } else { /* invalid values for surrogate */
250 if (*outbytesleft < 3) {
251 LOG(log_debug, logtype_default, "short utf8 write");
254 c[2] = GETUTF8TRAILBYTE(uc, 0);
255 c[1] = GETUTF8TRAILBYTE(uc, 6);
256 c[0] = (char)(0xe0 | ((uc >> 12) & 0x0f));
260 (*inbytesleft) -= ilen;
261 (*outbytesleft) -= olen;
266 if (*inbytesleft == 1) {
271 if (*inbytesleft > 1) {