]> arthur.barton.de Git - netatalk.git/blob - libatalk/unicode/utf8.c
Move byteorder.h to include/atalk
[netatalk.git] / libatalk / unicode / utf8.c
1 /* 
2    Unix SMB/CIFS implementation.
3    minimal iconv implementation
4    Copyright (C) Andrew Tridgell 2001
5    Copyright (C) Jelmer Vernooij 2002,2003
6    
7    This program is free software; you can redistribute it and/or modify
8    it under the terms of the GNU General Public License as published by
9    the Free Software Foundation; either version 2 of the License, or
10    (at your option) any later version.
11    
12    This program is distributed in the hope that it will be useful,
13    but WITHOUT ANY WARRANTY; without even the implied warranty of
14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15    GNU General Public License for more details.
16    
17    You should have received a copy of the GNU General Public License
18    along with this program; if not, write to the Free Software
19    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20    
21    From samba 3.0 beta and GNU libiconv-1.8
22    It's bad but most of the time we can't use libc iconv service:
23    - it doesn't round trip for most encoding
24    - it doesn't know about Apple extension
25 */
26
27 #ifdef HAVE_CONFIG_H
28 #include "config.h"
29 #endif /* HAVE_CONFIG_H */
30 #include <stdlib.h>
31 #include <errno.h>
32 #include <arpa/inet.h>
33
34 #include <atalk/unicode.h>
35 #include <atalk/logger.h>
36 #include <atalk/unicode.h>
37 #include <atalk/byteorder.h>
38
39 /* Given a trailing UTF-8 byte, get the contribution from it to
40  * the Unicode scalar value for a particular bit shift amount
41  */
42 #define GETUCVAL(utf8_trailbyte,shift)  ((unsigned int) (( utf8_trailbyte & 0x3F) << shift))
43
44 /* Given a unicode scalar, get a trail UTF-8 byte for a particular bit shift amount */
45 #define GETUTF8TRAILBYTE(uc,shift)      ((char)( 0x80 | ((uc >> shift) & 0x3F) ) )
46
47
48
49 static size_t   utf8_pull(void *,char **, size_t *, char **, size_t *);
50 static size_t   utf8_push(void *,char **, size_t *, char **, size_t *);
51
52 struct charset_functions charset_utf8 =
53 {
54         "UTF8",
55         0x08000103,
56         utf8_pull,
57         utf8_push,
58         CHARSET_VOLUME | CHARSET_MULTIBYTE | CHARSET_PRECOMPOSED,
59         NULL,
60         NULL, NULL
61 };
62
63 struct charset_functions charset_utf8_mac =
64 {
65         "UTF8-MAC",
66         0x08000103,
67         utf8_pull,
68         utf8_push,
69         CHARSET_VOLUME | CHARSET_CLIENT | CHARSET_MULTIBYTE | CHARSET_DECOMPOSED,
70         NULL,
71         NULL, NULL
72 };
73
74 /* ------------------- Convert from UTF-8 to UTF-16 -------------------*/
75 static size_t utf8_pull(void *cd _U_, char **inbuf, size_t *inbytesleft,
76                          char **outbuf, size_t *outbytesleft)
77 {
78         ucs2_t uc = 0;
79         unsigned int codepoint;
80         int len;
81
82         while (*inbytesleft >= 1 && *outbytesleft >= 2) {
83                 unsigned char *c = (unsigned char *)*inbuf;
84                 len = 1;
85
86                 /* Arrange conditionals in the order of most frequent occurrence 
87                  * for users of Latin-based chars */
88                 if ((c[0] & 0x80) == 0) {
89                         uc = c[0];
90                 } else if ((c[0] & 0xe0) == 0xc0) {
91                         if (*inbytesleft < 2) {
92                                 LOG(log_debug, logtype_default, "short utf8 char");
93                                 goto badseq;
94                         }
95                         uc = (ucs2_t) (((c[0] & 0x1f) << 6) | GETUCVAL(c[1],0)) ;
96                         len = 2;
97                 } else if ((c[0] & 0xf0) == 0xe0) {
98                         if (*inbytesleft < 3) {
99                                 LOG(log_debug, logtype_default, "short utf8 char");
100                                 goto badseq;
101                         }
102                         uc = (ucs2_t) (((c[0] & 0x0f) << 12) | GETUCVAL(c[1],6) | GETUCVAL(c[2],0)) ;
103                         len = 3;
104                 } else if ((c[0] & 0xf8) == 0xf0) {
105                         /* 4 bytes, which happens for surrogate pairs only */
106                         if (*inbytesleft < 4) {
107                                 LOG(log_debug, logtype_default, "short utf8 char");
108                                 goto badseq;
109                         }
110                         if (*outbytesleft < 4) {
111                                 LOG(log_debug, logtype_default, "short ucs-2 write");
112                                 errno = E2BIG;
113                                 return -1;
114                         }
115                         codepoint = ((c[0] & 0x07) << 18) | GETUCVAL(c[1],12) |
116                                 GETUCVAL(c[2],6) |  GETUCVAL(c[3],0);
117                         SSVAL(*outbuf,0,(((codepoint - 0x10000) >> 10) + 0xD800)); /* hi  */
118                         SSVAL(*outbuf,2,(0xDC00 + (codepoint & 0x03FF)));          /* low */
119                         len = 4;
120                         (*inbuf)  += 4;
121                         (*inbytesleft)  -= 4;
122                         (*outbytesleft) -= 4;
123                         (*outbuf) += 4;
124                         continue;
125                 }
126                 else {
127                         errno = EINVAL;
128                         return -1;
129                 }
130
131                 SSVAL(*outbuf,0,uc);
132                 (*inbuf)  += len;
133                 (*inbytesleft)  -= len;
134                 (*outbytesleft) -= 2;
135                 (*outbuf) += 2;
136         }
137
138         if (*inbytesleft > 0) {
139                 errno = E2BIG;
140                 return -1;
141         }
142         
143         return 0;
144
145 badseq:
146         errno = EINVAL;
147         return -1;
148 }
149
150 /* --------------------- Convert from UTF-16 to UTF-8 -----------*/
151 static size_t utf8_push(void *cd _U_, char **inbuf, size_t *inbytesleft,
152                          char **outbuf, size_t *outbytesleft)
153 {
154         ucs2_t uc=0;
155         ucs2_t hi, low;
156         unsigned int codepoint;
157         int olen, ilen;
158
159         while (*inbytesleft >= 2 && *outbytesleft >= 1) {
160                 unsigned char *c = (unsigned char *)*outbuf;
161                 uc = SVAL((*inbuf),0);
162                 olen=1;
163                 ilen=2;
164
165                 /* Arrange conditionals in the order of most frequent occurrence for
166                    users of Latin-based chars */
167                 if (uc < 0x80) {
168                         c[0] = uc;
169                 } else if (uc < 0x800) {
170                         if (*outbytesleft < 2) {
171                                 LOG(log_debug, logtype_default, "short utf8 write");
172                                 goto toobig;
173                         }
174                         c[1] = GETUTF8TRAILBYTE(uc, 0);
175                         c[0] = (char)(0xc0 | ((uc >> 6) & 0x1f));
176                         olen = 2;
177                 }
178                 else if ( uc >= 0x202a && uc <= 0x202e ) {
179                         /* ignore bidi hint characters */
180                         olen = 0;
181                 }
182                 /*
183                  * A 2-byte uc value represents a stand-alone Unicode character if
184                  *     0 <= uc < 0xd800 or 0xdfff < uc <= 0xffff.
185                  * If  0xd800 <= uc <= 0xdfff, uc itself does not represent a Unicode character.
186                  * Rather, it is just part of a surrogate pair.  A surrogate pair consists of 
187                  * a high surrogate in the range [0xd800 ... 0xdbff] and a low surrogate in the
188                  * range [0xdc00 ... 0xdfff].  Together the pair maps to a single Unicode character
189                  * whose scalar value is 64K or larger.  It is this scalar value that is transformed
190                  * to UTF-8, not the individual surrogates.
191                  *
192                  * See www.unicode.org/faq/utf_bom.html for more info.
193                  */
194
195                 else if ( 0xd800 <= uc && uc <= 0xdfff) {
196                         /* surrogate - needs 4 bytes from input and 4 bytes for output to UTF-8 */
197                         if (*outbytesleft < 4) {
198                                 LOG(log_debug, logtype_default, "short utf8 write");
199                                 goto toobig;
200                         }
201                         if (*inbytesleft < 4) {
202                                 errno = EINVAL;
203                                 return -1;
204                         }
205                         hi =  SVAL((*inbuf),0);
206                         low = SVAL((*inbuf),2);
207                         if ( 0xd800 <= hi && hi <= 0xdbff && 0xdc00 <= low && low <= 0xdfff) {
208                                 codepoint = ((hi - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
209                                 c[3] = GETUTF8TRAILBYTE(codepoint, 0);
210                                 c[2] = GETUTF8TRAILBYTE(codepoint, 6);
211                                 c[1] = GETUTF8TRAILBYTE(codepoint, 12);
212                                 c[0] = (char)(0xf0 | ((codepoint >> 18) & 0x07));
213                                 ilen = olen = 4;
214                         } else { /* invalid values for surrogate */
215                                 errno = EINVAL;
216                                 return -1;
217                         }
218                 } else {
219                         if (*outbytesleft < 3) {
220                                 LOG(log_debug, logtype_default, "short utf8 write");
221                                 goto toobig;
222                         }
223                         c[2] = GETUTF8TRAILBYTE(uc, 0);
224                         c[1] = GETUTF8TRAILBYTE(uc, 6);
225                         c[0] = (char)(0xe0 | ((uc >> 12) & 0x0f));
226                         olen = 3;
227                 }
228
229                 (*inbytesleft)  -= ilen;
230                 (*outbytesleft) -= olen;
231                 (*inbuf)  += ilen;
232                 (*outbuf) += olen;
233         }
234
235         if (*inbytesleft == 1) {
236                 errno = EINVAL;
237                 return -1;
238         }
239
240         if (*inbytesleft > 1) {
241                 errno = E2BIG;
242                 return -1;
243         }
244         
245         return 0;
246
247 toobig:
248         errno = E2BIG;
249         return -1;
250 }