]> arthur.barton.de Git - netatalk.git/blob - libatalk/unicode/utf8.c
Fix for not shown ACLs for when filesyem uid or gid couldn't be resolved because...
[netatalk.git] / libatalk / unicode / utf8.c
1 /* 
2    Unix SMB/CIFS implementation.
3    minimal iconv implementation
4    Copyright (C) Andrew Tridgell 2001
5    Copyright (C) Jelmer Vernooij 2002,2003
6    
7    This program is free software; you can redistribute it and/or modify
8    it under the terms of the GNU General Public License as published by
9    the Free Software Foundation; either version 2 of the License, or
10    (at your option) any later version.
11    
12    This program is distributed in the hope that it will be useful,
13    but WITHOUT ANY WARRANTY; without even the implied warranty of
14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15    GNU General Public License for more details.
16    
17    You should have received a copy of the GNU General Public License
18    along with this program; if not, write to the Free Software
19    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20    
21    From samba 3.0 beta and GNU libiconv-1.8
22    It's bad but most of the time we can't use libc iconv service:
23    - it doesn't round trip for most encoding
24    - it doesn't know about Apple extension
25 */
26
27 #ifdef HAVE_CONFIG_H
28 #include "config.h"
29 #endif /* HAVE_CONFIG_H */
30 #include <stdlib.h>
31 #include <errno.h>
32
33 #include <netatalk/endian.h>
34 #include <atalk/unicode.h>
35 #include <atalk/logger.h>
36 #include <atalk/unicode.h>
37 #include "byteorder.h"
38
39 /* Given a trailing UTF-8 byte, get the contribution from it to
40  * the Unicode scalar value for a particular bit shift amount
41  */
42 #define GETUCVAL(utf8_trailbyte,shift)  ((unsigned int) (( utf8_trailbyte & 0x3F) << shift))
43
44 /* Given a unicode scalar, get a trail UTF-8 byte for a particular bit shift amount */
45 #define GETUTF8TRAILBYTE(uc,shift)      ((char)( 0x80 | ((uc >> shift) & 0x3F) ) )
46
47
48
49 static size_t   utf8_pull(void *,char **, size_t *, char **, size_t *);
50 static size_t   utf8_push(void *,char **, size_t *, char **, size_t *);
51
52 struct charset_functions charset_utf8 =
53 {
54         "UTF8",
55         0x08000103,
56         utf8_pull,
57         utf8_push,
58         CHARSET_VOLUME | CHARSET_MULTIBYTE | CHARSET_PRECOMPOSED,
59         NULL,
60         NULL, NULL
61 };
62
63 struct charset_functions charset_utf8_mac =
64 {
65         "UTF8-MAC",
66         0x08000103,
67         utf8_pull,
68         utf8_push,
69         CHARSET_VOLUME | CHARSET_CLIENT | CHARSET_MULTIBYTE | CHARSET_DECOMPOSED,
70         NULL,
71         NULL, NULL
72 };
73
74 /* ------------------- Convert from UTF-8 to UCS-2 -------------------*/
75 static size_t utf8_pull(void *cd _U_, char **inbuf, size_t *inbytesleft,
76                          char **outbuf, size_t *outbytesleft)
77 {
78         ucs2_t uc = 0;
79         ucs2_t hi, low;     /* surrogate pair */
80         unsigned int codepoint, surrogate;
81         int len;
82
83         while (*inbytesleft >= 1 && *outbytesleft >= 2) {
84                 unsigned char *c = (unsigned char *)*inbuf;
85                 len = 1;
86
87                 /* Arrange conditionals in the order of most frequent occurrence 
88                  * for users of Latin-based chars */
89                 if ((c[0] & 0x80) == 0) {
90                         uc = c[0];
91                 } else if ((c[0] & 0xe0) == 0xc0) {
92                         if (*inbytesleft < 2) {
93                                 LOG(log_debug, logtype_default, "short utf8 char");
94                                 goto badseq;
95                         }
96                         uc = (ucs2_t) (((c[0] & 0x1f) << 6) | GETUCVAL(c[1],0)) ;
97                         len = 2;
98                 } else if ((c[0] & 0xf0) == 0xe0) {
99                         if (*inbytesleft < 3) {
100                                 LOG(log_debug, logtype_default, "short utf8 char");
101                                 goto badseq;
102                         }
103                         uc = (ucs2_t) (((c[0] & 0x0f) << 12) | GETUCVAL(c[1],6) | GETUCVAL(c[2],0)) ;
104                         len = 3;
105                 } else if ((c[0] & 0xf8) == 0xf0) {
106                         /* 4 bytes, which happens for surrogate pairs only */
107                         if (*inbytesleft < 4) {
108                                 LOG(log_debug, logtype_default, "short utf8 char");
109                                 goto badseq;
110                         }
111                         if (*outbytesleft < 4) {
112                                 LOG(log_debug, logtype_default, "short ucs-2 write");
113                                 errno = E2BIG;
114                                 return -1;
115                         }
116                         codepoint = ((c[0] & 0x07) << 18) | GETUCVAL(c[1],12) |
117                                 GETUCVAL(c[2],6) |  GETUCVAL(c[3],0);
118                         hi = (ucs2_t)( ((codepoint - 0x10000) >> 10) + 0xD800);
119                         low = (ucs2_t)(0xDC00 + (codepoint & 0x03FF));
120                         surrogate = (hi << 16) | low;
121                         SIVAL(*outbuf,0,surrogate);
122                         len = 4;
123                         (*inbuf)  += 4;
124                         (*inbytesleft)  -= 4;
125                         (*outbytesleft) -= 4;
126                         (*outbuf) += 4;
127                         continue;
128                 }
129                 else {
130                         errno = EINVAL;
131                         return -1;
132                 }
133
134                 SSVAL(*outbuf,0,uc);
135                 (*inbuf)  += len;
136                 (*inbytesleft)  -= len;
137                 (*outbytesleft) -= 2;
138                 (*outbuf) += 2;
139         }
140
141         if (*inbytesleft > 0) {
142                 errno = E2BIG;
143                 return -1;
144         }
145         
146         return 0;
147
148 badseq:
149         errno = EINVAL;
150         return -1;
151 }
152
153 /* --------------------- Convert from UCS-2 to UTF-8 -----------*/
154 static size_t utf8_push(void *cd _U_, char **inbuf, size_t *inbytesleft,
155                          char **outbuf, size_t *outbytesleft)
156 {
157         ucs2_t uc=0;
158         ucs2_t hi, low;
159         unsigned int surrogatepair, codepoint;
160         int olen, ilen;
161
162         while (*inbytesleft >= 2 && *outbytesleft >= 1) {
163                 unsigned char *c = (unsigned char *)*outbuf;
164                 uc = SVAL((*inbuf),0);
165                 olen=1;
166                 ilen=2;
167
168                 /* Arrange conditionals in the order of most frequent occurrence for
169                    users of Latin-based chars */
170                 if (uc < 0x80) {
171                         c[0] = uc;
172                 } else if (uc < 0x800) {
173                         if (*outbytesleft < 2) {
174                                 LOG(log_debug, logtype_default, "short utf8 write");
175                                 goto toobig;
176                         }
177                         c[1] = GETUTF8TRAILBYTE(uc, 0);
178                         c[0] = (char)(0xc0 | ((uc >> 6) & 0x1f));
179                         olen = 2;
180                 }
181                 else if ( uc >= 0x202a && uc <= 0x202e ) {
182                         /* ignore bidi hint characters */
183                         olen = 0;
184                 }
185                 /*
186                  * A 2-byte uc value represents a stand-alone Unicode character if
187                  *     0 <= uc < 0xd800 or 0xdfff < uc <= 0xffff.
188                  * If  0xd800 <= uc <= 0xdfff, uc itself does not represent a Unicode character.
189                  * Rather, it is just part of a surrogate pair.  A surrogate pair consists of 
190                  * a high surrogate in the range [0xd800 ... 0xdbff] and a low surrogate in the
191                  * range [0xdc00 ... 0xdfff].  Together the pair maps to a single Unicode character
192                  * whose scalar value is 64K or larger.  It is this scalar value that is transformed
193                  * to UTF-8, not the individual surrogates.
194                  *
195                  * See www.unicode.org/faq/utf_bom.html for more info.
196                  */
197
198                 else if ( 0xd800 <= uc && uc <= 0xdfff) {
199                         /* surrogate - needs 4 bytes from input and 4 bytes for output to UTF-8 */
200                         if (*outbytesleft < 4) {
201                                 LOG(log_debug, logtype_default, "short utf8 write");
202                                 goto toobig;
203                         }
204                         if (*inbytesleft < 4) {
205                                 errno = EINVAL;
206                                 return -1;
207                         }
208                         surrogatepair = IVAL((*inbuf),0);
209                         low = (ucs2_t)surrogatepair;
210                         hi = (ucs2_t)(surrogatepair >> 16);
211                         if ( 0xd800 <= hi && hi <= 0xdbff && 0xdc00 <= low && low <= 0xdfff) {
212                                 codepoint = ((hi - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
213                                 c[3] = GETUTF8TRAILBYTE(codepoint, 0);
214                                 c[2] = GETUTF8TRAILBYTE(codepoint, 6);
215                                 c[1] = GETUTF8TRAILBYTE(codepoint, 12);
216                                 c[0] = (char)(0xf0 | ((codepoint >> 18) & 0x07));
217                                 ilen = olen = 4;
218                         } else { /* invalid values for surrogate */
219                                 errno = EINVAL;
220                                 return -1;
221                         }
222                 } else {
223                         if (*outbytesleft < 3) {
224                                 LOG(log_debug, logtype_default, "short utf8 write");
225                                 goto toobig;
226                         }
227                         c[2] = GETUTF8TRAILBYTE(uc, 0);
228                         c[1] = GETUTF8TRAILBYTE(uc, 6);
229                         c[0] = (char)(0xe0 | ((uc >> 12) & 0x0f));
230                         olen = 3;
231                 }
232
233                 (*inbytesleft)  -= ilen;
234                 (*outbytesleft) -= olen;
235                 (*inbuf)  += ilen;
236                 (*outbuf) += olen;
237         }
238
239         if (*inbytesleft == 1) {
240                 errno = EINVAL;
241                 return -1;
242         }
243
244         if (*inbytesleft > 1) {
245                 errno = E2BIG;
246                 return -1;
247         }
248         
249         return 0;
250
251 toobig:
252         errno = E2BIG;
253         return -1;
254 }