2 Unix SMB/CIFS implementation.
3 Character set conversion Extensions
4 Copyright (C) Igor Vergeichik <iverg@mail.ru> 2001
5 Copyright (C) Andrew Tridgell 2001
6 Copyright (C) Simo Sorce 2001
7 Copyright (C) Martin Pool 2003
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation; either version 2 of the License, or
12 (at your option) any later version.
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, write to the Free Software
21 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26 #endif /* HAVE_CONFIG_H */
32 #include <sys/param.h>
34 #include <atalk/logger.h>
37 #include <netatalk/endian.h>
38 #include <atalk/unicode.h>
40 #ifdef HAVE_USABLE_ICONV
48 * @brief Character-set conversion routines built on our iconv.
50 * @note Samba's internal character set (at least in the 3.0 series)
51 * is always the same as the one for the Unix filesystem. It is
52 * <b>not</b> necessarily UTF-8 and may be different on machines that
53 * need i18n filenames to be compatible with Unix software. It does
54 * have to be a superset of ASCII. All multibyte sequences must start
55 * with a byte with the high bit set.
61 #define MAX_CHARSETS 10
63 static atalk_iconv_t conv_handles[MAX_CHARSETS][MAX_CHARSETS];
65 static char* charset_names[MAX_CHARSETS];
69 charset_t ch_charset_t;
70 struct charset *prev, *next;
74 * Return the name of a charset to give to iconv().
76 static const char *charset_name(charset_t ch)
78 const char *ret = NULL;
80 if (ch == CH_UCS2) ret = "UCS-2LE";
81 else if (ch == CH_UNIX) ret = "ASCII"; /*lp_unix_charset();*/
82 else if (ch == CH_MAC) ret = "MAC"; /*lp_display_charset();*/
83 else if (ch == CH_UTF8) ret = "UTF8";
86 ret = charset_names[ch];
88 if (!ret || !*ret) ret = "ASCII";
92 void lazy_initialize_conv(void)
94 static int initialized = 0;
102 charset_t add_charset(char* name)
104 static charset_t max_charset_t = NUM_CHARSETS-1;
105 charset_t cur_charset_t = max_charset_t+1;
108 for (c1=0; c1<=max_charset_t;c1++) {
109 if ( strcmp(name, charset_name(c1)) == 0)
113 if ( cur_charset_t >= MAX_CHARSETS ) {
114 LOG (log_debug, logtype_default, "Adding charset %s failed, too many charsets (max. %u allowed)",
119 /* First try to setup the required conversions */
121 conv_handles[cur_charset_t][CH_UCS2] = atalk_iconv_open( charset_name(CH_UCS2), name);
122 if (conv_handles[cur_charset_t][CH_UCS2] == (atalk_iconv_t)-1) {
123 LOG(log_error, logtype_default, "Required conversion from %s to %s not supported\n",
124 name, charset_name(CH_UCS2));
125 conv_handles[cur_charset_t][CH_UCS2] = NULL;
129 conv_handles[CH_UCS2][cur_charset_t] = atalk_iconv_open( name, charset_name(CH_UCS2));
130 if (conv_handles[CH_UCS2][cur_charset_t] == (atalk_iconv_t)-1) {
131 LOG(log_error, logtype_default, "Required conversion from %s to %s not supported\n",
132 charset_name(CH_UCS2), name);
133 conv_handles[CH_UCS2][cur_charset_t] = NULL;
137 /* register the new charset_t name */
138 charset_names[cur_charset_t] = strdup(name);
141 for (c1=0;c1<=cur_charset_t;c1++) {
142 for (c2=0;c2<=cur_charset_t;c2++) {
143 const char *n1 = charset_name((charset_t)c1);
144 const char *n2 = charset_name((charset_t)c2);
145 if (conv_handles[c1][c2] &&
146 strcmp(n1, conv_handles[c1][c2]->from_name) == 0 &&
147 strcmp(n2, conv_handles[c1][c2]->to_name) == 0)
150 if (conv_handles[c1][c2])
151 atalk_iconv_close(conv_handles[c1][c2]);
153 conv_handles[c1][c2] = atalk_iconv_open(n2,n1);
154 if (conv_handles[c1][c2] == (atalk_iconv_t)-1) {
155 LOG(log_debug, logtype_default, "Conversion from %s to %s not supported\n",
156 charset_name((charset_t)c1), charset_name((charset_t)c2));
157 conv_handles[c1][c2] = NULL;
164 LOG(log_debug, logtype_default, "Added charset %s with handle %u", name, cur_charset_t);
165 return (cur_charset_t);
169 * Initialize iconv conversion descriptors.
171 * This is called the first time it is needed, and also called again
172 * every time the configuration is reloaded, because the charset or
173 * codepage might have changed.
175 void init_iconv(void)
179 /* so that charset_name() works we need to get the UNIX<->UCS2 going
181 if (!conv_handles[CH_UNIX][CH_UCS2])
182 conv_handles[CH_UNIX][CH_UCS2] = atalk_iconv_open("UCS-2LE", "ASCII");
184 if (!conv_handles[CH_UCS2][CH_UNIX])
185 conv_handles[CH_UCS2][CH_UNIX] = atalk_iconv_open("ASCII", "UCS-2LE");
187 for (c1=0;c1<NUM_CHARSETS;c1++) {
188 for (c2=0;c2<NUM_CHARSETS;c2++) {
189 const char *n1 = charset_name((charset_t)c1);
190 const char *n2 = charset_name((charset_t)c2);
191 if (conv_handles[c1][c2] &&
192 strcmp(n1, conv_handles[c1][c2]->from_name) == 0 &&
193 strcmp(n2, conv_handles[c1][c2]->to_name) == 0)
196 if (conv_handles[c1][c2])
197 atalk_iconv_close(conv_handles[c1][c2]);
199 conv_handles[c1][c2] = atalk_iconv_open(n2,n1);
200 if (conv_handles[c1][c2] == (atalk_iconv_t)-1) {
201 LOG(log_debug, logtype_default, "Conversion from %s to %s not supported\n",
202 charset_name((charset_t)c1), charset_name((charset_t)c2));
203 conv_handles[c1][c2] = NULL;
210 * Convert string from one encoding to another, making error checking etc
212 * @param src pointer to source string (multibyte or singlebyte)
213 * @param srclen length of the source string in bytes
214 * @param dest pointer to destination string (multibyte or singlebyte)
215 * @param destlen maximal length allowed for string
216 * @returns the number of bytes occupied in the destination
218 size_t convert_string(charset_t from, charset_t to,
219 void const *src, size_t srclen,
220 void *dest, size_t destlen)
224 const char* inbuf = (const char*)src;
225 char* outbuf = (char*)dest;
226 atalk_iconv_t descriptor;
228 if (srclen == (size_t)-1)
229 srclen = strlen(src)+1;
231 lazy_initialize_conv();
233 descriptor = conv_handles[from][to];
235 if (descriptor == (atalk_iconv_t)-1 || descriptor == (atalk_iconv_t)0) {
236 /* conversion not supported, use as is */
237 size_t len = MIN(srclen,destlen);
238 memcpy(dest,src,len);
244 retval = atalk_iconv(descriptor, &inbuf, &i_len, &outbuf, &o_len);
245 if(retval==(size_t)-1) {
246 const char *reason="unknown error";
249 reason="Incomplete multibyte sequence";
252 reason="No more room";
253 LOG(log_debug, logtype_default, "convert_string: Required %d, available %d\n",
255 /* we are not sure we need srclen bytes,
256 may be more, may be less.
257 We only know we need more than destlen
261 reason="Illegal multibyte sequence";
265 /* smb_panic(reason); */
267 return destlen-o_len;
271 * Convert between character sets, allocating a new buffer for the result.
273 * @param srclen length of source buffer.
274 * @param dest always set at least to NULL
275 * @note -1 is not accepted for srclen.
277 * @returns Size in bytes of the converted string; or -1 in case of error.
280 size_t convert_string_allocate(charset_t from, charset_t to,
281 void const *src, size_t srclen, void **dest)
283 size_t i_len, o_len, destlen;
285 const char *inbuf = (const char *)src;
287 atalk_iconv_t descriptor;
291 if (src == NULL || srclen == (size_t)-1)
294 lazy_initialize_conv();
296 descriptor = conv_handles[from][to];
298 if (descriptor == (atalk_iconv_t)-1 || descriptor == (atalk_iconv_t)0) {
299 /* conversion not supported, return -1*/
300 LOG(log_debug, logtype_default, "convert_string_allocate: conversion not supported!\n");
304 destlen = MAX(srclen, 512);
307 destlen = destlen * 2;
308 ob = (char *)realloc(outbuf, destlen);
310 LOG(log_debug, logtype_default,"convert_string_allocate: realloc failed!\n");
318 retval = atalk_iconv(descriptor,
321 if(retval == (size_t)-1) {
322 const char *reason="unknown error";
325 reason="Incomplete multibyte sequence";
330 reason="Illegal multibyte sequence";
333 LOG(log_debug, logtype_default,"Conversion error: %s(%s)\n",reason,inbuf);
334 /* smb_panic(reason); */
338 destlen = destlen - o_len;
339 *dest = (char *)realloc(ob,destlen);
340 if (destlen && !*dest) {
341 LOG(log_debug, logtype_default, "convert_string_allocate: out of memory!\n");
350 size_t unix_strupper(const char *src, size_t srclen, char *dest, size_t destlen)
355 size = convert_string_allocate(CH_UNIX, CH_UCS2, src, srclen,
361 if (!strupper_w(buffer) && (dest == src)) {
366 size = convert_string(CH_UCS2, CH_UNIX, buffer, size, dest, destlen);
371 size_t unix_strlower(const char *src, size_t srclen, char *dest, size_t destlen)
376 size = convert_string_allocate(CH_UNIX, CH_UCS2, src, srclen,
381 /* smb_panic("failed to create UCS2 buffer");*/
383 if (!strlower_w(buffer) && (dest == src)) {
387 size = convert_string(CH_UCS2, CH_UNIX, buffer, size, dest, destlen);
392 size_t utf8_strupper(const char *src, size_t srclen, char *dest, size_t destlen)
397 size = convert_string_allocate(CH_UTF8, CH_UCS2, src, srclen,
403 if (!strupper_w(buffer) && (dest == src)) {
408 size = convert_string(CH_UCS2, CH_UTF8, buffer, size, dest, destlen);
413 size_t utf8_strlower(const char *src, size_t srclen, char *dest, size_t destlen)
418 size = convert_string_allocate(CH_UTF8, CH_UCS2, src, srclen,
424 if (!strlower_w(buffer) && (dest == src)) {
429 size = convert_string(CH_UCS2, CH_UTF8, buffer, size, dest, destlen);
434 size_t mac_strupper(const char *src, size_t srclen, char *dest, size_t destlen)
439 size = convert_string_allocate(CH_MAC, CH_UCS2, src, srclen,
445 if (!strupper_w(buffer) && (dest == src)) {
450 size = convert_string(CH_UCS2, CH_MAC, buffer, size, dest, destlen);
455 size_t mac_strlower(const char *src, size_t srclen, char *dest, size_t destlen)
460 size = convert_string_allocate(CH_MAC, CH_UCS2, src, srclen,
466 if (!strlower_w(buffer) && (dest == src)) {
471 size = convert_string(CH_UCS2, CH_MAC, buffer, size, dest, destlen);
477 * Copy a string from a mac char* src to a UCS2 destination, allocating a buffer
479 * @param dest always set at least to NULL
481 * @returns The number of bytes occupied by the string in the destination
482 * or -1 in case of error.
485 size_t mac_to_ucs2_allocate(ucs2_t **dest, const char *src)
487 size_t src_len = strlen(src)+1;
490 return convert_string_allocate(CH_MAC, CH_UCS2, src, src_len, (void **)dest);
494 * Copy a string from a unix char* src to a UTF-8 destination, allocating a buffer
496 * @param dest always set at least to NULL
498 * @returns The number of bytes occupied by the string in the destination
501 size_t mac_to_utf8_allocate(char **dest, const char *src)
503 size_t src_len = strlen(src)+1;
506 return convert_string_allocate(CH_MAC, CH_UTF8, src, src_len, (void **)dest);
510 * Copy a string from a UCS2 src to a unix char * destination, allocating a buffer
512 * @param dest always set at least to NULL
514 * @returns The number of bytes occupied by the string in the destination
517 size_t ucs2_to_mac_allocate(char **dest, const ucs2_t *src)
519 size_t src_len = (strlen_w(src)+1) * sizeof(ucs2_t);
521 return convert_string_allocate(CH_UCS2, CH_MAC, src, src_len, (void **)dest);
525 * Copy a string from a UTF-8 src to a unix char * destination, allocating a buffer
527 * @param dest always set at least to NULL
529 * @returns The number of bytes occupied by the string in the destination
532 static char convbuf[MAXPATHLEN+1];
533 size_t utf8_to_mac_allocate(void **dest, const char *src)
535 size_t src_len = strlen(src)+1;
538 src_len = utf8_precompose ( (char *) src, src_len, convbuf, MAXPATHLEN);
539 return convert_string_allocate(CH_UTF8, CH_MAC, convbuf, src_len, dest);
542 size_t utf8_to_mac ( char* src, size_t src_len, char* dest, size_t dest_len)
544 src_len = utf8_precompose ( (char *) src, src_len, convbuf, MAXPATHLEN);
545 return convert_string(CH_UTF8, CH_MAC, convbuf, src_len, dest, dest_len);
548 static char debugbuf[ MAXPATHLEN +1 ];
549 char * debug_out ( char * seq, size_t len)
555 p = (unsigned char*) seq;
558 for ( i = 0; i<=(len-1); i++)
560 sprintf(q, "%2.2x.", *p);
570 size_t utf8_precompose ( char * src, size_t inlen, char * dst, size_t outlen)
576 if ((size_t)(-1) == (len = convert_string(CH_UTF8, CH_UCS2, src, inlen, convbuf, MAXPATHLEN)) )
579 if ( NULL == (u = precompose_w((ucs2_t *)convbuf, len, &ilen)) )
582 if ((size_t)(-1) == (len = convert_string( CH_UCS2, CH_UTF8, u, ilen, dst, outlen)) )
589 size_t utf8_decompose ( char * src, size_t inlen, char * dst, size_t outlen)
595 if ((size_t)(-1) == (len = convert_string(CH_UTF8, CH_UCS2, src, inlen, convbuf, MAXPATHLEN)) )
598 if ( NULL == (u = decompose_w((ucs2_t *)convbuf, len, &ilen)) )
601 if ((size_t)(-1) == (len = convert_string( CH_UCS2, CH_UTF8, u, ilen, dst, outlen)) )
609 size_t utf8_to_mac_charset ( charset_t ch, char* src, size_t src_len, char* dest, size_t dest_len, int* mangle)
614 char* outbuf = (char*)dest;
615 atalk_iconv_t descriptor;
617 lazy_initialize_conv();
619 src_len = utf8_precompose ( (char *) src, src_len+1, convbuf, MAXPATHLEN);
621 descriptor = conv_handles[CH_UTF8][ch];
623 if (descriptor == (atalk_iconv_t)-1 || descriptor == (atalk_iconv_t)0) {
624 LOG(log_error, logtype_default, "Conversion not supported ( UTF8 to %s )", charset_name(ch));
628 inbuf = (const char*) convbuf;
632 retval = atalk_iconv_ignore(descriptor, &inbuf, &i_len, &outbuf, &o_len, mangle);
634 if(retval==(size_t)-1)
637 dest[dest_len-o_len] = 0;
638 return dest_len-o_len;