From: HAT Date: Tue, 23 Sep 2014 13:00:32 +0000 (+0900) Subject: Handling of malformed UTF8 strings, bug #524 X-Git-Url: https://arthur.barton.de/cgi-bin/gitweb.cgi?p=netatalk.git;a=commitdiff_plain;h=8f6ad94065ea107b7c7c7e10d1905920328f3169 Handling of malformed UTF8 strings, bug #524 Authored-by: HAT Reviewed-by: Ralph Boehme --- diff --git a/NEWS b/NEWS index d0e32b56..a41419cf 100644 --- a/NEWS +++ b/NEWS @@ -6,6 +6,7 @@ Changes in 3.1.7 * FIX: netatalk: SIGHUP would kill the process instead of being resent to the other Netatalk processes, bug #579 * FIX: afpd: Solaris locking problem, bug #559 +* FIX: Handling of malformed UTF8 strings, bug #524 Changes in 3.1.6 ================ diff --git a/libatalk/unicode/utf8.c b/libatalk/unicode/utf8.c index 4a300398..04dbb9a2 100644 --- a/libatalk/unicode/utf8.c +++ b/libatalk/unicode/utf8.c @@ -71,7 +71,29 @@ struct charset_functions charset_utf8_mac = NULL, NULL }; +/* The Unicode Standard Version 6.2 – Core Specification */ +/* http://www.unicode.org/versions/Unicode6.2.0/ch03.pdf */ +/* */ +/* Scalar Value First Second Third Fourth */ +/* 00000000 0xxxxxxx 0xxxxxxx */ +/* 00000yyy yyxxxxxx 110yyyyy 10xxxxxx */ +/* zzzzyyyy yyxxxxxx 1110zzzz 10yyyyyy 10xxxxxx */ +/* 000uuuuu zzzzyyyy yyxxxxxx 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx */ + + /* ------------------- Convert from UTF-8 to UTF-16 -------------------*/ + +/* Code Points First Second Third Fourth */ +/* U+0000..U+007F 00..7F */ +/* U+0080..U+07FF C2..DF 80..BF */ +/* U+0800..U+0FFF E0 A0..BF 80..BF */ +/* U+1000..U+CFFF E1..EC 80..BF 80..BF */ +/* U+D000..U+D7FF ED 80..9F 80..BF */ +/* U+E000..U+FFFF EE..EF 80..BF 80..BF */ +/* U+10000..U+3FFFF F0 90..BF 80..BF 80..BF */ +/* U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF */ +/* U+100000..U+10FFFF F4 80..8F 80..BF 80..BF */ + static size_t utf8_pull(void *cd _U_, char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft) { @@ -85,33 +107,34 @@ static size_t utf8_pull(void *cd _U_, char **inbuf, size_t *inbytesleft, /* Arrange conditionals in the order of most frequent occurrence * for users of Latin-based chars */ - if ((c[0] & 0x80) == 0) { + if ((c[0] & 0x80) == 0) { /* 0xxx xxxx */ /* 1 byte */ uc = c[0]; - } else if ((c[0] & 0xe0) == 0xc0) { - if (*inbytesleft < 2) { - LOG(log_debug, logtype_default, "short utf8 char"); - goto badseq; - } + } else if ((c[0] & 0xe0) == 0xc0) { /* 110y yyyy */ /* 2 bytes */ + if (*inbytesleft < 2) goto inval; + if (c[0] < 0xc2) goto ilseq; /* C2-DF */ + if ((c[1] & 0xc0) != 0x80) goto ilseq; /* 80-BF */ uc = (ucs2_t) (((c[0] & 0x1f) << 6) | GETUCVAL(c[1],0)) ; len = 2; - } else if ((c[0] & 0xf0) == 0xe0) { - if (*inbytesleft < 3) { - LOG(log_debug, logtype_default, "short utf8 char"); - goto badseq; - } + } else if ((c[0] & 0xf0) == 0xe0) { /* 1110 zzzz */ /* 3 bytes */ + if (*inbytesleft < 3) goto inval; + if (!((c[0] == 0xe0 && (c[1] & 0xe0) == 0xa0) || /* E0 A0-BF*/ + (0xe1 <= c[0] && c[0] <= 0xec && (c[1] & 0xc0) == 0x80) || /* E1-EC 80-BF */ + (c[0] == 0xed && (c[1] & 0xe0) == 0x80) || /* ED 80-9F */ + ((c[0] & 0xfe) == 0xee && (c[1] & 0xc0) == 0x80))) /* EE-EF 80-BF */ + goto ilseq; + if ((c[2] & 0xc0) != 0x80) goto ilseq; /* 80-BF */ uc = (ucs2_t) (((c[0] & 0x0f) << 12) | GETUCVAL(c[1],6) | GETUCVAL(c[2],0)) ; len = 3; - } else if ((c[0] & 0xf8) == 0xf0) { - /* 4 bytes, which happens for surrogate pairs only */ - if (*inbytesleft < 4) { - LOG(log_debug, logtype_default, "short utf8 char"); - goto badseq; - } - if (*outbytesleft < 4) { - LOG(log_debug, logtype_default, "short ucs-2 write"); - errno = E2BIG; - return -1; - } + } else if ((c[0] & 0xf8) == 0xf0) { /* 1111 0uuu */ /* 4 bytes */ + if (*inbytesleft < 4) goto inval; + if (*outbytesleft < 4) goto toobig; + if (c[0] > 0xf4) goto ilseq; /* happens for surrogate pairs only */ + if (!((c[0] == 0xf0 && 0x90 <= c[1] && c[1] <= 0xbf) || /* F0 90-BF */ + (0xf1 <= c[0] && c[0] <= 0xf3 && (c[1] & 0xc0) == 0x80) || /* F1-F3 80-BF */ + (c[0] == 0xf4 && (c[1] & 0xc0) == 0x80))) /* F4 80-8F */ + goto ilseq; + if ((c[2] & 0xc0) != 0x80) goto ilseq; /* 80-BF */ + if ((c[3] & 0xc0) != 0x80) goto ilseq; /* 80-BF */ codepoint = ((c[0] & 0x07) << 18) | GETUCVAL(c[1],12) | GETUCVAL(c[2],6) | GETUCVAL(c[3],0); SSVAL(*outbuf,0,(((codepoint - 0x10000) >> 10) + 0xD800)); /* hi */ @@ -122,10 +145,8 @@ static size_t utf8_pull(void *cd _U_, char **inbuf, size_t *inbytesleft, (*outbytesleft) -= 4; (*outbuf) += 4; continue; - } - else { - errno = EINVAL; - return -1; + } else { + goto ilseq; } SSVAL(*outbuf,0,uc); @@ -136,13 +157,23 @@ static size_t utf8_pull(void *cd _U_, char **inbuf, size_t *inbytesleft, } if (*inbytesleft > 0) { - errno = E2BIG; - return -1; + goto toobig; } - + return 0; -badseq: +toobig: + LOG(log_debug, logtype_default, "short ucs-2 write"); + errno = E2BIG; + return -1; + +ilseq: + LOG(log_debug, logtype_default, "malformed utf8 sequence"); + errno = EILSEQ; + return -1; + +inval: + LOG(log_debug, logtype_default, "short utf8 char"); errno = EINVAL; return -1; }