X-Git-Url: https://arthur.barton.de/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=libatalk%2Funicode%2Futil_unistr.c;h=c163d6f6e3f6366d75d7503ba64787dd3356e255;hb=654f86c6e05414423af719716ad028f7af7a65f5;hp=9b018baa97217b0eddd15f0947c0d40691051a45;hpb=80fe6e63eb8ec9a396b4383cb8210fd5ded6ad35;p=netatalk.git

diff --git a/libatalk/unicode/util_unistr.c b/libatalk/unicode/util_unistr.c
index 9b018baa..c163d6f6 100644
--- a/libatalk/unicode/util_unistr.c
+++ b/libatalk/unicode/util_unistr.c
@@ -1,3 +1,13 @@
+/*******************************************************************
+  NOTE:
+  The early netatalk 2.x was based on UCS-2.
+  UCS-2 don't support chars above U+10000.
+  Recent netatalk is based on UTF-16.
+  UTF-16 can support chars above U+10000, using Surrogate Pair.
+  However, Surrogate Pair is complex, dirty, filthy and disagreeable.
+  There might still be latent bugs...
+********************************************************************/
+
 #ifdef HAVE_CONFIG_H
 #include "config.h"
 #endif /* HAVE_CONFIG_H */
@@ -12,21 +22,38 @@
 #include <arpa/inet.h>
 
 #include <atalk/unicode.h>
+#include <atalk/byteorder.h>
+
 #include "precompose.h"
-#include "byteorder.h"
 
 /*******************************************************************
  Convert a string to lower case.
  return True if any char is converted
 ********************************************************************/
+/* surrogate pair support */
+
 int strlower_w(ucs2_t *s)
 {
 	int ret = 0;
+
 	while (*s) {
-		ucs2_t v = tolower_w(*s);
-		if (v != *s) {
-			*s = v;
-			ret = 1;
+		if ((0xD800 <= *s) && (*s < 0xDC00)) {
+			if ((0xDC00 <= s[1]) && (s[1] < 0xE000)) {
+				uint32_t s_sp = (uint32_t)*s << 16 | (uint32_t)s[1];
+				uint32_t v_sp = tolower_sp(s_sp);
+				if (v_sp != s_sp) {
+					*s = v_sp >> 16;
+					s++;
+					*s = v_sp & 0xFFFF;
+					ret = 1;
+				}
+			}
+		} else {
+			ucs2_t v = tolower_w(*s);
+			if (v != *s) {
+				*s = v;
+				ret = 1;
+			}
 		}
 		s++;
 	}
@@ -37,41 +64,74 @@ int strlower_w(ucs2_t *s)
  Convert a string to upper case.
  return True if any char is converted
 ********************************************************************/
+/* surrogate pair support */
+
 int strupper_w(ucs2_t *s)
 {
 	int ret = 0;
+
 	while (*s) {
-		ucs2_t v = toupper_w(*s);
-		if (v != *s) {
-			*s = v;
-			ret = 1;
+		if ((0xD800 <= *s) && (*s < 0xDC00)) {
+			if ((0xDC00 <= s[1]) && (s[1] < 0xE000)) {
+				uint32_t s_sp = (uint32_t)*s << 16 | (uint32_t)s[1];
+				uint32_t v_sp = toupper_sp(s_sp);
+				if (v_sp != s_sp) {
+					*s = v_sp >> 16;
+					s++;
+					*s = v_sp & 0xFFFF;
+					ret = 1;
+				}
+			}
+		} else {
+			ucs2_t v = toupper_w(*s);
+			if (v != *s) {
+				*s = v;
+				ret = 1;
+			}
 		}
 		s++;
 	}
 	return ret;
 }
 
-
 /*******************************************************************
+wide & sp islower()
 determine if a character is lowercase
 ********************************************************************/
+/* These functions are not used. */
+
 int islower_w(ucs2_t c)
 {
 	return ( c == tolower_w(c));
 }
 
+int islower_sp(uint32_t c_sp)
+{
+	return ( c_sp == tolower_sp(c_sp));
+}
+
 /*******************************************************************
+wide & sp isupper()
 determine if a character is uppercase
 ********************************************************************/
+/* These functions are not used. */
+
 int isupper_w(ucs2_t c)
 {
 	return ( c == toupper_w(c));
 }
 
+int isupper_sp(uint32_t c_sp)
+{
+	return ( c_sp == toupper_sp(c_sp));
+}
 
 /*******************************************************************
- Count the number of characters in a ucs2_t string.
+wide strlen()
+ Count the number of characters in a UTF-16 string.
 ********************************************************************/
+/* NOTE: one surrogate pair is two characters. */
+
 size_t strlen_w(const ucs2_t *src)
 {
 	size_t len;
@@ -82,8 +142,11 @@ size_t strlen_w(const ucs2_t *src)
 }
 
 /*******************************************************************
- Count up to max number of characters in a ucs2_t string.
+wide strnlen()
+ Count up to max number of characters in a UTF-16 string.
 ********************************************************************/
+/* NOTE: one surrogate pair is two characters. */
+
 size_t strnlen_w(const ucs2_t *src, size_t max)
 {
 	size_t len;
@@ -96,6 +159,8 @@ size_t strnlen_w(const ucs2_t *src, size_t max)
 /*******************************************************************
 wide strchr()
 ********************************************************************/
+/* NOTE: hi and lo of surrogate pair are separately processed. */
+
 ucs2_t *strchr_w(const ucs2_t *s, ucs2_t c)
 {
 	while (*s != 0) {
@@ -107,11 +172,15 @@ ucs2_t *strchr_w(const ucs2_t *s, ucs2_t c)
 	return NULL;
 }
 
+/*******************************************************************
+wide & sp strcasechr()
+********************************************************************/
+/* NOTE: separately process BMP and surrogate pair */
+
 ucs2_t *strcasechr_w(const ucs2_t *s, ucs2_t c)
 {
 	while (*s != 0) {
-/*		LOG(log_debug, logtype_default, "Comparing %X to %X (%X - %X)", c, *s, toupper_w(c), toupper_w(*s));*/
-		if (toupper_w(c) == toupper_w(*s)) return (ucs2_t *)s;
+		if (tolower_w(c) == tolower_w(*s)) return (ucs2_t *)s;
 		s++;
 	}
 	if (c == *s) return (ucs2_t *)s;
@@ -119,6 +188,21 @@ ucs2_t *strcasechr_w(const ucs2_t *s, ucs2_t c)
 	return NULL;
 }
 
+ucs2_t *strcasechr_sp(const ucs2_t *s, uint32_t c_sp)
+{
+	if (*s == 0) return NULL;
+	while (s[1] != 0) {
+		if (tolower_sp(c_sp) == tolower_sp((uint32_t)*s << 16 | (uint32_t)s[1])) return (ucs2_t *)s;
+		s++;
+	}
+
+	return NULL;
+}
+
+/*******************************************************************
+wide strcmp()
+********************************************************************/
+/* no problem of surrogate pair */
 
 int strcmp_w(const ucs2_t *a, const ucs2_t *b)
 {
@@ -129,6 +213,11 @@ int strcmp_w(const ucs2_t *a, const ucs2_t *b)
 	   string is longer */
 }
 
+/*******************************************************************
+wide strncmp()
+********************************************************************/
+/* no problem of surrogate pair */
+
 int strncmp_w(const ucs2_t *a, const ucs2_t *b, size_t len)
 {
 	size_t n = 0;
@@ -139,6 +228,8 @@ int strncmp_w(const ucs2_t *a, const ucs2_t *b, size_t len)
 /*******************************************************************
 wide strstr()
 ********************************************************************/
+/* no problem of surrogate pair */
+
 ucs2_t *strstr_w(const ucs2_t *s, const ucs2_t *ins)
 {
 	ucs2_t *r;
@@ -155,6 +246,11 @@ ucs2_t *strstr_w(const ucs2_t *s, const ucs2_t *ins)
 	return NULL;
 }
 
+/*******************************************************************
+wide strcasestr()
+********************************************************************/
+/* surrogate pair support */
+
 ucs2_t *strcasestr_w(const ucs2_t *s, const ucs2_t *ins)
 {
 	ucs2_t *r;
@@ -164,39 +260,86 @@ ucs2_t *strcasestr_w(const ucs2_t *s, const ucs2_t *ins)
 	slen = strlen_w(s);
 	inslen = strlen_w(ins);
 	r = (ucs2_t *)s;
-	while ((r = strcasechr_w(r, *ins))) {
-		if (strncasecmp_w(r, ins, inslen) == 0) return r;
-		r++;
+
+	if ((0xD800 <= *ins) && (*ins < 0xDC00)) {
+		if ((0xDC00 <= ins[1]) && (ins[1] < 0xE000)) {
+			uint32_t ins_sp = (uint32_t)*ins << 16 | (uint32_t)ins[1];
+			while ((r = strcasechr_sp(r, ins_sp))) {
+				if (strncasecmp_w(r, ins, inslen) == 0) return r;
+				r++;
+			}
+		} else {
+			return NULL; /* illegal sequence */
+		}
+	} else {
+		while ((r = strcasechr_w(r, *ins))) {
+			if (strncasecmp_w(r, ins, inslen) == 0) return r;
+			r++;
+		}
 	}
 	return NULL;
 }
 
-
-
-
 /*******************************************************************
+wide strcasecmp()
 case insensitive string comparison
 ********************************************************************/
+/* surrogate pair support */
+
 int strcasecmp_w(const ucs2_t *a, const ucs2_t *b)
 {
-	while (*b && toupper_w(*a) == toupper_w(*b)) { a++; b++; }
+	int ret;
+
+	while (*a && *b) {
+		if ((0xD800 <= *a) && (*a < 0xDC00)) {
+			if (ret = tolower_sp((uint32_t)*a << 16 | (uint32_t)a[1]) - tolower_sp((uint32_t)*b << 16 | (uint32_t)b[1])) return ret;
+			a++;
+			b++;
+			if (!(*a && *b)) return (tolower_w(*a) - tolower_w(*b)); /* avoid buffer over run */
+		} else {
+			if (ret = tolower_w(*a) - tolower_w(*b)) return ret;
+		}
+		a++;
+		b++;
+	}
 	return (tolower_w(*a) - tolower_w(*b));
 }
 
 /*******************************************************************
-case insensitive string comparison, lenght limited
+wide strncasecmp()
+case insensitive string comparison, length limited
 ********************************************************************/
+/* NOTE: compare up to 'len+1' if 'len' isolate surrogate pair  */
+
 int strncasecmp_w(const ucs2_t *a, const ucs2_t *b, size_t len)
 {
 	size_t n = 0;
-	while ((n < len) && *b && (toupper_w(*a) == toupper_w(*b))) { a++; b++; n++; }
+	int ret;
+
+	while ((n < len) && *a && *b) {
+		if ((0xD800 <= *a) && (*a < 0xDC00)) {
+			if (ret = tolower_sp((uint32_t)*a << 16 | (uint32_t)a[1]) - tolower_sp((uint32_t)*b << 16 | (uint32_t)b[1])) return ret;
+			a++;
+			b++;
+			n++;
+			if (!((n < len) && *a && *b)) return (tolower_w(*a) - tolower_w(*b));
+		} else {
+			if (ret = tolower_w(*a) - tolower_w(*b)) return ret;
+		}
+		a++;
+		b++;
+		n++;
+	}
 	return (len - n)?(tolower_w(*a) - tolower_w(*b)):0;
 }
 
 /*******************************************************************
+wide strndup()
 duplicate string
 ********************************************************************/
+/* NOTE: not check isolation of surrogate pair */
 /* if len == 0 then duplicate the whole string */
+
 ucs2_t *strndup_w(const ucs2_t *src, size_t len)
 {
 	ucs2_t *dest;
@@ -214,6 +357,12 @@ ucs2_t *strndup_w(const ucs2_t *src, size_t len)
 	return dest;
 }
 
+/*******************************************************************
+wide strdup()
+duplicate string
+********************************************************************/
+/* no problem of surrogate pair */
+
 ucs2_t *strdup_w(const ucs2_t *src)
 {
 	return strndup_w(src, 0);
@@ -222,6 +371,8 @@ ucs2_t *strdup_w(const ucs2_t *src)
 /*******************************************************************
 copy a string with max len
 ********************************************************************/
+/* This function is not used. */
+/* NOTE: not check isolation of surrogate pair */
 
 ucs2_t *strncpy_w(ucs2_t *dest, const ucs2_t *src, const size_t max)
 {
@@ -241,7 +392,9 @@ ucs2_t *strncpy_w(ucs2_t *dest, const ucs2_t *src, const size_t max)
 /*******************************************************************
 append a string of len bytes and add a terminator
 ********************************************************************/
+/* These functions are not used. */
 
+/* NOTE: not check isolation of surrogate pair */
 ucs2_t *strncat_w(ucs2_t *dest, const ucs2_t *src, const size_t max)
 {
 	size_t start;
@@ -258,7 +411,7 @@ ucs2_t *strncat_w(ucs2_t *dest, const ucs2_t *src, const size_t max)
 	return dest;
 }
 
-
+/* no problem of surrogate pair */
 ucs2_t *strcat_w(ucs2_t *dest, const ucs2_t *src)
 {
 	size_t start;
@@ -416,8 +569,6 @@ size_t precompose_w (ucs2_t *name, size_t inplen, ucs2_t *comp, size_t *outlen)
 	base = *in;
 	while (*outlen > 2) {
 		i += 2;
-		in++;
-
 		if (i == inplen) {
 			*out = base;
 			out++;
@@ -425,7 +576,7 @@ size_t precompose_w (ucs2_t *name, size_t inplen, ucs2_t *comp, size_t *outlen)
 			*outlen -= 2;
 			return o_len - *outlen;
 		}
-
+		in++;
 		comb = *in;
 		result = 0;
 
@@ -453,7 +604,7 @@ size_t precompose_w (ucs2_t *name, size_t inplen, ucs2_t *comp, size_t *outlen)
 		
 		/* Binary Search for Surrogate Pair */
 		else if ((0xD800 <= base) && (base < 0xDC00)) {
-			if ((0xDC00 <= comb) && (comb < 0xE000) && (i + 4 <= inplen)) {
+			if ((0xDC00 <= comb) && (comb < 0xE000) && (i + 6 <= inplen)) {
 				base_sp = ((uint32_t)base << 16) | (uint32_t)comb;
 				do {
 					comb_sp = ((uint32_t)in[1] << 16) | (uint32_t)in[2];
@@ -462,7 +613,7 @@ size_t precompose_w (ucs2_t *name, size_t inplen, ucs2_t *comp, size_t *outlen)
 						i += 4;
 						in +=2;
 					}
-				} while ((i + 4 <= inplen) && result_sp) ;
+				} while ((i + 6 <= inplen) && result_sp) ;
 
 				*out = base_sp >> 16;
 				out++;
@@ -478,6 +629,11 @@ size_t precompose_w (ucs2_t *name, size_t inplen, ucs2_t *comp, size_t *outlen)
 				*outlen -= 2;
 
 				i += 2;
+				if (i == inplen) {
+					out++;
+					*out = 0;
+					return o_len - *outlen;
+				}
 				in++;
 				base = *in;