From: Jonas Fonseca <fonseca@diku.dk>
Date: Tue, 30 Mar 2010 11:03:17 +0000 (-0400)
Subject: utf8: move unicode related functions below the string helpers
X-Git-Url: https://git.tokkee.org/?a=commitdiff_plain;h=61a871f4f1214475d54b6e83d4521b7826455800;p=tig.git

utf8: move unicode related functions below the string helpers
---

diff --git a/tig.c b/tig.c
index d4a5591..75f7944 100644
--- a/tig.c
+++ b/tig.c
@@ -68,8 +68,6 @@
 static void __NORETURN die(const char *err, ...);
 static void warn(const char *msg, ...);
 static void report(const char *msg, ...);
-static size_t utf8_length(const char **start, size_t skip, int *width, size_t max_width, int *trimmed, bool reserve, int tab_size);
-static inline unsigned char utf8_char_length(const char *string, const char *end);
 
 #define ABS(x)		((x) >= 0  ? (x) : -(x))
 #define MIN(x, y)	((x) < (y) ? (x) :  (y))
@@ -359,6 +357,166 @@ suffixcmp(const char *str, int slen, const char *suffix)
 }
 
 
+/*
+ * Unicode / UTF-8 handling
+ *
+ * NOTE: Much of the following code for dealing with Unicode is derived from
+ * ELinks' UTF-8 code developed by Scrool <scroolik@gmail.com>. Origin file is
+ * src/intl/charset.c from the UTF-8 branch commit elinks-0.11.0-g31f2c28.
+ */
+
+static inline int
+unicode_width(unsigned long c, int tab_size)
+{
+	if (c >= 0x1100 &&
+	   (c <= 0x115f				/* Hangul Jamo */
+	    || c == 0x2329
+	    || c == 0x232a
+	    || (c >= 0x2e80  && c <= 0xa4cf && c != 0x303f)
+						/* CJK ... Yi */
+	    || (c >= 0xac00  && c <= 0xd7a3)	/* Hangul Syllables */
+	    || (c >= 0xf900  && c <= 0xfaff)	/* CJK Compatibility Ideographs */
+	    || (c >= 0xfe30  && c <= 0xfe6f)	/* CJK Compatibility Forms */
+	    || (c >= 0xff00  && c <= 0xff60)	/* Fullwidth Forms */
+	    || (c >= 0xffe0  && c <= 0xffe6)
+	    || (c >= 0x20000 && c <= 0x2fffd)
+	    || (c >= 0x30000 && c <= 0x3fffd)))
+		return 2;
+
+	if (c == '\t')
+		return tab_size;
+
+	return 1;
+}
+
+/* Number of bytes used for encoding a UTF-8 character indexed by first byte.
+ * Illegal bytes are set one. */
+static const unsigned char utf8_bytes[256] = {
+	1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
+	1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
+	1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
+	1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
+	1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
+	1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
+	2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
+	3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 5,5,5,5,6,6,1,1,
+};
+
+static inline unsigned char
+utf8_char_length(const char *string, const char *end)
+{
+	int c = *(unsigned char *) string;
+
+	return utf8_bytes[c];
+}
+
+/* Decode UTF-8 multi-byte representation into a Unicode character. */
+static inline unsigned long
+utf8_to_unicode(const char *string, size_t length)
+{
+	unsigned long unicode;
+
+	switch (length) {
+	case 1:
+		unicode  =   string[0];
+		break;
+	case 2:
+		unicode  =  (string[0] & 0x1f) << 6;
+		unicode +=  (string[1] & 0x3f);
+		break;
+	case 3:
+		unicode  =  (string[0] & 0x0f) << 12;
+		unicode += ((string[1] & 0x3f) << 6);
+		unicode +=  (string[2] & 0x3f);
+		break;
+	case 4:
+		unicode  =  (string[0] & 0x0f) << 18;
+		unicode += ((string[1] & 0x3f) << 12);
+		unicode += ((string[2] & 0x3f) << 6);
+		unicode +=  (string[3] & 0x3f);
+		break;
+	case 5:
+		unicode  =  (string[0] & 0x0f) << 24;
+		unicode += ((string[1] & 0x3f) << 18);
+		unicode += ((string[2] & 0x3f) << 12);
+		unicode += ((string[3] & 0x3f) << 6);
+		unicode +=  (string[4] & 0x3f);
+		break;
+	case 6:
+		unicode  =  (string[0] & 0x01) << 30;
+		unicode += ((string[1] & 0x3f) << 24);
+		unicode += ((string[2] & 0x3f) << 18);
+		unicode += ((string[3] & 0x3f) << 12);
+		unicode += ((string[4] & 0x3f) << 6);
+		unicode +=  (string[5] & 0x3f);
+		break;
+	default:
+		return 0;
+	}
+
+	/* Invalid characters could return the special 0xfffd value but NUL
+	 * should be just as good. */
+	return unicode > 0xffff ? 0 : unicode;
+}
+
+/* Calculates how much of string can be shown within the given maximum width
+ * and sets trimmed parameter to non-zero value if all of string could not be
+ * shown. If the reserve flag is TRUE, it will reserve at least one
+ * trailing character, which can be useful when drawing a delimiter.
+ *
+ * Returns the number of bytes to output from string to satisfy max_width. */
+static size_t
+utf8_length(const char **start, size_t skip, int *width, size_t max_width, int *trimmed, bool reserve, int tab_size)
+{
+	const char *string = *start;
+	const char *end = strchr(string, '\0');
+	unsigned char last_bytes = 0;
+	size_t last_ucwidth = 0;
+
+	*width = 0;
+	*trimmed = 0;
+
+	while (string < end) {
+		unsigned char bytes = utf8_char_length(string, end);
+		size_t ucwidth;
+		unsigned long unicode;
+
+		if (string + bytes > end)
+			break;
+
+		/* Change representation to figure out whether
+		 * it is a single- or double-width character. */
+
+		unicode = utf8_to_unicode(string, bytes);
+		/* FIXME: Graceful handling of invalid Unicode character. */
+		if (!unicode)
+			break;
+
+		ucwidth = unicode_width(unicode, tab_size);
+		if (skip > 0) {
+			skip -= ucwidth <= skip ? ucwidth : skip;
+			*start += bytes;
+		}
+		*width  += ucwidth;
+		if (*width > max_width) {
+			*trimmed = 1;
+			*width -= ucwidth;
+			if (reserve && *width == max_width) {
+				string -= last_bytes;
+				*width -= last_ucwidth;
+			}
+			break;
+		}
+
+		string  += bytes;
+		last_bytes = ucwidth ? bytes : 0;
+		last_ucwidth = ucwidth;
+	}
+
+	return string - *start;
+}
+
+
 #define DATE_INFO \
 	DATE_(NO), \
 	DATE_(DEFAULT), \
@@ -6748,166 +6906,6 @@ static struct view_ops main_ops = {
 };
 
 
-/*
- * Unicode / UTF-8 handling
- *
- * NOTE: Much of the following code for dealing with Unicode is derived from
- * ELinks' UTF-8 code developed by Scrool <scroolik@gmail.com>. Origin file is
- * src/intl/charset.c from the UTF-8 branch commit elinks-0.11.0-g31f2c28.
- */
-
-static inline int
-unicode_width(unsigned long c, int tab_size)
-{
-	if (c >= 0x1100 &&
-	   (c <= 0x115f				/* Hangul Jamo */
-	    || c == 0x2329
-	    || c == 0x232a
-	    || (c >= 0x2e80  && c <= 0xa4cf && c != 0x303f)
-						/* CJK ... Yi */
-	    || (c >= 0xac00  && c <= 0xd7a3)	/* Hangul Syllables */
-	    || (c >= 0xf900  && c <= 0xfaff)	/* CJK Compatibility Ideographs */
-	    || (c >= 0xfe30  && c <= 0xfe6f)	/* CJK Compatibility Forms */
-	    || (c >= 0xff00  && c <= 0xff60)	/* Fullwidth Forms */
-	    || (c >= 0xffe0  && c <= 0xffe6)
-	    || (c >= 0x20000 && c <= 0x2fffd)
-	    || (c >= 0x30000 && c <= 0x3fffd)))
-		return 2;
-
-	if (c == '\t')
-		return tab_size;
-
-	return 1;
-}
-
-/* Number of bytes used for encoding a UTF-8 character indexed by first byte.
- * Illegal bytes are set one. */
-static const unsigned char utf8_bytes[256] = {
-	1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
-	1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
-	1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
-	1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
-	1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
-	1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
-	2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
-	3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 5,5,5,5,6,6,1,1,
-};
-
-static inline unsigned char
-utf8_char_length(const char *string, const char *end)
-{
-	int c = *(unsigned char *) string;
-
-	return utf8_bytes[c];
-}
-
-/* Decode UTF-8 multi-byte representation into a Unicode character. */
-static inline unsigned long
-utf8_to_unicode(const char *string, size_t length)
-{
-	unsigned long unicode;
-
-	switch (length) {
-	case 1:
-		unicode  =   string[0];
-		break;
-	case 2:
-		unicode  =  (string[0] & 0x1f) << 6;
-		unicode +=  (string[1] & 0x3f);
-		break;
-	case 3:
-		unicode  =  (string[0] & 0x0f) << 12;
-		unicode += ((string[1] & 0x3f) << 6);
-		unicode +=  (string[2] & 0x3f);
-		break;
-	case 4:
-		unicode  =  (string[0] & 0x0f) << 18;
-		unicode += ((string[1] & 0x3f) << 12);
-		unicode += ((string[2] & 0x3f) << 6);
-		unicode +=  (string[3] & 0x3f);
-		break;
-	case 5:
-		unicode  =  (string[0] & 0x0f) << 24;
-		unicode += ((string[1] & 0x3f) << 18);
-		unicode += ((string[2] & 0x3f) << 12);
-		unicode += ((string[3] & 0x3f) << 6);
-		unicode +=  (string[4] & 0x3f);
-		break;
-	case 6:
-		unicode  =  (string[0] & 0x01) << 30;
-		unicode += ((string[1] & 0x3f) << 24);
-		unicode += ((string[2] & 0x3f) << 18);
-		unicode += ((string[3] & 0x3f) << 12);
-		unicode += ((string[4] & 0x3f) << 6);
-		unicode +=  (string[5] & 0x3f);
-		break;
-	default:
-		return 0;
-	}
-
-	/* Invalid characters could return the special 0xfffd value but NUL
-	 * should be just as good. */
-	return unicode > 0xffff ? 0 : unicode;
-}
-
-/* Calculates how much of string can be shown within the given maximum width
- * and sets trimmed parameter to non-zero value if all of string could not be
- * shown. If the reserve flag is TRUE, it will reserve at least one
- * trailing character, which can be useful when drawing a delimiter.
- *
- * Returns the number of bytes to output from string to satisfy max_width. */
-static size_t
-utf8_length(const char **start, size_t skip, int *width, size_t max_width, int *trimmed, bool reserve, int tab_size)
-{
-	const char *string = *start;
-	const char *end = strchr(string, '\0');
-	unsigned char last_bytes = 0;
-	size_t last_ucwidth = 0;
-
-	*width = 0;
-	*trimmed = 0;
-
-	while (string < end) {
-		unsigned char bytes = utf8_char_length(string, end);
-		size_t ucwidth;
-		unsigned long unicode;
-
-		if (string + bytes > end)
-			break;
-
-		/* Change representation to figure out whether
-		 * it is a single- or double-width character. */
-
-		unicode = utf8_to_unicode(string, bytes);
-		/* FIXME: Graceful handling of invalid Unicode character. */
-		if (!unicode)
-			break;
-
-		ucwidth = unicode_width(unicode, tab_size);
-		if (skip > 0) {
-			skip -= ucwidth <= skip ? ucwidth : skip;
-			*start += bytes;
-		}
-		*width  += ucwidth;
-		if (*width > max_width) {
-			*trimmed = 1;
-			*width -= ucwidth;
-			if (reserve && *width == max_width) {
-				string -= last_bytes;
-				*width -= last_ucwidth;
-			}
-			break;
-		}
-
-		string  += bytes;
-		last_bytes = ucwidth ? bytes : 0;
-		last_ucwidth = ucwidth;
-	}
-
-	return string - *start;
-}
-
-
 /*
  * Status management
  */