[sword-svn] r2075 - trunk/src/mgr

Tue Sep 4 00:10:14 MST 2007

Author: scribe
Date: 2007-09-04 00:10:14 -0700 (Tue, 04 Sep 2007)
New Revision: 2075

Modified:
   trunk/src/mgr/stringmgr.cpp
Log:
added DM's isValidUTF8 method for better determining if we have latin1 or
UTF8 text.


Modified: trunk/src/mgr/stringmgr.cpp
===================================================================

--- trunk/src/mgr/stringmgr.cpp	2007-09-04 03:02:55 UTC (rev 2074)
+++ trunk/src/mgr/stringmgr.cpp	2007-09-04 07:10:14 UTC (rev 2075)
@@ -48,7 +48,71 @@
 	~__staticsystemStringMgr() { if (StringMgr::systemStringMgr) delete StringMgr::systemStringMgr; StringMgr::systemStringMgr = 0; }
 } _staticsystemStringMgr;
 
+/**
+ * Determine whether the string contains a valid unicode sequence. The following table give the pattern of a valid UTF-8 character.
+ * Unicode Range            1st       2nd       3rd       4th       5th       6th
+ * U-00000000 - U-0000007F  0nnnnnnn
+ * U-00000080 - U-000007FF  110nnnnn  10nnnnnn
+ * U-00000800 - U-0000FFFF  1110nnnn  10nnnnnn  10nnnnnn
+ * U-00010000 - U-001FFFFF  11110nnn  10nnnnnn  10nnnnnn  10nnnnnn
+ * U-00200000 - U-03FFFFFF  111110nn  10nnnnnn  10nnnnnn  10nnnnnn  10nnnnnn
+ * U-04000000 - U-7FFFFFFF  1111110n  10nnnnnn  10nnnnnn  10nnnnnn  10nnnnnn  10nnnnnn
+ * Note:
+ *   The latest UTF-8 RFC allows for a max of 4 bytes. Earlier allowed 6.
+ *   The number of bits of the leading byte before the first 0 is the total number of bytes
+ *   The "n" are the bits of the unicode codepoint.
+ *
+ * This routine does not check to see if the code point is in the range. It could.
+ *
+ * @param txt the text to check
+ * @return  1 if all high order characters form a valid unicode sequence
+ *         -1 if there are no high order characters
+ *          0 if there are high order characters that do not form a valid unicode sequence
+ * @author DM Smith [dmsmith555 at yahoo dot com]
+ */
+int isValidUTF8(unsigned char *txt) {
+	unsigned int  countUTF8 = 0;
+	unsigned char parts     = 0;
 
+
+	unsigned char *p = txt;
+	while (*p) {
+		// Is the high order bit set?
+		if (*p & 0x80) {
+			// then count the number of high order bits that are set
+			// this determines the number of following bytes need to have high order bits set
+			unsigned char i = *p;
+			for (parts = 0; i & 0x80; parts++) {
+				i <<= 1;
+			}
+
+
+			// The pattern 10nnnnnn is not a unicode character
+			if (parts == 1) {
+				return 0;
+			}
+			else {
+				while (--parts && ++*p) {
+					// The pattern of each following character must be: 10nnnnnn
+					if (0xc0 & *p != 0x80) {
+						return  0;
+					}
+				}
+
+				// Oops, we've run out of bytes too soon: Cannot be UTF-8
+				if (parts) {
+					return 0;
+				}
+			}
+			countUTF8++;
+		}
+	}
+
+	// At this point it is either UTF-8 or ascii
+	return countUTF8 ? 1 : -1;
+}
+
+
 #ifdef _ICU_
 
 //here comes our ICUStringMgr reimplementation
@@ -111,15 +175,33 @@
 }
 
 
-/** Converts the param to an upper case Utf8 string
-* @param The text encoded in utf8 which should be turned into an upper case string
-*/	
+/**
+ * This is a fallback method.  It should never be called.
+ * If UTF8 support is desired, then a UTF8 StringMgr needs
+ * to be used.
+ *
+ * Here we just do our best.
+ *
+ * Converts the param to an upper case UTF8 string
+ * @param t - The text encoded in utf8 which should be turned into an upper case string
+ *
+ */	
 char *StringMgr::upperUTF8(char *t, unsigned int maxlen) const {
 	// try to decide if it's worth trying to toupper.  Do we have more
 	// characters which are probably lower latin than not?
+	// we still don't use isValidUTF8 optimally. what if we have 1 unicode
+	// character in the string?  should we not try to upper any of the string?
+	// dunno.  Best solution is to upper all other characters. Don't have
+	// time to write that before release.
 	long performOp = 0;
-	for (const char *ch = t; *ch; ch++)
-		performOp += (*ch > 0) ? 1 : -1;
+	if (!isValidUTF8((unsigned char *)t)) {
+		performOp = 1;
+	}
+	else {
+		for (const char *ch = t; *ch; ch++) {
+			performOp += (*ch > 0) ? 1 : -1;
+		}
+	}
 
 	if (performOp > 0) {
 		return upperLatin1(t);
@@ -128,10 +210,12 @@
 	return t;
 }
 
-/** Converts the param to an uppercase latin1 string
-* @param The text encoded in latin1 which should be turned into an upper case string
-*/	
-char* StringMgr::upperLatin1(char* buf, unsigned int maxlen) const {
+
+/**
+ * Converts the param to an uppercase latin1 string
+ * @param The text encoded in latin1 which should be turned into an upper case string
+ */	
+char *StringMgr::upperLatin1(char *buf, unsigned int maxlen) const {
 	if (!buf)
 		return 0;