[sword-svn] r3714 - in trunk: include src/mgr src/modules/filters src/utilfuns

Fri Apr 10 16:43:12 MST 2020

Author: scribe
Date: 2020-04-10 16:43:12 -0700 (Fri, 10 Apr 2020)
New Revision: 3714

Modified:
   trunk/include/stringmgr.h
   trunk/include/swbuf.h
   trunk/include/utilstr.h
   trunk/src/mgr/stringmgr.cpp
   trunk/src/modules/filters/osishtmlhref.cpp
   trunk/src/utilfuns/swbuf.cpp
   trunk/src/utilfuns/utilstr.cpp
Log:
Expanded StringMgr to cover: lowerUTF8, isUpper, isLower, isDigit, isAlpha
Extended SWBuf c-tor initial size param to truncate an initialization string if it is longer than initialSize



Modified: trunk/include/stringmgr.h
===================================================================

--- trunk/include/stringmgr.h	2020-04-10 22:58:37 UTC (rev 3713)
+++ trunk/include/stringmgr.h	2020-04-10 23:43:12 UTC (rev 3714)
@@ -35,6 +35,8 @@
  * Each platform, if it's up-to-date, should provide functions to handle unicode and utf8. This class makes it possible to implement Unicode support on the user-side and not in Sword itself.
  */
 class SWDLLEXPORT StringMgr {
+private:
+	static StringMgr *systemStringMgr;
 public:
 
 	/** Sets the global StringMgr handle
@@ -61,6 +63,17 @@
 	* @return text buffer (only for convenience)
 	*/	
 	virtual char *upperUTF8(char *text, unsigned int max = 0) const;
+	/** Converts the param to a lower case Utf8 string
+	* @param text The text encoded in utf8 which should be turned into an upper case string
+	* @param max Max buffer size
+	* @return text buffer (only for convenience)
+	*/	
+	virtual char *lowerUTF8(char *text, unsigned int max = 0) const;
+
+	virtual bool isUpper(__u32 character) const;
+	virtual bool isLower(__u32 character) const;
+	virtual bool isDigit(__u32 character) const;
+	virtual bool isAlpha(__u32 character) const;
    
 	/** Converts the param to an uppercase latin1 string
 	* @param text The text encoded in latin1 which should be turned into an upper case string
@@ -86,35 +99,25 @@
 	virtual ~StringMgr();
 	
 	virtual bool supportsUnicode() const;
-
-private:
-	static StringMgr *systemStringMgr;
 };
 
+
 inline char *toupperstr(char *t, unsigned int max = 0) {
 	return StringMgr::getSystemStringMgr()->upperUTF8(t, max);
 }
+
+inline char *tolowerstr(char *t, unsigned int max = 0) {
+	return StringMgr::getSystemStringMgr()->lowerUTF8(t, max);
+}
 	
+/*
+ * @deprecated - SWBuf assumed to be UTF-8 now.
+ */
 inline char *toupperstr_utf8(char *t, unsigned int max = 0) {
 	return StringMgr::getSystemStringMgr()->upperUTF8(t, max);
 }
-	
-/**
- * Converts an SWBuf filled with UTF-8 to upper case
- *
- * @param b SWBuf to change to upper case
- * 
- * @return b for convenience
- */
-inline SWBuf &toupperstr(SWBuf &b) {
-	char *utf8 = 0;
-	stdstr(&utf8, b.c_str(), 2);
-	toupperstr(utf8, (unsigned int)strlen(utf8)*2);
-	b = utf8;
-	delete [] utf8;
-	return b;
-}
 
+
 SWORD_NAMESPACE_END
 
 

Modified: trunk/include/swbuf.h
===================================================================
--- trunk/include/swbuf.h	2020-04-10 22:58:37 UTC (rev 3713)
+++ trunk/include/swbuf.h	2020-04-10 23:43:12 UTC (rev 3714)
@@ -99,7 +99,7 @@
 	inline SWBuf(const char *initVal, unsigned long initSize = 0) {
 		init(initSize);
 		if (initVal)
-			set(initVal);
+			set(initVal, initSize);
 	}
 
 	/******************************************************************************
@@ -211,9 +211,10 @@
 	* If the allocated memory is bigger than the new string, it will NOT be resized.
 	* @param newVal the value to set this buffer to. 
 	*/
-	inline void set(const char *newVal) {
+	inline void set(const char *newVal, unsigned long maxSize = 0) {
 		if (newVal) {
 			unsigned long len = strlen(newVal) + 1;
+			if (maxSize && maxSize < (len-1)) len = maxSize + 1;
 			assureSize(len);
 			memcpy(buf, newVal, len);
 			end = buf + (len - 1);
@@ -221,8 +222,8 @@
 		else {
 			assureSize(1);
 			end = buf;
-			*end = 0;
 		}
+		*end = 0;
 	}
 
 	/**
@@ -463,7 +464,16 @@
 	 * @return returns true if this buffer starts with the specified prefix
 	 */
 	inline bool startsWith(const SWBuf &prefix) const { return !strncmp(c_str(), prefix.c_str(), prefix.size()); }
-	void toUpper();
+	/**
+	 * Converts this SWBuf to uppercase
+	 * &return this
+	 */
+	SWBuf &toUpper();
+	/**
+	 * Converts this SWBuf to lowercase
+	 * &return this
+	 */
+	SWBuf &toLower();
 
 	/**
 	 * @return returns true if this buffer ends with the specified postfix

Modified: trunk/include/utilstr.h
===================================================================
--- trunk/include/utilstr.h	2020-04-10 22:58:37 UTC (rev 3713)
+++ trunk/include/utilstr.h	2020-04-10 23:43:12 UTC (rev 3714)
@@ -36,6 +36,7 @@
  * ENT:	ipstr	- pointer to a string pointer to set if necessary
  *	istr	- string to set to *ipstr
  *			0 - only get
+ *	memPadFactor - memory will be allocated the size of istr * memPadFactor
  *
  * RET:	*ipstr
  */
@@ -66,14 +67,12 @@
 #define SW_toupper(c) SW_toupper_array[(unsigned char)c]
 
 /******************************************************************************
- * getUniCharFromUTF8 - retrieves the next Unicode codepoint from a UTF8 string
- * 					and increments buf to start of next codepoint
+ * SW_tolower - array of lowercase values for any given Latin-1 value
  *
- * ENT:	buf - address of a utf8 buffer
- *
- * RET:	buf - incremented past last byte used in computing the current codepoint
- * 		unicode codepoint value (0 with buf incremented is invalid UTF8 byte
+ * use this instead of tolower() for fast lookups on accented characters
  */
+extern const unsigned char SW_tolower_array[256];
+#define SW_tolower(c) SW_tolower_array[(unsigned char)c]
 
 
 /******************************************************************************

Modified: trunk/src/mgr/stringmgr.cpp
===================================================================
--- trunk/src/mgr/stringmgr.cpp	2020-04-10 22:58:37 UTC (rev 3713)
+++ trunk/src/mgr/stringmgr.cpp	2020-04-10 23:43:12 UTC (rev 3714)
@@ -119,6 +119,21 @@
 	#endif
 		return countUTF8 ? 1 : -1;
 	}
+
+	char *lowerLatin1(char *buf, unsigned int maxlen = 0) {
+		if (!buf)
+			return 0;
+			
+		char *ret = buf;
+		bool checkMax = maxlen;
+
+		while (*buf && (!checkMax || maxlen--)) {
+			*buf = SW_tolower(*buf);
+			buf++;
+		}
+
+		return ret;
+	}
 }
 
 
@@ -128,6 +143,11 @@
 class ICUStringMgr : public StringMgr {
 public:
 	virtual char *upperUTF8(char *, unsigned int maxlen = 0) const;
+	virtual char *lowerUTF8(char *, unsigned int maxlen = 0) const;
+	virtual bool isUpper(__u32 character) const;
+	virtual bool isLower(__u32 character) const;
+	virtual bool isDigit(__u32 character) const;
+	virtual bool isAlpha(__u32 character) const;
 	
 protected:
 	virtual bool supportsUnicode() const { return true; };
@@ -221,6 +241,56 @@
 
 
 /**
+ * This is a fallback method.  It should never be called.
+ * If UTF8 support is desired, then a UTF8 StringMgr needs
+ * to be used.
+ *
+ * Here we just do our best.
+ *
+ * Converts the param to a lower case UTF8 string
+ * @param t - The text encoded in utf8 which should be turned into an lower case string
+ *
+ */	
+char *StringMgr::lowerUTF8(char *t, unsigned int maxlen) const {
+	// try to decide if it's worth trying to tolower.  Do we have more
+	// characters which are probably lower latin than not?
+	// we still don't use isValidUTF8 optimally. what if we have 1 unicode
+	// character in the string?  should we not try to lower any of the string?
+	// dunno.  Best solution is to lower all other characters. Don't have
+	// time to write that before release.
+	long performOp = 0;
+	if (!isValidUTF8((unsigned char *)t)) {
+		performOp = 1;
+	}
+	else {
+		for (const char *ch = t; *ch; ch++) {
+			performOp += (*ch > 0) ? 1 : -1;
+		}
+	}
+
+	if (performOp > 0) {
+		return lowerLatin1(t);
+	}
+
+	return t;
+}
+
+bool StringMgr::isUpper(__u32 character) const {
+	return isupper(character);
+}
+bool StringMgr::isLower(__u32 character) const {
+	return islower(character);
+}
+bool StringMgr::isDigit(__u32 character) const {
+	return isdigit(character);
+}
+bool StringMgr::isAlpha(__u32 character) const {
+	return isalpha(character);
+}
+
+
+
+/**
  * Converts the param to an uppercase latin1 string
  * @param The text encoded in latin1 which should be turned into an upper case string
  */	
@@ -281,6 +351,55 @@
 	delete [] upperStr;
 	return ret;
 }
+
+char *ICUStringMgr::lowerUTF8(char *buf, unsigned int maxlen) const {
+	char *ret = buf;
+	int max = (int)((maxlen) ? maxlen : strlen(buf));
+		
+	UErrorCode err = U_ZERO_ERROR;
+		
+	if (!buf || !max) {
+		return ret;
+	}
+		
+	UChar *sourceStr = new UChar[max+10];
+	UChar *resultStr = new UChar[max+10];
+		
+	u_strFromUTF8(sourceStr, max+9, 0, buf, -1, &err);
+	if (err != U_ZERO_ERROR) {
+//		SWLog::getSystemLog()->logError("from: %s", u_errorName(err));
+		delete [] sourceStr;
+		delete [] resultStr;
+		return ret;
+	}
+
+	u_strToLower(resultStr, max+9, sourceStr, -1, 0, &err);
+	if (err != U_ZERO_ERROR) {
+//		SWLog::getSystemLog()->logError("upperCase: %s", u_errorName(err));
+		delete [] sourceStr;
+		delete [] resultStr;
+		return ret;
+	}
+
+	ret = u_strToUTF8(ret, max, 0, resultStr, -1, &err);
+		
+	delete [] sourceStr;
+	delete [] resultStr;
+	return ret;
+}
+
+bool ICUStringMgr::isUpper(__u32 character) const {
+	return u_isupper(character);
+}
+bool ICUStringMgr::isLower(__u32 character) const {
+	return u_islower(character);
+}
+bool ICUStringMgr::isDigit(__u32 character) const {
+	return u_isdigit(character);
+}
+bool ICUStringMgr::isAlpha(__u32 character) const {
+	return u_isalpha(character);
+}
 	
 #endif
 

Modified: trunk/src/modules/filters/osishtmlhref.cpp
===================================================================
--- trunk/src/modules/filters/osishtmlhref.cpp	2020-04-10 22:58:37 UTC (rev 3713)
+++ trunk/src/modules/filters/osishtmlhref.cpp	2020-04-10 23:43:12 UTC (rev 3714)
@@ -446,7 +446,7 @@
 				SWBuf lastText = u->lastSuspendSegment.c_str();
 				u->suspendTextPassThru = (--u->suspendLevel);
 				if (lastText.size()) {
-					toupperstr(lastText);
+					lastText.toUpper();
 					scratch.setFormatted("%c<font size=\"-1\">%s</font>", lastText[0], lastText.c_str()+1);
 
 					const unsigned char *tmpBuf = (const unsigned char *)lastText.c_str();

Modified: trunk/src/utilfuns/swbuf.cpp
===================================================================
--- trunk/src/utilfuns/swbuf.cpp	2020-04-10 22:58:37 UTC (rev 3713)
+++ trunk/src/utilfuns/swbuf.cpp	2020-04-10 23:43:12 UTC (rev 3714)
@@ -122,6 +122,30 @@
 }
 
 
-void SWBuf::toUpper() { assureSize(size()*3); toupperstr(buf, size()*3-1); }
+/**
+ * Converts an SWBuf filled with UTF-8 to upper case
+ *
+ * @param b SWBuf to change to upper case
+ * 
+ * @return b for convenience
+ */
+SWBuf &SWBuf::toUpper() { 
+	char *utf8 = 0;
+	stdstr(&utf8, c_str(), 3);
+	sword::toupperstr(utf8, (unsigned int)size()*3-1);
+	*this = utf8;
+	delete [] utf8;
 
+	return *this;
+}
+SWBuf &SWBuf::toLower() {
+	char *utf8 = 0;
+	stdstr(&utf8, c_str(), 3);
+	sword::tolowerstr(utf8, (unsigned int)size()*3-1);
+	*this = utf8;
+	delete [] utf8;
+
+	return *this;
+}
+
 SWORD_NAMESPACE_END

Modified: trunk/src/utilfuns/utilstr.cpp
===================================================================
--- trunk/src/utilfuns/utilstr.cpp	2020-04-10 22:58:37 UTC (rev 3713)
+++ trunk/src/utilfuns/utilstr.cpp	2020-04-10 23:43:12 UTC (rev 3714)
@@ -66,8 +66,47 @@
 	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xff
 };
 
+// WARNING, this is simply a copy of toupper right now
+// It needs to be inverted, but actually should never
+// be used.  A StringMgr which supports UTF-8 should
+// be used intstead
+const unsigned char SW_tolower_array[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xd7,
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xdf,
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff
+};
 
 
+
 /******************************************************************************
  * strstrip - Removes leading and trailing spaces from a string
  *