[sword-svn] r2295 - in trunk: include src/modules/filters src/utilfuns tests utilities

Sun Mar 29 10:11:27 MST 2009

Author: scribe
Date: 2009-03-29 10:11:27 -0700 (Sun, 29 Mar 2009)
New Revision: 2295

Modified:
   trunk/include/filemgr.h
   trunk/include/utilstr.h
   trunk/src/modules/filters/osishtmlhref.cpp
   trunk/src/modules/filters/utf8utf16.cpp
   trunk/src/utilfuns/utilstr.cpp
   trunk/tests/filtertest.cpp
   trunk/utilities/imp2vs.cpp
Log:
extracted UTF8 codepoint logic from utf8->utf16 filter
into utilstr
Applied Ben Morgan's patch to handle multibyte divine name data


Modified: trunk/include/filemgr.h
===================================================================

--- trunk/include/filemgr.h	2009-03-29 14:48:47 UTC (rev 2294)
+++ trunk/include/filemgr.h	2009-03-29 17:11:27 UTC (rev 2295)
@@ -143,7 +143,7 @@
 	* Will only close the file if it was created by this FileMgr object.
 	* @param file The file to close.
 	*/
-	void close(FileDesc * file);
+	void close(FileDesc *file);
 
 	/** Cacher methods overridden
 	 */

Modified: trunk/include/utilstr.h
===================================================================
--- trunk/include/utilstr.h	2009-03-29 14:48:47 UTC (rev 2294)
+++ trunk/include/utilstr.h	2009-03-29 17:11:27 UTC (rev 2295)
@@ -23,6 +23,7 @@
 #define UTILSTR_H
 
 #include <defs.h>
+#include <sysdata.h>
 
 SWORD_NAMESPACE_START
 
@@ -42,5 +43,18 @@
 extern const unsigned char SW_toupper_array[256];
 #define SW_toupper(c) SW_toupper_array[(unsigned char)c]
 
+/******************************************************************************
+ * getUniCharFromUTF8 - retrieves the next Unicode codepoint from a UTF8 string
+ * 					and increments buf to start of next codepoint
+ *
+ * ENT:	buf - address of a utf8 buffer
+ *
+ * RET:	buf - incremented past last byte used in computing the current codepoint
+ * 		unicode codepoint value (0 with buf incremented is invalid UTF8 byte
+ */
+
+__u32 getUniCharFromUTF8(const unsigned char **buf);
+
+
 SWORD_NAMESPACE_END
 #endif

Modified: trunk/src/modules/filters/osishtmlhref.cpp
===================================================================
--- trunk/src/modules/filters/osishtmlhref.cpp	2009-03-29 14:48:47 UTC (rev 2294)
+++ trunk/src/modules/filters/osishtmlhref.cpp	2009-03-29 17:11:27 UTC (rev 2295)
@@ -432,6 +432,16 @@
 				if (lastText.size()) {
 					toupperstr(lastText);
 					scratch.setFormatted("%c<font size=\"-1\">%s</font>", lastText[0], lastText.c_str()+1);
+
+					const unsigned char *tmpBuf = (const unsigned char *)lastText.c_str();
+					getUniCharFromUTF8(&tmpBuf);
+					int char_length = (tmpBuf - (const unsigned char *)lastText.c_str());
+					scratch.setFormatted("%.*s<font size=\"-1\">%s</font>", 
+						char_length, 
+						lastText.c_str(),
+						lastText.c_str() + char_length
+					);
+					
 					outText(scratch.c_str(), buf, u);
 				}               
 			} 

Modified: trunk/src/modules/filters/utf8utf16.cpp
===================================================================
--- trunk/src/modules/filters/utf8utf16.cpp	2009-03-29 14:48:47 UTC (rev 2294)
+++ trunk/src/modules/filters/utf8utf16.cpp	2009-03-29 17:11:27 UTC (rev 2295)
@@ -23,7 +23,9 @@
 #include <stdlib.h>
 #include <stdio.h>
 
+#include <sysdata.h>
 #include <utf8utf16.h>
+#include <utilstr.h>
 #include <swbuf.h>
 
 SWORD_NAMESPACE_START
@@ -31,61 +33,36 @@
 UTF8UTF16::UTF8UTF16() {
 }
 
+
 char UTF8UTF16::processText(SWBuf &text, const SWKey *key, const SWModule *module) {
 	const unsigned char *from;
-	unsigned long ch;
-        signed short utf16;
-	unsigned char from2[7];
-
 	SWBuf orig = text;
 
 	from = (const unsigned char *)orig.c_str();
 
 	// -------------------------------
-	for (text = ""; *from; from++) {
-		ch = 0;
-                //case: ANSI
-		if ((*from & 128) != 128) {
+	text = "";
+	while (*from) {
+
+		__u32 ch = getUniCharFromUTF8(&from);
+
+		if (!ch) continue;	// invalid char
+
+		if (ch < 0x10000) {
 			text.setSize(text.size()+2);
-			*((unsigned short *)(text.getRawData()+(text.size()-2))) = (unsigned short)*from;
-			continue;
+			*((__u16 *)(text.getRawData()+(text.size()-2))) = (__u16)ch;
 		}
-                //case: Invalid UTF-8 (illegal continuing byte in initial position)
-		if ((*from & 128) && ((*from & 64) != 64)) {
-			continue;
+		else {
+			__u16 utf16;
+			utf16 = (__s16)((ch - 0x10000) / 0x400 + 0xD800);
+			text.setSize(text.size()+4);
+			*((__u16 *)(text.getRawData()+(text.size()-4))) = utf16;
+			utf16 = (__s16)((ch - 0x10000) % 0x400 + 0xDC00);
+			*((__u16 *)(text.getRawData()+(text.size()-2))) = utf16;
 		}
-                //case: 2+ byte codepoint
-		from2[0] = *from;
-		from2[0] <<= 1;
-		int subsequent;
-		for (subsequent = 1; (from2[0] & 128) && (subsequent < 7); subsequent++) {
-			from2[0] <<= 1;
-			from2[subsequent] = from[subsequent];
-			from2[subsequent] &= 63;
-			ch <<= 6;
-			ch |= from2[subsequent];
-		}
-		subsequent--;
-		from2[0] <<= 1;
-		char significantFirstBits = 8 - (2+subsequent);
-		
-		ch |= (((short)from2[0]) << (((6*subsequent)+significantFirstBits)-8));
-		from += subsequent;
-			if (ch < 0x10000) {
-				text.setSize(text.size()+2);
-				*((unsigned short *)(text.getRawData()+(text.size()-2))) = (unsigned short)ch;
-			 }
-			else {
-				utf16 = (signed short)((ch - 0x10000) / 0x400 + 0xD800);
-				text.setSize(text.size()+2);
-				*((unsigned short *)(text.getRawData()+(text.size()-2))) = (unsigned short)utf16;
-				utf16 = (signed short)((ch - 0x10000) % 0x400 + 0xDC00);
-				text.setSize(text.size()+2);
-				*((unsigned short *)(text.getRawData()+(text.size()-2))) = (unsigned short)utf16;
-			}
 	}
 	text.setSize(text.size()+2);
-	*((unsigned short *)(text.getRawData()+(text.size()-2))) = (unsigned short)0;
+	*((__u16 *)(text.getRawData()+(text.size()-2))) = (__u16)0;
 	   
 	return 0;
 

Modified: trunk/src/utilfuns/utilstr.cpp
===================================================================
--- trunk/src/utilfuns/utilstr.cpp	2009-03-29 14:48:47 UTC (rev 2294)
+++ trunk/src/utilfuns/utilstr.cpp	2009-03-29 17:11:27 UTC (rev 2295)
@@ -19,20 +19,9 @@
 #include <ctype.h>
 #include <string.h>
 
-#include <localemgr.h>
+#include <sysdata.h>
 
 
-#ifdef _ICU_
-#include <unicode/utypes.h>
-#include <unicode/ucnv.h>
-#include <unicode/ustring.h>
-#include <unicode/uchar.h>
-
-#include <unicode/unistr.h>
-#include <unicode/translit.h>
-
-#endif
-
 SWORD_NAMESPACE_START
 
 const unsigned char SW_toupper_array[256] =
@@ -192,68 +181,58 @@
 #endif
 }
 
+
 /******************************************************************************
- * toupperstr - converts a string to uppercase string
+ * getUniCharFromUTF8 - retrieves the next Unicode codepoint from a UTF8 string
+ * 					and increments buf to start of next codepoint
  *
- * ENT:	target - string to convert
+ * ENT:	buf - address of a utf8 buffer
  *
- * RET:	target
+ * RET:	buf - incremented past last byte used in computing the current codepoint
+ * 		unicode codepoint value (0 with buf incremented is invalid UTF8 byte
  */
 
-// char *toupperstr(char *buf) {
-// 	char *ret = buf;
-// 
-// 	/*if (StringHelper::getSystemStringHelper()) {
-// 		StringHelper::getSystemStringHelper()->upperStringLatin1( ret );
-// 	}
-// 	else*/ {
-// 		while (*buf) {
-// 			*buf++ = SW_toupper(*buf);
-// 		}
-// // 	}
-// 	return ret;
-// }
+__u32 getUniCharFromUTF8(const unsigned char **buf) {
+	__u32 ch = 0;
+	unsigned char multibuf[7];
 
+	//case: We're at the end
+	if (!(**buf)) {
+		return ch;
+	}
 
-/******************************************************************************
- * toupperstr - converts a string to uppercase string
- *
- * ENT:	target - string to convert
- *
- * RET:	target
- */
+	//case: ANSI
+	if (!(**buf & 128)) {
+		ch = **buf;
+		(*buf)++;
+		return ch;
+	}
 
-// char *toupperstr_utf8(char *buf, unsigned int max) {
-// 	char *ret = buf;
-// 
-// /*	if (StringHelper::getSystemStringHelper()) {
-// 		StringHelper::getSystemStringHelper()->upperStringUtf8( ret );
-// 		return ret;
-// 	}*/
-// 	
-// #ifndef _ICU_
-// 	// try to decide if it's worth trying to toupper.  Do we have more
-// 	// characters that are probably lower latin than not?
-// 	long performOp = 0;
-// 	for (const char *ch = buf; *ch; ch++)
-// 		performOp += (*ch > 0) ? 1 : -1;
-// 
-// 	if (performOp > 0) {
-// 		while (*buf)
-// 			*buf = SW_toupper(*buf++);
-// 	}
-// #else
-// 	if (!max)
-// 		max = strlen(ret);
-// 	UErrorCode err = U_ZERO_ERROR;
-// 	UConverter *conv = ucnv_open("UTF-8", &err);
-// 	UnicodeString str(buf, -1, conv, err);
-// 	UnicodeString ustr = str.toUpper();
-// 	ustr.extract(ret, max, conv, err);
-// 	ucnv_close(conv);
-// #endif
-// 
-// 	return ret;
-// }
+	//case: Invalid UTF-8 (illegal continuing byte in initial position)
+	if ((**buf & 128) && (!(**buf & 64))) {
+		(*buf)++;
+		return ch;
+	}
 
+	//case: 2+ byte codepoint
+	multibuf[0] = **buf;
+	multibuf[0] <<= 1;
+	int subsequent;
+	for (subsequent = 1; (multibuf[0] & 128) && (subsequent < 7); subsequent++) {
+		multibuf[0] <<= 1;
+		multibuf[subsequent] = (*buf)[subsequent];
+		multibuf[subsequent] &= 63;
+		ch <<= 6;
+		ch |= multibuf[subsequent];
+	}
+	subsequent--;
+	multibuf[0] <<= 1;
+	char significantFirstBits = 8 - (2+subsequent);
+	
+	ch |= (((__s16)multibuf[0]) << (((6*subsequent)+significantFirstBits)-8));
+	*buf += (subsequent+1);
+	return ch;
+}
+
+
 SWORD_NAMESPACE_END

Modified: trunk/tests/filtertest.cpp
===================================================================
--- trunk/tests/filtertest.cpp	2009-03-29 14:48:47 UTC (rev 2294)
+++ trunk/tests/filtertest.cpp	2009-03-29 17:11:27 UTC (rev 2295)
@@ -17,7 +17,9 @@
 
 #include <iostream>
 #include <swbuf.h>
+#include <filemgr.h>
 #include <papyriplain.h>
+#include <utf8utf16.h>
 //#include <swmgr.h>
 #ifndef NO_SWORD_NAMESPACE
 using namespace sword;
@@ -26,15 +28,42 @@
 
 
 int main(int argc, char **argv) {
-//	SWMgr mgr;
-//	SWModule *module = mgr.getModule("KJV");
-	PapyriPlain filter;
-	SWBuf buf;
-	buf = "This is t<e>xt which has papy-\nri markings in it.\n  L[et's be] sure it gets--\n cleaned up well for s(earching)";
-	std::cout << "Original:\n\n" << buf << "\n\n-------\n\n";
-	filter.processText(buf);
-//	filter.processText(buf, module->getKey(), module);
-	std::cout << buf << "\n\n+++++++\n";
+	UTF8UTF16 filter;
+//	PapyriPlain filter;
+//
+	FileDesc *fd = (argc > 1) ? FileMgr::getSystemFileMgr()->open(argv[1], FileMgr::RDONLY) : 0;
 
+	SWBuf lineBuffer = "This is t<e>xt which has papy-\nri markings in it.\n  L[et's be] sure it gets--\n cleaned up well for s(earching)";
+
+	std::cout << "Original:\n\n";
+
+	while (!fd || FileMgr::getLine(fd, lineBuffer)) {
+		cout << lineBuffer << "\n";
+		if (!fd) break;
+	}
+
+ 	cout << "\n\n-------\n\n";
+
+	if (fd) {
+		FileMgr::getSystemFileMgr()->close(fd);
+		fd = FileMgr::getSystemFileMgr()->open(argv[1], FileMgr::RDONLY);
+	}
+
+	while (!fd || FileMgr::getLine(fd, lineBuffer)) {
+		filter.processText(lineBuffer);
+		for (unsigned int i = 0; i < lineBuffer.size(); i++) {
+			printf("%c", lineBuffer[i]);
+		}
+		cout << "\n";
+		if (!fd) break;
+	}
+
+	std::cout << "\n\n+++++++\n";
+
+	if (fd) {
+		FileMgr::getSystemFileMgr()->close(fd);
+	}
+
 	return 0;
 }
+

Modified: trunk/utilities/imp2vs.cpp
===================================================================
--- trunk/utilities/imp2vs.cpp	2009-03-29 14:48:47 UTC (rev 2294)
+++ trunk/utilities/imp2vs.cpp	2009-03-29 17:11:27 UTC (rev 2295)
@@ -131,6 +131,8 @@
 	}
 	writeEntry(currentKey, currentEntry, module);
 
+	FileMgr::getSystemFileMgr()->close(fd);
+
 	delete vkey;
 
 	return 0;