[sword-svn] r3083 - in trunk: include src/mgr src/modules/filters

chrislit at crosswire.org chrislit at crosswire.org
Thu Mar 6 01:13:10 MST 2014


Author: chrislit
Date: 2014-03-06 01:13:10 -0700 (Thu, 06 Mar 2014)
New Revision: 3083

Modified:
   trunk/include/scsuutf8.h
   trunk/src/mgr/encfiltmgr.cpp
   trunk/src/modules/filters/scsuutf8.cpp
Log:
fixed --without-icu (_ICU_ not defined) compilation
first cut of an update to R. Czyborra's SCSU decoder code


Modified: trunk/include/scsuutf8.h
===================================================================
--- trunk/include/scsuutf8.h	2014-03-05 21:49:39 UTC (rev 3082)
+++ trunk/include/scsuutf8.h	2014-03-06 08:13:10 UTC (rev 3083)
@@ -43,8 +43,15 @@
 	UErrorCode err;
 #else
 	// without ICU, we'll attempt to use Roman Czyborra's SCSU decoder code
+	unsigned char active;
+	bool mode;
 	unsigned long c, d;
-	unsigned char* UTF8Output(unsigned long, unsigned char* text);
+
+	static unsigned short start[8];
+	static unsigned short slide[8];
+	static unsigned short win[256];
+
+	int UTF8Output(unsigned long, SWBuf* utf8Buf);
 #endif
   
 public:

Modified: trunk/src/mgr/encfiltmgr.cpp
===================================================================
--- trunk/src/mgr/encfiltmgr.cpp	2014-03-05 21:49:39 UTC (rev 3082)
+++ trunk/src/mgr/encfiltmgr.cpp	2014-03-06 08:13:10 UTC (rev 3083)
@@ -33,7 +33,10 @@
 #include <utf8latin1.h>
 #include <utf8utf16.h>
 #include <utf8html.h>
+
+#ifdef _ICU_
 #include <utf8scsu.h>
+#endif 
 
 #include <swmodule.h>
 
@@ -63,7 +66,9 @@
 		case ENC_UTF16:  targetenc = new UTF8UTF16();  break;
 		case ENC_RTF:    targetenc = new UnicodeRTF(); break;
 		case ENC_HTML:   targetenc = new UTF8HTML();   break;
+#ifdef _ICU_
 		case ENC_SCSU:   targetenc = new UTF8SCSU();   break;
+#endif
 		default: // i.e. case ENC_UTF8
 			targetenc = NULL;
 	}
@@ -121,7 +126,9 @@
 			case ENC_UTF16:  targetenc = new UTF8UTF16();  break;
 			case ENC_RTF:    targetenc = new UnicodeRTF(); break;
 			case ENC_HTML:   targetenc = new UTF8HTML();   break;
+#ifdef _ICU_
 			case ENC_SCSU:   targetenc = new UTF8SCSU();   break;
+#endif
 			default: // i.e. case ENC_UTF8
 				targetenc = NULL;
 		}

Modified: trunk/src/modules/filters/scsuutf8.cpp
===================================================================
--- trunk/src/modules/filters/scsuutf8.cpp	2014-03-05 21:49:39 UTC (rev 3082)
+++ trunk/src/modules/filters/scsuutf8.cpp	2014-03-06 08:13:10 UTC (rev 3083)
@@ -23,10 +23,7 @@
 
 /* This class is based on:
  * http://czyborra.com/scsu/scsu.c written by Roman Czyborra at dds.nl
- * on Andrea's balcony in North Amsterdam on 1998-08-04
- * Thanks to Richard Verhoeven <rcb5 at win.tue.nl> for his suggestion
- * to correct the haphazard "if" after UQU to "else if" on 1998-10-01
- * 
+ *
  * This is a deflator to UTF-8 output for input compressed in SCSU,
  * the (Reuters) Standard Compression Scheme for Unicode as described
  * in http://www.unicode.org/unicode/reports/tr6.html
@@ -35,6 +32,7 @@
 #include <scsuutf8.h>
 #include <swbuf.h>
 
+
 SWORD_NAMESPACE_START
 
 
@@ -43,59 +41,100 @@
 	// initialize SCSU converter
 	scsuConv = ucnv_open("SCSU", &err);
 	// initialize UTF-8 converter
-	utf8Conv = ucnv_open("UTF-8", &err);
+	utf8Conv = ucnv_open("UTF-8", &err);	
+#else
+	active = 0;
+	mode = 0;
 #endif
 }
 
 SCSUUTF8::~SCSUUTF8() {
 #ifdef _ICU_
-         ucnv_close(scsuConv);
-         ucnv_close(utf8Conv);
+	ucnv_close(scsuConv);
+	ucnv_close(utf8Conv);
 #endif
 }
 
-
 #ifndef _ICU_
-unsigned char* SCSUUTF8::UTF8Output(unsigned long uchar, unsigned char* text)
-{
-	/* join UTF-16 surrogates without any pairing sanity checks */
+unsigned short SCSUUTF8::start[] = {0x0000,0x0080,0x0100,0x0300,0x2000,0x2080,0x2100,0x3000};
+unsigned short SCSUUTF8::slide[] = {0x0080,0x00C0,0x0400,0x0600,0x0900,0x3040,0x30A0,0xFF00};
+unsigned short SCSUUTF8::win[] = {
+	0x0000, 0x0080, 0x0100, 0x0180, 0x0200, 0x0280, 0x0300, 0x0380,
+	0x0400, 0x0480, 0x0500, 0x0580, 0x0600, 0x0680, 0x0700, 0x0780,
+	0x0800, 0x0880, 0x0900, 0x0980, 0x0A00, 0x0A80, 0x0B00, 0x0B80,
+	0x0C00, 0x0C80, 0x0D00, 0x0D80, 0x0E00, 0x0E80, 0x0F00, 0x0F80,
+	0x1000, 0x1080, 0x1100, 0x1180, 0x1200, 0x1280, 0x1300, 0x1380,
+	0x1400, 0x1480, 0x1500, 0x1580, 0x1600, 0x1680, 0x1700, 0x1780,
+	0x1800, 0x1880, 0x1900, 0x1980, 0x1A00, 0x1A80, 0x1B00, 0x1B80,
+	0x1C00, 0x1C80, 0x1D00, 0x1D80, 0x1E00, 0x1E80, 0x1F00, 0x1F80,
+	0x2000, 0x2080, 0x2100, 0x2180, 0x2200, 0x2280, 0x2300, 0x2380,
+	0x2400, 0x2480, 0x2500, 0x2580, 0x2600, 0x2680, 0x2700, 0x2780,
+	0x2800, 0x2880, 0x2900, 0x2980, 0x2A00, 0x2A80, 0x2B00, 0x2B80,
+	0x2C00, 0x2C80, 0x2D00, 0x2D80, 0x2E00, 0x2E80, 0x2F00, 0x2F80,
+	0x3000, 0x3080, 0x3100, 0x3180, 0x3200, 0x3280, 0x3300, 0x3800,
+	0xE000, 0xE080, 0xE100, 0xE180, 0xE200, 0xE280, 0xE300, 0xE380,
+	0xE400, 0xE480, 0xE500, 0xE580, 0xE600, 0xE680, 0xE700, 0xE780,
+	0xE800, 0xE880, 0xE900, 0xE980, 0xEA00, 0xEA80, 0xEB00, 0xEB80,
+	0xEC00, 0xEC80, 0xED00, 0xED80, 0xEE00, 0xEE80, 0xEF00, 0xEF80,
+	0xF000, 0xF080, 0xF100, 0xF180, 0xF200, 0xF280, 0xF300, 0xF380,
+	0xF400, 0xF480, 0xF500, 0xF580, 0xF600, 0xF680, 0xF700, 0xF780,
+	0xF800, 0xF880, 0xF900, 0xF980, 0xFA00, 0xFA80, 0xFB00, 0xFB80,
+	0xFC00, 0xFC80, 0xFD00, 0xFD80, 0xFE00, 0xFE80, 0xFF00, 0xFF80,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x00C0, 0x0250, 0x0370, 0x0530, 0x3040, 0x30A0, 0xFF60,
+};
 
-	static int d;
-  
-	if (uchar >= 0xd800 && uchar <= 0xdbff) { d = uchar & 0x3f; return text;  }
-	if (uchar >= 0xdc00 && uchar <= 0xdfff) { uchar = uchar + 0x2400 + d * 0x400; }
-  
-	/* output one character as UTF-8 multibyte sequence */
-  
+int SCSUUTF8::UTF8Output(unsigned long uchar, SWBuf* utf8Buf)
+{
+	// join UTF-16 surrogates without any pairing sanity checks
+	if (uchar >= 0xd800 && uchar <= 0xdbff) {
+		d = uchar & 0x3ff;
+		return 0;
+	}
+	if (uchar >= 0xdc00 && uchar <= 0xdfff) {
+		uchar = uchar + 0x2400 + d * 0x400;
+	}
+	
+	// output one character as UTF-8 multibyte sequence
+	
 	if (uchar < 0x80) {
-		*text++ = c;
+		utf8Buf += uchar;
 	}
-	else if (uchar < 0x800) { 
-		*text++ = 0xc0 | uchar >> 6; 
-		*text++ = 0x80 | (uchar & 0x3f);
+	else if (uchar < 0x800) {
+		utf8Buf += (0xc0 | (uchar>>6));
+		utf8Buf += (0x80 | (uchar & 0x3f));
 	}
 	else if (uchar < 0x10000) {
-		*text++ = 0xe0 | uchar >> 12; 
-		*text++ = 0x80 | (uchar >> 6 & 0x3f);
-		*text++ = 0x80 | (uchar & 0x3f);
+		utf8Buf += (0xe0 | (uchar>>12));
+		utf8Buf += (0x80 | (uchar>>6 & 0x3f));
+		utf8Buf += (0x80 | (uchar & 0x3f));
 	}
 	else if (uchar < 0x200000) {
-		*text++ = 0xf0 | uchar >> 18;
-		*text++ = 0x80 | (uchar >> 12 & 0x3f);
-		*text++ = 0x80 | (uchar >> 6 & 0x3f);
-		*text++ = 0x80 | (uchar & 0x3f);
-	}  
+		utf8Buf += (0xf0 | (uchar>>18));
+		utf8Buf += (0x80 | (uchar>>12 & 0x3f));
+		utf8Buf += (0x80 | (uchar>>6 & 0x3f));
+		utf8Buf += (0x80 | (uchar & 0x3f));
+	}
 	
-	return text;
+	return 0;
 }
 #endif
 
 char SCSUUTF8::processText(SWBuf &text, const SWKey *key, const SWModule *module) {
-#ifdef _ICU_
-
 	if ((unsigned long)key < 2)	// hack, we're en(1)/de(0)ciphering
 		return -1;
-
+	
+#ifdef _ICU_
+	// Try decoding with ICU if possible
 	err = U_ZERO_ERROR;
 	UnicodeString utf16Text(text.getRawData(), text.length(), scsuConv, err);
 	err = U_ZERO_ERROR;
@@ -104,86 +143,36 @@
 		text.setSize(len+1);
 		int32_t len = utf16Text.extract(text.getRawData(), text.size(), scsuConv, err);
 	}
-
 #else
+	// If ICU is unavailable, decode using Czyborra's decoder
+	SWBuf utf8Buf = "";
+	int len = text.length();
+	const char* scsuString = text.c_str();
 
-	unsigned char *to, *from;
-	unsigned long buflen = len * FILTERPAD;
-	char active = 0, mode = 0;
-	if ((unsigned long)key < 2)	// hack, we're en(1)/de(0)ciphering
-		return -1;
-	
-	static unsigned short start[8] = {0x0000,0x0080,0x0100,0x0300,0x2000,0x2080,0x2100,0x3000};
-	static unsigned short slide[8] = {0x0080,0x00C0,0x0400,0x0600,0x0900,0x3040,0x30A0,0xFF00};
-	static unsigned short win[256]   = {
-		0x0000, 0x0080, 0x0100, 0x0180, 0x0200, 0x0280, 0x0300, 0x0380,
-		0x0400, 0x0480, 0x0500, 0x0580, 0x0600, 0x0680, 0x0700, 0x0780,
-		0x0800, 0x0880, 0x0900, 0x0980, 0x0A00, 0x0A80, 0x0B00, 0x0B80,
-		0x0C00, 0x0C80, 0x0D00, 0x0D80, 0x0E00, 0x0E80, 0x0F00, 0x0F80,
-		0x1000, 0x1080, 0x1100, 0x1180, 0x1200, 0x1280, 0x1300, 0x1380,
-		0x1400, 0x1480, 0x1500, 0x1580, 0x1600, 0x1680, 0x1700, 0x1780,
-		0x1800, 0x1880, 0x1900, 0x1980, 0x1A00, 0x1A80, 0x1B00, 0x1B80,
-		0x1C00, 0x1C80, 0x1D00, 0x1D80, 0x1E00, 0x1E80, 0x1F00, 0x1F80,
-		0x2000, 0x2080, 0x2100, 0x2180, 0x2200, 0x2280, 0x2300, 0x2380,
-		0x2400, 0x2480, 0x2500, 0x2580, 0x2600, 0x2680, 0x2700, 0x2780,
-		0x2800, 0x2880, 0x2900, 0x2980, 0x2A00, 0x2A80, 0x2B00, 0x2B80,
-		0x2C00, 0x2C80, 0x2D00, 0x2D80, 0x2E00, 0x2E80, 0x2F00, 0x2F80,
-		0x3000, 0x3080, 0x3100, 0x3180, 0x3200, 0x3280, 0x3300, 0x3800,
-		0xE000, 0xE080, 0xE100, 0xE180, 0xE200, 0xE280, 0xE300, 0xE380,
-		0xE400, 0xE480, 0xE500, 0xE580, 0xE600, 0xE680, 0xE700, 0xE780,
-		0xE800, 0xE880, 0xE900, 0xE980, 0xEA00, 0xEA80, 0xEB00, 0xEB80,
-		0xEC00, 0xEC80, 0xED00, 0xED80, 0xEE00, 0xEE80, 0xEF00, 0xEF80,
-		0xF000, 0xF080, 0xF100, 0xF180, 0xF200, 0xF280, 0xF300, 0xF380,
-		0xF400, 0xF480, 0xF500, 0xF580, 0xF600, 0xF680, 0xF700, 0xF780,
-		0xF800, 0xF880, 0xF900, 0xF980, 0xFA00, 0xFA80, 0xFB00, 0xFB80,
-		0xFC00, 0xFC80, 0xFD00, 0xFD80, 0xFE00, 0xFE80, 0xFF00, 0xFF80,
-		0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-		0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-		0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-		0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-		0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-		0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-		0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-		0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-		0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-		0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-		0x0000, 0x00C0, 0x0250, 0x0370, 0x0530, 0x3040, 0x30A0, 0xFF60
-	};
-	
-	if (!len)
-		return 0;
-	
-	memmove(&text[buflen - len], text, len);
-	from = (unsigned char*)&text[buflen - len];
-	to = (unsigned char *)text;
-	
-	// -------------------------------
-	
 	for (int i = 0; i < len;) {
 		
-		
 		if (i >= len) break;
-		c = from[i++];
+		c = scsuString[i++];
 		
 		if (c >= 0x80)
 		{
-			to = UTF8Output (c - 0x80 + slide[active], to);
+			UTF8Output(c - 0x80 + slide[active], &utf8Buf);
 		}
 		else if (c >= 0x20 && c <= 0x7F)
 		{
-			to = UTF8Output (c, to);
+			UTF8Output(c, &utf8Buf);
 		}
 		else if (c == 0x0 || c == 0x9 || c == 0xA || c == 0xC || c == 0xD)
 		{
-			to = UTF8Output (c, to);
+			UTF8Output(c, &utf8Buf);
 		}
 		else if (c >= 0x1 && c <= 0x8) // SQn
 		{
 			if (i >= len) break;
-			d = from[i++]; // single quote
+			d = scsuString[i++]; // single quote
 			
-			to = UTF8Output (d < 0x80 ? d + start [c - 0x1] :
-					 d - 0x80 + slide [c - 0x1], to);
+			UTF8Output(d < 0x80 ? d + start[c - 0x1] :
+				    d - 0x80 + slide[c - 0x1], &utf8Buf);
 		}
 		else if (c >= 0x10 && c <= 0x17) // SCn
 		{
@@ -193,25 +182,25 @@
 		{
 			active = c - 0x18;  // define window
 			if (i >= len) break;
-			slide [active] = win [from[i++]];
+			slide[active] = win[scsuString[i++]];
 		}
 		else if (c == 0xB) // SDX
 		{
 			if (i >= len) break;
-			c = from[i++];
+			c = scsuString[i++];
 			
 			if (i >= len) break;
-			d = from[i++];
+			d = scsuString[i++];
 			
-			slide [active = c>>5] = 0x10000 + (((c & 0x1F) << 8 | d) << 7);
+			slide[active = c>>5] = 0x10000 + (((c & 0x1F) << 8 | d) << 7);
 		}
 		else if (c == 0xE) // SQU
 		{
 			if (i >= len) break;
-			c = from[i++]; // SQU
+			c = scsuString[i++]; // SQU
 			
 			if (i >= len) break;
-			to = UTF8Output (c << 8 | from[i++], to);
+			UTF8Output(c << 8 | scsuString[i++], &utf8Buf);
 		}
 		else if (c == 0xF) // SCU
 		{
@@ -220,50 +209,47 @@
 			while (mode)
 			{
 				if (i >= len) break;
-				c = from[i++];
+				c = scsuString[i++];
 				
 				if (c <= 0xDF || c >= 0xF3)
 				{
 					if (i >= len) break;
-					to = UTF8Output (c << 8 | from[i++], to);
+					UTF8Output(c << 8 | scsuString[i++], &utf8Buf);
 				}
 				else if (c == 0xF0) // UQU
 				{
 					if (i >= len) break;
-					c = from[i++];
+					c = scsuString[i++];
 					
 					if (i >= len) break;
-					to = UTF8Output (c << 8 | from[i++], to);
+					UTF8Output(c << 8 | scsuString[i++], &utf8Buf);
 				}
 				else if (c >= 0xE0 && c <= 0xE7) // UCn
 				{
-					active = c - 0xE0; mode = 0;
+					active = c - 0xE0;
+					mode = 0;
 				}
 				else if (c >= 0xE8 && c <= 0xEF) // UDn
 				{
 					if (i >= len) break;
-					slide [active=c-0xE8] = win [from[i++]]; mode = 0;
+					slide[active=c-0xE8] = win[scsuString[i++]];
+					mode = 0;
 				}
 				else if (c == 0xF1) // UDX
 				{
 					if (i >= len) break;
-					c = from[i++];
+					c = scsuString[i++];
 					
 					if (i >= len) break;
-					d = from[i++];
+					d = scsuString[i++];
 					
-					slide [active = c>>5] =
-						0x10000 + (((c & 0x1F) << 8 | d) << 7); mode = 0;
+					slide[active = c>>5] =
+						0x10000 + (((c & 0x1F) << 8 | d) << 7);
+					mode = 0;
 				}
 			}
 		}
-		
-		
 	}
-	
-	*to++ = 0;
-	*to = 0;
-
 #endif
 	
 	return 0;




More information about the sword-cvs mailing list