[sword-svn] r3084 - trunk/utilities

chrislit at crosswire.org chrislit at crosswire.org
Sat Mar 8 00:16:58 MST 2014


Author: chrislit
Date: 2014-03-08 00:16:58 -0700 (Sat, 08 Mar 2014)
New Revision: 3084

Modified:
   trunk/utilities/osis2mod.cpp
Log:
first cut of output encoding conversion (to UTF-16 or SCSU)


Modified: trunk/utilities/osis2mod.cpp
===================================================================
--- trunk/utilities/osis2mod.cpp	2014-03-06 08:13:10 UTC (rev 3083)
+++ trunk/utilities/osis2mod.cpp	2014-03-08 07:16:58 UTC (rev 3084)
@@ -59,7 +59,9 @@
 #ifdef _ICU_
 #include <utf8nfc.h>
 #include <latin1utf8.h>
+#include <utf8scsu.h>
 #endif
+#include <utf8utf16.h>
 
 #ifndef NO_SWORD_NAMESPACE
 using namespace sword;
@@ -90,6 +92,8 @@
 UTF8NFC    normalizer;
 Latin1UTF8 converter;
 #endif
+SWFilter*  outputConverter;     
+
 int normalized = 0;
 int converted  = 0;
 
@@ -121,7 +125,8 @@
  * U-00000000 - U-0000007F  0nnnnnnn
  * U-00000080 - U-000007FF  110nnnnn  10nnnnnn
  * U-00000800 - U-0000FFFF  1110nnnn  10nnnnnn  10nnnnnn
- * U-00010000 - U-001FFFFF  11110nnn  10nnnnnn  10nnnnnn  10nnnnnn
+ * U-00010000 - U-0010FFFF  11110nnn  10nnnnnn  10nnnnnn  10nnnnnn
+ *
  * Note:
  *   1.  The latest UTF-8 RFC allows for a max of 4 bytes.
  *       Earlier allowed 6.
@@ -533,9 +538,17 @@
 			}
 		}
 
+		// If the desired output encoding is non-UTF-8, convert to that encoding
+		if (outputConverter) {
+			outputConverter->processText(activeVerseText, (SWKey *)2);  // note the hack of 2 to mimic a real key. TODO: remove all hacks
+		}
+
 		// If the entry already exists, then append this entry to the text.
 		// This is for verses that are outside the chosen versification. They are appended to the prior verse.
 		// The space should not be needed if we retained verse tags.
+		// TODO: in the case of SCSU output, very slightly better compression might be
+		// achieved by decoding the currentText & activeVerseText, concatenating them,
+		// and re-encoding them as SCSU
 		SWBuf currentText = module->getRawEntry();
 		if (currentText.length()) {
 			cout << "INFO(WRITE): Appending entry: " << currentVerse.getOSISRef() << ": " << activeVerseText << endl;
@@ -968,7 +981,7 @@
 
 			if (tokenName != topToken.getName()) {
 				cout << "FATAL(NESTING): " << currentOsisID << ": Expected " << topToken.getName() << " found " << tokenName << endl;
-//				exit(EXIT_BAD_NESTING);	// (OSK) I'm sure this validity check is a good idea, but there's a but somewhere that's killing the converter here.
+//				exit(EXIT_BAD_NESTING);	// (OSK) I'm sure this validity check is a good idea, but there's a bug somewhere that's killing the converter here.
 						// So I'm disabling this line. Unvalidated OSIS files shouldn't be run through the converter anyway.
 						// (DM) This has nothing to do with well-form or valid. It checks milestoned elements for proper nesting.
 			}
@@ -1302,8 +1315,10 @@
 	fprintf(stderr, "  -c <cipher_key>\t encipher module using supplied key\n");
 	fprintf(stderr, "\t\t\t\t (default no enciphering)\n");
 
-#ifdef _ICU_       
-	fprintf(stderr, "  -N\t\t\t do not convert UTF-8 or normalize UTF-8 to NFC\n");
+#ifdef _ICU_
+	fprintf(stderr, "  -e <1|2|s>\t\t convert Unicode encoding (default: 1)\n");
+	fprintf(stderr, "\t\t\t\t 1 - UTF-8 ; 2 - UTF-16 ; s - SCSU\n");
+	fprintf(stderr, "  -N\t\t\t do not normalize to NFC\n");
 	if (verboseHelp) {
 		fprintf(stderr, "\t\t\t\t (default is to convert to UTF-8, if needed,\n");
 		fprintf(stderr, "\t\t\t\t  and then normalize to NFC)\n");
@@ -1594,11 +1609,11 @@
 			if (entrySize) usage(*argv, "Cannot specify both -z and -s");
 			compType = "ZIP";
 			if (i+1 < argc && argv[i+1][0] != '-') {
-				switch (argv[i+1][0]) {
-				case 'l': compType = "LZSS";
-				case 'z': compType = "ZIP";
-				case 'b': compType = "BZIP2";
-				case 'x': compType = "XZ";
+				switch (argv[++i][0]) {
+				case 'l': compType = "LZSS"; break;
+				case 'z': compType = "ZIP"; break;
+				case 'b': compType = "BZIP2"; break;
+				case 'x': compType = "XZ"; break;
 				}
 			}
 		}
@@ -1617,6 +1632,18 @@
 		else if (!strcmp(argv[i], "-N")) {
 			normalize = false;
 		}
+		else if (!strcmp(argv[i], "-e")) {
+			if (i+1 < argc) {
+				switch (argv[++i][0]) {
+				case '1': outputConverter = NULL; break; // leave as UTF-8
+				case '2': outputConverter = new UTF8UTF16(); break;
+#ifdef _ICU_
+				case 's': outputConverter = new UTF8SCSU(); break;
+#endif
+				default: outputConverter = NULL;
+				}
+			} 
+		}
 		else if (!strcmp(argv[i], "-c")) {
 			if (i+1 < argc) cipherKey = argv[++i];
 			else usage(*argv, "-c requires <cipher_key>");
@@ -1787,6 +1814,8 @@
 	delete module;
 	if (cipherFilter)
 		delete cipherFilter;
+	if (outputConverter)
+		delete outputConverter;
 
 	fprintf(stderr, "SUCCESS: %s: has finished its work and will now rest\n", program);
 	exit(0); // success




More information about the sword-cvs mailing list