[sword-svn] r2399 - trunk/utilities

Sun May 10 18:21:41 MST 2009

Author: dmsmith
Date: 2009-05-10 18:21:41 -0700 (Sun, 10 May 2009)
New Revision: 2399

Modified:
   trunk/utilities/osis2mod.cpp
Log:
Osis2mod improvements:
1) changed -4 to -s <2|4>, matching tei2mod. -4 was added recently, so the change had not been released.
2) API-113 resolved by adding the ability to read from standard input.


Modified: trunk/utilities/osis2mod.cpp
===================================================================

--- trunk/utilities/osis2mod.cpp	2009-05-09 23:03:59 UTC (rev 2398)
+++ trunk/utilities/osis2mod.cpp	2009-05-11 01:21:41 UTC (rev 2399)
@@ -80,6 +80,7 @@
 
 SWText *module = 0;
 VerseKey currentVerse;
+SWBuf v11n     = "KJV";
 char activeOsisID[255];
 char currentOsisID[255];
 
@@ -94,8 +95,8 @@
 
 bool isOSISAbbrev(const char *buf) {
 	VerseMgr *vmgr = VerseMgr::getSystemVerseMgr();
-	const VerseMgr::System *v11n = vmgr->getVersificationSystem(currentVerse.getVersificationSystem());
-	return v11n->getBookNumberByOSISName(buf) >= 0;
+	const VerseMgr::System *av11n = vmgr->getVersificationSystem(currentVerse.getVersificationSystem());
+	return av11n->getBookNumberByOSISName(buf) >= 0;
 }
 
 /**
@@ -1258,6 +1259,8 @@
 	if (error) fprintf(stderr, "\n%s: %s\n", app, error);
 
 	fprintf(stderr, "\nusage: %s <output/path> <osisDoc> [OPTIONS]\n", app);
+	fprintf(stderr, "  <output/path>\t\t an existing folder that the module will be written\n");
+	fprintf(stderr, "  <osisDoc>\t\t path to the validated OSIS document, or '-' to read from standard input\n");
 	fprintf(stderr, "  -a\t\t\t augment module if exists (default is to create new)\n");
 	fprintf(stderr, "  -z\t\t\t use ZIP compression (default no compression)\n");
 	fprintf(stderr, "  -Z\t\t\t use LZSS compression (default no compression)\n");
@@ -1269,7 +1272,7 @@
 	fprintf(stderr, "\t\t\t\t (default is to convert to UTF-8, if needed,\n");
 	fprintf(stderr, "\t\t\t\t  and then normalize to NFC)\n");
 	fprintf(stderr, "\t\t\t\t Note: UTF-8 texts should be normalized to NFC.\n");
-	fprintf(stderr, "  -4\t\t\t use 4 byte size entries (default is 2).\n");
+	fprintf(stderr, "  -s <2|4>\t\t max text size per entry (default is 2).\n");
 	fprintf(stderr, "\t\t\t\t Note: useful for commentaries with very large entries\n");
 	fprintf(stderr, "\t\t\t\t       in uncompressed modules (default is 65535 bytes)\n");
 	fprintf(stderr, "  -v <v11n>\t\t specify a versification scheme to use (default is KJV)\n");
@@ -1302,6 +1305,93 @@
 	exit(EXIT_BAD_ARG);
 }
 
+void processOSIS(istream& infile) {
+	activeOsisID[0] = '\0';
+
+	strcpy(currentOsisID,"N/A");
+
+	currentVerse.setVersificationSystem(v11n);
+	currentVerse.AutoNormalize(0);
+	currentVerse.Headings(1);	// turn on mod/testmnt/book/chap headings
+	currentVerse.Persist(1);
+
+	module->setKey(currentVerse);
+	module->setPosition(TOP);
+
+	SWBuf token;
+	SWBuf text;
+	bool intoken = false;
+	bool inWhitespace = false;
+	bool seeingSpace = false;
+	char curChar = '\0';
+
+	while (infile.good()) {
+		
+		curChar = infile.get();
+
+		// skip the character if it is bad. infile.good() will catch the problem
+		if (curChar == -1) {
+			continue;
+		}
+
+		if (!intoken && curChar == '<') {
+			intoken = true;
+			token = "<";
+			continue;
+		}
+
+		// Outside of tokens merge adjacent whitespace
+		if (!intoken) {
+			seeingSpace = isspace(curChar);
+			if (seeingSpace) {
+				if (inWhitespace) {
+					continue;
+				}
+				// convert all whitespace to blanks
+				curChar = ' ';
+			}
+			inWhitespace = seeingSpace;
+		}
+
+		if (intoken && curChar == '>') {
+			intoken = false;
+			inWhitespace = false;
+			token.append('>');
+			// take this isalpha if out to check for bugs in text
+			if ((isalpha(token[1])) || (isalpha(token[2]))) {
+				//cout << "Handle:" << token.c_str() << endl;
+				XMLTag t = transformBSP(token.c_str());
+
+				if (!handleToken(text, t)) {
+					text.append(t);
+				}
+			}
+			continue;
+		}
+
+		if (intoken) {
+			token.append(curChar);
+		}
+		else {
+			switch (curChar) {
+				case '>' : text.append("&gt;"); break;
+				case '<' : text.append("&lt;"); break;
+				default  : text.append(curChar); break;
+			}
+		}
+	}
+
+	// Force the last entry from the text buffer.
+	text = "";
+	writeEntry(text, true);
+	writeLinks();
+
+#ifdef _ICU_
+	if (converted)  fprintf(stderr, "osis2mod converted %d verses to UTF-8\n", converted);
+	if (normalized) fprintf(stderr, "osis2mod normalized %d verses to NFC\n", normalized);
+#endif
+}
+
 int main(int argc, char **argv) {
 
 	fprintf(stderr, "You are running osis2mod: $Rev$\n");
@@ -1312,16 +1402,15 @@
 	}
 
 	// variables for arguments, holding defaults
-	const char* program = argv[0];
-	const char* path    = argv[1];
-	const char* osisDoc = argv[2];
-	int append          = 0;
-	int compType        = 0;
-	int iType           = 4;
-	int largeEntry      = 0;
-	SWBuf cipherKey     = "";
-	SWBuf v11n          = "KJV";
-
+	const char* program    = argv[0];
+	const char* path       = argv[1];
+	const char* osisDoc    = argv[2];
+	int append             = 0;
+	SWBuf compType         = "";
+	bool isCommentary      = false;
+	int iType              = 4;
+	int entrySize          = 0;
+	SWBuf cipherKey        = "";
 	SWCompress *compressor = 0;
 
 	for (int i = 3; i < argc; i++) {
@@ -1329,14 +1418,14 @@
 			append = 1;
 		}
 		else if (!strcmp(argv[i], "-z")) {
-			if (compType) usage(*argv, "Cannot specify both -z and -Z");
-			if (largeEntry) usage(*argv, "Cannot specify both -z and -4");
-			compType = 2;
+			if (compType.size()) usage(*argv, "Cannot specify both -z and -Z");
+			if (entrySize) usage(*argv, "Cannot specify both -z and -s");
+			compType = "ZIP";
 		}
 		else if (!strcmp(argv[i], "-Z")) {
-			if (compType) usage(*argv, "Cannot specify both -z and -Z");
-			if (largeEntry) usage(*argv, "Cannot specify both -Z and -4");
-			compType = 1;
+			if (compType.size()) usage(*argv, "Cannot specify both -z and -Z");
+			if (entrySize) usage(*argv, "Cannot specify both -Z and -s");
+			compType = "LZSS";
 		}
 		else if (!strcmp(argv[i], "-b")) {
 			if (i+1 < argc) {
@@ -1356,10 +1445,19 @@
 			if (i+1 < argc) v11n = argv[++i];
 			else usage(*argv, "-v requires <v11n>");
 		}
-		else if (!strcmp(argv[i], "-4")) {
-			if (compType) usage(*argv, "Cannot specify -4 and -z or -Z");
-			largeEntry = 1;
+		else if (!strcmp(argv[i], "-s")) {
+			if (compType.size()) usage(*argv, "Cannot specify -s and -z or -Z");
+                        if (i+1 < argc) {
+                                entrySize = atoi(argv[++i]);
+                                if (entrySize == 2 || entrySize == 4) {
+                                        continue;
+                                }
+                        }
+                        usage(*argv, "-s requires one of <2|4>");
 		}
+		else if (!strcmp(argv[i], "-C")) {
+			isCommentary = true;
+		}
 #ifdef DEBUG
 		else if (!strcmp(argv[i], "-d")) {
 			if (i+1 < argc) debug |= atoi(argv[++i]);
@@ -1369,11 +1467,12 @@
 		else usage(*argv, (((SWBuf)"Unknown argument: ")+ argv[i]).c_str());
 	}
 
-	switch (compType) {	// these are deleted by zText
-		case 0: break;
-		case 1: compressor = new LZSSCompress(); break;
-		case 2: compressor = new ZipCompress(); break;
-	}
+        if (compType == "ZIP") {
+                compressor = new ZipCompress();
+        }
+        else if (compType = "LZSS") {
+                compressor = new LZSSCompress();
+        }
 
 #ifndef _ICU_
 	if (normalize) {
@@ -1388,7 +1487,6 @@
 	}
 #endif
 
-
 	if (!append) {	// == 0 then create module
 	// Try to initialize a default set of datafiles and indicies at our
 	// datapath location passed to us from the user.
@@ -1398,7 +1496,7 @@
 				exit(EXIT_NO_CREATE);
 			}
 		}
-		else if (largeEntry) {
+		else if (entrySize == 4) {
 			if (RawText4::createModule(path, v11n)) {
 				fprintf(stderr, "ERROR: %s: couldn't create module at path: %s \n", program, path);
 				exit(EXIT_NO_CREATE);
@@ -1412,13 +1510,6 @@
 		}
 	}
 
-	// Let's see if we can open our input file
-	ifstream infile(osisDoc);
-	if (infile.fail()) {
-		fprintf(stderr, "ERROR: %s: couldn't open input file: %s \n", program, osisDoc);
-		exit(EXIT_NO_READ);
-	}
-
 	// Do some initialization stuff
 	if (compressor) {
 		// Create a compressed text module allowing very large entries
@@ -1437,7 +1528,7 @@
 				v11n		// versification
                        );
 	}
-	else if (largeEntry) {
+	else if (entrySize == 4) {
 		// Create a raw text module allowing very large entries
 		// Taking defaults except for first and last argument
 		module = new RawText4(
@@ -1481,95 +1572,26 @@
 		exit(EXIT_NO_WRITE);
 	}
 
-	activeOsisID[0] = '\0';
-
-	strcpy(currentOsisID,"N/A");
-
-	currentVerse.setVersificationSystem(v11n);
-	currentVerse.AutoNormalize(0);
-	currentVerse.Headings(1);	// turn on mod/testmnt/book/chap headings
-	currentVerse.Persist(1);
-
-	module->setKey(currentVerse);
-	module->setPosition(TOP);
-
-	SWBuf token;
-	SWBuf text;
-	bool intoken = false;
-	bool inWhitespace = false;
-	bool seeingSpace = false;
-	char curChar = '\0';
-
-	while (infile.good()) {
-		
-		curChar = infile.get();
-
-		// skip the character if it is bad. infile.good() will catch the problem
-		if (curChar == -1) {
-			continue;
+	// Either read from std::cin (aka stdin), when the argument is a '-'
+	// or from a specified file.
+	if (!strcmp(osisDoc, "-")) {
+		processOSIS(cin);
+	}
+	else {
+		// Let's see if we can open our input file
+		ifstream infile(osisDoc);
+		if (infile.fail()) {
+			fprintf(stderr, "ERROR: %s: couldn't open input file: %s \n", program, osisDoc);
+			exit(EXIT_NO_READ);
 		}
-
-		if (!intoken && curChar == '<') {
-			intoken = true;
-			token = "<";
-			continue;
-		}
-
-		// Outside of tokens merge adjacent whitespace
-		if (!intoken) {
-			seeingSpace = isspace(curChar);
-			if (seeingSpace) {
-				if (inWhitespace) {
-					continue;
-				}
-				// convert all whitespace to blanks
-				curChar = ' ';
-			}
-			inWhitespace = seeingSpace;
-		}
-
-		if (intoken && curChar == '>') {
-			intoken = false;
-			inWhitespace = false;
-			token.append('>');
-			// take this isalpha if out to check for bugs in text
-			if ((isalpha(token[1])) || (isalpha(token[2]))) {
-				//cout << "Handle:" << token.c_str() << endl;
-				XMLTag t = transformBSP(token.c_str());
-
-				if (!handleToken(text, t)) {
-					text.append(t);
-				}
-			}
-			continue;
-		}
-
-		if (intoken) {
-			token.append(curChar);
-		}
-		else {
-			switch (curChar) {
-				case '>' : text.append("&gt;"); break;
-				case '<' : text.append("&lt;"); break;
-				default  : text.append(curChar); break;
-			}
-		}
+		processOSIS(infile);
+		infile.close();
 	}
 
-	// Force the last entry from the text buffer.
-	text = "";
-	writeEntry(text, true);
-	writeLinks();
-
 	delete module;
 	if (cipherFilter)
 		delete cipherFilter;
-	infile.close();
 
-#ifdef _ICU_
-	if (converted)  fprintf(stderr, "osis2mod converted %d verses to UTF-8\n", converted);
-	if (normalized) fprintf(stderr, "osis2mod normalized %d verses to NFC\n", normalized);
-#endif
 	exit(0); // success
 }