[sword-svn] r2142 - trunk/utilities

Sat Mar 1 11:53:20 MST 2008

Author: dmsmith
Date: 2008-03-01 11:53:18 -0700 (Sat, 01 Mar 2008)
New Revision: 2142

Added:
   trunk/utilities/tei2mod.cpp
Modified:
   trunk/utilities/Makefile.am
Log:
added tei2mod to build TEI dictionaries

Modified: trunk/utilities/Makefile.am
===================================================================

--- trunk/utilities/Makefile.am	2008-02-29 20:01:00 UTC (rev 2141)
+++ trunk/utilities/Makefile.am	2008-03-01 18:53:18 UTC (rev 2142)
@@ -6,7 +6,7 @@
 stepdump step2vpl gbfidx modwrite addvs emptyvss \
 addgb genbookutil treeidxutil
 
-bin_PROGRAMS = mod2imp mod2osis osis2mod vs2osisref mod2vpl \
+bin_PROGRAMS = mod2imp mod2osis osis2mod tei2mod vs2osisref mod2vpl \
 	mkfastmod vpl2mod imp2vs installmgr xml2gbs imp2gbs
 
 #if INSTALLMGR
@@ -56,6 +56,7 @@
 mod2osis_SOURCES = mod2osis.cpp
 xml2gbs_SOURCES = xml2gbs.cpp
 osis2mod_SOURCES = osis2mod.cpp
+tei2mod_SOURCES = tei2mod.cpp
 vs2osisref_SOURCES = vs2osisref.cpp
 genbookutil_SOURCES = genbookutil.cpp
 treeidxutil_SOURCES = treeidxutil.cpp

Added: trunk/utilities/tei2mod.cpp
===================================================================
--- trunk/utilities/tei2mod.cpp	                        (rev 0)
+++ trunk/utilities/tei2mod.cpp	2008-03-01 18:53:18 UTC (rev 2142)
@@ -0,0 +1,529 @@
+/**
+ * This program handles xml files of the form:
+ * <TEI.2>
+ *   <text>
+ *     <body>
+ *       <entry      key="xxx">...</entry>
+ *       <entryFree  key="yyy">...</entryFree>
+ *       <superentry key="zzz">...</superentry>
+ *     </body>
+ *   </text>
+ * </TEI.2>
+ * The document is assumed to be well-formed and valid.
+ * Three kinds of entries are allowed,
+ *    <entry> - a very restricted form of a dictionary entry.
+ *    <entryFree> - a very unrestricted form of a dictionary entry.
+ *    <superentry> - an entry which can have other entries.
+ * The value of the key attribute is used as the key for the entry in the module.
+ * Note, for a <superentry> only it's key becomes a SWORD key.
+ * Keys of entries internal to it are not used.
+ *
+ * The entries must be sorted according to an ASCII collation of their bytes.
+ * This should be the same for Latin-1 and for UTF-8
+ *
+ * Sword will allow for any tags, but only a few have any styling.
+ *
+ * author DM Smith
+ */
+#include <string>
+#include <vector>
+#include <fstream>
+#include <iostream>
+#include <swbuf.h>
+#include <utilxml.h>
+#include <rawld.h>
+#include <rawld4.h>
+#include <zld.h>
+#include <zipcomprs.h>
+#include <lzsscomprs.h>
+#include <stdio.h>
+#include <cipherfil.h>
+
+#ifdef _ICU_
+#include <utf8nfc.h>
+#include <latin1utf8.h>
+#endif
+
+#ifndef NO_SWORD_NAMESPACE
+using namespace sword;
+#endif
+
+using namespace std;
+
+#ifdef _ICU_
+UTF8NFC normalizer;
+int normalized = 0;
+
+Latin1UTF8 converter;
+int converted = 0;
+#endif
+
+//#define DEBUG
+
+SWLD  *module       = NULL;
+SWKey *currentKey   = NULL;
+bool   normalize    = true;
+
+/**
+ * Determine whether the string contains a valid unicode sequence.
+ * The following table give the pattern of a valid UTF-8 character.
+ * Unicode Range               1st       2nd       3rd       4th
+ * U-00000000 - U-0000007F  0nnnnnnn
+ * U-00000080 - U-000007FF  110nnnnn  10nnnnnn
+ * U-00000800 - U-0000FFFF  1110nnnn  10nnnnnn  10nnnnnn
+ * U-00010000 - U-001FFFFF  11110nnn  10nnnnnn  10nnnnnn  10nnnnnn
+ * Note:
+ *   1.  The latest UTF-8 RFC allows for a max of 4 bytes.
+ *       Earlier allowed 6.
+ *   2.  The number of bits of the leading byte before the first 0
+ *       is the total number of bytes.
+ *   3.  The "n" are the bits of the unicode codepoint.
+ * This routine does not check to see if the code point is in the range.
+ * It could.
+ *
+ * param  txt the text to check
+ * return   1 if all high order characters form a valid unicode sequence
+ *         -1 if there are no high order characters.
+ *            Note: this is also a valid unicode sequence
+ *          0 if there are high order characters that do not form
+ *            a valid unicode sequence
+ * author DM Smith
+ */
+int detectUTF8(const char *txt) {
+    unsigned int  countUTF8 = 0;
+    int count = 0;
+
+    // Cast it to make masking and shifting easier
+    const unsigned char *p = (const unsigned char*) txt;
+    while (*p) {
+        // Is the high order bit set?
+        if (*p & 0x80) {
+            // Then count the number of high order bits that are set.
+            // This determines the number of following bytes
+            // that are a part of the unicode character
+            unsigned char i = *p;
+            for (count = 0; i & 0x80; count++) {
+                i <<= 1;
+            }
+
+            // Validate count:
+            // Count 0: bug in code that would cause core walking
+            // Count 1: is a pattern of 10nnnnnn,
+            //          which does not signal the start of a unicode character
+            // Count 5 to 8: 111110nn, 1111110n and 11111110 and 11111111
+            //          are not legal starts, either
+            if (count < 2 || count > 4) return 0;
+
+            // At this point we expect (count - 1) following characters
+            // of the pattern 10nnnnnn
+            while (--count && *++p) {
+                // The pattern of each following character must be: 10nnnnnn
+                // So, compare the top 2 bits.
+                if ((0xc0 & *p) != 0x80) return  0;
+            }
+
+            // Oops, we've run out of bytes too soon: Cannot be UTF-8
+            if (count) return 0;
+
+            // We have a valid UTF-8 character, so count it
+            countUTF8++;
+        }
+
+        // Advance to the next character to examine.
+        p++;
+    }
+
+    // At this point it is either UTF-8 or 7-bit ascii
+    return countUTF8 ? 1 : -1;
+}
+
+void normalizeInput(SWKey &key, SWBuf &text) {
+#ifdef _ICU_
+	int utf8State = detectUTF8(text.c_str());
+	if (normalize) {
+		// Don't need to normalize text that is ASCII
+		// But assume other non-UTF-8 text is Latin1 (cp1252) and convert it to UTF-8
+		if (!utf8State) {
+			cout << "Warning: " << key << ": Converting to UTF-8 (" << text << ")" << endl;
+			converter.processText(text, (SWKey *)2);  // note the hack of 2 to mimic a real key. TODO: remove all hacks
+			converted++;
+
+			// Prepare for double check. This probably can be removed.
+			// But for now we are running the check again.
+			// This is to determine whether we need to normalize output of the conversion.
+			utf8State = detectUTF8(text.c_str());
+		}
+
+		// Double check. This probably can be removed.
+		if (!utf8State) {
+			cout << "Error: " << key << ": Converting to UTF-8 (" << text << ")" << endl;
+		}
+
+		if (utf8State > 0) {
+			SWBuf before = text;
+			normalizer.processText(text, (SWKey *)2);  // note the hack of 2 to mimic a real key. TODO: remove all hacks
+			if (before != text) {
+				normalized++;
+			}
+		}
+	}
+#endif
+}
+
+void writeEntry(SWKey &key, SWBuf &text) {
+#ifdef DEBUG
+	cout << key << endl;
+#endif
+
+	module->setKey(key);
+
+	normalizeInput(key, text);
+
+	module->setEntry(text);
+}
+
+void linkToEntry(SWBuf &keyBuf, vector<string> &linkBuf) {
+
+/*
+	char links = linkBuf.size();
+	for (int i = 0; i < links; i++) {
+		SWKey tmpkey = linkBuf[i].c_str();
+		module->linkEntry(&tmpkey);
+		cout << "Linking: " << linkBuf[i] << endl;
+	}
+*/
+}
+
+// Return true if the content was handled or is to be ignored.
+//	  false if the what has been seen is to be accumulated and considered later.
+bool handleToken(SWBuf &text, XMLTag *token) {
+        // The start token for the current entry;
+	static XMLTag startTag;
+	static SWBuf  keyBuf;
+
+        // Flags to indicate whether we are in a entry, entryFree or superentry
+        static bool inEntry      = false;
+        static bool inEntryFree  = false;
+        static bool inSuperEntry = false;
+
+	const char *tokenName = token->getName();
+//-- START TAG -------------------------------------------------------------------------
+	if (!token->isEndTag()) {
+
+		// If we are not in an "entry" and we see one, then enter it.
+		if (!inEntry && !inEntryFree && !inSuperEntry) {
+			inEntry      = !strcmp(tokenName, "entry");
+			inEntryFree  = !strcmp(tokenName, "entryFree");
+			inSuperEntry = !strcmp(tokenName, "superentry");
+                        if (inEntry || inEntryFree || inSuperEntry) {
+#ifdef DEBUG
+				cout << "Entering " << tokenName << endl;
+#endif
+				startTag    = *token;
+				text        = "";
+				*currentKey = token->getAttribute("key");
+
+				return false; // make tag be part of the output
+			}
+		}
+	}
+
+//-- EMPTY and END TAG ---------------------------------------------------------------------------------------------
+	else {
+
+		// ENTRY end
+		// If we see the end of an entry that we are in, then leave it
+		if ((inEntry      && !strcmp(tokenName, "entry"     )) ||
+		    (inEntryFree  && !strcmp(tokenName, "entryFree" )) ||
+		    (inSuperEntry && !strcmp(tokenName, "superentry"))) {
+#ifdef DEBUG
+			cout << "Leaving " << tokenName << endl;
+#endif
+			// Only one is false coming into here,
+			// but all must be on leaving.
+			inEntry       = false;
+			inEntryFree   = false;
+			inSuperEntry  = false;
+			text         += token->toString();
+			writeEntry(*currentKey, text);
+
+			// Since we consumed the text, clear it
+			// and tell the caller that the tag was consumed.
+			text = "";
+			return true;
+		}
+	}
+	return false;
+}
+
+void usage(const char *app, const char *error = 0) {
+			
+	if (error) fprintf(stderr, "\n%s: %s\n", app, error);
+		
+	fprintf(stderr, "TEI Lexicon/Dictionary/Daily Devotional/Glossary module creation tool for the SWORD Project\n");
+	fprintf(stderr, "\nusage: %s <output/path> <teiDoc> [OPTIONS]\n", app);
+	fprintf(stderr, "  -z\t\t\t use ZIP compression (default no compression)\n");
+	fprintf(stderr, "  -Z\t\t\t use LZSS compression (default no compression)\n");
+	fprintf(stderr, "  -s <2|4>\t\t max text size per entry(default 4):\n");
+	fprintf(stderr, "  -c <cipher_key>\t encipher module using supplied key\n");
+	fprintf(stderr, "\t\t\t\t (default no enciphering)\n");
+        fprintf(stderr, "  -N\t\t\t Do not convert UTF-8 or normalize UTF-8 to NFC\n");
+        fprintf(stderr, "\t\t\t\t (default is to convert to UTF-8, if needed, and then normalize to NFC");
+        fprintf(stderr, "\t\t\t\t Note: all UTF-8 texts should be normalized to NFC\n");
+	fprintf(stderr, "-z, -Z, and -s are mutually exclusive\n");
+	exit(-1);
+}
+
+int main(int argc, char **argv) {
+
+	SWBuf program = argv[0];
+	fprintf(stderr, "You are running %s: $Rev: 2138 $\n", argv[0]);
+
+	// Let's test our command line arguments
+	if (argc < 3) {
+		usage(*argv);
+	}
+
+	// variables for arguments, holding defaults
+	SWBuf path             = argv[1];
+	SWBuf teiDoc           = argv[2];
+	SWBuf compType	       = "";
+	SWBuf modDrv           = "";
+	SWBuf recommendedPath  = "./modules/lexdict/";
+	SWBuf cipherKey        = "";
+	SWCompress *compressor = 0;
+
+	for (int i = 3; i < argc; i++) {
+		if (!strcmp(argv[i], "-z")) {
+			if (compType.size()) usage(*argv, "Cannot specify both -z and -Z");
+			if (modDrv.size()) usage(*argv, "Cannot specify both -z and -s");
+			compType = "ZIP";
+			modDrv = "zLD";
+			recommendedPath += "zld/";
+		}
+		else if (!strcmp(argv[i], "-Z")) {
+			if (compType.size()) usage(*argv, "Cannot specify both -z and -Z");
+			if (modDrv.size()) usage(*argv, "Cannot specify both -Z and -s");
+			compType = "LZSS";
+			recommendedPath += "zld/";
+		}
+		else if (!strcmp(argv[i], "-s")) {
+			if (compType.size()) usage(*argv, "Cannot specify both -s and -z or -Z");
+			if (i+1 < argc) {
+				int size = atoi(argv[++i]);
+				if (size == 2) {
+					modDrv           = "RawLD";
+					recommendedPath += "rawld/";
+					continue;
+				}
+				if (size == 4) {
+					modDrv           = "RawLD4";
+					recommendedPath += "rawld4/";
+					continue;
+				}
+			}
+			usage(*argv, "-s requires one of <2|4>");
+		}
+		else if (!strcmp(argv[i], "-N")) {
+			normalize = false;
+		}
+		else if (!strcmp(argv[i], "-c")) {
+			if (i+1 < argc) cipherKey = argv[++i];
+			else usage(*argv, "-c requires <cipher_key>");
+		}
+		else usage(*argv, (((SWBuf)"Unknown argument: ")+ argv[i]).c_str());
+	}
+	if (!modDrv.size()) {
+		modDrv           = "RawLD4";
+		recommendedPath += "rawld4/";
+	}
+
+#ifndef _ICU_
+	if (normalize) {
+		normalize = false;
+		cout << program << " is not compiled with support for ICU. Setting -N flag." << endl;
+	}
+#endif
+
+	if (compType == "ZIP") {
+		compressor = new ZipCompress();
+	}
+	else if (compType = "LZSS") {
+		compressor = new LZSSCompress();
+	}
+
+#ifdef DEBUG
+	// cout << "path: " << path << " teiDoc: " << teiDoc << " compressType: " << compType << " ldType: " << modDrv << " cipherKey: " << cipherKey.c_str() << " normalize: " << normalize << "\n";
+	cout << "path: " << path << " teiDoc: " << teiDoc << " compressType: " << compType << " ldType: " << modDrv << " normalize: " << normalize << "\n";
+	cout << "";
+//      exit(-3);
+#endif
+
+	SWBuf modName = path;
+	int pathlen   = path.length();
+	char lastChar = path[pathlen - 1];
+	if (lastChar != '/' && lastChar != '\\') {
+		modName += "/";
+	}
+	modName += "dict";
+
+	SWBuf keyBuf;
+	SWBuf entBuf;
+	SWBuf lineBuf;
+	vector<string> linkBuf;
+
+	if (modDrv == "zLD") {
+		if (zLD::createModule(modName)) {
+			fprintf(stderr, "error: %s: couldn't create module at path: %s \n", program.c_str(), modName.c_str()); 
+			exit(-3);
+		}
+		module = new zLD(modName, 0, 0, 30, compressor);
+	}
+	else if (modDrv == "RawLD") {
+		if (RawLD::createModule(modName)) {
+			fprintf(stderr, "error: %s: couldn't create module at path: %s \n", program.c_str(), modName.c_str()); 
+			exit(-3);
+		}
+		module = new RawLD(modName);
+	}
+	else {
+		if (RawLD4::createModule(modName)) {
+			fprintf(stderr, "error: %s: couldn't create module at path: %s \n", program.c_str(), modName.c_str()); 
+			exit(-3);
+		}
+		module = new RawLD4(modName);
+	}
+
+	SWFilter *cipherFilter = 0;
+
+	if (cipherKey.size()) {
+		fprintf(stderr, "Adding cipher filter with phrase: %s\n", cipherKey.c_str() );
+		cipherFilter = new CipherFilter(cipherKey.c_str());
+		module->AddRawFilter(cipherFilter);
+	}
+
+        if (!module->isWritable()) {
+                fprintf(stderr, "The module is not writable. Writing text to it will not work.\nExiting.\n" );
+                exit(-1);
+        }
+
+	// Let's see if we can open our input file
+	ifstream infile(teiDoc);
+	if (infile.fail()) {
+		fprintf(stderr, "error: %s: couldn't open input file: %s \n", program.c_str(), teiDoc.c_str());
+		exit(-2);
+	}
+
+	currentKey = module->CreateKey();
+	currentKey->Persist(1);
+	module->setKey(*currentKey);
+
+	(*module) = TOP;
+
+	SWBuf token;
+	SWBuf text;
+	bool intoken = false;
+	char curChar = '\0';
+
+	while (infile.good()) {
+
+		curChar = infile.get();
+
+		// skip the character if it is bad. infile.good() will catch the problem
+		if (curChar == -1) {
+			continue;
+		}
+
+		if (!intoken && curChar == '<') {
+			intoken = true;
+			token = "<";
+			continue;
+		}
+
+		if (intoken && curChar == '>') {
+			intoken = false;
+			token.append('>');
+
+			XMLTag *t = new XMLTag(token.c_str());
+			if (!handleToken(text, t)) {
+				text.append(*t);
+			}
+			continue;
+		}
+
+		if (intoken)
+			token.append(curChar);
+		else
+			switch (curChar) {
+				case '>' : text.append("&gt;"); break;
+				case '<' : text.append("&lt;"); break;
+				default  : text.append(curChar); break;
+			}
+	}
+
+	// Force the last entry from the text buffer.
+	//text = "";
+	//writeEntry(*currentKey, text);
+
+	delete module;
+	delete currentKey;
+	if (cipherFilter)
+		delete cipherFilter;
+	infile.close();
+
+	if (converted)  fprintf(stderr, "tei2mod converted %d verses to UTF-8\n", converted);
+	if (normalized) fprintf(stderr, "tei2mod normalized %d verses to NFC\n", normalized);
+
+	/*
+	 * Suggested module name detection.
+	 * Only used for suggesting a conf.
+	 *
+	 * Various forms of path.
+	 * . and .. - no module name given, use "dict".
+	 * Or one of the following where z is the module name
+	 * and x may be . or ..
+	 * z 
+	 * x/y/z
+	 * x/y/z/
+	 * x/y/z/z
+	 */
+	SWBuf suggestedModuleName = path;
+	if (lastChar == '/' || lastChar == '\\') {
+		suggestedModuleName.setSize(--pathlen);
+	}
+
+	lastChar = suggestedModuleName[pathlen - 1];
+	if (lastChar == '.') {
+		suggestedModuleName = "???";
+	}
+	else {
+		/* At this point the suggestion is either
+		 * what follows the last / or \
+		 * or the entire string
+		 */
+		const char *m = strrchr(suggestedModuleName.c_str(), '/');
+		if (!m) {
+			m = strrchr(suggestedModuleName.c_str(), '\\');
+		}
+		if (m) {
+			suggestedModuleName = m+1;
+		}
+	}
+
+	recommendedPath += suggestedModuleName;
+	recommendedPath += "/dict";
+
+	fprintf(stderr, "\nSuggested conf (replace ??? with appropriate values)\n\n");
+	fprintf(stderr, "[%s]\n", suggestedModuleName.c_str());
+	fprintf(stderr, "DataPath=%s\n", recommendedPath.c_str());
+	fprintf(stderr, "Description=???\n");
+	fprintf(stderr, "SourceType=TEI\n");
+	fprintf(stderr, "Encoding=%s\n", (normalized ? "UTF-8" : "???"));
+	fprintf(stderr, "ModDrv=%s\n", modDrv.c_str());
+	if (compressor) {
+		fprintf(stderr, "CompressType=%s\n", compType.c_str());
+	}
+	if (cipherKey.size()) {
+		fprintf(stderr, "CipherKey=%s\n", cipherKey.c_str());
+	}
+}