utilities/tei2mod.cpp File Reference

#include <string>
#include <vector>
#include <fstream>
#include <iostream>
#include <swbuf.h>
#include <utilxml.h>
#include <rawld.h>
#include <rawld4.h>
#include <zld.h>
#include <zipcomprs.h>
#include <lzsscomprs.h>
#include <stdio.h>
#include <cipherfil.h>
Include dependency graph for tei2mod.cpp:

Go to the source code of this file.

Defines

#define DEBUG

Functions

int detectUTF8 (const char *txt)
bool handleToken (SWBuf &text, XMLTag *token)
void linkToEntry (const SWBuf &keyBuf, const SWBuf &linkBuf)
int main (int argc, char **argv)
void normalizeInput (SWKey &key, SWBuf &text)
void usage (const char *app, const char *error=0)
void writeEntry (SWKey &key, SWBuf &text)

Variables

SWKey * currentKey = NULL
unsigned long entryCount = 0
SWBuf keyStr
SWLDmodule = NULL
bool normalize = true

Define Documentation

#define DEBUG

Definition at line 83 of file tei2mod.cpp.


Function Documentation

int detectUTF8 ( const char *  txt  ) 

Determine whether the string contains a valid unicode sequence. The following table give the pattern of a valid UTF-8 character. Unicode Range 1st 2nd 3rd 4th U-00000000 - U-0000007F 0nnnnnnn U-00000080 - U-000007FF 110nnnnn 10nnnnnn U-00000800 - U-0000FFFF 1110nnnn 10nnnnnn 10nnnnnn U-00010000 - U-001FFFFF 11110nnn 10nnnnnn 10nnnnnn 10nnnnnn Note: 1. The latest UTF-8 RFC allows for a max of 4 bytes. Earlier allowed 6. 2. The number of bits of the leading byte before the first 0 is the total number of bytes. 3. The "n" are the bits of the unicode codepoint. This routine does not check to see if the code point is in the range. It could.

param txt the text to check return 1 if all high order characters form a valid unicode sequence -1 if there are no high order characters. Note: this is also a valid unicode sequence 0 if there are high order characters that do not form a valid unicode sequence author DM Smith

Definition at line 117 of file tei2mod.cpp.

00117                                 {
00118     unsigned int  countUTF8 = 0;
00119     int count = 0;
00120 
00121     // Cast it to make masking and shifting easier
00122     const unsigned char *p = (const unsigned char*) txt;
00123     while (*p) {
00124         // Is the high order bit set?
00125         if (*p & 0x80) {
00126             // Then count the number of high order bits that are set.
00127             // This determines the number of following bytes
00128             // that are a part of the unicode character
00129             unsigned char i = *p;
00130             for (count = 0; i & 0x80; count++) {
00131                 i <<= 1;
00132             }
00133 
00134             // Validate count:
00135             // Count 0: bug in code that would cause core walking
00136             // Count 1: is a pattern of 10nnnnnn,
00137             //          which does not signal the start of a unicode character
00138             // Count 5 to 8: 111110nn, 1111110n and 11111110 and 11111111
00139             //          are not legal starts, either
00140             if (count < 2 || count > 4) return 0;
00141 
00142             // At this point we expect (count - 1) following characters
00143             // of the pattern 10nnnnnn
00144             while (--count && *++p) {
00145                 // The pattern of each following character must be: 10nnnnnn
00146                 // So, compare the top 2 bits.
00147                 if ((0xc0 & *p) != 0x80) return  0;
00148             }
00149 
00150             // Oops, we've run out of bytes too soon: Cannot be UTF-8
00151             if (count) return 0;
00152 
00153             // We have a valid UTF-8 character, so count it
00154             countUTF8++;
00155         }
00156 
00157         // Advance to the next character to examine.
00158         p++;
00159     }
00160 
00161     // At this point it is either UTF-8 or 7-bit ascii
00162     return countUTF8 ? 1 : -1;
00163 }

bool handleToken ( SWBuf &  text,
XMLTag token 
)

Definition at line 220 of file tei2mod.cpp.

00220                                              {
00221         // The start token for the current entry;
00222     static XMLTag startTag;
00223 
00224         // Flags to indicate whether we are in a entry, entryFree or superentry
00225         static bool inEntry      = false;
00226         static bool inEntryFree  = false;
00227         static bool inSuperEntry = false;
00228 
00229     const char *tokenName = token->getName();
00230 
00231         static const char *splitPtr, *splitPtr2 = NULL;
00232         static char *splitBuffer    = new char[4096];
00233     static SWKey tmpKey;
00234 //-- START TAG -------------------------------------------------------------------------
00235     if (!token->isEndTag()) {
00236 
00237         // If we are not in an "entry" and we see one, then enter it.
00238         if (!inEntry && !inEntryFree && !inSuperEntry) {
00239             inEntry      = !strcmp(tokenName, "entry");
00240             inEntryFree  = !strcmp(tokenName, "entryFree");
00241             inSuperEntry = !strcmp(tokenName, "superentry");
00242                         if (inEntry || inEntryFree || inSuperEntry) {
00243 #ifdef DEBUG
00244                 cout << "Entering " << tokenName << endl;
00245 #endif
00246                 startTag    = *token;
00247                 text        = "";
00248 
00249                                 keyStr = token->getAttribute("n"); // P5 with linking and/or non-URI chars
00250                                 if (!strlen(keyStr)) {
00251                                     keyStr = token->getAttribute("sortKey"); // P5 otherwise
00252                                     if (!strlen(keyStr)) {
00253                             keyStr = token->getAttribute("key"); // P4
00254                                         }
00255                                 }
00256 
00257                 return false; // make tag be part of the output
00258             }
00259         }
00260     }
00261 
00262 //-- EMPTY and END TAG ---------------------------------------------------------------------------------------------
00263     else {
00264 
00265         // ENTRY end
00266         // If we see the end of an entry that we are in, then leave it
00267         if ((inEntry      && !strcmp(tokenName, "entry"     )) ||
00268             (inEntryFree  && !strcmp(tokenName, "entryFree" )) ||
00269             (inSuperEntry && !strcmp(tokenName, "superentry"))) {
00270 #ifdef DEBUG
00271             cout << "Leaving " << tokenName << endl;
00272 #endif
00273             // Only one is false coming into here,
00274             // but all must be on leaving.
00275             inEntry       = false;
00276             inEntryFree   = false;
00277             inSuperEntry  = false;
00278             text         += token->toString();
00279 
00280                         entryCount++;
00281 #ifdef DEBUG
00282             cout << "keyStr: " << keyStr << endl;
00283 #endif
00284                         splitPtr = strstr(keyStr, "|");
00285                         if (splitPtr) {
00286                                 strncpy (splitBuffer, keyStr.c_str(), splitPtr - keyStr.c_str());
00287                                 splitBuffer[splitPtr - keyStr.c_str()] = 0;
00288                 *currentKey = splitBuffer;
00289 #ifdef DEBUG
00290                 cout << "splitBuffer: " << splitBuffer << endl;
00291                 cout << "currentKey: " << *currentKey << endl;
00292 #endif
00293                 writeEntry(*currentKey, text);
00294 #if 1
00295                                 while (splitPtr) {
00296                                     splitPtr += 1;
00297                                     splitPtr2 = strstr(splitPtr, "|");
00298                                         entryCount++;
00299                                         if (splitPtr2) {
00300                         strncpy (splitBuffer, splitPtr, splitPtr2 - splitPtr);
00301                                                 splitBuffer[splitPtr2 - splitPtr] = 0;
00302 #ifdef DEBUG
00303                         cout << "splitBuffer: " << splitBuffer << endl;
00304                         cout << "currentKey: " << *currentKey << endl;
00305 #endif
00306                         linkToEntry(currentKey->getText(), splitBuffer);
00307                                             splitPtr = splitPtr2;
00308                                         }
00309                                         else {
00310                         strcpy (splitBuffer, splitPtr);
00311 #ifdef DEBUG
00312                                 cout << "splitBuffer: " << splitBuffer << endl;
00313                         cout << "currentKey: " << *currentKey << endl;
00314 #endif
00315                         linkToEntry(currentKey->getText(), splitBuffer);
00316                                                 splitPtr = 0;
00317                                         }
00318                                 }
00319 #endif
00320                         }
00321                         else {
00322                 *currentKey = keyStr;
00323                 writeEntry(*currentKey, text);
00324                         }
00325 
00326             // Since we consumed the text, clear it
00327             // and tell the caller that the tag was consumed.
00328             text = "";
00329             return true;
00330         }
00331     }
00332     return false;
00333 }

void linkToEntry ( const SWBuf &  keyBuf,
const SWBuf &  linkBuf 
)

Definition at line 210 of file tei2mod.cpp.

00210                                                             {
00211         SWKey tmpkey = linkBuf.c_str();
00212     module->linkEntry(&tmpkey);
00213 #ifdef DEBUG
00214     cout << "(" << entryCount << ") " << "Linking: " << linkBuf << endl;
00215 #endif
00216 }

int main ( int  argc,
char **  argv 
)

Definition at line 354 of file tei2mod.cpp.

00354                                 {
00355 
00356     SWBuf program = argv[0];
00357     fprintf(stderr, "You are running %s: $Rev: 2138 $\n", argv[0]);
00358 
00359     // Let's test our command line arguments
00360     if (argc < 3) {
00361         usage(*argv);
00362     }
00363 
00364     // variables for arguments, holding defaults
00365     SWBuf path             = argv[1];
00366     SWBuf teiDoc           = argv[2];
00367     SWBuf compType         = "";
00368     SWBuf modDrv           = "";
00369     SWBuf recommendedPath  = "./modules/lexdict/";
00370     SWBuf cipherKey        = "";
00371     SWCompress *compressor = 0;
00372 
00373     for (int i = 3; i < argc; i++) {
00374         if (!strcmp(argv[i], "-z")) {
00375             if (compType.size()) usage(*argv, "Cannot specify both -z and -Z");
00376             if (modDrv.size()) usage(*argv, "Cannot specify both -z and -s");
00377             compType = "ZIP";
00378             modDrv = "zLD";
00379             recommendedPath += "zld/";
00380         }
00381         else if (!strcmp(argv[i], "-Z")) {
00382             if (compType.size()) usage(*argv, "Cannot specify both -z and -Z");
00383             if (modDrv.size()) usage(*argv, "Cannot specify both -Z and -s");
00384             compType = "LZSS";
00385             recommendedPath += "zld/";
00386         }
00387         else if (!strcmp(argv[i], "-s")) {
00388             if (compType.size()) usage(*argv, "Cannot specify both -s and -z or -Z");
00389             if (i+1 < argc) {
00390                 int size = atoi(argv[++i]);
00391                 if (size == 2) {
00392                     modDrv           = "RawLD";
00393                     recommendedPath += "rawld/";
00394                     continue;
00395                 }
00396                 if (size == 4) {
00397                     modDrv           = "RawLD4";
00398                     recommendedPath += "rawld4/";
00399                     continue;
00400                 }
00401             }
00402             usage(*argv, "-s requires one of <2|4>");
00403         }
00404         else if (!strcmp(argv[i], "-N")) {
00405             normalize = false;
00406         }
00407         else if (!strcmp(argv[i], "-c")) {
00408             if (i+1 < argc) cipherKey = argv[++i];
00409             else usage(*argv, "-c requires <cipher_key>");
00410         }
00411         else usage(*argv, (((SWBuf)"Unknown argument: ")+ argv[i]).c_str());
00412     }
00413     if (!modDrv.size()) {
00414         modDrv           = "RawLD4";
00415         recommendedPath += "rawld4/";
00416     }
00417 
00418 #ifndef _ICU_
00419     if (normalize) {
00420         normalize = false;
00421         cout << program << " is not compiled with support for ICU. Setting -N flag." << endl;
00422     }
00423 #endif
00424 
00425     if (compType == "ZIP") {
00426 #ifndef EXCLUDEZLIB
00427         compressor = new ZipCompress();
00428 #else
00429         usage(*argv, "ERROR: SWORD library not compiled with ZIP compression support.\n\tBe sure libzip is available when compiling SWORD library");
00430 #endif
00431     }
00432     else if (compType == "LZSS") {
00433         compressor = new LZSSCompress();
00434     }
00435 
00436 #ifdef DEBUG
00437     // cout << "path: " << path << " teiDoc: " << teiDoc << " compressType: " << compType << " ldType: " << modDrv << " cipherKey: " << cipherKey.c_str() << " normalize: " << normalize << "\n";
00438     cout << "path: " << path << " teiDoc: " << teiDoc << " compressType: " << compType << " ldType: " << modDrv << " normalize: " << normalize << "\n";
00439     cout << "";
00440 //      exit(-3);
00441 #endif
00442 
00443     SWBuf modName = path;
00444     int pathlen   = path.length();
00445     char lastChar = path[pathlen - 1];
00446     if (lastChar != '/' && lastChar != '\\') {
00447         modName += "/";
00448     }
00449     modName += "dict";
00450 
00451     SWBuf keyBuf;
00452     SWBuf entBuf;
00453     SWBuf lineBuf;
00454     vector<string> linkBuf;
00455 
00456     if (modDrv == "zLD") {
00457         if (zLD::createModule(modName)) {
00458             fprintf(stderr, "error: %s: couldn't create module at path: %s \n", program.c_str(), modName.c_str()); 
00459             exit(-3);
00460         }
00461         module = new zLD(modName, 0, 0, 30, compressor);
00462     }
00463     else if (modDrv == "RawLD") {
00464         if (RawLD::createModule(modName)) {
00465             fprintf(stderr, "error: %s: couldn't create module at path: %s \n", program.c_str(), modName.c_str()); 
00466             exit(-3);
00467         }
00468         module = new RawLD(modName);
00469     }
00470     else {
00471         if (RawLD4::createModule(modName)) {
00472             fprintf(stderr, "error: %s: couldn't create module at path: %s \n", program.c_str(), modName.c_str()); 
00473             exit(-3);
00474         }
00475         module = new RawLD4(modName);
00476     }
00477 
00478     SWFilter *cipherFilter = 0;
00479 
00480     if (cipherKey.size()) {
00481         fprintf(stderr, "Adding cipher filter with phrase: %s\n", cipherKey.c_str() );
00482         cipherFilter = new CipherFilter(cipherKey.c_str());
00483         module->addRawFilter(cipherFilter);
00484     }
00485 
00486         if (!module->isWritable()) {
00487                 fprintf(stderr, "The module is not writable. Writing text to it will not work.\nExiting.\n" );
00488                 exit(-1);
00489         }
00490 
00491     // Let's see if we can open our input file
00492     ifstream infile(teiDoc);
00493     if (infile.fail()) {
00494         fprintf(stderr, "error: %s: couldn't open input file: %s \n", program.c_str(), teiDoc.c_str());
00495         exit(-2);
00496     }
00497 
00498     currentKey = module->createKey();
00499     currentKey->setPersist(true);
00500     module->setKey(*currentKey);
00501 
00502     (*module) = TOP;
00503 
00504     SWBuf token;
00505     SWBuf text;
00506     bool intoken = false;
00507     char curChar = '\0';
00508 
00509     while (infile.good()) {
00510 
00511         curChar = infile.get();
00512 
00513         // skip the character if it is bad. infile.good() will catch the problem
00514         if (curChar == -1) {
00515             continue;
00516         }
00517 
00518         if (!intoken && curChar == '<') {
00519             intoken = true;
00520             token = "<";
00521             continue;
00522         }
00523 
00524         if (intoken && curChar == '>') {
00525             intoken = false;
00526             token.append('>');
00527 
00528             XMLTag *t = new XMLTag(token.c_str());
00529             if (!handleToken(text, t)) {
00530                 text.append(*t);
00531             }
00532                         delete t;
00533             continue;
00534         }
00535 
00536         if (intoken)
00537             token.append(curChar);
00538         else
00539             switch (curChar) {
00540                 case '>' : text.append("&gt;"); break;
00541                 case '<' : text.append("&lt;"); break;
00542                 default  : text.append(curChar); break;
00543             }
00544     }
00545 
00546     // Force the last entry from the text buffer.
00547     //text = "";
00548     //writeEntry(*currentKey, text);
00549 
00550     delete module;
00551     delete currentKey;
00552     if (cipherFilter)
00553         delete cipherFilter;
00554     infile.close();
00555 
00556 #ifdef _ICU_
00557     if (converted)  fprintf(stderr, "tei2mod converted %d verses to UTF-8\n", converted);
00558     if (normalized) fprintf(stderr, "tei2mod normalized %d verses to NFC\n", normalized);
00559 #endif
00560 
00561     /*
00562      * Suggested module name detection.
00563      * Only used for suggesting a conf.
00564      *
00565      * Various forms of path.
00566      * . and .. - no module name given, use "dict".
00567      * Or one of the following where z is the module name
00568      * and x may be . or ..
00569      * z 
00570      * x/y/z
00571      * x/y/z/
00572      * x/y/z/z
00573      */
00574     SWBuf suggestedModuleName = path;
00575     if (lastChar == '/' || lastChar == '\\') {
00576         suggestedModuleName.setSize(--pathlen);
00577     }
00578 
00579     lastChar = suggestedModuleName[pathlen - 1];
00580     if (lastChar == '.') {
00581         suggestedModuleName = "???";
00582     }
00583     else {
00584         /* At this point the suggestion is either
00585          * what follows the last / or \
00586          * or the entire string
00587          */
00588         const char *m = strrchr(suggestedModuleName.c_str(), '/');
00589         if (!m) {
00590             m = strrchr(suggestedModuleName.c_str(), '\\');
00591         }
00592         if (m) {
00593             suggestedModuleName = m+1;
00594         }
00595     }
00596 
00597     recommendedPath += suggestedModuleName;
00598     recommendedPath += "/dict";
00599 
00600     fprintf(stderr, "\nSuggested conf (replace ??? with appropriate values)\n\n");
00601     fprintf(stderr, "[%s]\n", suggestedModuleName.c_str());
00602     fprintf(stderr, "DataPath=%s\n", recommendedPath.c_str());
00603     fprintf(stderr, "Description=???\n");
00604     fprintf(stderr, "SourceType=TEI\n");
00605     fprintf(stderr, "Encoding=%s\n", (normalize ? "UTF-8" : "???"));
00606     fprintf(stderr, "ModDrv=%s\n", modDrv.c_str());
00607     if (compressor) {
00608         fprintf(stderr, "CompressType=%s\n", compType.c_str());
00609     }
00610     if (cipherKey.size()) {
00611         fprintf(stderr, "CipherKey=%s\n", cipherKey.c_str());
00612     }
00613 }

void normalizeInput ( SWKey &  key,
SWBuf &  text 
)

Definition at line 165 of file tei2mod.cpp.

00165                                              {
00166 #ifdef _ICU_
00167     int utf8State = detectUTF8(text.c_str());
00168     if (normalize) {
00169         // Don't need to normalize text that is ASCII
00170         // But assume other non-UTF-8 text is Latin1 (cp1252) and convert it to UTF-8
00171         if (!utf8State) {
00172             cout << "Warning: " << key << ": Converting to UTF-8 (" << text << ")" << endl;
00173             converter.processText(text, (SWKey *)2);  // note the hack of 2 to mimic a real key. TODO: remove all hacks
00174             converted++;
00175 
00176             // Prepare for double check. This probably can be removed.
00177             // But for now we are running the check again.
00178             // This is to determine whether we need to normalize output of the conversion.
00179             utf8State = detectUTF8(text.c_str());
00180         }
00181 
00182         // Double check. This probably can be removed.
00183         if (!utf8State) {
00184             cout << "Error: " << key << ": Converting to UTF-8 (" << text << ")" << endl;
00185         }
00186 
00187         if (utf8State > 0) {
00188             SWBuf before = text;
00189             normalizer.processText(text, (SWKey *)2);  // note the hack of 2 to mimic a real key. TODO: remove all hacks
00190             if (before != text) {
00191                 normalized++;
00192             }
00193         }
00194     }
00195 #endif
00196 }

void usage ( const char *  app,
const char *  error = 0 
)

Definition at line 335 of file tei2mod.cpp.

00335                                                    {
00336             
00337     if (error) fprintf(stderr, "\n%s: %s\n", app, error);
00338         
00339     fprintf(stderr, "TEI Lexicon/Dictionary/Daily Devotional/Glossary module creation tool for\n\tThe SWORD Project\n");
00340     fprintf(stderr, "\nusage: %s <output/path> <teiDoc> [OPTIONS]\n", app);
00341     fprintf(stderr, "  -z\t\t\t use ZIP compression (default no compression)\n");
00342     fprintf(stderr, "  -Z\t\t\t use LZSS compression (default no compression)\n");
00343     fprintf(stderr, "  -s <2|4>\t\t max text size per entry(default 4):\n");
00344     fprintf(stderr, "  -c <cipher_key>\t encipher module using supplied key\n");
00345     fprintf(stderr, "\t\t\t\t (default no enciphering)\n");
00346         fprintf(stderr, "  -N\t\t\t Do not convert UTF-8 or normalize UTF-8 to NFC\n");
00347         fprintf(stderr, "\t\t\t\t (default is to convert to UTF-8, if needed,\n");
00348         fprintf(stderr, "\t\t\t\t  and then normalize to NFC. Note: all UTF-8\n");
00349     fprintf(stderr, "\t\t\t\t  texts should be normalized to NFC.)\n");
00350     fprintf(stderr, "\n\tThe options -z, -Z, and -s are mutually exclusive.\n");
00351     exit(-1);
00352 }

void writeEntry ( SWKey &  key,
SWBuf &  text 
)

Definition at line 198 of file tei2mod.cpp.

00198                                          {
00199 #ifdef DEBUG
00200     cout << "(" << entryCount << ") " << key << endl;
00201 #endif
00202 
00203     module->setKey(key);
00204 
00205     normalizeInput(key, text);
00206 
00207     module->setEntry(text);
00208 }


Variable Documentation

SWKey* currentKey = NULL

Definition at line 86 of file tei2mod.cpp.

unsigned long entryCount = 0

Definition at line 90 of file tei2mod.cpp.

SWBuf keyStr

Definition at line 88 of file tei2mod.cpp.

SWLD* module = NULL

Definition at line 85 of file tei2mod.cpp.

bool normalize = true

Definition at line 87 of file tei2mod.cpp.


Generated on 18 Mar 2013 for The SWORD Project by  doxygen 1.6.1