#include <string>#include <vector>#include <fstream>#include <iostream>#include <swbuf.h>#include <utilxml.h>#include <rawld.h>#include <rawld4.h>#include <zld.h>#include <zipcomprs.h>#include <lzsscomprs.h>#include <stdio.h>#include <cipherfil.h>
Go to the source code of this file.
Defines | |
| #define | DEBUG |
Functions | |
| int | detectUTF8 (const char *txt) |
| bool | handleToken (SWBuf &text, XMLTag *token) |
| void | linkToEntry (const SWBuf &keyBuf, const SWBuf &linkBuf) |
| int | main (int argc, char **argv) |
| void | normalizeInput (SWKey &key, SWBuf &text) |
| void | usage (const char *app, const char *error=0) |
| void | writeEntry (SWKey &key, SWBuf &text) |
Variables | |
| SWKey * | currentKey = NULL |
| unsigned long | entryCount = 0 |
| SWBuf | keyStr |
| SWLD * | module = NULL |
| bool | normalize = true |
| #define DEBUG |
Definition at line 83 of file tei2mod.cpp.
| int detectUTF8 | ( | const char * | txt | ) |
Determine whether the string contains a valid unicode sequence. The following table give the pattern of a valid UTF-8 character. Unicode Range 1st 2nd 3rd 4th U-00000000 - U-0000007F 0nnnnnnn U-00000080 - U-000007FF 110nnnnn 10nnnnnn U-00000800 - U-0000FFFF 1110nnnn 10nnnnnn 10nnnnnn U-00010000 - U-001FFFFF 11110nnn 10nnnnnn 10nnnnnn 10nnnnnn Note: 1. The latest UTF-8 RFC allows for a max of 4 bytes. Earlier allowed 6. 2. The number of bits of the leading byte before the first 0 is the total number of bytes. 3. The "n" are the bits of the unicode codepoint. This routine does not check to see if the code point is in the range. It could.
param txt the text to check return 1 if all high order characters form a valid unicode sequence -1 if there are no high order characters. Note: this is also a valid unicode sequence 0 if there are high order characters that do not form a valid unicode sequence author DM Smith
Definition at line 117 of file tei2mod.cpp.
00117 { 00118 unsigned int countUTF8 = 0; 00119 int count = 0; 00120 00121 // Cast it to make masking and shifting easier 00122 const unsigned char *p = (const unsigned char*) txt; 00123 while (*p) { 00124 // Is the high order bit set? 00125 if (*p & 0x80) { 00126 // Then count the number of high order bits that are set. 00127 // This determines the number of following bytes 00128 // that are a part of the unicode character 00129 unsigned char i = *p; 00130 for (count = 0; i & 0x80; count++) { 00131 i <<= 1; 00132 } 00133 00134 // Validate count: 00135 // Count 0: bug in code that would cause core walking 00136 // Count 1: is a pattern of 10nnnnnn, 00137 // which does not signal the start of a unicode character 00138 // Count 5 to 8: 111110nn, 1111110n and 11111110 and 11111111 00139 // are not legal starts, either 00140 if (count < 2 || count > 4) return 0; 00141 00142 // At this point we expect (count - 1) following characters 00143 // of the pattern 10nnnnnn 00144 while (--count && *++p) { 00145 // The pattern of each following character must be: 10nnnnnn 00146 // So, compare the top 2 bits. 00147 if ((0xc0 & *p) != 0x80) return 0; 00148 } 00149 00150 // Oops, we've run out of bytes too soon: Cannot be UTF-8 00151 if (count) return 0; 00152 00153 // We have a valid UTF-8 character, so count it 00154 countUTF8++; 00155 } 00156 00157 // Advance to the next character to examine. 00158 p++; 00159 } 00160 00161 // At this point it is either UTF-8 or 7-bit ascii 00162 return countUTF8 ? 1 : -1; 00163 }
| bool handleToken | ( | SWBuf & | text, | |
| XMLTag * | token | |||
| ) |
Definition at line 220 of file tei2mod.cpp.
00220 { 00221 // The start token for the current entry; 00222 static XMLTag startTag; 00223 00224 // Flags to indicate whether we are in a entry, entryFree or superentry 00225 static bool inEntry = false; 00226 static bool inEntryFree = false; 00227 static bool inSuperEntry = false; 00228 00229 const char *tokenName = token->getName(); 00230 00231 static const char *splitPtr, *splitPtr2 = NULL; 00232 static char *splitBuffer = new char[4096]; 00233 static SWKey tmpKey; 00234 //-- START TAG ------------------------------------------------------------------------- 00235 if (!token->isEndTag()) { 00236 00237 // If we are not in an "entry" and we see one, then enter it. 00238 if (!inEntry && !inEntryFree && !inSuperEntry) { 00239 inEntry = !strcmp(tokenName, "entry"); 00240 inEntryFree = !strcmp(tokenName, "entryFree"); 00241 inSuperEntry = !strcmp(tokenName, "superentry"); 00242 if (inEntry || inEntryFree || inSuperEntry) { 00243 #ifdef DEBUG 00244 cout << "Entering " << tokenName << endl; 00245 #endif 00246 startTag = *token; 00247 text = ""; 00248 00249 keyStr = token->getAttribute("n"); // P5 with linking and/or non-URI chars 00250 if (!strlen(keyStr)) { 00251 keyStr = token->getAttribute("sortKey"); // P5 otherwise 00252 if (!strlen(keyStr)) { 00253 keyStr = token->getAttribute("key"); // P4 00254 } 00255 } 00256 00257 return false; // make tag be part of the output 00258 } 00259 } 00260 } 00261 00262 //-- EMPTY and END TAG --------------------------------------------------------------------------------------------- 00263 else { 00264 00265 // ENTRY end 00266 // If we see the end of an entry that we are in, then leave it 00267 if ((inEntry && !strcmp(tokenName, "entry" )) || 00268 (inEntryFree && !strcmp(tokenName, "entryFree" )) || 00269 (inSuperEntry && !strcmp(tokenName, "superentry"))) { 00270 #ifdef DEBUG 00271 cout << "Leaving " << tokenName << endl; 00272 #endif 00273 // Only one is false coming into here, 00274 // but all must be on leaving. 00275 inEntry = false; 00276 inEntryFree = false; 00277 inSuperEntry = false; 00278 text += token->toString(); 00279 00280 entryCount++; 00281 #ifdef DEBUG 00282 cout << "keyStr: " << keyStr << endl; 00283 #endif 00284 splitPtr = strstr(keyStr, "|"); 00285 if (splitPtr) { 00286 strncpy (splitBuffer, keyStr.c_str(), splitPtr - keyStr.c_str()); 00287 splitBuffer[splitPtr - keyStr.c_str()] = 0; 00288 *currentKey = splitBuffer; 00289 #ifdef DEBUG 00290 cout << "splitBuffer: " << splitBuffer << endl; 00291 cout << "currentKey: " << *currentKey << endl; 00292 #endif 00293 writeEntry(*currentKey, text); 00294 #if 1 00295 while (splitPtr) { 00296 splitPtr += 1; 00297 splitPtr2 = strstr(splitPtr, "|"); 00298 entryCount++; 00299 if (splitPtr2) { 00300 strncpy (splitBuffer, splitPtr, splitPtr2 - splitPtr); 00301 splitBuffer[splitPtr2 - splitPtr] = 0; 00302 #ifdef DEBUG 00303 cout << "splitBuffer: " << splitBuffer << endl; 00304 cout << "currentKey: " << *currentKey << endl; 00305 #endif 00306 linkToEntry(currentKey->getText(), splitBuffer); 00307 splitPtr = splitPtr2; 00308 } 00309 else { 00310 strcpy (splitBuffer, splitPtr); 00311 #ifdef DEBUG 00312 cout << "splitBuffer: " << splitBuffer << endl; 00313 cout << "currentKey: " << *currentKey << endl; 00314 #endif 00315 linkToEntry(currentKey->getText(), splitBuffer); 00316 splitPtr = 0; 00317 } 00318 } 00319 #endif 00320 } 00321 else { 00322 *currentKey = keyStr; 00323 writeEntry(*currentKey, text); 00324 } 00325 00326 // Since we consumed the text, clear it 00327 // and tell the caller that the tag was consumed. 00328 text = ""; 00329 return true; 00330 } 00331 } 00332 return false; 00333 }
| void linkToEntry | ( | const SWBuf & | keyBuf, | |
| const SWBuf & | linkBuf | |||
| ) |
Definition at line 210 of file tei2mod.cpp.
00210 { 00211 SWKey tmpkey = linkBuf.c_str(); 00212 module->linkEntry(&tmpkey); 00213 #ifdef DEBUG 00214 cout << "(" << entryCount << ") " << "Linking: " << linkBuf << endl; 00215 #endif 00216 }
| int main | ( | int | argc, | |
| char ** | argv | |||
| ) |
Definition at line 354 of file tei2mod.cpp.
00354 { 00355 00356 SWBuf program = argv[0]; 00357 fprintf(stderr, "You are running %s: $Rev: 2138 $\n", argv[0]); 00358 00359 // Let's test our command line arguments 00360 if (argc < 3) { 00361 usage(*argv); 00362 } 00363 00364 // variables for arguments, holding defaults 00365 SWBuf path = argv[1]; 00366 SWBuf teiDoc = argv[2]; 00367 SWBuf compType = ""; 00368 SWBuf modDrv = ""; 00369 SWBuf recommendedPath = "./modules/lexdict/"; 00370 SWBuf cipherKey = ""; 00371 SWCompress *compressor = 0; 00372 00373 for (int i = 3; i < argc; i++) { 00374 if (!strcmp(argv[i], "-z")) { 00375 if (compType.size()) usage(*argv, "Cannot specify both -z and -Z"); 00376 if (modDrv.size()) usage(*argv, "Cannot specify both -z and -s"); 00377 compType = "ZIP"; 00378 modDrv = "zLD"; 00379 recommendedPath += "zld/"; 00380 } 00381 else if (!strcmp(argv[i], "-Z")) { 00382 if (compType.size()) usage(*argv, "Cannot specify both -z and -Z"); 00383 if (modDrv.size()) usage(*argv, "Cannot specify both -Z and -s"); 00384 compType = "LZSS"; 00385 recommendedPath += "zld/"; 00386 } 00387 else if (!strcmp(argv[i], "-s")) { 00388 if (compType.size()) usage(*argv, "Cannot specify both -s and -z or -Z"); 00389 if (i+1 < argc) { 00390 int size = atoi(argv[++i]); 00391 if (size == 2) { 00392 modDrv = "RawLD"; 00393 recommendedPath += "rawld/"; 00394 continue; 00395 } 00396 if (size == 4) { 00397 modDrv = "RawLD4"; 00398 recommendedPath += "rawld4/"; 00399 continue; 00400 } 00401 } 00402 usage(*argv, "-s requires one of <2|4>"); 00403 } 00404 else if (!strcmp(argv[i], "-N")) { 00405 normalize = false; 00406 } 00407 else if (!strcmp(argv[i], "-c")) { 00408 if (i+1 < argc) cipherKey = argv[++i]; 00409 else usage(*argv, "-c requires <cipher_key>"); 00410 } 00411 else usage(*argv, (((SWBuf)"Unknown argument: ")+ argv[i]).c_str()); 00412 } 00413 if (!modDrv.size()) { 00414 modDrv = "RawLD4"; 00415 recommendedPath += "rawld4/"; 00416 } 00417 00418 #ifndef _ICU_ 00419 if (normalize) { 00420 normalize = false; 00421 cout << program << " is not compiled with support for ICU. Setting -N flag." << endl; 00422 } 00423 #endif 00424 00425 if (compType == "ZIP") { 00426 #ifndef EXCLUDEZLIB 00427 compressor = new ZipCompress(); 00428 #else 00429 usage(*argv, "ERROR: SWORD library not compiled with ZIP compression support.\n\tBe sure libzip is available when compiling SWORD library"); 00430 #endif 00431 } 00432 else if (compType == "LZSS") { 00433 compressor = new LZSSCompress(); 00434 } 00435 00436 #ifdef DEBUG 00437 // cout << "path: " << path << " teiDoc: " << teiDoc << " compressType: " << compType << " ldType: " << modDrv << " cipherKey: " << cipherKey.c_str() << " normalize: " << normalize << "\n"; 00438 cout << "path: " << path << " teiDoc: " << teiDoc << " compressType: " << compType << " ldType: " << modDrv << " normalize: " << normalize << "\n"; 00439 cout << ""; 00440 // exit(-3); 00441 #endif 00442 00443 SWBuf modName = path; 00444 int pathlen = path.length(); 00445 char lastChar = path[pathlen - 1]; 00446 if (lastChar != '/' && lastChar != '\\') { 00447 modName += "/"; 00448 } 00449 modName += "dict"; 00450 00451 SWBuf keyBuf; 00452 SWBuf entBuf; 00453 SWBuf lineBuf; 00454 vector<string> linkBuf; 00455 00456 if (modDrv == "zLD") { 00457 if (zLD::createModule(modName)) { 00458 fprintf(stderr, "error: %s: couldn't create module at path: %s \n", program.c_str(), modName.c_str()); 00459 exit(-3); 00460 } 00461 module = new zLD(modName, 0, 0, 30, compressor); 00462 } 00463 else if (modDrv == "RawLD") { 00464 if (RawLD::createModule(modName)) { 00465 fprintf(stderr, "error: %s: couldn't create module at path: %s \n", program.c_str(), modName.c_str()); 00466 exit(-3); 00467 } 00468 module = new RawLD(modName); 00469 } 00470 else { 00471 if (RawLD4::createModule(modName)) { 00472 fprintf(stderr, "error: %s: couldn't create module at path: %s \n", program.c_str(), modName.c_str()); 00473 exit(-3); 00474 } 00475 module = new RawLD4(modName); 00476 } 00477 00478 SWFilter *cipherFilter = 0; 00479 00480 if (cipherKey.size()) { 00481 fprintf(stderr, "Adding cipher filter with phrase: %s\n", cipherKey.c_str() ); 00482 cipherFilter = new CipherFilter(cipherKey.c_str()); 00483 module->addRawFilter(cipherFilter); 00484 } 00485 00486 if (!module->isWritable()) { 00487 fprintf(stderr, "The module is not writable. Writing text to it will not work.\nExiting.\n" ); 00488 exit(-1); 00489 } 00490 00491 // Let's see if we can open our input file 00492 ifstream infile(teiDoc); 00493 if (infile.fail()) { 00494 fprintf(stderr, "error: %s: couldn't open input file: %s \n", program.c_str(), teiDoc.c_str()); 00495 exit(-2); 00496 } 00497 00498 currentKey = module->createKey(); 00499 currentKey->setPersist(true); 00500 module->setKey(*currentKey); 00501 00502 (*module) = TOP; 00503 00504 SWBuf token; 00505 SWBuf text; 00506 bool intoken = false; 00507 char curChar = '\0'; 00508 00509 while (infile.good()) { 00510 00511 curChar = infile.get(); 00512 00513 // skip the character if it is bad. infile.good() will catch the problem 00514 if (curChar == -1) { 00515 continue; 00516 } 00517 00518 if (!intoken && curChar == '<') { 00519 intoken = true; 00520 token = "<"; 00521 continue; 00522 } 00523 00524 if (intoken && curChar == '>') { 00525 intoken = false; 00526 token.append('>'); 00527 00528 XMLTag *t = new XMLTag(token.c_str()); 00529 if (!handleToken(text, t)) { 00530 text.append(*t); 00531 } 00532 delete t; 00533 continue; 00534 } 00535 00536 if (intoken) 00537 token.append(curChar); 00538 else 00539 switch (curChar) { 00540 case '>' : text.append(">"); break; 00541 case '<' : text.append("<"); break; 00542 default : text.append(curChar); break; 00543 } 00544 } 00545 00546 // Force the last entry from the text buffer. 00547 //text = ""; 00548 //writeEntry(*currentKey, text); 00549 00550 delete module; 00551 delete currentKey; 00552 if (cipherFilter) 00553 delete cipherFilter; 00554 infile.close(); 00555 00556 #ifdef _ICU_ 00557 if (converted) fprintf(stderr, "tei2mod converted %d verses to UTF-8\n", converted); 00558 if (normalized) fprintf(stderr, "tei2mod normalized %d verses to NFC\n", normalized); 00559 #endif 00560 00561 /* 00562 * Suggested module name detection. 00563 * Only used for suggesting a conf. 00564 * 00565 * Various forms of path. 00566 * . and .. - no module name given, use "dict". 00567 * Or one of the following where z is the module name 00568 * and x may be . or .. 00569 * z 00570 * x/y/z 00571 * x/y/z/ 00572 * x/y/z/z 00573 */ 00574 SWBuf suggestedModuleName = path; 00575 if (lastChar == '/' || lastChar == '\\') { 00576 suggestedModuleName.setSize(--pathlen); 00577 } 00578 00579 lastChar = suggestedModuleName[pathlen - 1]; 00580 if (lastChar == '.') { 00581 suggestedModuleName = "???"; 00582 } 00583 else { 00584 /* At this point the suggestion is either 00585 * what follows the last / or \ 00586 * or the entire string 00587 */ 00588 const char *m = strrchr(suggestedModuleName.c_str(), '/'); 00589 if (!m) { 00590 m = strrchr(suggestedModuleName.c_str(), '\\'); 00591 } 00592 if (m) { 00593 suggestedModuleName = m+1; 00594 } 00595 } 00596 00597 recommendedPath += suggestedModuleName; 00598 recommendedPath += "/dict"; 00599 00600 fprintf(stderr, "\nSuggested conf (replace ??? with appropriate values)\n\n"); 00601 fprintf(stderr, "[%s]\n", suggestedModuleName.c_str()); 00602 fprintf(stderr, "DataPath=%s\n", recommendedPath.c_str()); 00603 fprintf(stderr, "Description=???\n"); 00604 fprintf(stderr, "SourceType=TEI\n"); 00605 fprintf(stderr, "Encoding=%s\n", (normalize ? "UTF-8" : "???")); 00606 fprintf(stderr, "ModDrv=%s\n", modDrv.c_str()); 00607 if (compressor) { 00608 fprintf(stderr, "CompressType=%s\n", compType.c_str()); 00609 } 00610 if (cipherKey.size()) { 00611 fprintf(stderr, "CipherKey=%s\n", cipherKey.c_str()); 00612 } 00613 }
| void normalizeInput | ( | SWKey & | key, | |
| SWBuf & | text | |||
| ) |
Definition at line 165 of file tei2mod.cpp.
00165 { 00166 #ifdef _ICU_ 00167 int utf8State = detectUTF8(text.c_str()); 00168 if (normalize) { 00169 // Don't need to normalize text that is ASCII 00170 // But assume other non-UTF-8 text is Latin1 (cp1252) and convert it to UTF-8 00171 if (!utf8State) { 00172 cout << "Warning: " << key << ": Converting to UTF-8 (" << text << ")" << endl; 00173 converter.processText(text, (SWKey *)2); // note the hack of 2 to mimic a real key. TODO: remove all hacks 00174 converted++; 00175 00176 // Prepare for double check. This probably can be removed. 00177 // But for now we are running the check again. 00178 // This is to determine whether we need to normalize output of the conversion. 00179 utf8State = detectUTF8(text.c_str()); 00180 } 00181 00182 // Double check. This probably can be removed. 00183 if (!utf8State) { 00184 cout << "Error: " << key << ": Converting to UTF-8 (" << text << ")" << endl; 00185 } 00186 00187 if (utf8State > 0) { 00188 SWBuf before = text; 00189 normalizer.processText(text, (SWKey *)2); // note the hack of 2 to mimic a real key. TODO: remove all hacks 00190 if (before != text) { 00191 normalized++; 00192 } 00193 } 00194 } 00195 #endif 00196 }
| void usage | ( | const char * | app, | |
| const char * | error = 0 | |||
| ) |
Definition at line 335 of file tei2mod.cpp.
00335 { 00336 00337 if (error) fprintf(stderr, "\n%s: %s\n", app, error); 00338 00339 fprintf(stderr, "TEI Lexicon/Dictionary/Daily Devotional/Glossary module creation tool for\n\tThe SWORD Project\n"); 00340 fprintf(stderr, "\nusage: %s <output/path> <teiDoc> [OPTIONS]\n", app); 00341 fprintf(stderr, " -z\t\t\t use ZIP compression (default no compression)\n"); 00342 fprintf(stderr, " -Z\t\t\t use LZSS compression (default no compression)\n"); 00343 fprintf(stderr, " -s <2|4>\t\t max text size per entry(default 4):\n"); 00344 fprintf(stderr, " -c <cipher_key>\t encipher module using supplied key\n"); 00345 fprintf(stderr, "\t\t\t\t (default no enciphering)\n"); 00346 fprintf(stderr, " -N\t\t\t Do not convert UTF-8 or normalize UTF-8 to NFC\n"); 00347 fprintf(stderr, "\t\t\t\t (default is to convert to UTF-8, if needed,\n"); 00348 fprintf(stderr, "\t\t\t\t and then normalize to NFC. Note: all UTF-8\n"); 00349 fprintf(stderr, "\t\t\t\t texts should be normalized to NFC.)\n"); 00350 fprintf(stderr, "\n\tThe options -z, -Z, and -s are mutually exclusive.\n"); 00351 exit(-1); 00352 }
| void writeEntry | ( | SWKey & | key, | |
| SWBuf & | text | |||
| ) |
Definition at line 198 of file tei2mod.cpp.
00198 { 00199 #ifdef DEBUG 00200 cout << "(" << entryCount << ") " << key << endl; 00201 #endif 00202 00203 module->setKey(key); 00204 00205 normalizeInput(key, text); 00206 00207 module->setEntry(text); 00208 }
| SWKey* currentKey = NULL |
Definition at line 86 of file tei2mod.cpp.
| unsigned long entryCount = 0 |
Definition at line 90 of file tei2mod.cpp.
| SWBuf keyStr |
Definition at line 88 of file tei2mod.cpp.
Definition at line 85 of file tei2mod.cpp.
| bool normalize = true |
Definition at line 87 of file tei2mod.cpp.
1.6.1