The SWORD Project  1.9.0.svnversion
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
tei2mod.cpp
Go to the documentation of this file.
1 /******************************************************************************
2  *
3  * tei2mod.cpp - Utility to import documents encoded as TEI
4  *
5  * $Id: tei2mod.cpp 3416 2016-03-15 14:07:18Z dmsmith $
6  *
7  * Copyright 2008-2014 CrossWire Bible Society (http://www.crosswire.org)
8  * CrossWire Bible Society
9  * P. O. Box 2528
10  * Tempe, AZ 85280-2528
11  *
12  * This program is free software; you can redistribute it and/or modify it
13  * under the terms of the GNU General Public License as published by the
14  * Free Software Foundation version 2.
15  *
16  * This program is distributed in the hope that it will be useful, but
17  * WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19  * General Public License for more details.
20  *
21  */
22 
23 /******************************************************************************
24  * This program handles xml files of the form:
25  * <TEI.2>
26  * <text>
27  * <body>
28  * <entry key="xxx">...</entry>
29  * <entryFree key="yyy">...</entryFree>
30  * <superentry key="zzz">...</superentry>
31  * </body>
32  * </text>
33  * </TEI.2>
34  * The document is assumed to be well-formed and valid.
35  * Three kinds of entries are allowed,
36  * <entry> - a very restricted form of a dictionary entry.
37  * <entryFree> - a very unrestricted form of a dictionary entry.
38  * <superentry> - an entry which can have other entries.
39  * The value of the key attribute is used as the key for the entry in the module.
40  * Note, for a <superentry> only it's key becomes a SWORD key.
41  * Keys of entries internal to it are not used.
42  *
43  * The entries must be sorted according to an ASCII collation of their bytes.
44  * This should be the same for Latin-1 and for UTF-8
45  *
46  * Sword will allow for any tags, but only a few have any styling.
47  *
48  * author DM Smith
49  */
50 
51 
52 #ifdef _MSC_VER
53  #pragma warning( disable: 4251 )
54 #endif
55 
56 #include <string>
57 #include <vector>
58 #include <fstream>
59 #include <iostream>
60 #include <swbuf.h>
61 #include <utilxml.h>
62 #include <rawld.h>
63 #include <rawld4.h>
64 #include <zld.h>
65 #include <lzsscomprs.h>
66 #ifndef EXCLUDEZLIB
67 #include <zipcomprs.h>
68 #endif
69 #ifndef EXCLUDEBZIP2
70 #include <bz2comprs.h>
71 #endif
72 #ifndef EXCLUDEXZ
73 #include <xzcomprs.h>
74 #endif
75 #include <stdio.h>
76 #include <cipherfil.h>
77 
78 #ifdef _ICU_
79 #include <utf8nfc.h>
80 #include <latin1utf8.h>
81 #endif
82 
83 #ifndef NO_SWORD_NAMESPACE
84 using namespace sword;
85 #endif
86 
87 using namespace std;
88 
89 #ifdef _ICU_
90 UTF8NFC *normalizer = 0;
91 int normalized = 0;
92 
93 Latin1UTF8 converter;
94 int converted = 0;
95 #endif
96 
97 #define DEBUG
98 
100 SWKey *currentKey = NULL;
101 bool normalize = true;
102 SWBuf keyStr;
103 
104 unsigned long entryCount = 0;
105 
131 int detectUTF8(const char *txt) {
132  unsigned int countUTF8 = 0;
133  int count = 0;
134 
135  // Cast it to make masking and shifting easier
136  const unsigned char *p = (const unsigned char*) txt;
137  while (*p) {
138  // Is the high order bit set?
139  if (*p & 0x80) {
140  // Then count the number of high order bits that are set.
141  // This determines the number of following bytes
142  // that are a part of the unicode character
143  unsigned char i = *p;
144  for (count = 0; i & 0x80; count++) {
145  i <<= 1;
146  }
147 
148  // Validate count:
149  // Count 0: bug in code that would cause core walking
150  // Count 1: is a pattern of 10nnnnnn,
151  // which does not signal the start of a unicode character
152  // Count 5 to 8: 111110nn, 1111110n and 11111110 and 11111111
153  // are not legal starts, either
154  if (count < 2 || count > 4) return 0;
155 
156  // At this point we expect (count - 1) following characters
157  // of the pattern 10nnnnnn
158  while (--count && *++p) {
159  // The pattern of each following character must be: 10nnnnnn
160  // So, compare the top 2 bits.
161  if ((0xc0 & *p) != 0x80) return 0;
162  }
163 
164  // Oops, we've run out of bytes too soon: Cannot be UTF-8
165  if (count) return 0;
166 
167  // We have a valid UTF-8 character, so count it
168  countUTF8++;
169  }
170 
171  // Advance to the next character to examine.
172  p++;
173  }
174 
175  // At this point it is either UTF-8 or 7-bit ascii
176  return countUTF8 ? 1 : -1;
177 }
178 
179 void normalizeInput(SWKey &key, SWBuf &text) {
180 #ifdef _ICU_
181  int utf8State = detectUTF8(text.c_str());
182  if (normalize) {
183  // Don't need to normalize text that is ASCII
184  // But assume other non-UTF-8 text is Latin1 (cp1252) and convert it to UTF-8
185  if (!utf8State) {
186  cout << "Warning: " << key << ": Converting to UTF-8 (" << text << ")" << endl;
187  converter.processText(text, (SWKey *)2); // note the hack of 2 to mimic a real key. TODO: remove all hacks
188  converted++;
189 
190  // Prepare for double check. This probably can be removed.
191  // But for now we are running the check again.
192  // This is to determine whether we need to normalize output of the conversion.
193  utf8State = detectUTF8(text.c_str());
194  }
195 
196  // Double check. This probably can be removed.
197  if (!utf8State) {
198  cout << "Error: " << key << ": Converting to UTF-8 (" << text << ")" << endl;
199  }
200 
201  if (utf8State > 0) {
202  SWBuf before = text;
203  normalizer->processText(text, (SWKey *)2); // note the hack of 2 to mimic a real key. TODO: remove all hacks
204  if (before != text) {
205  normalized++;
206  }
207  }
208  }
209 #endif
210 }
211 
212 void writeEntry(SWKey &key, SWBuf &text) {
213 #ifdef DEBUG
214  cout << "(" << entryCount << ") " << key << endl;
215 #endif
216 
217  module->setKey(key);
218 
219  normalizeInput(key, text);
220 
221  module->setEntry(text);
222 }
223 
224 void linkToEntry(const SWBuf &keyBuf, const SWBuf &linkBuf) {
225  SWKey tmpkey = linkBuf.c_str();
226  module->linkEntry(&tmpkey);
227 #ifdef DEBUG
228  cout << "(" << entryCount << ") " << "Linking: " << linkBuf << endl;
229 #endif
230 }
231 
232 // Return true if the content was handled or is to be ignored.
233 // false if the what has been seen is to be accumulated and considered later.
234 bool handleToken(SWBuf &text, XMLTag *token) {
235  // The start token for the current entry;
236  static XMLTag startTag;
237 
238  // Flags to indicate whether we are in a entry, entryFree or superentry
239  static bool inEntry = false;
240  static bool inEntryFree = false;
241  static bool inSuperEntry = false;
242 
243  const char *tokenName = token->getName();
244 
245  static const char *splitPtr, *splitPtr2 = NULL;
246  static char *splitBuffer = new char[4096];
247  static SWKey tmpKey;
248 //-- START TAG -------------------------------------------------------------------------
249  if (!token->isEndTag()) {
250 
251  // If we are not in an "entry" and we see one, then enter it.
252  if (!inEntry && !inEntryFree && !inSuperEntry) {
253  inEntry = !strcmp(tokenName, "entry");
254  inEntryFree = !strcmp(tokenName, "entryFree");
255  inSuperEntry = !strcmp(tokenName, "superentry");
256  if (inEntry || inEntryFree || inSuperEntry) {
257 #ifdef DEBUG
258  cout << "Entering " << tokenName << endl;
259 #endif
260  startTag = *token;
261  text = "";
262 
263  keyStr = token->getAttribute("n"); // P5 with linking and/or non-URI chars
264  if (!strlen(keyStr)) {
265  keyStr = token->getAttribute("sortKey"); // P5 otherwise
266  if (!strlen(keyStr)) {
267  keyStr = token->getAttribute("key"); // P4
268  }
269  }
270 
271  return false; // make tag be part of the output
272  }
273  }
274  }
275 
276 //-- EMPTY and END TAG ---------------------------------------------------------------------------------------------
277  else {
278 
279  // ENTRY end
280  // If we see the end of an entry that we are in, then leave it
281  if ((inEntry && !strcmp(tokenName, "entry" )) ||
282  (inEntryFree && !strcmp(tokenName, "entryFree" )) ||
283  (inSuperEntry && !strcmp(tokenName, "superentry"))) {
284 #ifdef DEBUG
285  cout << "Leaving " << tokenName << endl;
286 #endif
287  // Only one is false coming into here,
288  // but all must be on leaving.
289  inEntry = false;
290  inEntryFree = false;
291  inSuperEntry = false;
292  text += token->toString();
293 
294  entryCount++;
295 #ifdef DEBUG
296  cout << "keyStr: " << keyStr << endl;
297 #endif
298  splitPtr = strstr(keyStr, "|");
299  if (splitPtr) {
300  strncpy (splitBuffer, keyStr.c_str(), splitPtr - keyStr.c_str());
301  splitBuffer[splitPtr - keyStr.c_str()] = 0;
302  *currentKey = splitBuffer;
303 #ifdef DEBUG
304  cout << "splitBuffer: " << splitBuffer << endl;
305  cout << "currentKey: " << *currentKey << endl;
306 #endif
307  writeEntry(*currentKey, text);
308 #if 1
309  while (splitPtr) {
310  splitPtr += 1;
311  splitPtr2 = strstr(splitPtr, "|");
312  entryCount++;
313  if (splitPtr2) {
314  strncpy (splitBuffer, splitPtr, splitPtr2 - splitPtr);
315  splitBuffer[splitPtr2 - splitPtr] = 0;
316 #ifdef DEBUG
317  cout << "splitBuffer: " << splitBuffer << endl;
318  cout << "currentKey: " << *currentKey << endl;
319 #endif
320  linkToEntry(currentKey->getText(), splitBuffer);
321  splitPtr = splitPtr2;
322  }
323  else {
324  strcpy (splitBuffer, splitPtr);
325 #ifdef DEBUG
326  cout << "splitBuffer: " << splitBuffer << endl;
327  cout << "currentKey: " << *currentKey << endl;
328 #endif
329  linkToEntry(currentKey->getText(), splitBuffer);
330  splitPtr = 0;
331  }
332  }
333 #endif
334  }
335  else {
336  *currentKey = keyStr;
337  writeEntry(*currentKey, text);
338  }
339 
340  // Since we consumed the text, clear it
341  // and tell the caller that the tag was consumed.
342  text = "";
343  return true;
344  }
345  }
346  return false;
347 }
348 
349 void usage(const char *app, const char *error = 0) {
350 
351  if (error) fprintf(stderr, "\n%s: %s\n", app, error);
352 
353  fprintf(stderr, "TEI Lexicon/Dictionary/Daily Devotional/Glossary module creation tool for\n\tThe SWORD Project\n");
354  fprintf(stderr, "\nusage: %s <output/path> <teiDoc> [OPTIONS]\n", app);
355  fprintf(stderr, " -z <l|z|b|x>\t\t use compression (default: none)\n");
356  fprintf(stderr, "\t\t\t\t l - LZSS; z - ZIP; b - bzip2; x - xz\n");
357  fprintf(stderr, " -s <2|4>\t\t max text size per entry (default: 4)\n");
358  fprintf(stderr, " -c <cipher_key>\t encipher module using supplied key\n");
359  fprintf(stderr, "\t\t\t\t (default: none)\n");
360  fprintf(stderr, " -N\t\t\t Do not convert UTF-8 or normalize UTF-8 to NFC\n");
361  fprintf(stderr, "\t\t\t\t (default is to convert to UTF-8, if needed,\n");
362  fprintf(stderr, "\t\t\t\t and then normalize to NFC. Note: all UTF-8\n");
363  fprintf(stderr, "\t\t\t\t texts should be normalized to NFC.)\n");
364  fprintf(stderr, "\n\tThe options -z and -s are mutually exclusive.\n");
365  exit(-1);
366 }
367 
368 int main(int argc, char **argv) {
369 #ifdef _ICU_
370  UTF8NFC normalizr;
371  normalizer = &normalizr;
372 #endif
373 
374  SWBuf program = argv[0];
375  fprintf(stderr, "You are running %s: $Rev: 3416 $\n", argv[0]);
376 
377  // Let's test our command line arguments
378  if (argc < 3) {
379  usage(*argv);
380  }
381 
382  // variables for arguments, holding defaults
383  SWBuf path = argv[1];
384  SWBuf teiDoc = argv[2];
385  SWBuf compType = "";
386  SWBuf modDrv = "";
387  SWBuf recommendedPath = "./modules/lexdict/";
388  SWBuf cipherKey = "";
389  SWCompress *compressor = 0;
390 
391  for (int i = 3; i < argc; i++) {
392  if (!strcmp(argv[i], "-z")) {
393  if (modDrv.size()) usage(*argv, "Cannot specify both -z and -s");
394  compType = "ZIP";
395  if (i+1 < argc && argv[i+1][0] != '-') {
396  switch (argv[++i][0]) {
397  case 'l': compType = "LZSS"; break;
398  case 'z': compType = "ZIP"; break;
399  case 'b': compType = "BZIP2"; break;
400  case 'x': compType = "XZ"; break;
401  }
402  }
403  modDrv = "zLD";
404  recommendedPath += "zld/";
405  }
406  else if (!strcmp(argv[i], "-Z")) {
407  if (compType.size()) usage(*argv, "Cannot specify both -z and -Z");
408  if (modDrv.size()) usage(*argv, "Cannot specify both -Z and -s");
409  compType = "LZSS";
410  modDrv = "zLD";
411  recommendedPath += "zld/";
412  }
413  else if (!strcmp(argv[i], "-s")) {
414  if (compType.size()) usage(*argv, "Cannot specify both -s and -z");
415  if (i+1 < argc) {
416  int size = atoi(argv[++i]);
417  if (size == 2) {
418  modDrv = "RawLD";
419  recommendedPath += "rawld/";
420  continue;
421  }
422  if (size == 4) {
423  modDrv = "RawLD4";
424  recommendedPath += "rawld4/";
425  continue;
426  }
427  }
428  usage(*argv, "-s requires one of <2|4>");
429  }
430  else if (!strcmp(argv[i], "-N")) {
431  normalize = false;
432  }
433  else if (!strcmp(argv[i], "-c")) {
434  if (i+1 < argc) cipherKey = argv[++i];
435  else usage(*argv, "-c requires <cipher_key>");
436  }
437  else usage(*argv, (((SWBuf)"Unknown argument: ")+ argv[i]).c_str());
438  }
439  if (!modDrv.size()) {
440  modDrv = "RawLD4";
441  recommendedPath += "rawld4/";
442  }
443 
444 #ifndef _ICU_
445  if (normalize) {
446  normalize = false;
447  cout << program << " is not compiled with support for ICU. Setting -N flag." << endl;
448  }
449 #endif
450 
451  if (compType == "LZSS") {
452  compressor = new LZSSCompress();
453  }
454  else if (compType == "ZIP") {
455 #ifndef EXCLUDEZLIB
456  compressor = new ZipCompress();
457 #else
458  usage(*argv, "ERROR: SWORD library not compiled with ZIP compression support.\n\tBe sure libz is available when compiling SWORD library");
459 #endif
460  }
461  else if (compType == "BZIP2") {
462 #ifndef EXCLUDEBZIP2
463  compressor = new Bzip2Compress();
464 #else
465  usage(*argv, "ERROR: SWORD library not compiled with bzip2 compression support.\n\tBe sure libbz2 is available when compiling SWORD library");
466 #endif
467  }
468  else if (compType == "XZ") {
469 #ifndef EXCLUDEXZ
470  compressor = new XzCompress();
471 #else
472  usage(*argv, "ERROR: SWORD library not compiled with xz compression support.\n\tBe sure liblzma is available when compiling SWORD library");
473 #endif
474  }
475 
476 #ifdef DEBUG
477  // cout << "path: " << path << " teiDoc: " << teiDoc << " compressType: " << compType << " ldType: " << modDrv << " cipherKey: " << cipherKey.c_str() << " normalize: " << normalize << "\n";
478  cout << "path: " << path << " teiDoc: " << teiDoc << " compressType: " << compType << " ldType: " << modDrv << " normalize: " << normalize << "\n";
479  cout << "";
480 // exit(-3);
481 #endif
482 
483  SWBuf modName = path;
484  int pathlen = path.length();
485  char lastChar = path[pathlen - 1];
486  if (lastChar != '/' && lastChar != '\\') {
487  modName += "/";
488  }
489  modName += "dict";
490 
491  SWBuf keyBuf;
492  SWBuf entBuf;
493  SWBuf lineBuf;
494  vector<string> linkBuf;
495 
496  if (modDrv == "zLD") {
497  if (zLD::createModule(modName)) {
498  fprintf(stderr, "error: %s: couldn't create module at path: %s \n", program.c_str(), modName.c_str());
499  exit(-3);
500  }
501  module = new zLD(modName, 0, 0, 30, compressor);
502  }
503  else if (modDrv == "RawLD") {
504  if (RawLD::createModule(modName)) {
505  fprintf(stderr, "error: %s: couldn't create module at path: %s \n", program.c_str(), modName.c_str());
506  exit(-3);
507  }
508  module = new RawLD(modName);
509  }
510  else {
511  if (RawLD4::createModule(modName)) {
512  fprintf(stderr, "error: %s: couldn't create module at path: %s \n", program.c_str(), modName.c_str());
513  exit(-3);
514  }
515  module = new RawLD4(modName);
516  }
517 
518  SWFilter *cipherFilter = 0;
519 
520  if (cipherKey.size()) {
521  fprintf(stderr, "Adding cipher filter with phrase: %s\n", cipherKey.c_str() );
522  cipherFilter = new CipherFilter(cipherKey.c_str());
523  module->addRawFilter(cipherFilter);
524  }
525 
526  if (!module->isWritable()) {
527  fprintf(stderr, "The module is not writable. Writing text to it will not work.\nExiting.\n" );
528  exit(-1);
529  }
530 
531  // Let's see if we can open our input file
532  ifstream infile(teiDoc);
533  if (infile.fail()) {
534  fprintf(stderr, "error: %s: couldn't open input file: %s \n", program.c_str(), teiDoc.c_str());
535  exit(-2);
536  }
537 
539  currentKey->setPersist(true);
541 
542  (*module) = TOP;
543 
544  SWBuf token;
545  SWBuf text;
546  bool intoken = false;
547  char curChar = '\0';
548 
549  while (infile.good()) {
550 
551  curChar = infile.get();
552 
553  // skip the character if it is bad. infile.good() will catch the problem
554  if (curChar == -1) {
555  continue;
556  }
557 
558  if (!intoken && curChar == '<') {
559  intoken = true;
560  token = "<";
561  continue;
562  }
563 
564  if (intoken && curChar == '>') {
565  intoken = false;
566  token.append('>');
567 
568  XMLTag *t = new XMLTag(token.c_str());
569  if (!handleToken(text, t)) {
570  text.append(*t);
571  }
572  delete t;
573  continue;
574  }
575 
576  if (intoken)
577  token.append(curChar);
578  else
579  switch (curChar) {
580  case '>' : text.append("&gt;"); break;
581  case '<' : text.append("&lt;"); break;
582  default : text.append(curChar); break;
583  }
584  }
585 
586  // Force the last entry from the text buffer.
587  //text = "";
588  //writeEntry(*currentKey, text);
589 
590  delete module;
591  delete currentKey;
592  if (cipherFilter)
593  delete cipherFilter;
594  infile.close();
595 
596 #ifdef _ICU_
597  if (converted) fprintf(stderr, "tei2mod converted %d verses to UTF-8\n", converted);
598  if (normalized) fprintf(stderr, "tei2mod normalized %d verses to NFC\n", normalized);
599 #endif
600 
601  /*
602  * Suggested module name detection.
603  * Only used for suggesting a conf.
604  *
605  * Various forms of path.
606  * . and .. - no module name given, use "dict".
607  * Or one of the following where z is the module name
608  * and x may be . or ..
609  * z
610  * x/y/z
611  * x/y/z/
612  * x/y/z/z
613  */
614  SWBuf suggestedModuleName = path;
615  if (lastChar == '/' || lastChar == '\\') {
616  suggestedModuleName.setSize(--pathlen);
617  }
618 
619  lastChar = suggestedModuleName[pathlen - 1];
620  if (lastChar == '.') {
621  suggestedModuleName = "???";
622  }
623  else {
624  /* At this point the suggestion is either
625  * what follows the last / or \
626  * or the entire string
627  */
628  const char *m = strrchr(suggestedModuleName.c_str(), '/');
629  if (!m) {
630  m = strrchr(suggestedModuleName.c_str(), '\\');
631  }
632  if (m) {
633  suggestedModuleName = m+1;
634  }
635  }
636 
637  recommendedPath += suggestedModuleName;
638  recommendedPath += "/dict";
639 
640  fprintf(stderr, "\nSuggested conf (replace ??? with appropriate values)\n\n");
641  fprintf(stderr, "[%s]\n", suggestedModuleName.c_str());
642  fprintf(stderr, "DataPath=%s\n", recommendedPath.c_str());
643  fprintf(stderr, "Description=???\n");
644  fprintf(stderr, "SourceType=TEI\n");
645  fprintf(stderr, "Encoding=%s\n", (normalize ? "UTF-8" : "???"));
646  fprintf(stderr, "ModDrv=%s\n", modDrv.c_str());
647  if (compressor) {
648  fprintf(stderr, "CompressType=%s\n", compType.c_str());
649  }
650  if (cipherKey.size()) {
651  fprintf(stderr, "CipherKey=%s\n", cipherKey.c_str());
652  }
653 }
#define TOP
Definition: swkey.h:68
virtual SWKey * createKey() const
Definition: swtext.cpp:67
void normalizeInput(SWKey &key, SWBuf &text)
Definition: tei2mod.cpp:179
bool normalize
Definition: tei2mod.cpp:101
virtual void setEntry(const char *inbuf, long len=-1)
Definition: swmodule.cpp:1680
const char * getName() const
Definition: utilxml.h:58
SWText * module
Definition: osis2mod.cpp:105
Definition: utilxml.h:38
bool handleToken(SWBuf &text, XMLTag token)
Definition: osis2mod.cpp:617
virtual char setKey(const SWKey *ikey)
Definition: swmodule.cpp:298
int main(int argc, char **argv)
Definition: addcomment.cpp:32
const char * toString() const
Definition: utilxml.cpp:285
unsigned long entryCount
Definition: tei2mod.cpp:104
static char createModule(const char *path)
Definition: rawld4.h:53
virtual void linkEntry(const SWKey *sourceKey)
Definition: swmodule.cpp:1683
int detectUTF8(const char *txt)
Definition: osis2mod.cpp:152
return NULL
Definition: regex.c:7953
SWBuf keyStr
Definition: tei2mod.cpp:102
static char createModule(const char *path)
Definition: rawld.h:53
virtual bool isWritable() const
Definition: swmodule.h:506
int normalized
Definition: osis2mod.cpp:102
void writeEntry(SWModule *book, SWBuf keyBuffer, SWBuf entBuffer)
Definition: imp2gbs.cpp:131
virtual SWModule & addRawFilter(SWFilter *newFilter)
Definition: swmodule.h:694
int converted
Definition: osis2mod.cpp:103
const char * getAttribute(const char *attribName, int partNum=-1, char partSplit= '|') const
Definition: utilxml.cpp:230
void usage(const char *app)
Definition: imp2gbs.cpp:65
int size
Definition: regex.c:5043
bool isEndTag(const char *eID=0) const
Definition: utilxml.cpp:323
SWKey * currentKey
Definition: tei2mod.cpp:100
static char createModule(const char *path)
Definition: zld.h:49
Definition: swld.h:36
void linkToEntry(VerseKey &linkKey, VerseKey &dest)
Definition: osis2mod.cpp:595