The SWORD Project  1.9.0.svnversion
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
tei2mod.cpp File Reference
#include <string>
#include <vector>
#include <fstream>
#include <iostream>
#include <swbuf.h>
#include <utilxml.h>
#include <rawld.h>
#include <rawld4.h>
#include <zld.h>
#include <lzsscomprs.h>
#include <zipcomprs.h>
#include <bz2comprs.h>
#include <xzcomprs.h>
#include <stdio.h>
#include <cipherfil.h>
+ Include dependency graph for tei2mod.cpp:

Go to the source code of this file.

Macros

#define DEBUG
 

Functions

int detectUTF8 (const char *txt)
 
bool handleToken (SWBuf &text, XMLTag *token)
 
void linkToEntry (const SWBuf &keyBuf, const SWBuf &linkBuf)
 
int main (int argc, char **argv)
 
void normalizeInput (SWKey &key, SWBuf &text)
 
void usage (const char *app, const char *error=0)
 
void writeEntry (SWKey &key, SWBuf &text)
 

Variables

SWKeycurrentKey = NULL
 
unsigned long entryCount = 0
 
SWBuf keyStr
 
SWLDmodule = NULL
 
bool normalize = true
 

Macro Definition Documentation

#define DEBUG

Definition at line 97 of file tei2mod.cpp.

Function Documentation

int detectUTF8 ( const char *  txt)

Determine whether the string contains a valid unicode sequence. The following table give the pattern of a valid UTF-8 character. Unicode Range 1st 2nd 3rd 4th U-00000000 - U-0000007F 0nnnnnnn U-00000080 - U-000007FF 110nnnnn 10nnnnnn U-00000800 - U-0000FFFF 1110nnnn 10nnnnnn 10nnnnnn U-00010000 - U-001FFFFF 11110nnn 10nnnnnn 10nnnnnn 10nnnnnn Note:

  1. The latest UTF-8 RFC allows for a max of 4 bytes. Earlier allowed 6.
  2. The number of bits of the leading byte before the first 0 is the total number of bytes.
  3. The "n" are the bits of the unicode codepoint. This routine does not check to see if the code point is in the range. It could.

param txt the text to check return 1 if all high order characters form a valid unicode sequence -1 if there are no high order characters. Note: this is also a valid unicode sequence 0 if there are high order characters that do not form a valid unicode sequence author DM Smith

Definition at line 131 of file tei2mod.cpp.

131  {
132  unsigned int countUTF8 = 0;
133  int count = 0;
134 
135  // Cast it to make masking and shifting easier
136  const unsigned char *p = (const unsigned char*) txt;
137  while (*p) {
138  // Is the high order bit set?
139  if (*p & 0x80) {
140  // Then count the number of high order bits that are set.
141  // This determines the number of following bytes
142  // that are a part of the unicode character
143  unsigned char i = *p;
144  for (count = 0; i & 0x80; count++) {
145  i <<= 1;
146  }
147 
148  // Validate count:
149  // Count 0: bug in code that would cause core walking
150  // Count 1: is a pattern of 10nnnnnn,
151  // which does not signal the start of a unicode character
152  // Count 5 to 8: 111110nn, 1111110n and 11111110 and 11111111
153  // are not legal starts, either
154  if (count < 2 || count > 4) return 0;
155 
156  // At this point we expect (count - 1) following characters
157  // of the pattern 10nnnnnn
158  while (--count && *++p) {
159  // The pattern of each following character must be: 10nnnnnn
160  // So, compare the top 2 bits.
161  if ((0xc0 & *p) != 0x80) return 0;
162  }
163 
164  // Oops, we've run out of bytes too soon: Cannot be UTF-8
165  if (count) return 0;
166 
167  // We have a valid UTF-8 character, so count it
168  countUTF8++;
169  }
170 
171  // Advance to the next character to examine.
172  p++;
173  }
174 
175  // At this point it is either UTF-8 or 7-bit ascii
176  return countUTF8 ? 1 : -1;
177 }
bool handleToken ( SWBuf text,
XMLTag token 
)

Definition at line 234 of file tei2mod.cpp.

234  {
235  // The start token for the current entry;
236  static XMLTag startTag;
237 
238  // Flags to indicate whether we are in a entry, entryFree or superentry
239  static bool inEntry = false;
240  static bool inEntryFree = false;
241  static bool inSuperEntry = false;
242 
243  const char *tokenName = token->getName();
244 
245  static const char *splitPtr, *splitPtr2 = NULL;
246  static char *splitBuffer = new char[4096];
247  static SWKey tmpKey;
248 //-- START TAG -------------------------------------------------------------------------
249  if (!token->isEndTag()) {
250 
251  // If we are not in an "entry" and we see one, then enter it.
252  if (!inEntry && !inEntryFree && !inSuperEntry) {
253  inEntry = !strcmp(tokenName, "entry");
254  inEntryFree = !strcmp(tokenName, "entryFree");
255  inSuperEntry = !strcmp(tokenName, "superentry");
256  if (inEntry || inEntryFree || inSuperEntry) {
257 #ifdef DEBUG
258  cout << "Entering " << tokenName << endl;
259 #endif
260  startTag = *token;
261  text = "";
262 
263  keyStr = token->getAttribute("n"); // P5 with linking and/or non-URI chars
264  if (!strlen(keyStr)) {
265  keyStr = token->getAttribute("sortKey"); // P5 otherwise
266  if (!strlen(keyStr)) {
267  keyStr = token->getAttribute("key"); // P4
268  }
269  }
270 
271  return false; // make tag be part of the output
272  }
273  }
274  }
275 
276 //-- EMPTY and END TAG ---------------------------------------------------------------------------------------------
277  else {
278 
279  // ENTRY end
280  // If we see the end of an entry that we are in, then leave it
281  if ((inEntry && !strcmp(tokenName, "entry" )) ||
282  (inEntryFree && !strcmp(tokenName, "entryFree" )) ||
283  (inSuperEntry && !strcmp(tokenName, "superentry"))) {
284 #ifdef DEBUG
285  cout << "Leaving " << tokenName << endl;
286 #endif
287  // Only one is false coming into here,
288  // but all must be on leaving.
289  inEntry = false;
290  inEntryFree = false;
291  inSuperEntry = false;
292  text += token->toString();
293 
294  entryCount++;
295 #ifdef DEBUG
296  cout << "keyStr: " << keyStr << endl;
297 #endif
298  splitPtr = strstr(keyStr, "|");
299  if (splitPtr) {
300  strncpy (splitBuffer, keyStr.c_str(), splitPtr - keyStr.c_str());
301  splitBuffer[splitPtr - keyStr.c_str()] = 0;
302  *currentKey = splitBuffer;
303 #ifdef DEBUG
304  cout << "splitBuffer: " << splitBuffer << endl;
305  cout << "currentKey: " << *currentKey << endl;
306 #endif
307  writeEntry(*currentKey, text);
308 #if 1
309  while (splitPtr) {
310  splitPtr += 1;
311  splitPtr2 = strstr(splitPtr, "|");
312  entryCount++;
313  if (splitPtr2) {
314  strncpy (splitBuffer, splitPtr, splitPtr2 - splitPtr);
315  splitBuffer[splitPtr2 - splitPtr] = 0;
316 #ifdef DEBUG
317  cout << "splitBuffer: " << splitBuffer << endl;
318  cout << "currentKey: " << *currentKey << endl;
319 #endif
320  linkToEntry(currentKey->getText(), splitBuffer);
321  splitPtr = splitPtr2;
322  }
323  else {
324  strcpy (splitBuffer, splitPtr);
325 #ifdef DEBUG
326  cout << "splitBuffer: " << splitBuffer << endl;
327  cout << "currentKey: " << *currentKey << endl;
328 #endif
329  linkToEntry(currentKey->getText(), splitBuffer);
330  splitPtr = 0;
331  }
332  }
333 #endif
334  }
335  else {
336  *currentKey = keyStr;
337  writeEntry(*currentKey, text);
338  }
339 
340  // Since we consumed the text, clear it
341  // and tell the caller that the tag was consumed.
342  text = "";
343  return true;
344  }
345  }
346  return false;
347 }
const char * getName() const
Definition: utilxml.h:58
Definition: utilxml.h:38
const char * toString() const
Definition: utilxml.cpp:285
unsigned long entryCount
Definition: tei2mod.cpp:104
return NULL
Definition: regex.c:7953
SWBuf keyStr
Definition: tei2mod.cpp:102
void writeEntry(SWModule *book, SWBuf keyBuffer, SWBuf entBuffer)
Definition: imp2gbs.cpp:131
const char * getAttribute(const char *attribName, int partNum=-1, char partSplit= '|') const
Definition: utilxml.cpp:230
bool isEndTag(const char *eID=0) const
Definition: utilxml.cpp:323
SWKey * currentKey
Definition: tei2mod.cpp:100
void linkToEntry(VerseKey &linkKey, VerseKey &dest)
Definition: osis2mod.cpp:595
void linkToEntry ( const SWBuf keyBuf,
const SWBuf linkBuf 
)

Definition at line 224 of file tei2mod.cpp.

224  {
225  SWKey tmpkey = linkBuf.c_str();
226  module->linkEntry(&tmpkey);
227 #ifdef DEBUG
228  cout << "(" << entryCount << ") " << "Linking: " << linkBuf << endl;
229 #endif
230 }
SWText * module
Definition: osis2mod.cpp:105
unsigned long entryCount
Definition: tei2mod.cpp:104
virtual void linkEntry(const SWKey *sourceKey)
Definition: swmodule.cpp:1683
int main ( int  argc,
char **  argv 
)

Definition at line 368 of file tei2mod.cpp.

368  {
369 #ifdef _ICU_
370  UTF8NFC normalizr;
371  normalizer = &normalizr;
372 #endif
373 
374  SWBuf program = argv[0];
375  fprintf(stderr, "You are running %s: $Rev: 3416 $\n", argv[0]);
376 
377  // Let's test our command line arguments
378  if (argc < 3) {
379  usage(*argv);
380  }
381 
382  // variables for arguments, holding defaults
383  SWBuf path = argv[1];
384  SWBuf teiDoc = argv[2];
385  SWBuf compType = "";
386  SWBuf modDrv = "";
387  SWBuf recommendedPath = "./modules/lexdict/";
388  SWBuf cipherKey = "";
389  SWCompress *compressor = 0;
390 
391  for (int i = 3; i < argc; i++) {
392  if (!strcmp(argv[i], "-z")) {
393  if (modDrv.size()) usage(*argv, "Cannot specify both -z and -s");
394  compType = "ZIP";
395  if (i+1 < argc && argv[i+1][0] != '-') {
396  switch (argv[++i][0]) {
397  case 'l': compType = "LZSS"; break;
398  case 'z': compType = "ZIP"; break;
399  case 'b': compType = "BZIP2"; break;
400  case 'x': compType = "XZ"; break;
401  }
402  }
403  modDrv = "zLD";
404  recommendedPath += "zld/";
405  }
406  else if (!strcmp(argv[i], "-Z")) {
407  if (compType.size()) usage(*argv, "Cannot specify both -z and -Z");
408  if (modDrv.size()) usage(*argv, "Cannot specify both -Z and -s");
409  compType = "LZSS";
410  modDrv = "zLD";
411  recommendedPath += "zld/";
412  }
413  else if (!strcmp(argv[i], "-s")) {
414  if (compType.size()) usage(*argv, "Cannot specify both -s and -z");
415  if (i+1 < argc) {
416  int size = atoi(argv[++i]);
417  if (size == 2) {
418  modDrv = "RawLD";
419  recommendedPath += "rawld/";
420  continue;
421  }
422  if (size == 4) {
423  modDrv = "RawLD4";
424  recommendedPath += "rawld4/";
425  continue;
426  }
427  }
428  usage(*argv, "-s requires one of <2|4>");
429  }
430  else if (!strcmp(argv[i], "-N")) {
431  normalize = false;
432  }
433  else if (!strcmp(argv[i], "-c")) {
434  if (i+1 < argc) cipherKey = argv[++i];
435  else usage(*argv, "-c requires <cipher_key>");
436  }
437  else usage(*argv, (((SWBuf)"Unknown argument: ")+ argv[i]).c_str());
438  }
439  if (!modDrv.size()) {
440  modDrv = "RawLD4";
441  recommendedPath += "rawld4/";
442  }
443 
444 #ifndef _ICU_
445  if (normalize) {
446  normalize = false;
447  cout << program << " is not compiled with support for ICU. Setting -N flag." << endl;
448  }
449 #endif
450 
451  if (compType == "LZSS") {
452  compressor = new LZSSCompress();
453  }
454  else if (compType == "ZIP") {
455 #ifndef EXCLUDEZLIB
456  compressor = new ZipCompress();
457 #else
458  usage(*argv, "ERROR: SWORD library not compiled with ZIP compression support.\n\tBe sure libz is available when compiling SWORD library");
459 #endif
460  }
461  else if (compType == "BZIP2") {
462 #ifndef EXCLUDEBZIP2
463  compressor = new Bzip2Compress();
464 #else
465  usage(*argv, "ERROR: SWORD library not compiled with bzip2 compression support.\n\tBe sure libbz2 is available when compiling SWORD library");
466 #endif
467  }
468  else if (compType == "XZ") {
469 #ifndef EXCLUDEXZ
470  compressor = new XzCompress();
471 #else
472  usage(*argv, "ERROR: SWORD library not compiled with xz compression support.\n\tBe sure liblzma is available when compiling SWORD library");
473 #endif
474  }
475 
476 #ifdef DEBUG
477  // cout << "path: " << path << " teiDoc: " << teiDoc << " compressType: " << compType << " ldType: " << modDrv << " cipherKey: " << cipherKey.c_str() << " normalize: " << normalize << "\n";
478  cout << "path: " << path << " teiDoc: " << teiDoc << " compressType: " << compType << " ldType: " << modDrv << " normalize: " << normalize << "\n";
479  cout << "";
480 // exit(-3);
481 #endif
482 
483  SWBuf modName = path;
484  int pathlen = path.length();
485  char lastChar = path[pathlen - 1];
486  if (lastChar != '/' && lastChar != '\\') {
487  modName += "/";
488  }
489  modName += "dict";
490 
491  SWBuf keyBuf;
492  SWBuf entBuf;
493  SWBuf lineBuf;
494  vector<string> linkBuf;
495 
496  if (modDrv == "zLD") {
497  if (zLD::createModule(modName)) {
498  fprintf(stderr, "error: %s: couldn't create module at path: %s \n", program.c_str(), modName.c_str());
499  exit(-3);
500  }
501  module = new zLD(modName, 0, 0, 30, compressor);
502  }
503  else if (modDrv == "RawLD") {
504  if (RawLD::createModule(modName)) {
505  fprintf(stderr, "error: %s: couldn't create module at path: %s \n", program.c_str(), modName.c_str());
506  exit(-3);
507  }
508  module = new RawLD(modName);
509  }
510  else {
511  if (RawLD4::createModule(modName)) {
512  fprintf(stderr, "error: %s: couldn't create module at path: %s \n", program.c_str(), modName.c_str());
513  exit(-3);
514  }
515  module = new RawLD4(modName);
516  }
517 
518  SWFilter *cipherFilter = 0;
519 
520  if (cipherKey.size()) {
521  fprintf(stderr, "Adding cipher filter with phrase: %s\n", cipherKey.c_str() );
522  cipherFilter = new CipherFilter(cipherKey.c_str());
523  module->addRawFilter(cipherFilter);
524  }
525 
526  if (!module->isWritable()) {
527  fprintf(stderr, "The module is not writable. Writing text to it will not work.\nExiting.\n" );
528  exit(-1);
529  }
530 
531  // Let's see if we can open our input file
532  ifstream infile(teiDoc);
533  if (infile.fail()) {
534  fprintf(stderr, "error: %s: couldn't open input file: %s \n", program.c_str(), teiDoc.c_str());
535  exit(-2);
536  }
537 
539  currentKey->setPersist(true);
541 
542  (*module) = TOP;
543 
544  SWBuf token;
545  SWBuf text;
546  bool intoken = false;
547  char curChar = '\0';
548 
549  while (infile.good()) {
550 
551  curChar = infile.get();
552 
553  // skip the character if it is bad. infile.good() will catch the problem
554  if (curChar == -1) {
555  continue;
556  }
557 
558  if (!intoken && curChar == '<') {
559  intoken = true;
560  token = "<";
561  continue;
562  }
563 
564  if (intoken && curChar == '>') {
565  intoken = false;
566  token.append('>');
567 
568  XMLTag *t = new XMLTag(token.c_str());
569  if (!handleToken(text, t)) {
570  text.append(*t);
571  }
572  delete t;
573  continue;
574  }
575 
576  if (intoken)
577  token.append(curChar);
578  else
579  switch (curChar) {
580  case '>' : text.append("&gt;"); break;
581  case '<' : text.append("&lt;"); break;
582  default : text.append(curChar); break;
583  }
584  }
585 
586  // Force the last entry from the text buffer.
587  //text = "";
588  //writeEntry(*currentKey, text);
589 
590  delete module;
591  delete currentKey;
592  if (cipherFilter)
593  delete cipherFilter;
594  infile.close();
595 
596 #ifdef _ICU_
597  if (converted) fprintf(stderr, "tei2mod converted %d verses to UTF-8\n", converted);
598  if (normalized) fprintf(stderr, "tei2mod normalized %d verses to NFC\n", normalized);
599 #endif
600 
601  /*
602  * Suggested module name detection.
603  * Only used for suggesting a conf.
604  *
605  * Various forms of path.
606  * . and .. - no module name given, use "dict".
607  * Or one of the following where z is the module name
608  * and x may be . or ..
609  * z
610  * x/y/z
611  * x/y/z/
612  * x/y/z/z
613  */
614  SWBuf suggestedModuleName = path;
615  if (lastChar == '/' || lastChar == '\\') {
616  suggestedModuleName.setSize(--pathlen);
617  }
618 
619  lastChar = suggestedModuleName[pathlen - 1];
620  if (lastChar == '.') {
621  suggestedModuleName = "???";
622  }
623  else {
624  /* At this point the suggestion is either
625  * what follows the last / or \
626  * or the entire string
627  */
628  const char *m = strrchr(suggestedModuleName.c_str(), '/');
629  if (!m) {
630  m = strrchr(suggestedModuleName.c_str(), '\\');
631  }
632  if (m) {
633  suggestedModuleName = m+1;
634  }
635  }
636 
637  recommendedPath += suggestedModuleName;
638  recommendedPath += "/dict";
639 
640  fprintf(stderr, "\nSuggested conf (replace ??? with appropriate values)\n\n");
641  fprintf(stderr, "[%s]\n", suggestedModuleName.c_str());
642  fprintf(stderr, "DataPath=%s\n", recommendedPath.c_str());
643  fprintf(stderr, "Description=???\n");
644  fprintf(stderr, "SourceType=TEI\n");
645  fprintf(stderr, "Encoding=%s\n", (normalize ? "UTF-8" : "???"));
646  fprintf(stderr, "ModDrv=%s\n", modDrv.c_str());
647  if (compressor) {
648  fprintf(stderr, "CompressType=%s\n", compType.c_str());
649  }
650  if (cipherKey.size()) {
651  fprintf(stderr, "CipherKey=%s\n", cipherKey.c_str());
652  }
653 }
#define TOP
Definition: swkey.h:68
virtual SWKey * createKey() const
Definition: swtext.cpp:67
bool normalize
Definition: tei2mod.cpp:101
SWText * module
Definition: osis2mod.cpp:105
Definition: utilxml.h:38
bool handleToken(SWBuf &text, XMLTag token)
Definition: osis2mod.cpp:617
virtual char setKey(const SWKey *ikey)
Definition: swmodule.cpp:298
static char createModule(const char *path)
Definition: rawld4.h:53
static char createModule(const char *path)
Definition: rawld.h:53
virtual bool isWritable() const
Definition: swmodule.h:506
int normalized
Definition: osis2mod.cpp:102
virtual SWModule & addRawFilter(SWFilter *newFilter)
Definition: swmodule.h:694
int converted
Definition: osis2mod.cpp:103
void usage(const char *app)
Definition: imp2gbs.cpp:65
int size
Definition: regex.c:5043
SWKey * currentKey
Definition: tei2mod.cpp:100
static char createModule(const char *path)
Definition: zld.h:49
void normalizeInput ( SWKey key,
SWBuf text 
)

Definition at line 179 of file tei2mod.cpp.

179  {
180 #ifdef _ICU_
181  int utf8State = detectUTF8(text.c_str());
182  if (normalize) {
183  // Don't need to normalize text that is ASCII
184  // But assume other non-UTF-8 text is Latin1 (cp1252) and convert it to UTF-8
185  if (!utf8State) {
186  cout << "Warning: " << key << ": Converting to UTF-8 (" << text << ")" << endl;
187  converter.processText(text, (SWKey *)2); // note the hack of 2 to mimic a real key. TODO: remove all hacks
188  converted++;
189 
190  // Prepare for double check. This probably can be removed.
191  // But for now we are running the check again.
192  // This is to determine whether we need to normalize output of the conversion.
193  utf8State = detectUTF8(text.c_str());
194  }
195 
196  // Double check. This probably can be removed.
197  if (!utf8State) {
198  cout << "Error: " << key << ": Converting to UTF-8 (" << text << ")" << endl;
199  }
200 
201  if (utf8State > 0) {
202  SWBuf before = text;
203  normalizer->processText(text, (SWKey *)2); // note the hack of 2 to mimic a real key. TODO: remove all hacks
204  if (before != text) {
205  normalized++;
206  }
207  }
208  }
209 #endif
210 }
bool normalize
Definition: tei2mod.cpp:101
int detectUTF8(const char *txt)
Definition: osis2mod.cpp:152
int normalized
Definition: osis2mod.cpp:102
int converted
Definition: osis2mod.cpp:103
void usage ( const char *  app,
const char *  error = 0 
)

Definition at line 349 of file tei2mod.cpp.

349  {
350 
351  if (error) fprintf(stderr, "\n%s: %s\n", app, error);
352 
353  fprintf(stderr, "TEI Lexicon/Dictionary/Daily Devotional/Glossary module creation tool for\n\tThe SWORD Project\n");
354  fprintf(stderr, "\nusage: %s <output/path> <teiDoc> [OPTIONS]\n", app);
355  fprintf(stderr, " -z <l|z|b|x>\t\t use compression (default: none)\n");
356  fprintf(stderr, "\t\t\t\t l - LZSS; z - ZIP; b - bzip2; x - xz\n");
357  fprintf(stderr, " -s <2|4>\t\t max text size per entry (default: 4)\n");
358  fprintf(stderr, " -c <cipher_key>\t encipher module using supplied key\n");
359  fprintf(stderr, "\t\t\t\t (default: none)\n");
360  fprintf(stderr, " -N\t\t\t Do not convert UTF-8 or normalize UTF-8 to NFC\n");
361  fprintf(stderr, "\t\t\t\t (default is to convert to UTF-8, if needed,\n");
362  fprintf(stderr, "\t\t\t\t and then normalize to NFC. Note: all UTF-8\n");
363  fprintf(stderr, "\t\t\t\t texts should be normalized to NFC.)\n");
364  fprintf(stderr, "\n\tThe options -z and -s are mutually exclusive.\n");
365  exit(-1);
366 }
void writeEntry ( SWKey key,
SWBuf text 
)

Definition at line 212 of file tei2mod.cpp.

212  {
213 #ifdef DEBUG
214  cout << "(" << entryCount << ") " << key << endl;
215 #endif
216 
217  module->setKey(key);
218 
219  normalizeInput(key, text);
220 
221  module->setEntry(text);
222 }
void normalizeInput(SWKey &key, SWBuf &text)
Definition: tei2mod.cpp:179
virtual void setEntry(const char *inbuf, long len=-1)
Definition: swmodule.cpp:1680
SWText * module
Definition: osis2mod.cpp:105
virtual char setKey(const SWKey *ikey)
Definition: swmodule.cpp:298
unsigned long entryCount
Definition: tei2mod.cpp:104

Variable Documentation

SWKey* currentKey = NULL

Definition at line 100 of file tei2mod.cpp.

unsigned long entryCount = 0

Definition at line 104 of file tei2mod.cpp.

SWBuf keyStr

Definition at line 102 of file tei2mod.cpp.

SWLD* module = NULL

Definition at line 99 of file tei2mod.cpp.

bool normalize = true

Definition at line 101 of file tei2mod.cpp.