The SWORD Project  1.9.0.svnversion
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
osis2mod.cpp File Reference
#include <ctype.h>
#include <stdio.h>
#include <fcntl.h>
#include <errno.h>
#include <stdlib.h>
#include <stack>
#include <vector>
#include <iostream>
#include <fstream>
#include <utilstr.h>
#include <swmgr.h>
#include <rawtext.h>
#include <rawtext4.h>
#include <swbuf.h>
#include <utilxml.h>
#include <listkey.h>
#include <versekey.h>
#include <swversion.h>
#include <ztext.h>
#include <ztext4.h>
#include <lzsscomprs.h>
#include <zipcomprs.h>
#include <bz2comprs.h>
#include <xzcomprs.h>
#include <cipherfil.h>
#include <utf8utf16.h>
#include <utf16utf8.h>
+ Include dependency graph for osis2mod.cpp:

Go to the source code of this file.

Functions

int detectUTF8 (const char *txt)
 
bool handleToken (SWBuf &text, XMLTag token)
 
bool isOSISAbbrev (const char *buf)
 
bool isValidRef (const char *buf, const char *caller)
 
void linkToEntry (VerseKey &linkKey, VerseKey &dest)
 
int main (int argc, char **argv)
 
void makeValidRef (VerseKey &key)
 
void prepareSWText (const char *osisID, SWBuf &text)
 
void prepareSWVerseKey (SWBuf &buf)
 
void processOSIS (istream &infile)
 
XMLTag transformBSP (XMLTag t)
 
void usage (const char *app, const char *error=0, const bool verboseHelp=false)
 
void writeEntry (SWBuf &text, bool force=false)
 
void writeLinks ()
 

Variables

char activeOsisID [255]
 
SWBuf activeVerseText
 
int converted = 0
 
ListKey currentKeyIDs = ListKey()
 
char currentOsisID [255]
 
VerseKey currentVerse
 
int debug = 0
 
const int DEBUG_INTERVERSE = 16
 
const int DEBUG_OTHER = 512
 
const int DEBUG_QUOTE = 4
 
const int DEBUG_REF = 128
 
const int DEBUG_REV11N = 64
 
const int DEBUG_STACK = 256
 
const int DEBUG_TITLE = 8
 
const int DEBUG_VERSE = 2
 
const int DEBUG_WRITE = 1
 
const int DEBUG_XFORM = 32
 
const int EXIT_BAD_ARG = 1
 
const int EXIT_BAD_NESTING = 5
 
const int EXIT_NO_CREATE = 3
 
const int EXIT_NO_READ = 4
 
const int EXIT_NO_WRITE = 2
 
static bool inCanonicalOSISBook = true
 
std::vector< ListKeylinkedVerses
 
SWTextmodule = 0
 
static bool normalize = true
 
int normalized = 0
 
SWFilteroutputDecoder = NULL
 
SWFilteroutputEncoder = NULL
 
SWBuf v11n = "KJV"
 

Function Documentation

int detectUTF8 ( const char *  txt)

Determine whether the string contains a valid unicode sequence. The following table give the pattern of a valid UTF-8 character. Unicode Range 1st 2nd 3rd 4th U-00000000 - U-0000007F 0nnnnnnn U-00000080 - U-000007FF 110nnnnn 10nnnnnn U-00000800 - U-0000FFFF 1110nnnn 10nnnnnn 10nnnnnn U-00010000 - U-0010FFFF 11110nnn 10nnnnnn 10nnnnnn 10nnnnnn

Note:

  1. The latest UTF-8 RFC allows for a max of 4 bytes. Earlier allowed 6.
  2. The number of bits of the leading byte before the first 0 is the total number of bytes.
  3. The "n" are the bits of the unicode codepoint. This routine does not check to see if the code point is in the range. It could.

param txt the text to check return 1 if all high order characters form a valid unicode sequence -1 if there are no high order characters. Note: this is also a valid unicode sequence 0 if there are high order characters that do not form a valid unicode sequence author DM Smith

Definition at line 152 of file osis2mod.cpp.

152  {
153  unsigned int countUTF8 = 0;
154  int count = 0;
155 
156  // Cast it to make masking and shifting easier
157  const unsigned char *p = (const unsigned char*) txt;
158  while (*p) {
159  // Is the high order bit set?
160  if (*p & 0x80) {
161  // Then count the number of high order bits that are set.
162  // This determines the number of following bytes
163  // that are a part of the unicode character
164  unsigned char i = *p;
165  for (count = 0; i & 0x80; count++) {
166  i <<= 1;
167  }
168 
169  // Validate count:
170  // Count 0: bug in code that would cause core walking
171  // Count 1: is a pattern of 10nnnnnn,
172  // which does not signal the start of a unicode character
173  // Count 5 to 8: 111110nn, 1111110n and 11111110 and 11111111
174  // are not legal starts, either
175  if (count < 2 || count > 4) return 0;
176 
177  // At this point we expect (count - 1) following characters
178  // of the pattern 10nnnnnn
179  while (--count && *++p) {
180  // The pattern of each following character must be: 10nnnnnn
181  // So, compare the top 2 bits.
182  if ((0xc0 & *p) != 0x80) return 0;
183  }
184 
185  // Oops, we've run out of bytes too soon: Cannot be UTF-8
186  if (count) return 0;
187 
188  // We have a valid UTF-8 character, so count it
189  countUTF8++;
190  }
191 
192  // Advance to the next character to examine.
193  p++;
194  }
195 
196  // At this point it is either UTF-8 or 7-bit ascii
197  return countUTF8 ? 1 : -1;
198 }
bool handleToken ( SWBuf text,
XMLTag  token 
)

Definition at line 617 of file osis2mod.cpp.

617  {
618 
619  // Everything between the begin book tag and the first begin chapter tag is inBookIntro
620  static bool inBookIntro = false;
621 
622  // Everything between the begin chapter tag and the first begin verse tag is inChapterIntro
623  static bool inChapterIntro = false;
624 
625  // Flags indicating whether we are processing the content of a chapter
626  static bool inChapter = false;
627 
628  // Flags indicating whether we are processing the content of a verse
629  static bool inVerse = false;
630 
631  // Flags indicating whether we are processing the content of to be prepended to a verse
632  static bool inPreVerse = false;
633  static int genID = 1;
634 
635  // Flag indicating whether we are in "Words of Christ"
636  static bool inWOC = false;
637  // Tag for WOC quotes within a verse
638  static XMLTag wocTag = "<q who=\"Jesus\" marker=\"\">";
639 
640  // Flag used to indicate where useful text begins
641  static bool firstDiv = false;
642  static bool headerEnded = false;
643 
644  // Retain the sID of book, chapter and verse (commentary) divs so that we can find them again.
645  // This relies on transformBSP.
646  static SWBuf sidBook = "";
647  static SWBuf sidChapter = "";
648  static SWBuf sidVerse = "";
649 
650  // Stack of quote elements used to handle Words of Christ
651  static std::stack<XMLTag> quoteStack;
652 
653  // Stack of elements used to validate that books, chapters and verses are well-formed
654  // This goes beyond simple xml well-formed and also considers milestoned div, chapter and verse
655  // to be begin and end tags, too.
656  // It is an error if books and chapters are not well formed (though not required by OSIS)
657  // It is a warning that verses are not well formed (because some clients are not ready)
658  static std::stack<XMLTag> tagStack;
659 
660  // The following are used to validate well-formedness
661  static int chapterDepth = 0;
662  static int bookDepth = 0;
663  static int verseDepth = 0;
664 
665  int tagDepth = tagStack.size();
666  SWBuf tokenName = token.getName();
667  bool isEndTag = token.isEndTag() || token.getAttribute("eID");
668  SWBuf typeAttr = token.getAttribute("type");
669  SWBuf eidAttr = token.getAttribute("eID");
670 
671  // process start tags
672  if (!isEndTag) {
673 
674  // Remember non-empty start tags
675  if (!token.isEmpty()) {
676  tagStack.push(token);
677 
678  if (debug & DEBUG_STACK) {
679  cout << "DEBUG(STACK): " << currentOsisID << ": push (" << tagStack.size() << ") " << token.getName() << endl;
680  }
681  }
682 
683  // throw away everything up to the first div (that is outside the header)
684  if (!firstDiv) {
685  if (headerEnded && (tokenName == "div")) {
686  if (debug & DEBUG_OTHER) {
687  cout << "DEBUG(FOUND): Found first div and pitching prior material: " << text << endl;
688  }
689 
690  // TODO: Save off the content to use it to suggest the module's conf.
691  firstDiv = true;
692  text = "";
693  }
694  else {
695  // Collect the content so it can be used to suggest the module's conf.
696  return false;
697  }
698  }
699 
700  //-- WITH osisID OR annotateRef -------------------------------------------------------------------------
701  // Handle Book, Chapter, and Verse (or commentary equivalent)
702  if (token.getAttribute("osisID") || token.getAttribute("annotateRef")) {
703 
704  // BOOK START, <div type="book" ...>
705  if (tokenName == "div" && typeAttr == "book") {
706  if (inBookIntro || inChapterIntro) { // this one should never happen, but just in case
707 
708  if (debug & DEBUG_TITLE) {
709  cout << "DEBUG(TITLE): " << currentOsisID << ": OOPS INTRO " << endl;
710  cout << "\tinChapterIntro = " << inChapterIntro << endl;
711  cout << "\tinBookIntro = " << inBookIntro << endl;
712  }
713 
714  currentVerse.setTestament(0);
715  currentVerse.setBook(0);
716  currentVerse.setChapter(0);
717  currentVerse.setVerse(0);
718  writeEntry(text);
719  }
720  currentVerse = token.getAttribute("osisID");
721  currentVerse.setChapter(0);
722  currentVerse.setVerse(0);
723  strcpy(currentOsisID, currentVerse.getOSISRef());
724 
725  sidBook = token.getAttribute("sID");
726  inChapter = false;
727  inVerse = false;
728  inPreVerse = false;
729  inBookIntro = true;
730  inChapterIntro = false;
731 
732  if (debug & DEBUG_TITLE) {
733  cout << "DEBUG(TITLE): " << currentOsisID << ": Looking for book introduction" << endl;
734  }
735 
736  bookDepth = tagStack.size();
737  chapterDepth = 0;
738  verseDepth = 0;
739 
740  inCanonicalOSISBook = isOSISAbbrev(token.getAttribute("osisID"));
741  if (!inCanonicalOSISBook) {
742  cout << "WARNING(V11N): New book is " << token.getAttribute("osisID") << " and is not in " << v11n << " versification, ignoring" << endl;
743  }
744  else if (debug & DEBUG_OTHER) {
745  cout << "DEBUG(FOUND): New book is " << currentVerse.getOSISRef() << endl;
746  }
747 
748  return false;
749  }
750 
751  // CHAPTER START, <chapter> or <div type="chapter" ...>
752  if ((tokenName == "chapter") ||
753  (tokenName == "div" && typeAttr == "chapter")
754  ) {
755  if (inBookIntro) {
756  if (debug & DEBUG_TITLE) {
757  cout << "DEBUG(TITLE): " << currentOsisID << ": BOOK INTRO "<< text << endl;
758  }
759 
760  writeEntry(text);
761  }
762 
763  currentVerse = token.getAttribute("osisID");
764  currentVerse.setVerse(0);
765 
766  if (debug & DEBUG_OTHER) {
767  cout << "DEBUG(FOUND): Current chapter is " << currentVerse.getOSISRef() << " (" << token.getAttribute("osisID") << ")" << endl;
768  }
769 
770  strcpy(currentOsisID, currentVerse.getOSISRef());
771 
772  sidChapter = token.getAttribute("sID");
773  inChapter = true;
774  inVerse = false;
775  inPreVerse = false;
776  inBookIntro = false;
777  inChapterIntro = true;
778 
779  if (debug & DEBUG_TITLE) {
780  cout << "DEBUG(TITLE): " << currentOsisID << ": Looking for chapter introduction" << endl;
781  }
782 
783  chapterDepth = tagStack.size();
784  verseDepth = 0;
785 
786  return false;
787  }
788 
789  // VERSE, <verse ...> OR COMMENTARY START, <div annotateType="xxx" ...>
790  if ((tokenName == "verse") ||
791  (tokenName == "div" && token.getAttribute("annotateType"))
792  ) {
793  if (debug & DEBUG_OTHER) {
794  cout << "DEBUG(FOUND): Entering verse" << endl;
795  }
796 
797  if (inChapterIntro) {
798  if (debug & DEBUG_TITLE) {
799  cout << "DEBUG(TITLE): " << currentOsisID << ": Done looking for chapter introduction" << endl;
800  }
801 
802  if (text.length()) {
803  if (debug & DEBUG_TITLE) {
804  cout << "DEBUG(TITLE): " << currentOsisID << ": CHAPTER INTRO "<< text << endl;
805  }
806 
807  writeEntry(text);
808  }
809  }
810 
811  // Did we have pre-verse material that needs to be marked?
812  if (inPreVerse) {
813  char genBuf[200];
814  sprintf(genBuf, "<div type=\"x-milestone\" subType=\"x-preverse\" eID=\"pv%d\"/>", genID++);
815  text.append(genBuf);
816  }
817 
818  // Get osisID for verse or annotateRef for commentary
819  SWBuf keyVal = token.getAttribute(tokenName == "verse" ? "osisID" : "annotateRef");
820 
821  // Massage the key into a form that parseVerseList can accept
822  prepareSWVerseKey(keyVal);
823 
824  // The osisID or annotateRef can be more than a single verse
825  // The first or only one is the currentVerse
826  // Use the last verse seen (i.e. the currentVerse) as the basis for recovering from bad parsing.
827  // This should never happen if the references are valid OSIS references
828  ListKey verseKeys = currentVerse.parseVerseList(keyVal, currentVerse, true);
829  int memberKeyCount = verseKeys.getCount();
830  if (memberKeyCount) {
831  verseKeys.setPosition(TOP);
832  // get the first single verse
833  currentVerse = verseKeys;
834  // See if this osisID or annotateRef refers to more than one verse.
835  // This can be done by incrementing, which will produce an error
836  // if there is only one verse.
837  verseKeys.increment(1);
838  if (!verseKeys.popError()) {
839  // If it does, save it until all verses have been seen.
840  // At that point we will output links.
841  cout << "DEBUG(LINK MASTER): " << currentVerse.getOSISRef() << endl;
842  linkedVerses.push_back(verseKeys);
843  }
844  }
845  else {
846  cout << "ERROR(REF): Invalid osisID/annotateRef: " << token.getAttribute((tokenName == "verse") ? "osisID" : "annotateRef") << endl;
847  }
848 
849  strcpy(currentOsisID, currentVerse.getOSISRef());
850 
851  if (debug & DEBUG_OTHER) {
852  cout << "DEBUG(FOUND): New current verse is " << currentVerse.getOSISRef() << endl;
853  cout << "DEBUG(FOUND): osisID/annotateRef is adjusted to: " << keyVal << endl;
854  }
855 
856  sidVerse = token.getAttribute("sID");
857  inVerse = true;
858  inPreVerse = false;
859  inBookIntro = false;
860  inChapterIntro = false;
861  verseDepth = tagStack.size();
862 
863  // Include the token if it is not a verse
864  if (tokenName != "verse") {
865  text.append(token);
866  }
867  else if (debug & DEBUG_VERSE)
868  {
869  // transform the verse into a milestone
870  XMLTag t = "<milestone resp=\"v\" />";
871  // copy all the attributes of the verse element to the milestone
872  StringList attrNames = token.getAttributeNames();
873  for (StringList::iterator loop = attrNames.begin(); loop != attrNames.end(); loop++) {
874  const char* attr = (*loop).c_str();
875  t.setAttribute(attr, token.getAttribute(attr));
876  }
877  text.append(t);
878  }
879 
880  if (inWOC) {
881  text.append(wocTag);
882  }
883  return true;
884  }
885  } // done with Handle Book, Chapter, and Verse (or commentary equivalent)
886 
887  // Now consider everything else.
888 
889 /*
890  // "majorSection" is code for the Book 1-5 of Psalms // This is incorrect assumption - majorSection can appear in any large book and can start and end inside chapters
891  if (tokenName == "div" && typeAttr == "majorSection") {
892  if (inBookIntro) {
893  if (debug & DEBUG_TITLE) {
894  cout << "DEBUG(TITLE): " << currentOsisID << ": BOOK INTRO "<< text << endl;
895  }
896  writeEntry(text);
897  }
898 
899  if (debug & DEBUG_OTHER) {
900  cout << "DEBUG(FOUND): majorSection found " << currentVerse.getOSISRef() << endl;
901  }
902 
903  strcpy(currentOsisID, currentVerse.getOSISRef());
904 
905 // as a result of the incorrect assumption these flags are set also incorrectly and cause problems in situations where majorSections do not follow the assumptions made during creation of this patch
906 
907  inChapter = false;
908  inVerse = false;
909  inPreVerse = false;
910  inBookIntro = false;
911  inChapterIntro = true;
912 
913  if (debug & DEBUG_TITLE) {
914  cout << "DEBUG(TITLE): " << currentOsisID << ": Looking for chapter introduction" << endl;
915  }
916 
917  verseDepth = 0;
918 
919  return false;
920  }
921 */
922  // Handle WOC quotes.
923  // Note this requires transformBSP to make them into milestones
924  // Otherwise have to do it here
925  if (tokenName == "q") {
926  quoteStack.push(token);
927 
928  if (debug & DEBUG_QUOTE) {
929  cout << "DEBUG(QUOTE): " << currentOsisID << ": quote top(" << quoteStack.size() << ") " << token << endl;
930  }
931 
932  if (token.getAttribute("who") && !strcmp(token.getAttribute("who"), "Jesus")) {
933  inWOC = true;
934 
935  // Output per verse WOC markup.
936  text.append(wocTag);
937 
938  // Output the quotation mark if appropriate, inside the WOC.
939  // If there is no marker attribute, let the SWORD engine manufacture one.
940  // If there is a marker attribute and it has content, then output that.
941  // If the marker attribute is present and empty, then there is nothing to do.
942  // And have it within the WOC markup
943  if (!token.getAttribute("marker") || token.getAttribute("marker")[0]) {
944  token.setAttribute("who", 0); // remove the who="Jesus"
945  text.append(token);
946  }
947  return true;
948  }
949  return false;
950  }
951 
952  // Have we found the start of pre-verse material?
953  // Pre-verse material follows the following rules
954  // 1) Between the opening of a book and the first chapter, all the material is handled as an introduction to the book.
955  // 2) Between the opening of a chapter and the first verse, the material is split between the introduction of the chapter
956  // and the first verse of the chapter.
957  // A <div> with a type of section will be taken as surrounding verses.
958  // A <title> of type other than main, chapter or sub, will be taken as a title for the verse.
959  // Once one of these conditions is met, the division between chapter introduction and pre-verse is set.
960  // 3) Between verses, the material is split between the prior verse and the next verse.
961  // Basically, while end and empty tags are found, they belong to the prior verse.
962  // Once a begin tag is found, it belongs to the next verse.
963  if (!inPreVerse && !inBookIntro) {
964  if (inChapterIntro) {
965  // Determine when we are no longer in a chapter heading, but in pre-verse material:
966  // If we see one of the following:
967  // a section div
968  // a title that is not main, chapter or sub or unclassified (no type attribute)
969  if ((tokenName == "div" && typeAttr == "section") ||
970  (tokenName == "title" && typeAttr.length() != 0 && typeAttr != "main" && typeAttr != "chapter" && typeAttr != "sub")
971  ) {
972  if (debug & DEBUG_TITLE) {
973  cout << "DEBUG(TITLE): " << currentOsisID << ": Done looking for chapter introduction" << endl;
974  }
975 
976  if (text.length()) {
977  if (debug & DEBUG_TITLE) {
978  cout << "DEBUG(TITLE): " << currentOsisID << ": CHAPTER INTRO "<< text << endl;
979  }
980 
981  // Since we have found the boundary, we need to write out the chapter heading
982  writeEntry(text);
983  }
984  // And we are no longer in the chapter heading
985  inChapterIntro = false;
986  // But rather, we are now in pre-verse material
987  inPreVerse = true;
988  }
989  }
990  else if (!inVerse && inChapter) {
991  inPreVerse = true;
992  }
993 
994  if (inPreVerse) {
995  char genBuf[200];
996  sprintf(genBuf, "<div type=\"x-milestone\" subType=\"x-preverse\" sID=\"pv%d\"/>", genID);
997  text.append(genBuf);
998  }
999  }
1000 
1001  if (debug & DEBUG_INTERVERSE) {
1002  if (!inVerse && !inBookIntro && !inChapterIntro) {
1003  cout << "DEBUG(INTERVERSE): " << currentOsisID << ": interverse start token " << token << ":" << text.c_str() << endl;
1004  }
1005  }
1006 
1007  return false;
1008  } // Done with procesing start and empty tags
1009 
1010  // Process end tags
1011  else {
1012 
1013  if (tagStack.empty()) {
1014  cout << "FATAL(NESTING): " << currentOsisID << ": tag expected" << endl;
1015  exit(EXIT_BAD_NESTING);
1016  }
1017 
1018  // Note: empty end tags have the eID attribute
1019  if (!token.isEmpty()) {
1020  XMLTag topToken = tagStack.top();
1021  tagDepth = tagStack.size();
1022 
1023  if (debug & DEBUG_STACK) {
1024  cout << "DEBUG(STACK): " << currentOsisID << ": pop(" << tagDepth << ") " << topToken.getName() << endl;
1025  }
1026 
1027  tagStack.pop();
1028 
1029  if (tokenName != topToken.getName()) {
1030  cout << "FATAL(NESTING): " << currentOsisID << ": Expected " << topToken.getName() << " found " << tokenName << endl;
1031 // exit(EXIT_BAD_NESTING); // (OSK) I'm sure this validity check is a good idea, but there's a bug somewhere that's killing the converter here.
1032  // So I'm disabling this line. Unvalidated OSIS files shouldn't be run through the converter anyway.
1033  // (DM) This has nothing to do with well-form or valid. It checks milestoned elements for proper nesting.
1034  }
1035  }
1036 
1037  // We haven't seen the first div outside the header so there is little to do.
1038  if (!firstDiv) {
1039  if (tokenName == "header") {
1040  headerEnded = true;
1041 
1042  if (debug & DEBUG_OTHER) {
1043  cout << "DEBUG(FOUND): End of header found" << endl;
1044  }
1045  }
1046 
1047  // Collect the content so it can be used to suggest the module's conf.
1048  return false;
1049  }
1050 
1051  // VERSE and COMMENTARY END
1052  if ((tokenName == "verse") ||
1053  (tokenName == "div" && eidAttr == sidVerse)
1054  ) {
1055 
1056  if (tagDepth != verseDepth) {
1057  cout << "WARNING(NESTING): verse " << currentOsisID << " is not well formed:(" << verseDepth << "," << tagDepth << ")" << endl;
1058  }
1059 
1060  // If we are in WOC then we need to terminate the <q who="Jesus" marker=""> that was added earlier in the verse.
1061  if (inWOC) {
1062  text.append("</q>");
1063  }
1064 
1065 
1066  // Include the token if it is not a verse
1067  if (tokenName != "verse") {
1068  text.append(token);
1069  }
1070  else if (debug & DEBUG_VERSE)
1071  {
1072  // transform the verse into a milestone
1073  XMLTag t = "<milestone resp=\"v\" />";
1074  // copy all the attributes of the verse element to the milestone
1075  StringList attrNames = token.getAttributeNames();
1076  for (StringList::iterator loop = attrNames.begin(); loop != attrNames.end(); loop++) {
1077  const char* attr = (*loop).c_str();
1078  t.setAttribute(attr, token.getAttribute(attr));
1079  }
1080  text.append(t);
1081  }
1082 
1083  writeEntry(text);
1084 
1085  inVerse = false;
1086  inPreVerse = false;
1087  verseDepth = 0;
1088 
1089  return true;
1090  }
1091 
1092  // Handle WOC quotes.
1093  // Note this requires transformBSP to make them into milestones
1094  // Otherwise have to manage it here
1095  if (tokenName == "q") {
1096  XMLTag topToken = quoteStack.top();
1097 
1098  if (debug & DEBUG_QUOTE) {
1099  cout << "DEBUG(QUOTE): " << currentOsisID << ": quote pop(" << quoteStack.size() << ") " << topToken << " -- " << token << endl;
1100  }
1101 
1102  quoteStack.pop();
1103 
1104  // If we have found an end tag for a <q who="Jesus"> then we are done with the WOC
1105  // and we need to terminate the <q who="Jesus" marker=""> that was added earlier in the verse.
1106  if (token.getAttribute("who") && !strcmp(token.getAttribute("who"), "Jesus")) {
1107 
1108  if (debug & DEBUG_QUOTE) {
1109  cout << "DEBUG(QUOTE): " << currentOsisID << ": (" << quoteStack.size() << ") " << topToken << " -- " << token << endl;
1110  }
1111 
1112  inWOC = false;
1113  const char *sID = topToken.getAttribute("sID");
1114  const char *eID = token.getAttribute("eID");
1115  if (!sID) {
1116  sID = "";
1117  }
1118  if (!eID) {
1119  eID = "";
1120  }
1121  if (strcmp(sID, eID)) {
1122  cout << "ERROR(NESTING): improper nesting " << currentOsisID << ": matching (sID,eID) not found. Looking at (" << sID << "," << eID << ")" << endl;
1123  }
1124 
1125 
1126  // Output the quotation mark if appropriate, inside the WOC.
1127  // If there is no marker attribute, let the SWORD engine manufacture one.
1128  // If there is a marker attribute and it has content, then output that.
1129  // If the marker attribute is present and empty, then there is nothing to do.
1130  // And have it within the WOC markup
1131  if (!token.getAttribute("marker") || token.getAttribute("marker")[0]) {
1132  token.setAttribute("who", 0); // remove the who="Jesus"
1133  text.append(token);
1134  }
1135 
1136  // Now close the WOC
1137  text.append("</q>");
1138  return true;
1139  }
1140  return false;
1141  }
1142 
1143  // Look for the end of document, book and chapter
1144  // Also for material that goes with last entry
1145  if (!inVerse && !inBookIntro && !inChapterIntro) {
1146  // Is this the end of a chapter.
1147  if ((tokenName == "chapter") ||
1148  (tokenName == "div" && eidAttr == sidChapter)
1149  ) {
1150  text.append(token);
1151  writeEntry(text);
1152  inChapter = false;
1153  sidChapter = "";
1154  chapterDepth = 0;
1155  verseDepth = 0;
1156  return true;
1157  }
1158 
1159  // Is it the end of a book
1160  if (tokenName == "div" && eidAttr == sidBook) {
1161  text.append(token);
1162  writeEntry(text);
1163  bookDepth = 0;
1164  chapterDepth = 0;
1165  verseDepth = 0;
1166  return true;
1167  }
1168 
1169  // Do not include the end of an osis document
1170  if (tokenName == "osisText" || tokenName == "osis") {
1171  bookDepth = 0;
1172  chapterDepth = 0;
1173  verseDepth = 0;
1174  text = "";
1175  return true;
1176  }
1177 
1178  // When we are not inPreVerse, the interverse tags get appended to the preceeding verse.
1179  if (!inPreVerse) {
1180  text.append(token);
1181  writeEntry(text);
1182 
1183  if (debug & DEBUG_INTERVERSE) {
1184  cout << "DEBUG(INTERVERSE): " << currentOsisID << ": appending interverse end tag: " << tokenName << "(" << tagDepth << "," << chapterDepth << "," << bookDepth << ")" << endl;
1185  }
1186 
1187  return true;
1188  }
1189 
1190  if (debug & DEBUG_INTERVERSE) {
1191  cout << "DEBUG(INTERVERSE): " << currentOsisID << ": interverse end tag: " << tokenName << "(" << tagDepth << "," << chapterDepth << "," << bookDepth << ")" << endl;
1192  }
1193 
1194  return false;
1195  }
1196 
1197  return false;
1198  } // done with Processing end tags
1199 
1200  return false;
1201 }
#define TOP
Definition: swkey.h:68
const int DEBUG_STACK
Definition: osis2mod.cpp:85
void prepareSWVerseKey(SWBuf &buf)
Definition: osis2mod.cpp:253
const char * setAttribute(const char *attribName, const char *attribValue, int partNum=-1, char partSplit= '|')
Definition: utilxml.cpp:248
char currentOsisID[255]
Definition: osis2mod.cpp:109
const char * getName() const
Definition: utilxml.h:58
Definition: utilxml.h:38
static bool inCanonicalOSISBook
Definition: osis2mod.cpp:117
const int DEBUG_VERSE
Definition: osis2mod.cpp:78
SWBuf v11n
Definition: osis2mod.cpp:107
bool isEmpty() const
Definition: utilxml.h:60
int debug
Definition: osis2mod.cpp:76
bool isOSISAbbrev(const char *buf)
Definition: osis2mod.cpp:120
const int DEBUG_TITLE
Definition: osis2mod.cpp:80
const StringList getAttributeNames() const
Definition: utilxml.cpp:188
const int DEBUG_QUOTE
Definition: osis2mod.cpp:79
std::list< SWBuf > StringList
Definition: swmodule.cpp:91
std::vector< ListKey > linkedVerses
Definition: osis2mod.cpp:115
void writeEntry(SWModule *book, SWBuf keyBuffer, SWBuf entBuffer)
Definition: imp2gbs.cpp:131
const int DEBUG_OTHER
Definition: osis2mod.cpp:86
const int EXIT_BAD_NESTING
Definition: osis2mod.cpp:93
const int DEBUG_INTERVERSE
Definition: osis2mod.cpp:81
const char * getAttribute(const char *attribName, int partNum=-1, char partSplit= '|') const
Definition: utilxml.cpp:230
bool isEndTag(const char *eID=0) const
Definition: utilxml.cpp:323
VerseKey currentVerse
Definition: osis2mod.cpp:106
bool isOSISAbbrev ( const char *  buf)

Definition at line 120 of file osis2mod.cpp.

120  {
122  const VersificationMgr::System *av11n = vmgr->getVersificationSystem(currentVerse.getVersificationSystem());
123  return av11n->getBookNumberByOSISName(buf) >= 0;
124 }
int getBookNumberByOSISName(const char *bookName) const
VerseKey currentVerse
Definition: osis2mod.cpp:106
static VersificationMgr * getSystemVersificationMgr()
const System * getVersificationSystem(const char *name) const
bool isValidRef ( const char *  buf,
const char *  caller 
)

Determine whether a verse as given is valid for the versification. This is done by comparing the before and after of normalization.

Definition at line 372 of file osis2mod.cpp.

372  {
373  // Create a VerseKey that does not do auto normalization
374  // Note: need to turn on headings so that a heading does not get normalized anyway
375  // And set it to the reference under question
376  VerseKey before;
377  before.setVersificationSystem(currentVerse.getVersificationSystem());
378  before.setAutoNormalize(false);
379  before.setIntros(true);
380  before.setText(buf);
381 
382  // If we are a heading we must bail
383  // These will autonormalize to the last verse of the prior chapter
384  if (!before.getTestament() || !before.getBook() || !before.getChapter() || !before.getVerse()) {
385  return true;
386  }
387 
388  // Create a VerseKey that does do auto normalization
389  // And set it to the reference under question
390  VerseKey after;
391  after.setVersificationSystem(currentVerse.getVersificationSystem());
392  after.setAutoNormalize(true);
393  after.setText(buf);
394 
395  if (before == after)
396  {
397  return true;
398  }
399 
400  // If we have gotten here the reference is not in the selected versification.
401  // cout << "INFO(V11N): " << before << " is not in the " << currentVerse.getVersificationSystem() << " versification." << endl;
402  if (debug & DEBUG_REV11N) {
403  cout << "DEBUG(V11N)[" << caller << "]: " << before << " normalizes to " << after << endl;
404  }
405 
406  return false;
407 }
int debug
Definition: osis2mod.cpp:76
const int DEBUG_REV11N
Definition: osis2mod.cpp:83
VerseKey currentVerse
Definition: osis2mod.cpp:106
void linkToEntry ( VerseKey linkKey,
VerseKey dest 
)

Definition at line 595 of file osis2mod.cpp.

595  {
596 
597  // Only link verses that are in the versification.
598  if (!isValidRef(linkKey, "linkToEntry")) {
599  return;
600  }
601 
602  VerseKey saveKey;
603  saveKey.setVersificationSystem(currentVerse.getVersificationSystem());
604  saveKey.setAutoNormalize(0);
605  saveKey.setIntros(1);
606  saveKey = currentVerse;
607  currentVerse = linkKey;
608 
609  cout << "INFO(LINK): Linking " << currentVerse.getOSISRef() << " to " << dest.getOSISRef() << "\n";
610  module->linkEntry(&dest);
611 
612  currentVerse = saveKey;
613 }
SWText * module
Definition: osis2mod.cpp:105
virtual void linkEntry(const SWKey *sourceKey)
Definition: swmodule.cpp:1683
bool isValidRef(const char *buf, const char *caller)
Definition: osis2mod.cpp:372
VerseKey currentVerse
Definition: osis2mod.cpp:106
int main ( int  argc,
char **  argv 
)

Definition at line 1847 of file osis2mod.cpp.

1847  {
1848 
1849  fprintf(stderr, "You are running osis2mod: $Rev: 3769 $ (SWORD: %s)\n", SWVersion::currentVersion.getText());
1850 
1851  if (argc > 1) {
1852  for (int i = 1; i < argc; i++) {
1853  if (!strcmp(argv[i], "-h") || !strcmp(argv[i], "--help")) {
1854  usage(*argv, "", true);
1855  }
1856  }
1857  }
1858 
1859  // Let's test our command line arguments
1860  if (argc < 3) {
1861  usage(*argv);
1862  }
1863 
1864  // variables for arguments, holding defaults
1865  const char* program = argv[0];
1866  const char* path = argv[1];
1867  const char* osisDoc = argv[2];
1868  int append = 0;
1869  SWBuf compType = "";
1870  bool isCommentary = false;
1871  int iType = 4;
1872  int entrySize = 0;
1873  SWBuf cipherKey = "";
1874  SWCompress *compressor = 0;
1875  int compLevel = 0;
1876 
1877  for (int i = 3; i < argc; i++) {
1878  if (!strcmp(argv[i], "-a")) {
1879  append = 1;
1880  }
1881  else if (!strcmp(argv[i], "-z")) {
1882  compType = "ZIP";
1883  if (i+1 < argc && argv[i+1][0] != '-') {
1884  switch (argv[++i][0]) {
1885  case 'l': compType = "LZSS"; break;
1886  case 'z': compType = "ZIP"; break;
1887  case 'b': compType = "BZIP2"; break;
1888  case 'x': compType = "XZ"; break;
1889  }
1890  }
1891  }
1892  else if (!strcmp(argv[i], "-Z")) {
1893  if (compType.size()) usage(*argv, "Cannot specify both -z and -Z");
1894  compType = "LZSS";
1895  }
1896  else if (!strcmp(argv[i], "-b")) {
1897  if (i+1 < argc) {
1898  iType = atoi(argv[++i]);
1899  if ((iType >= 2) && (iType <= 4)) continue;
1900  }
1901  usage(*argv, "-b requires one of <2|3|4>");
1902  }
1903  else if (!strcmp(argv[i], "-N")) {
1904  normalize = false;
1905  }
1906  else if (!strcmp(argv[i], "-e")) {
1907  if (i+1 < argc) {
1908  switch (argv[++i][0]) {
1909  case '1': // leave as UTF-8
1910  outputEncoder = NULL;
1911  outputDecoder = NULL;
1912  break;
1913 
1914  case '2':
1915  outputEncoder = new UTF8UTF16();
1916  outputDecoder = new UTF16UTF8();
1917  break;
1918 #ifdef _ICU_
1919  case 's':
1920  outputEncoder = new UTF8SCSU();
1921  outputDecoder = new SCSUUTF8();
1922  break;
1923 #endif
1924  default:
1925  outputEncoder = NULL;
1926  outputDecoder = NULL;
1927  }
1928  }
1929  }
1930  else if (!strcmp(argv[i], "-c")) {
1931  if (i+1 < argc) cipherKey = argv[++i];
1932  else usage(*argv, "-c requires <cipher_key>");
1933  }
1934  else if (!strcmp(argv[i], "-v")) {
1935  if (i+1 < argc) v11n = argv[++i];
1936  else usage(*argv, "-v requires <v11n>");
1937  }
1938  else if (!strcmp(argv[i], "-s")) {
1939  if (i+1 < argc) {
1940  entrySize = atoi(argv[++i]);
1941  if (entrySize == 2 || entrySize == 4) {
1942  continue;
1943  }
1944  }
1945  usage(*argv, "-s requires one of <2|4>");
1946  }
1947  else if (!strcmp(argv[i], "-C")) {
1948  isCommentary = true;
1949  }
1950  else if (!strcmp(argv[i], "-d")) {
1951  if (i+1 < argc) debug |= atoi(argv[++i]);
1952  else usage(*argv, "-d requires <flags>");
1953  }
1954  else if (!strcmp(argv[i], "-l")) {
1955  if (i+1 < argc) {
1956  compLevel = atoi(argv[++i]);
1957  }
1958  else usage(*argv, "-l requires a value from 1-9");
1959 
1960  if (compLevel < 0 || compLevel > 10) {
1961  usage(*argv, "-l requires a value from 1-9");
1962  }
1963  }
1964  else usage(*argv, (((SWBuf)"Unknown argument: ")+ argv[i]).c_str());
1965  }
1966 
1967  if (isCommentary) isCommentary = true; // avoid unused warning for now
1968 
1969  if (compType == "LZSS") {
1970  compressor = new LZSSCompress();
1971  }
1972  else if (compType == "ZIP") {
1973 #ifndef EXCLUDEZLIB
1974  compressor = new ZipCompress();
1975 #else
1976  usage(*argv, "ERROR: SWORD library not compiled with ZIP compression support.\n\tBe sure libz is available when compiling SWORD library");
1977 #endif
1978  }
1979  else if (compType == "BZIP2") {
1980 #ifndef EXCLUDEBZIP2
1981  compressor = new Bzip2Compress();
1982 #else
1983  usage(*argv, "ERROR: SWORD library not compiled with bzip2 compression support.\n\tBe sure libbz2 is available when compiling SWORD library");
1984 #endif
1985  }
1986  else if (compType == "XZ") {
1987 #ifndef EXCLUDEXZ
1988  compressor = new XzCompress();
1989 #else
1990  usage(*argv, "ERROR: SWORD library not compiled with xz compression support.\n\tBe sure liblzma is available when compiling SWORD library");
1991 #endif
1992  }
1993 
1994  if (compressor && compLevel > 0) {
1995  compressor->setLevel(compLevel);
1996  }
1997 
1998 #ifndef _ICU_
1999  if (normalize) {
2000  normalize = false;
2001  cout << "WARNING(UTF8): " << program << " is not compiled with support for ICU. Assuming -N." << endl;
2002  }
2003 #endif
2004 
2005  if (debug & DEBUG_OTHER) {
2006  cout << "DEBUG(ARGS):\n\tpath: " << path << "\n\tosisDoc: " << osisDoc << "\n\tcreate: " << append << "\n\tcompressType: " << compType << "\n\tblockType: " << iType << "\n\tcompressLevel: " << compLevel << "\n\tcipherKey: " << cipherKey.c_str() << "\n\tnormalize: " << normalize << endl;
2007  }
2008 
2009  if (!append) { // == 0 then create module
2010  // Try to initialize a default set of datafiles and indicies at our
2011  // datapath location passed to us from the user.
2012  if (compressor) {
2013  if (entrySize == 4) {
2014  if (zText4::createModule(path, iType, v11n)) {
2015  fprintf(stderr, "ERROR: %s: couldn't create module at path: %s \n", program, path);
2016  exit(EXIT_NO_CREATE);
2017  }
2018  }
2019  else {
2020  if (zText::createModule(path, iType, v11n)) {
2021  fprintf(stderr, "ERROR: %s: couldn't create module at path: %s \n", program, path);
2022  exit(EXIT_NO_CREATE);
2023  }
2024  }
2025  }
2026  else if (entrySize == 4) {
2027  if (RawText4::createModule(path, v11n)) {
2028  fprintf(stderr, "ERROR: %s: couldn't create module at path: %s \n", program, path);
2029  exit(EXIT_NO_CREATE);
2030  }
2031  }
2032  else {
2033  if (RawText::createModule(path, v11n)) {
2034  fprintf(stderr, "ERROR: %s: couldn't create module at path: %s \n", program, path);
2035  exit(EXIT_NO_CREATE);
2036  }
2037  }
2038  }
2039 
2040  // Do some initialization stuff
2041  if (compressor) {
2042  if (entrySize == 4) {
2043  // Create a compressed text module allowing very large entries
2044  // Taking defaults except for first, fourth, fifth and last argument
2045  module = new zText4(
2046  path, // ipath
2047  0, // iname
2048  0, // idesc
2049  iType, // iblockType
2050  compressor, // icomp
2051  0, // idisp
2052  ENC_UNKNOWN, // enc
2053  DIRECTION_LTR, // dir
2054  FMT_UNKNOWN, // markup
2055  0, // lang
2056  v11n // versification
2057  );
2058  }
2059  else {
2060  // Create a compressed text module allowing reasonable sized entries
2061  // Taking defaults except for first, fourth, fifth and last argument
2062  module = new zText(
2063  path, // ipath
2064  0, // iname
2065  0, // idesc
2066  iType, // iblockType
2067  compressor, // icomp
2068  0, // idisp
2069  ENC_UNKNOWN, // enc
2070  DIRECTION_LTR, // dir
2071  FMT_UNKNOWN, // markup
2072  0, // lang
2073  v11n // versification
2074  );
2075  }
2076  }
2077  else if (entrySize == 4) {
2078  // Create a raw text module allowing very large entries
2079  // Taking defaults except for first and last argument
2080  module = new RawText4(
2081  path, // ipath
2082  0, // iname
2083  0, // idesc
2084  0, // idisp
2085  ENC_UNKNOWN, // encoding
2086  DIRECTION_LTR, // dir
2087  FMT_UNKNOWN, // markup
2088  0, // ilang
2089  v11n // versification
2090  );
2091  }
2092  else {
2093  // Create a raw text module allowing reasonable sized entries
2094  // Taking defaults except for first and last argument
2095  module = new RawText(
2096  path, // ipath
2097  0, // iname
2098  0, // idesc
2099  0, // idisp
2100  ENC_UNKNOWN, // encoding
2101  DIRECTION_LTR, // dir
2102  FMT_UNKNOWN, // markup
2103  0, // ilang
2104  v11n // versification
2105  );
2106  }
2107 
2108  SWFilter *cipherFilter = 0;
2109 
2110  if (cipherKey.length()) {
2111  fprintf(stderr, "Adding cipher filter with phrase: %s\n", cipherKey.c_str() );
2112  cipherFilter = new CipherFilter(cipherKey.c_str());
2113  module->addRawFilter(cipherFilter);
2114  }
2115 
2116  if (!module->isWritable()) {
2117  fprintf(stderr, "The module is not writable. Writing text to it will not work.\nExiting.\n" );
2118  exit(EXIT_NO_WRITE);
2119  }
2120 
2121  // Either read from std::cin (aka stdin), when the argument is a '-'
2122  // or from a specified file.
2123  if (!strcmp(osisDoc, "-")) {
2124  processOSIS(cin);
2125  }
2126  else {
2127  // Let's see if we can open our input file
2128  ifstream infile(osisDoc);
2129  if (infile.fail()) {
2130  fprintf(stderr, "ERROR: %s: couldn't open input file: %s \n", program, osisDoc);
2131  exit(EXIT_NO_READ);
2132  }
2133  processOSIS(infile);
2134  infile.close();
2135  }
2136 
2137  delete module;
2138  if (cipherFilter)
2139  delete cipherFilter;
2140  if (outputEncoder)
2141  delete outputEncoder;
2142  if (outputDecoder)
2143  delete outputDecoder;
2144 
2145  fprintf(stderr, "SUCCESS: %s: has finished its work and will now rest\n", program);
2146  exit(0); // success
2147 }
Definition: ztext4.h:39
static char createModule(const char *path, const char *v11n="KJV")
Definition: rawtext.h:46
void processOSIS(istream &infile)
Definition: osis2mod.cpp:1428
virtual void setLevel(int l)
Definition: swcomprs.h:53
SWText * module
Definition: osis2mod.cpp:105
SWFilter * outputEncoder
Definition: osis2mod.cpp:99
SWBuf v11n
Definition: osis2mod.cpp:107
static bool normalize
Definition: osis2mod.cpp:118
const int EXIT_NO_CREATE
Definition: osis2mod.cpp:91
const int EXIT_NO_READ
Definition: osis2mod.cpp:92
int debug
Definition: osis2mod.cpp:76
return NULL
Definition: regex.c:7953
virtual bool isWritable() const
Definition: swmodule.h:506
const int DEBUG_OTHER
Definition: osis2mod.cpp:86
static char createModule(const char *path, const char *v11n="KJV")
Definition: rawtext4.h:48
virtual SWModule & addRawFilter(SWFilter *newFilter)
Definition: swmodule.h:694
static char createModule(const char *path, int blockBound, const char *v11n="KJV")
Definition: ztext.h:62
void usage(const char *app)
Definition: imp2gbs.cpp:65
static SWVersion currentVersion
Definition: swversion.h:69
SWFilter * outputDecoder
Definition: osis2mod.cpp:100
const int EXIT_NO_WRITE
Definition: osis2mod.cpp:90
static char createModule(const char *path, int blockBound, const char *v11n="KJV")
Definition: ztext4.h:62
Definition: ztext.h:39
void makeValidRef ( VerseKey key)

This routine is used to ensure that all the text in the input is saved to the module. Assumption: The input orders all the verses for a chapter in numerical order. Thus, any verses that are not in the chosen versification (v11n) follow those that are.

The prior implementation of this adjusted the verse to the last one that is in the chosen v11n. If it the chapter were extra, then it is appended to the last verse of the last chapter in the chosen v11n for that book. If it is just extra verses for a chapter, then it is appended to the last verse of the chapter.

The problem with this is when a OSIS verse refers to more than one verse, e.g. osisID="Gen.1.29 Gen.1.30 Gen.1.31" (Gen.1.31 is the last verse of the chapter in the chosen v11n) and then it is followed by Gen.1.32.

This routine assumes that linking is postponed to the end so that in the example Gen.1.30-31 are not linked but rather empty. This routine will then find the last verse in the computed chapter that has content.

Alternative, we could have done linking as we went, but this routine would have needed to find the first entry in the link set and elsewhere in the code when appending to a verse, it would need to be checked for adjacent links and those would have needed to be adjusted.

param key the key that may need to be adjusted

Definition at line 433 of file osis2mod.cpp.

433  {
434  VerseKey saveKey;
435  saveKey.setVersificationSystem(key.getVersificationSystem());
436  saveKey.setAutoNormalize(false);
437  saveKey.setIntros(true);
438  saveKey = key;
439 
440  // Since isValidRef returned false constrain the key to the nearest prior reference.
441  // If we are past the last chapter set the reference to the last chapter
442  int chapterMax = key.getChapterMax();
443  if (key.getChapter() > chapterMax) {
444  key.setChapter(chapterMax);
445  }
446 
447  // Either we set the chapter to the last chapter and now need to set to the last verse in the chapter
448  // Or the verse is beyond the end of the chapter.
449  // In any case we need to constrain the verse to it's chapter.
450  int verseMax = key.getVerseMax();
451  key.setVerse(verseMax);
452 
453  if (debug & DEBUG_REV11N) {
454  cout << "DEBUG(V11N) Chapter max:" << chapterMax << ", Verse Max:" << verseMax << endl;
455  }
456 
457  // There are three cases we want to handle:
458  // In the examples we are using the KJV versification where the last verse of Matt.7 is Matt.7.29.
459  // In each of these cases the out-of-versification, extra verse is Matt.7.30.
460  // 1) The "extra" verse follows the last verse in the chapter.
461  // <verse osisID="Matt.7.29">...</verse><verse osisID="Matt.7.30">...</verse>
462  // In this case re-versify Matt.7.30 as Matt.7.29.
463  //
464  // 2) The "extra" verse follows a range (a set of linked verses).
465  // <verse osisID="Matt.7.28-Matt.7.29">...</verse><verse osisID="Matt.7.30">...</verse>
466  // In this case, re-versify Matt.7.30 as Matt.7.28, the first verse in the linked set.
467  // Since we are post-poning linking, we want to re-reversify to the last entry in the module.
468  //
469  // 3) The last verse in the chapter is not in the input. There may be other verses missing as well.
470  // <verse osisID="Matt.7.8">...</verse><verse osisID="Matt.7.30">...</verse>
471  // In this case we should re-versify Matt.7.30 as Matt.7.29.
472  // However, since this and 2) are ambiguous, we'll re-reversify to the last entry in the module.
473 
474  while (!key.popError() && !module->hasEntry(&key)) {
475  key.decrement(1);
476  }
477 
478  cout << "INFO(V11N): " << saveKey.getOSISRef()
479  << " is not in the " << key.getVersificationSystem()
480  << " versification. Appending content to " << key.getOSISRef() << endl;
481 }
SWText * module
Definition: osis2mod.cpp:105
int debug
Definition: osis2mod.cpp:76
virtual bool hasEntry(const SWKey *) const
Definition: swmodule.h:809
const int DEBUG_REV11N
Definition: osis2mod.cpp:83
void prepareSWText ( const char *  osisID,
SWBuf text 
)

Definition at line 200 of file osis2mod.cpp.

201 {
202  // Always check on UTF8 and report on non-UTF8 entries
203  int utf8State = detectUTF8(text.c_str());
204 
205  // Trust, but verify.
206  if (!normalize && !utf8State) {
207  cout << "WARNING(UTF8): " << osisID << ": Should be converted to UTF-8 (" << text << ")" << endl;
208  }
209 
210 #ifdef _ICU_
211  if (normalize) {
212  // Don't need to normalize text that is ASCII
213  // But assume other non-UTF-8 text is Latin1 (cp1252) and convert it to UTF-8
214  if (!utf8State) {
215  cout << "INFO(UTF8): " << osisID << ": Converting to UTF-8 (" << text << ")" << endl;
216  converter.processText(text, (SWKey *)2); // note the hack of 2 to mimic a real key. TODO: remove all hacks
217  converted++;
218 
219  // Prepare for double check. This probably can be removed.
220  // But for now we are running the check again.
221  // This is to determine whether we need to normalize output of the conversion.
222  utf8State = detectUTF8(text.c_str());
223  }
224 
225  // Double check. This probably can be removed.
226  if (!utf8State) {
227  cout << "ERROR(UTF8): " << osisID << ": Converting to UTF-8 (" << text << ")" << endl;
228  }
229 
230  if (utf8State > 0) {
231  SWBuf before = text;
232  normalizer.processText(text, (SWKey *)2); // note the hack of 2 to mimic a real key. TODO: remove all hacks
233  if (before != text) {
234  normalized++;
235  }
236  }
237  }
238 #endif
239 }
static bool normalize
Definition: osis2mod.cpp:118
int detectUTF8(const char *txt)
Definition: osis2mod.cpp:152
int normalized
Definition: osis2mod.cpp:102
int converted
Definition: osis2mod.cpp:103
void prepareSWVerseKey ( SWBuf buf)

Definition at line 253 of file osis2mod.cpp.

253  {
254  // This routine modifies the buf in place
255  char* s = buf.getRawData();
256  char* p = s;
257  bool inRange = false;
258  while (*p) {
259  if (inRange) {
260  if (debug & DEBUG_REF) {
261  cout << "DEBUG(REF): Copy range marker:" << *p << endl;;
262  }
263 
264  // Range markers are copied as is
265  *s++ = *p++;
266  }
267 
268  // Look ahead to see if we are in a work prefix
269  // but don't look past an osisID
270  char *n = p;
271  while (*n && *n != ':' && *n != ' ' && *n != '-') {
272  n++;
273  }
274 
275  // We have found a work prefix
276  if (*n == ':') {
277  // set p to skip the work prefix
278  p = n + 1;
279 
280  if (debug & DEBUG_REF) {
281  cout << "DEBUG(REF): Found a work prefix ";
282  for (char *x = s; x <= n; x++) {
283  cout << *x;
284  }
285  cout << endl;
286  }
287  }
288 
289  // Now we are in the meat of an osisID.
290  // Copy it to its end but stop on a grain marker of '!'
291  if (debug & DEBUG_REF) {
292  cout << "DEBUG(REF): Copy osisID:";
293  }
294 
295  while (*p && *p != '!' && *p != ' ' && *p != '-') {
296  if (debug & DEBUG_REF) {
297  cout << *p;
298  }
299 
300  *s++ = *p++;
301  }
302 
303  if (debug & DEBUG_REF) {
304  cout << endl;
305  }
306 
307  // The ! and everything following until we hit
308  // the end of the osisID is part of the grain reference
309  if (*p == '!') {
310  n = p;
311  while (*n && *n != ' ' && *n != '-') {
312  n++;
313  }
314 
315  if (debug & DEBUG_REF) {
316  cout << "DEBUG(REF): Found a grain suffix ";
317  for (char *x = p; x < n; x++) {
318  cout << *x;
319  }
320  cout << endl;
321  }
322 
323  p = n;
324  }
325 
326  // At this point we have processed an osisID
327 
328  // if we are not in a range and the next characer is a -
329  // then we are entering a range
330  inRange = !inRange && *p == '-';
331 
332  if (debug & DEBUG_REF) {
333  if (inRange) {
334  cout << "DEBUG(REF): Found a range" << endl;
335  }
336  }
337 
338  // between ranges and stand alone osisIDs we might have whitespace
339  if (!inRange && *p == ' ') {
340  // skip this and subsequent spaces
341  while (*p == ' ') {
342  p++;
343  }
344 
345  // replacing them all with a ';'
346  *s++ = ';';
347 
348  if (debug & DEBUG_REF) {
349  cout << "DEBUG(REF): replacing space with ;. Remaining: " << p << endl;
350  }
351  }
352  }
353 
354  // Determine whether we have modified the buffer
355  // We have modified the buffer if s is not sitting on the null byte of the original
356  if (*s) {
357  // null terminate the reference
358  *s = '\0';
359  // Since we modified the swbuf, we need to tell it what we have done
360  buf.setSize(s - buf.c_str());
361 
362  if (debug & DEBUG_REF) {
363  cout << "DEBUG(REF): shortended keyVal to`" << buf.c_str() << "`"<< endl;
364  }
365  }
366 }
int debug
Definition: osis2mod.cpp:76
const int DEBUG_REF
Definition: osis2mod.cpp:84
void processOSIS ( istream &  infile)

Definition at line 1428 of file osis2mod.cpp.

1428  {
1429  typedef enum {
1430  CS_NOT_IN_COMMENT, // or seen starting "<"
1431  CS_SEEN_STARTING_EXCLAMATION,
1432  CS_SEEN_STARTING_HYPHEN,
1433  CS_IN_COMMENT,
1434  CS_SEEN_ENDING_HYPHEN,
1435  CS_SEEN_SECOND_ENDING_HYPHEN,
1436  CS_SEEN_ENDING_GREATER_THAN
1437  } t_commentstate;
1438 
1439  typedef enum {
1440  ET_NUM,
1441  ET_HEX,
1442  ET_CHAR,
1443  ET_NONE,
1444  ET_ERR
1445  } t_entitytype;
1446 
1447  activeOsisID[0] = '\0';
1448 
1449  strcpy(currentOsisID,"N/A");
1450 
1451  currentVerse.setVersificationSystem(v11n);
1452  currentVerse.setAutoNormalize(false);
1453  currentVerse.setIntros(true); // turn on mod/testmnt/book/chap headings
1454  currentVerse.setPersist(true);
1455 
1458 
1459  SWBuf token;
1460  SWBuf text;
1461  bool incomment = false;
1462  t_commentstate commentstate = CS_NOT_IN_COMMENT;
1463  bool intoken = false;
1464  bool inWhitespace = false;
1465  bool seeingSpace = false;
1466  unsigned char curChar = '\0';
1467  SWBuf entityToken;
1468  bool inentity = false;
1469  t_entitytype entitytype = ET_NONE;
1470  unsigned char attrQuoteChar = '\0';
1471  bool inattribute = false;
1472  unsigned int linePos = 1;
1473  unsigned int charPos = 0;
1474 
1475  while (infile.good()) {
1476 
1477  int possibleChar = infile.get();
1478 
1479  // skip the character if it is bad. infile.good() will catch the problem
1480  if (possibleChar == -1) {
1481  continue;
1482  }
1483 
1484  curChar = (unsigned char) possibleChar;
1485 
1486  // All newlines are simply whitespace
1487  // Does a SWORD module actually require this?
1488  if (curChar == '\n') {
1489  curChar = ' ';
1490  charPos = 0;
1491  linePos++;
1492  }
1493  charPos++;
1494 
1495  // Look for entities:
1496  // These are of the form &#dddd;, &xHHHH; or &llll;
1497  // where dddd is a sequence of digits
1498  // HHHH is a sequence of [A-Fa-f0-9]
1499  // llll is amp, lt, gt, quot or apos
1500  // but we will look for a sequence of [A-Za-z0-9]
1501  // All but &amp;, &lt;, &gt;, &quot;, &apos; will produce a WARNING
1502  // In the future:
1503  // &#dddd; and &xHHHH; should be converted to UTF-8,
1504  // with a WARNING if the text is not UTF-8
1505  // &llll; other than the xml standard 5 should produce a WARNING
1506 
1507  // For entity diagnostics track whether the text is an attribute value
1508  if (inattribute && (curChar == '\'' || curChar == '"')) {
1509  if (attrQuoteChar == curChar) {
1510  inattribute = false;
1511  attrQuoteChar = '\0';
1512  }
1513  else {
1514  attrQuoteChar = curChar;
1515  }
1516  }
1517  if (intoken && curChar == '=') {
1518  inattribute = true;
1519  attrQuoteChar = '\0';
1520  }
1521 
1522  if (!inentity && curChar == '&') {
1523  inentity = true;
1524  entitytype = ET_NONE;
1525  entityToken = "&";
1526  continue;
1527  }
1528 
1529  if (inentity) {
1530  if (curChar == ';') {
1531  inentity = false;
1532  }
1533  else {
1534  switch (entitytype) {
1535  case ET_NONE:
1536  // A hex entity cannot start with X in XML, but it can in HTML
1537  // Allow for it here and complain later
1538  if (curChar == 'x' || curChar == 'X') {
1539  entitytype = ET_HEX;
1540  }
1541  else
1542  if (curChar == '#') {
1543  entitytype = ET_NUM;
1544  }
1545  else
1546  if ((curChar >= 'A' && curChar <= 'Z') ||
1547  (curChar >= 'a' && curChar <= 'z') ||
1548  (curChar >= '0' && curChar <= '9')) {
1549  entitytype = ET_CHAR;
1550  }
1551  else {
1552  inentity = false;
1553  entitytype = ET_ERR;
1554  }
1555  break;
1556 
1557  case ET_NUM :
1558  if (!(curChar >= '0' && curChar <= '9')) {
1559  inentity = false;
1560  entitytype = ET_ERR;
1561  }
1562  break;
1563  case ET_HEX :
1564  if ((curChar >= 'G' && curChar <= 'Z') ||
1565  (curChar >= 'g' && curChar <= 'z')) {
1566  // Starts out as a HEX entity, but it isn't one
1567  entitytype = ET_CHAR;
1568  }
1569  else
1570  if (!((curChar >= 'A' && curChar <= 'F') ||
1571  (curChar >= 'a' && curChar <= 'f') ||
1572  (curChar >= '0' && curChar <= '9'))) {
1573  inentity = false;
1574  entitytype = ET_ERR;
1575  }
1576  break;
1577  case ET_CHAR :
1578  if (!((curChar >= 'A' && curChar <= 'Z') ||
1579  (curChar >= 'a' && curChar <= 'z') ||
1580  (curChar >= '0' && curChar <= '9'))) {
1581  inentity = false;
1582  entitytype = ET_ERR;
1583  }
1584  break;
1585  default:
1586  cout << "FATAL(ENTITY): unknown entitytype on entity end: " << entitytype << endl;
1587  exit(EXIT_BAD_NESTING);
1588  }
1589  }
1590 
1591  if (entitytype != ET_ERR) {
1592  entityToken.append((char) curChar);
1593  }
1594 
1595  // It is an entity, perhaps invalid, if curChar is ';', error otherwise
1596  // Test to see if we now have an entity or a failure
1597  // It may not be a valid entity.
1598  if (!inentity) {
1599  switch (entitytype) {
1600  case ET_ERR :
1601  // Remove the leading &
1602  entityToken << 1;
1603  cout << "WARNING(PARSE): malformed entity, replacing &" << entityToken << " with &amp;" << entityToken << endl;
1604  if (intoken) {
1605  token.append("&amp;");
1606  token.append(entityToken);
1607  }
1608  else {
1609  text.append("&amp;");
1610  text.append(entityToken);
1611  }
1612  break;
1613  case ET_HEX :
1614  if (entityToken[1] != 'x') {
1615  cout << "WARNING(PARSE): HEX entity must begin with &x, found " << entityToken << endl;
1616  }
1617  else {
1618  cout << "WARNING(PARSE): SWORD does not search HEX entities, found " << entityToken << endl;
1619  }
1620  break;
1621  case ET_CHAR :
1622  if (strcmp(entityToken, "&amp;") &&
1623  strcmp(entityToken, "&lt;") &&
1624  strcmp(entityToken, "&gt;") &&
1625  strcmp(entityToken, "&quot;") &&
1626  strcmp(entityToken, "&apos;")) {
1627  cout << "WARNING(PARSE): XML only supports 5 Character entities &amp;, &lt;, &gt;, &quot; and &apos;, found " << entityToken << endl;
1628  }
1629  else
1630  if (!strcmp(entityToken, "&apos;")) {
1631  cout << "WARNING(PARSE): While valid for XML, XHTML does not support &apos;." << endl;
1632  if (!inattribute) {
1633  cout << "WARNING(PARSE): &apos; is unnecessary outside of attribute values. Replacing with '. " << endl;
1634  entityToken = "'";
1635  }
1636  else {
1637  switch (attrQuoteChar) {
1638  case '"' :
1639  cout << "WARNING(PARSE): &apos; is unnecessary inside double quoted attribute values. Replacing with '. " << endl;
1640  entityToken = "'";
1641  break;
1642  case '\'' :
1643  cout << "WARNING(PARSE): &apos; is only needed within single quoted attribute values. Considering using double quoted attribute and replacing with '." << endl;
1644  break;
1645  }
1646  }
1647  }
1648  else
1649  if (!strcmp(entityToken, "&quot;")) {
1650  cout << "WARNING(PARSE): While valid for XML, &quot; is only needed within double quoted attribute values" << endl;
1651  if (!inattribute) {
1652  cout << "WARNING(PARSE): &quot; is unnecessary outside of attribute values. Replace with \"." << endl;
1653  entityToken = "\"";
1654  }
1655  else {
1656  switch (attrQuoteChar) {
1657  case '"' :
1658  cout << "WARNING(PARSE): &quot; is only needed within double quoted attribute values. Considering using single quoted attribute and replacing with \"." << endl;
1659  break;
1660  case '\'' :
1661  cout << "WARNING(PARSE): &quot; is unnecessary inside single quoted attribute values. Replace with \"." << endl;
1662  entityToken = "\"";
1663  break;
1664  }
1665  }
1666  }
1667  break;
1668  case ET_NUM :
1669  cout << "WARNING(PARSE): SWORD does not search numeric entities, found " << entityToken << endl;
1670  break;
1671  case ET_NONE :
1672  default:
1673  break;
1674  }
1675 
1676  // Put the entity into the stream.
1677  if (intoken) {
1678  token.append(entityToken);
1679  }
1680  else {
1681  text.append(entityToken);
1682  }
1683 
1684  if (curChar == ';') {
1685  // The character was handled, so go get the next one.
1686  continue;
1687  }
1688  }
1689  else {
1690  // The character was handled, so go get the next one.
1691  continue;
1692  }
1693  }
1694 
1695 
1696  if (!intoken && curChar == '<') {
1697  intoken = true;
1698  token = "<";
1699  inattribute = false;
1700  attrQuoteChar = '\0';
1701  continue;
1702  }
1703 
1704  // Handle XML comments starting with "<!--", ending with "-->"
1705  if (intoken && !incomment) {
1706  switch (commentstate) {
1707  case CS_NOT_IN_COMMENT :
1708  if (curChar == '!') {
1709  commentstate = CS_SEEN_STARTING_EXCLAMATION;
1710  token.append((char) curChar);
1711  continue;
1712  } else {
1713  break;
1714  }
1715 
1716  case CS_SEEN_STARTING_EXCLAMATION :
1717  if (curChar == '-') {
1718  commentstate = CS_SEEN_STARTING_HYPHEN;
1719  token.append((char) curChar);
1720  continue;
1721  } else {
1722  commentstate = CS_NOT_IN_COMMENT;
1723  break;
1724  }
1725 
1726  case CS_SEEN_STARTING_HYPHEN :
1727  if (curChar == '-') {
1728  incomment = true;
1729  commentstate = CS_IN_COMMENT;
1730  token.append((char) curChar);
1731 
1732  if (debug & DEBUG_OTHER) {
1733  cout << "DEBUG(COMMENTS): in comment" << endl;
1734  }
1735 
1736  continue;
1737  } else {
1738  commentstate = CS_NOT_IN_COMMENT;
1739  break;
1740  }
1741 
1742  default:
1743  cout << "FATAL(COMMENTS): unknown commentstate on comment start: " << commentstate << endl;
1744  exit(EXIT_BAD_NESTING);
1745  }
1746  }
1747 
1748  if (incomment) {
1749  switch (commentstate) {
1750  case CS_IN_COMMENT:
1751  if (curChar == '-') {
1752  commentstate = CS_SEEN_ENDING_HYPHEN;
1753  continue;
1754  } else {
1755  // ignore the character
1756  continue;
1757  }
1758 
1759  case CS_SEEN_ENDING_HYPHEN :
1760  if (curChar == '-') {
1761  commentstate = CS_SEEN_SECOND_ENDING_HYPHEN;
1762  continue;
1763  } else {
1764  // ignore character
1765  commentstate = CS_IN_COMMENT;
1766  continue;
1767  }
1768 
1769  case CS_SEEN_SECOND_ENDING_HYPHEN :
1770  if (curChar == '>') {
1771  intoken = false;
1772  incomment = false;
1773  commentstate = CS_NOT_IN_COMMENT;
1774 
1775  if (debug & DEBUG_OTHER) {
1776  cout << "DEBUG(COMMENTS): out of comment" << endl;
1777  }
1778 
1779  continue;
1780  } else {
1781  // ignore character
1782  commentstate = CS_IN_COMMENT;
1783  continue;
1784  }
1785 
1786  default:
1787  cout << "FATAL(COMMENTS): unknown commentstate on comment end: " << commentstate << endl;
1788  exit(EXIT_BAD_NESTING);
1789  }
1790  }
1791 
1792  // Outside of tokens merge adjacent whitespace
1793  if (!intoken) {
1794  seeingSpace = isspace(curChar)!=0;
1795  if (seeingSpace) {
1796  if (inWhitespace) {
1797  continue;
1798  }
1799  // convert all whitespace to blanks
1800  curChar = ' ';
1801  }
1802  inWhitespace = seeingSpace;
1803  }
1804 
1805  if (intoken && curChar == '>') {
1806  intoken = false;
1807  inWhitespace = false;
1808  token.append('>');
1809  // take this isalpha if out to check for bugs in text
1810  if (isalpha(token[1]) ||
1811  (((token[1] == '/') || (token[1] == '?')) && isalpha(token[2]))) {
1812  //cout << "Handle:" << token.c_str() << endl;
1813  XMLTag t = transformBSP(token.c_str());
1814 
1815  if (!handleToken(text, t)) {
1816  text.append(t);
1817  }
1818  } else {
1819  cout << "WARNING(PARSE): malformed token: " << token << endl;
1820  }
1821  continue;
1822  }
1823 
1824  if (intoken) {
1825  token.append((char) curChar);
1826  }
1827  else {
1828  switch (curChar) {
1829  case '>' : cout << "WARNING(PARSE): > should be &gt;" << endl; text.append("&gt;"); break;
1830  case '<' : cout << "WARNING(PARSE): < should be &lt;" << endl; text.append("&lt;"); break;
1831  default : text.append((char) curChar); break;
1832  }
1833  }
1834  }
1835 
1836  // Force the last entry from the text buffer.
1837  text = "";
1838  writeEntry(text, true);
1839  writeLinks();
1840 
1841 #ifdef _ICU_
1842  if (converted) fprintf(stderr, "osis2mod converted %d verses to UTF-8\n", converted);
1843  if (normalized) fprintf(stderr, "osis2mod normalized %d verses to NFC\n", normalized);
1844 #endif
1845 }
#define TOP
Definition: swkey.h:68
char currentOsisID[255]
Definition: osis2mod.cpp:109
virtual void setPosition(SW_POSITION pos)
Definition: swmodule.cpp:327
SWText * module
Definition: osis2mod.cpp:105
Definition: utilxml.h:38
bool handleToken(SWBuf &text, XMLTag token)
Definition: osis2mod.cpp:617
virtual char setKey(const SWKey *ikey)
Definition: swmodule.cpp:298
SWBuf v11n
Definition: osis2mod.cpp:107
int debug
Definition: osis2mod.cpp:76
char activeOsisID[255]
Definition: osis2mod.cpp:108
int normalized
Definition: osis2mod.cpp:102
void writeEntry(SWModule *book, SWBuf keyBuffer, SWBuf entBuffer)
Definition: imp2gbs.cpp:131
const int DEBUG_OTHER
Definition: osis2mod.cpp:86
int converted
Definition: osis2mod.cpp:103
const int EXIT_BAD_NESTING
Definition: osis2mod.cpp:93
void writeLinks()
Definition: osis2mod.cpp:1323
VerseKey currentVerse
Definition: osis2mod.cpp:106
XMLTag transformBSP(XMLTag t)
Definition: osis2mod.cpp:1215
XMLTag transformBSP ( XMLTag  t)

Support normalizations necessary for a SWORD module. OSIS allows for document structure (Book, Section, Paragraph or BSP) to overlap Bible versification (Book, Chapter, Verse). Most SWORD applications need to display verses in isolation or in HTML table cells, requiring each stored entry (i.e. verses) to be well-formed xml. This routine normalizes container elements which could cross verse boundaries into milestones. For most of these OSIS elements, there is a milestone form. However, p is not milestoneable. For this reason, p is transformed into div elements with type x-p. param t the tag to transform return the transformed tag or the original one

Definition at line 1215 of file osis2mod.cpp.

1215  {
1216  static std::stack<XMLTag> bspTagStack;
1217  static int sID = 1;
1218  char buf[11];
1219  SWBuf typeAttr = t.getAttribute("type");
1220 
1221  // Support simplification transformations
1222  if (t.isEmpty()) {
1223 
1224  if (debug & DEBUG_XFORM) {
1225  cout << "DEBUG(XFORM): " << currentOsisID << ": xform empty " << t << endl;
1226  }
1227 
1228  return t;
1229  }
1230 
1231  SWBuf tagName = t.getName();
1232  if (!t.isEndTag()) {
1233  // Transform <p> into <div type="x-p"> and milestone it
1234  if (tagName == "p") {
1235  t.setText("<div type=\"x-p\" />");
1236  sprintf(buf, "gen%d", sID++);
1237  t.setAttribute("sID", buf);
1238  }
1239 
1240  // Transform <tag> into <tag sID="">, where tag is a milestoneable element.
1241  // The following containers are milestoneable.
1242  // abbr, closer, div, foreign, l, lg, salute, signed, speech
1243  // Leaving out:
1244  // abbr When would this ever cross a boundary?
1245  // seg as it is used for a divineName hack
1246  // foreign so that it can be easily italicized
1247  // div type="colophon" so that it can be treated as a block
1248  else if (tagName == "chapter" ||
1249  tagName == "closer" ||
1250  (tagName == "div" && typeAttr != "colophon") ||
1251  tagName == "l" ||
1252  tagName == "lg" ||
1253  tagName == "q" ||
1254  tagName == "salute" ||
1255  tagName == "signed" ||
1256  tagName == "speech" ||
1257  tagName == "verse"
1258  ) {
1259  t.setEmpty(true);
1260  sprintf(buf, "gen%d", sID++);
1261  t.setAttribute("sID", buf);
1262  }
1263  bspTagStack.push(t);
1264 
1265  if (debug & DEBUG_XFORM) {
1266  cout << "DEBUG(XFORM): " << currentOsisID << ": xform push (" << bspTagStack.size() << ") " << t << " (tagname=" << tagName << ")" << endl;
1267  XMLTag topToken = bspTagStack.top();
1268  cout << "DEBUG(XFORM): " << currentOsisID << ": xform top(" << bspTagStack.size() << ") " << topToken << endl;
1269  }
1270  }
1271  else {
1272  if (!bspTagStack.empty()) {
1273  XMLTag topToken = bspTagStack.top();
1274 
1275  if (debug & DEBUG_XFORM) {
1276  cout << "DEBUG(XFORM): " << currentOsisID << ": xform pop(" << bspTagStack.size() << ") " << topToken << endl;
1277  }
1278 
1279  bspTagStack.pop();
1280  SWBuf topTypeAttr = topToken.getAttribute("type");
1281 
1282  // Look for the milestoneable container tags handled above.
1283  // Have to treat div type="colophon" differently
1284  if (tagName == "chapter" ||
1285  tagName == "closer" ||
1286  (tagName == "div" && topTypeAttr != "colophon") ||
1287  tagName == "l" ||
1288  tagName == "lg" ||
1289  tagName == "p" ||
1290  tagName == "q" ||
1291  tagName == "salute" ||
1292  tagName == "signed" ||
1293  tagName == "speech" ||
1294  tagName == "verse"
1295  ) {
1296  // make this a clone of the start tag with sID changed to eID
1297  // Note: in the case of </p> the topToken is a <div type="x-p">
1298  t = topToken;
1299  t.setAttribute("eID", t.getAttribute("sID"));
1300  t.setAttribute("sID", 0);
1301  }
1302  }
1303  else {
1304  cout << "FATAL(TAGSTACK): " << currentOsisID << ": closing tag without opening tag" << endl;
1305  }
1306  }
1307 
1308  return t;
1309 }
void setEmpty(bool value)
Definition: utilxml.h:66
const char * setAttribute(const char *attribName, const char *attribValue, int partNum=-1, char partSplit= '|')
Definition: utilxml.cpp:248
char currentOsisID[255]
Definition: osis2mod.cpp:109
const char * getName() const
Definition: utilxml.h:58
Definition: utilxml.h:38
const int DEBUG_XFORM
Definition: osis2mod.cpp:82
bool isEmpty() const
Definition: utilxml.h:60
int debug
Definition: osis2mod.cpp:76
void setText(const char *tagString)
Definition: utilxml.cpp:143
const char * getAttribute(const char *attribName, int partNum=-1, char partSplit= '|') const
Definition: utilxml.cpp:230
bool isEndTag(const char *eID=0) const
Definition: utilxml.cpp:323
void usage ( const char *  app,
const char *  error = 0,
const bool  verboseHelp = false 
)

Definition at line 1352 of file osis2mod.cpp.

1352  {
1353 
1354  if (error) fprintf(stderr, "\n%s: %s\n", app, error);
1355 
1356  fprintf(stderr, "OSIS Bible/commentary module creation tool for The SWORD Project\n");
1357  fprintf(stderr, "\nusage: %s <output/path> <osisDoc> [OPTIONS]\n", app);
1358  fprintf(stderr, " <output/path>\t\t an existing folder that the module will be written\n");
1359  fprintf(stderr, " <osisDoc>\t\t path to the validated OSIS document, or '-' to\n");
1360  fprintf(stderr, "\t\t\t\t read from standard input\n");
1361  fprintf(stderr, " -a\t\t\t augment module if exists (default is to create new)\n");
1362  fprintf(stderr, " -z <l|z|b|x>\t\t compression type (default: none)\n");
1363  fprintf(stderr, "\t\t\t\t l - LZSS; z - ZIP; b - bzip2; x - xz\n");
1364  fprintf(stderr, " -b <2|3|4>\t\t compression block size (default: 4)\n");
1365  fprintf(stderr, "\t\t\t\t 2 - verse; 3 - chapter; 4 - book\n");
1366  fprintf(stderr, " -l <1-9>\t\t compression level (default varies by compression type)\n");
1367  fprintf(stderr, " -c <cipher_key>\t encipher module using supplied key\n");
1368  fprintf(stderr, "\t\t\t\t (default no enciphering)\n");
1369 
1370 #ifdef _ICU_
1371  fprintf(stderr, " -e <1|2|s>\t\t convert Unicode encoding (default: 1)\n");
1372  fprintf(stderr, "\t\t\t\t 1 - UTF-8 ; 2 - UTF-16 ; s - SCSU\n");
1373  fprintf(stderr, " -N\t\t\t do not normalize to NFC\n");
1374  if (verboseHelp) {
1375  fprintf(stderr, "\t\t\t\t (default is to convert to UTF-8, if needed,\n");
1376  fprintf(stderr, "\t\t\t\t and then normalize to NFC)\n");
1377  fprintf(stderr, "\t\t\t\t Note: UTF-8 texts should be normalized to NFC.\n");
1378  }
1379 #endif
1380 
1381  fprintf(stderr, " -s <2|4>\t\t bytes used to store entry size (default is 2).\n");
1382  if (verboseHelp) {
1383  fprintf(stderr, "\t\t\t\t Note: useful for commentaries with very large\n");
1384  fprintf(stderr, "\t\t\t\t entries in uncompressed modules\n");
1385  fprintf(stderr, "\t\t\t\t (2 bytes to store size equal 65535 characters)\n");
1386  }
1387  fprintf(stderr, " -v <v11n>\t\t specify a versification scheme to use (default is KJV)\n");
1388  fprintf(stderr, "\t\t\t\t Note: The following are valid values for v11n:");
1389 
1391  StringList av11n = vmgr->getVersificationSystems();
1392  for (StringList::iterator loop = av11n.begin(); loop != av11n.end(); loop++) {
1393  if ((distance(av11n.begin(), loop) % 3) == 0) {
1394  fprintf(stderr, "\n\t\t\t\t %-12s", (*loop).c_str());
1395  }
1396  else {
1397  fprintf(stderr, "\t%-12s", (*loop).c_str());
1398  }
1399  }
1400  fprintf(stderr, "\n");
1401 
1402  if (verboseHelp) {
1403  fprintf(stderr, " -d <flags>\t\t turn on debugging (default is 0)\n");
1404  fprintf(stderr, "\t\t\t\t Note: This flag may change in the future.\n");
1405  fprintf(stderr, "\t\t\t\t Flags: The following are valid values:\n");
1406  fprintf(stderr, "\t\t\t\t\t0 - no debugging\n");
1407  fprintf(stderr, "\t\t\t\t\t1 - writes to module, very verbose\n");
1408  fprintf(stderr, "\t\t\t\t\t2 - verse start and end\n");
1409  fprintf(stderr, "\t\t\t\t\t4 - quotes, esp. Words of Christ\n");
1410  fprintf(stderr, "\t\t\t\t\t8 - titles\n");
1411  fprintf(stderr, "\t\t\t\t\t16 - inter-verse material\n");
1412  fprintf(stderr, "\t\t\t\t\t32 - BSP to BCV transformations\n");
1413  fprintf(stderr, "\t\t\t\t\t64 - v11n exceptions\n");
1414  fprintf(stderr, "\t\t\t\t\t128 - parsing of osisID and osisRef\n");
1415  fprintf(stderr, "\t\t\t\t\t256 - internal stack\n");
1416  fprintf(stderr, "\t\t\t\t\t512 - miscellaneous\n");
1417  fprintf(stderr, "\t\t\t\t This argument can be used more than once. (Or\n");
1418  fprintf(stderr, "\t\t\t\t the flags may be added together.)\n");
1419  }
1420  fprintf(stderr, " -h \t\t\t print verbose usage text\n");
1421 
1422  fprintf(stderr, "\n");
1423  fprintf(stderr, "See http://www.crosswire.org/wiki/osis2mod for more details.\n");
1424  fprintf(stderr, "\n");
1425  exit(EXIT_BAD_ARG);
1426 }
const int EXIT_BAD_ARG
Definition: osis2mod.cpp:89
std::list< SWBuf > StringList
Definition: swmodule.cpp:91
const StringList getVersificationSystems() const
static VersificationMgr * getSystemVersificationMgr()
void writeEntry ( SWBuf text,
bool  force = false 
)

Definition at line 483 of file osis2mod.cpp.

483  {
484  char keyOsisID[255];
485 
486  static SWBuf revision; revision.setFormatted("<milestone type=\"x-importer\" subType=\"x-osis2mod\" n=\"$Rev: 3769 $ (SWORD: %s)\"/>", SWVersion::currentVersion.getText());
487  static bool firstOT = true;
488  static bool firstNT = true;
489 
490  if (!inCanonicalOSISBook) {
491  return;
492  }
493 
494  strcpy(keyOsisID, currentVerse.getOSISRef());
495 
496  // set keyOsisID to anything that an osisID cannot be.
497  if (force) {
498  strcpy(keyOsisID, "-force");
499  }
500 
501  static VerseKey lastKey;
502  lastKey.setVersificationSystem(currentVerse.getVersificationSystem());
503  lastKey.setAutoNormalize(0);
504  lastKey.setIntros(1);
505 
506  VerseKey saveKey;
507  saveKey.setVersificationSystem(currentVerse.getVersificationSystem());
508  saveKey.setAutoNormalize(0);
509  saveKey.setIntros(1);
510  saveKey = currentVerse;
511 
512  // If we have seen a verse and the supplied one is different then we output the collected one.
513  if (*activeOsisID && strcmp(activeOsisID, keyOsisID)) {
514 
515  if (!isValidRef(lastKey, "writeEntry")) {
516  makeValidRef(lastKey);
517  }
518 
519  currentVerse = lastKey;
520 
522 
523  // Put the revision into the module
524  int testmt = currentVerse.getTestament();
525  if ((testmt == 1 && firstOT) || (testmt == 2 && firstNT)) {
526  VerseKey t;
527  t.setVersificationSystem(currentVerse.getVersificationSystem());
528  t.setAutoNormalize(0);
529  t.setIntros(1);
530  t = currentVerse;
531  currentVerse.setBook(0);
532  currentVerse.setChapter(0);
533  currentVerse.setVerse(0);
534  module->setEntry(revision);
535  currentVerse = t;
536  switch (testmt) {
537  case 1:
538  firstOT = false;
539  break;
540  case 2:
541  firstNT = false;
542  break;
543  }
544  }
545 
546  // If the desired output encoding is non-UTF-8, convert to that encoding
547  if (outputEncoder) {
548  outputEncoder->processText(activeVerseText, (SWKey *)2); // note the hack of 2 to mimic a real key. TODO: remove all hacks
549  }
550 
551  // If the entry already exists, then append this entry to the text.
552  // This is for verses that are outside the chosen versification. They are appended to the prior verse.
553  // The space should not be needed if we retained verse tags.
554  if (module->hasEntry(&currentVerse)) {
555  module->flush();
556  SWBuf currentText = module->getRawEntry();
557  cout << "INFO(WRITE): Appending entry: " << currentVerse.getOSISRef() << ": " << activeVerseText << endl;
558 
559  // If we have a non-UTF-8 encoding, we should decode it before concatenating, then re-encode it
560  if (outputDecoder) {
562  outputDecoder->processText(currentText, (SWKey *)2);
563  }
564  activeVerseText = currentText + " " + activeVerseText;
565  if (outputEncoder) {
566  outputEncoder->processText(activeVerseText, (SWKey *)2);
567  }
568  }
569 
570  if (debug & DEBUG_WRITE) {
571  cout << "DEBUG(WRITE): " << activeOsisID << ":" << currentVerse.getOSISRef() << ": " << activeVerseText << endl;
572  }
573 
574  module->setEntry(activeVerseText);
575  activeVerseText = "";
576  }
577 
578  // The following is for initial verse content and for appending interverse content.
579  if (activeVerseText.length()) {
580  activeVerseText += text;
581  }
582  else {
583  // Eliminate leading whitespace on the beginning of each verse
584  text.trimStart();
585  activeVerseText = text;
586  }
587  // text has been consumed so clear it out.
588  text = "";
589 
590  currentVerse = saveKey;
591  lastKey = currentVerse;
592  strcpy(activeOsisID, keyOsisID);
593 }
void makeValidRef(VerseKey &key)
Definition: osis2mod.cpp:433
virtual void setEntry(const char *inbuf, long len=-1)
Definition: swmodule.cpp:1680
SWText * module
Definition: osis2mod.cpp:105
SWFilter * outputEncoder
Definition: osis2mod.cpp:99
static bool inCanonicalOSISBook
Definition: osis2mod.cpp:117
int debug
Definition: osis2mod.cpp:76
SWBuf activeVerseText
Definition: osis2mod.cpp:111
char activeOsisID[255]
Definition: osis2mod.cpp:108
const int DEBUG_WRITE
Definition: osis2mod.cpp:77
void prepareSWText(const char *osisID, SWBuf &text)
Definition: osis2mod.cpp:200
virtual bool hasEntry(const SWKey *) const
Definition: swmodule.h:809
virtual char processText(SWBuf &text, const SWKey *key=0, const SWModule *module=0)=0
static SWVersion currentVersion
Definition: swversion.h:69
virtual void flush()
Definition: swcacher.cpp:39
bool isValidRef(const char *buf, const char *caller)
Definition: osis2mod.cpp:372
SWFilter * outputDecoder
Definition: osis2mod.cpp:100
VerseKey currentVerse
Definition: osis2mod.cpp:106
const char * getRawEntry() const
Definition: swmodule.h:500
void writeLinks ( )

Write out all links in the module. Waiting is necessary because writeEntry might ultimately append text to a verse moving it's offset in the data file. While we are minimizing it by postponing the write until we have gathered the next verse, the following scenario is happening: A module is using linked verses and has some verses that are not in the chosen versification. If the out-of-canon verse happens following a linked verse, the out-of-canon verse is appended to the prior verse. Care has to be taken that the linked verses all point to the first of the set.

Definition at line 1323 of file osis2mod.cpp.

1324 {
1325  // Link all the verses
1326  VerseKey destKey;
1327  destKey.setVersificationSystem(currentVerse.getVersificationSystem());
1328  destKey.setAutoNormalize(0);
1329  destKey.setIntros(1);
1330 
1331  VerseKey linkKey;
1332  linkKey.setVersificationSystem(currentVerse.getVersificationSystem());
1333  linkKey.setAutoNormalize(0);
1334  linkKey.setIntros(1);
1335  for (unsigned int i = 0; i < linkedVerses.size(); i++) {
1336  // The verseKeys is a list of verses
1337  // where the first is the real verse
1338  // and the others link to it.
1339  ListKey verseKeys = linkedVerses[i];
1340  verseKeys.setPosition(TOP);
1341  destKey = verseKeys.getElement();
1342  verseKeys.increment(1);
1343 
1344  while (!verseKeys.popError()) {
1345  linkKey = verseKeys.getElement();
1346  linkToEntry(linkKey, destKey);
1347  verseKeys.increment(1);
1348  }
1349  }
1350 }
#define TOP
Definition: swkey.h:68
std::vector< ListKey > linkedVerses
Definition: osis2mod.cpp:115
VerseKey currentVerse
Definition: osis2mod.cpp:106
void linkToEntry(VerseKey &linkKey, VerseKey &dest)
Definition: osis2mod.cpp:595

Variable Documentation

char activeOsisID[255]

Definition at line 108 of file osis2mod.cpp.

SWBuf activeVerseText

Definition at line 111 of file osis2mod.cpp.

int converted = 0

Definition at line 103 of file osis2mod.cpp.

ListKey currentKeyIDs = ListKey()

Definition at line 113 of file osis2mod.cpp.

char currentOsisID[255]

Definition at line 109 of file osis2mod.cpp.

VerseKey currentVerse

Definition at line 106 of file osis2mod.cpp.

int debug = 0

Definition at line 76 of file osis2mod.cpp.

const int DEBUG_INTERVERSE = 16

Definition at line 81 of file osis2mod.cpp.

const int DEBUG_OTHER = 512

Definition at line 86 of file osis2mod.cpp.

const int DEBUG_QUOTE = 4

Definition at line 79 of file osis2mod.cpp.

const int DEBUG_REF = 128

Definition at line 84 of file osis2mod.cpp.

const int DEBUG_REV11N = 64

Definition at line 83 of file osis2mod.cpp.

const int DEBUG_STACK = 256

Definition at line 85 of file osis2mod.cpp.

const int DEBUG_TITLE = 8

Definition at line 80 of file osis2mod.cpp.

const int DEBUG_VERSE = 2

Definition at line 78 of file osis2mod.cpp.

const int DEBUG_WRITE = 1

Definition at line 77 of file osis2mod.cpp.

const int DEBUG_XFORM = 32

Definition at line 82 of file osis2mod.cpp.

const int EXIT_BAD_ARG = 1

Definition at line 89 of file osis2mod.cpp.

const int EXIT_BAD_NESTING = 5

Definition at line 93 of file osis2mod.cpp.

const int EXIT_NO_CREATE = 3

Definition at line 91 of file osis2mod.cpp.

const int EXIT_NO_READ = 4

Definition at line 92 of file osis2mod.cpp.

const int EXIT_NO_WRITE = 2

Definition at line 90 of file osis2mod.cpp.

bool inCanonicalOSISBook = true
static

Definition at line 117 of file osis2mod.cpp.

std::vector<ListKey> linkedVerses

Definition at line 115 of file osis2mod.cpp.

SWText* module = 0

Definition at line 105 of file osis2mod.cpp.

bool normalize = true
static

Definition at line 118 of file osis2mod.cpp.

int normalized = 0

Definition at line 102 of file osis2mod.cpp.

SWFilter* outputDecoder = NULL

Definition at line 100 of file osis2mod.cpp.

SWFilter* outputEncoder = NULL

Definition at line 99 of file osis2mod.cpp.

SWBuf v11n = "KJV"

Definition at line 107 of file osis2mod.cpp.