utilities/osis2mod.cpp File Reference

#include <ctype.h>
#include <stdio.h>
#include <fcntl.h>
#include <errno.h>
#include <stdlib.h>
#include <stack>
#include <vector>
#include <iostream>
#include <fstream>
#include <utilstr.h>
#include <swmgr.h>
#include <rawtext.h>
#include <rawtext4.h>
#include <swbuf.h>
#include <utilxml.h>
#include <listkey.h>
#include <versekey.h>
#include <ztext.h>
#include <lzsscomprs.h>
#include <zipcomprs.h>
#include <cipherfil.h>
Include dependency graph for osis2mod.cpp:

Go to the source code of this file.

Functions

int detectUTF8 (const char *txt)
bool handleToken (SWBuf &text, XMLTag token)
bool isOSISAbbrev (const char *buf)
bool isValidRef (const char *buf)
void linkToEntry (VerseKey &linkKey, VerseKey &dest)
int main (int argc, char **argv)
void makeValidRef (VerseKey &key)
void prepareSWText (const char *osisID, SWBuf &text)
void prepareSWVerseKey (SWBuf &buf)
void processOSIS (istream &infile)
XMLTag transformBSP (XMLTag t)
void usage (const char *app, const char *error=0)
void writeEntry (SWBuf &text, bool force=false)
void writeLinks ()

Variables

char activeOsisID [255]
SWBuf activeVerseText
int converted = 0
ListKey currentKeyIDs = ListKey()
char currentOsisID [255]
VerseKey currentVerse
int debug = 0
const int DEBUG_INTERVERSE = 16
const int DEBUG_OTHER = 512
const int DEBUG_QUOTE = 4
const int DEBUG_REF = 128
const int DEBUG_REV11N = 64
const int DEBUG_STACK = 256
const int DEBUG_TITLE = 8
const int DEBUG_VERSE = 2
const int DEBUG_WRITE = 1
const int DEBUG_XFORM = 32
const int EXIT_BAD_ARG = 1
const int EXIT_BAD_NESTING = 5
const int EXIT_NO_CREATE = 3
const int EXIT_NO_READ = 4
const int EXIT_NO_WRITE = 2
static bool inCanonicalOSISBook = true
std::vector< ListKey > linkedVerses
SWTextmodule = 0
static bool normalize = true
int normalized = 0
SWBuf v11n = "KJV"

Function Documentation

int detectUTF8 ( const char *  txt  ) 

Determine whether the string contains a valid unicode sequence. The following table give the pattern of a valid UTF-8 character. Unicode Range 1st 2nd 3rd 4th U-00000000 - U-0000007F 0nnnnnnn U-00000080 - U-000007FF 110nnnnn 10nnnnnn U-00000800 - U-0000FFFF 1110nnnn 10nnnnnn 10nnnnnn U-00010000 - U-001FFFFF 11110nnn 10nnnnnn 10nnnnnn 10nnnnnn Note: 1. The latest UTF-8 RFC allows for a max of 4 bytes. Earlier allowed 6. 2. The number of bits of the leading byte before the first 0 is the total number of bytes. 3. The "n" are the bits of the unicode codepoint. This routine does not check to see if the code point is in the range. It could.

param txt the text to check return 1 if all high order characters form a valid unicode sequence -1 if there are no high order characters. Note: this is also a valid unicode sequence 0 if there are high order characters that do not form a valid unicode sequence author DM Smith

Definition at line 129 of file osis2mod.cpp.

00129                                 {
00130     unsigned int  countUTF8 = 0;
00131     int           count     = 0;
00132     
00133     // Cast it to make masking and shifting easier
00134     const unsigned char *p = (const unsigned char*) txt;
00135     while (*p) {
00136         // Is the high order bit set?
00137         if (*p & 0x80) {
00138             // Then count the number of high order bits that are set.
00139             // This determines the number of following bytes
00140             // that are a part of the unicode character
00141             unsigned char i = *p;
00142             for (count = 0; i & 0x80; count++) {
00143                 i <<= 1;
00144             }
00145 
00146             // Validate count:
00147             // Count 0: bug in code that would cause core walking
00148             // Count 1: is a pattern of 10nnnnnn,
00149             //          which does not signal the start of a unicode character
00150             // Count 5 to 8: 111110nn, 1111110n and 11111110 and 11111111
00151             //          are not legal starts, either
00152             if (count < 2 || count > 4) return 0;
00153 
00154             // At this point we expect (count - 1) following characters
00155             // of the pattern 10nnnnnn
00156             while (--count && *++p) {
00157                 // The pattern of each following character must be: 10nnnnnn
00158                 // So, compare the top 2 bits.
00159                 if ((0xc0 & *p) != 0x80) return  0;
00160             }
00161 
00162             // Oops, we've run out of bytes too soon: Cannot be UTF-8
00163             if (count) return 0;
00164 
00165             // We have a valid UTF-8 character, so count it
00166             countUTF8++;
00167         }
00168 
00169         // Advance to the next character to examine.
00170         p++;
00171     }
00172     
00173     // At this point it is either UTF-8 or 7-bit ascii
00174     return countUTF8 ? 1 : -1;
00175 }

bool handleToken ( SWBuf &  text,
XMLTag  token 
)

Definition at line 579 of file osis2mod.cpp.

00579                                             {
00580 
00581     // Everything between the begin book tag and the first begin chapter tag is inBookIntro
00582     static bool               inBookIntro     = false;
00583 
00584     // Everything between the begin chapter tag and the first begin verse tag is inChapterIntro
00585     static bool               inChapterIntro  = false;
00586 
00587     // Flags indicating whether we are processing the content of a chapter
00588     static bool               inChapter       = false;
00589 
00590     // Flags indicating whether we are processing the content of a verse
00591     static bool               inVerse         = false;
00592 
00593     // Flags indicating whether we are processing the content of to be prepended to a verse
00594     static bool               inPreVerse      = false;
00595     static int                genID           = 1;
00596 
00597     // Flag indicating whether we are in "Words of Christ"
00598     static bool               inWOC           = false;
00599     // Tag for WOC quotes within a verse
00600     static XMLTag             wocTag          = "<q who=\"Jesus\" marker=\"\">";
00601 
00602     // Flag used to indicate where useful text begins
00603     static bool               firstDiv        = false;
00604     static bool               headerEnded     = false;
00605 
00606     // Retain the sID of book, chapter and verse (commentary) divs so that we can find them again.
00607     // This relies on transformBSP.
00608     static SWBuf              sidBook         = "";
00609     static SWBuf              sidChapter      = "";
00610     static SWBuf              sidVerse        = "";
00611 
00612     // Stack of quote elements used to handle Words of Christ
00613     static std::stack<XMLTag> quoteStack;
00614 
00615     // Stack of elements used to validate that books, chapters and verses are well-formed
00616     // This goes beyond simple xml well-formed and also considers milestoned div, chapter and verse
00617     // to be begin and end tags, too.
00618     // It is an error if books and chapters are not well formed (though not required by OSIS)
00619     // It is a warning that verses are not well formed (because some clients are not ready)
00620     static std::stack<XMLTag> tagStack;
00621 
00622     // The following are used to validate well-formedness
00623     static int                chapterDepth    = 0;
00624     static int                bookDepth       = 0;
00625     static int                verseDepth      = 0;
00626 
00627     int                       tagDepth        = tagStack.size();
00628     SWBuf                     tokenName       = token.getName();
00629     bool                      isEndTag        = token.isEndTag() || token.getAttribute("eID");
00630     SWBuf                     typeAttr        = token.getAttribute("type");
00631     SWBuf                     eidAttr         = token.getAttribute("eID");
00632 
00633     // process start tags
00634     if (!isEndTag) {
00635 
00636         // Remember non-empty start tags
00637         if (!token.isEmpty()) {
00638             tagStack.push(token);
00639 
00640             if (debug & DEBUG_STACK) {
00641                 cout << "DEBUG(STACK): " << currentOsisID << ": push (" << tagStack.size() << ") " << token.getName() << endl;
00642             }
00643         }
00644 
00645         // throw away everything up to the first div (that is outside the header)
00646         if (!firstDiv) {
00647             if (headerEnded && (tokenName == "div")) {
00648                 if (debug & DEBUG_OTHER) {
00649                     cout << "DEBUG(FOUND): Found first div and pitching prior material: " << text << endl;
00650                 }
00651 
00652                 // TODO: Save off the content to use it to suggest the module's conf.
00653                 firstDiv = true;
00654                 text     = "";
00655             }
00656             else {
00657                 // Collect the content so it can be used to suggest the module's conf.
00658                 return false;
00659             }
00660         }
00661 
00662         //-- WITH osisID OR annotateRef -------------------------------------------------------------------------
00663         // Handle Book, Chapter, and Verse (or commentary equivalent)
00664         if (token.getAttribute("osisID") || token.getAttribute("annotateRef")) {
00665 
00666             // BOOK START, <div type="book" ...>
00667             if (tokenName == "div" && typeAttr == "book") {
00668                 if (inBookIntro || inChapterIntro) {    // this one should never happen, but just in case
00669 
00670                     if (debug & DEBUG_TITLE) {
00671                         cout << "DEBUG(TITLE): " << currentOsisID << ": OOPS INTRO " << endl;
00672                         cout << "\tinChapterIntro = " << inChapterIntro << endl;
00673                         cout << "\tinBookIntro = " << inBookIntro << endl;
00674                     }
00675 
00676                     currentVerse.setTestament(0);
00677                     currentVerse.setBook(0);
00678                     currentVerse.setChapter(0);
00679                     currentVerse.setVerse(0);
00680                     writeEntry(text);
00681                 }
00682                 currentVerse = token.getAttribute("osisID");
00683                 currentVerse.setChapter(0);
00684                 currentVerse.setVerse(0);
00685                 strcpy(currentOsisID, currentVerse.getOSISRef());
00686 
00687                 sidBook         = token.getAttribute("sID");
00688                 inChapter       = false;
00689                 inVerse         = false;
00690                 inPreVerse      = false;
00691                 inBookIntro     = true;
00692                 inChapterIntro  = false;
00693 
00694                 if (debug & DEBUG_TITLE) {
00695                     cout << "DEBUG(TITLE): " << currentOsisID << ": Looking for book introduction" << endl;
00696                 }
00697 
00698                 bookDepth       = tagStack.size();
00699                 chapterDepth    = 0;
00700                 verseDepth      = 0;
00701 
00702                 inCanonicalOSISBook = isOSISAbbrev(token.getAttribute("osisID"));
00703                 if (!inCanonicalOSISBook) {
00704                     cout << "WARNING(V11N): New book is " << token.getAttribute("osisID") << " and is not in " << v11n << " versification, ignoring" << endl;
00705                 }
00706                 else if (debug & DEBUG_OTHER) {
00707                     cout << "DEBUG(FOUND): New book is " << currentVerse.getOSISRef() << endl;
00708                 }
00709 
00710                 return false;
00711             }
00712 
00713             // CHAPTER START, <chapter> or <div type="chapter" ...>
00714             if ((tokenName == "chapter") ||
00715                 (tokenName == "div" && typeAttr == "chapter")
00716             ) {
00717                 if (inBookIntro) {
00718                     if (debug & DEBUG_TITLE) {
00719                         cout << "DEBUG(TITLE): " << currentOsisID << ": BOOK INTRO "<< text << endl;
00720                     }
00721 
00722                     writeEntry(text);
00723                 }
00724 
00725                 currentVerse = token.getAttribute("osisID");
00726                 currentVerse.setVerse(0);
00727 
00728                 if (debug & DEBUG_OTHER) {
00729                     cout << "DEBUG(FOUND): Current chapter is " << currentVerse.getOSISRef() << " (" << token.getAttribute("osisID") << ")" << endl;
00730                 }
00731 
00732                 strcpy(currentOsisID, currentVerse.getOSISRef());
00733 
00734                 sidChapter      = token.getAttribute("sID");
00735                 inChapter       = true;
00736                 inVerse         = false;
00737                 inPreVerse      = false;
00738                 inBookIntro     = false;
00739                 inChapterIntro  = true;
00740 
00741                 if (debug & DEBUG_TITLE) {
00742                     cout << "DEBUG(TITLE): " << currentOsisID << ": Looking for chapter introduction" << endl;
00743                 }
00744 
00745                 chapterDepth    = tagStack.size();
00746                 verseDepth      = 0;
00747 
00748                 return false;
00749             }
00750 
00751             // VERSE, <verse ...> OR COMMENTARY START, <div annotateType="xxx" ...>
00752             if ((tokenName == "verse") ||
00753                 (tokenName == "div" && token.getAttribute("annotateType"))
00754             ) {
00755                 if (debug & DEBUG_OTHER) {
00756                     cout << "DEBUG(FOUND): Entering verse" << endl;
00757                 }
00758 
00759                 if (inChapterIntro) {
00760                     if (debug & DEBUG_TITLE) {
00761                         cout << "DEBUG(TITLE): " << currentOsisID << ": Done looking for chapter introduction" << endl;
00762                     }
00763 
00764                     if (text.length()) {
00765                         if (debug & DEBUG_TITLE) {
00766                             cout << "DEBUG(TITLE): " << currentOsisID << ": CHAPTER INTRO "<< text << endl;
00767                         }
00768 
00769                         writeEntry(text);
00770                     }
00771                 }
00772 
00773                 // Did we have pre-verse material that needs to be marked?
00774                 if (inPreVerse) {
00775                     char genBuf[200];
00776                     sprintf(genBuf, "<div type=\"x-milestone\" subType=\"x-preverse\" eID=\"pv%d\"/>", genID++);
00777                     text.append(genBuf);
00778                 }
00779 
00780                 // Get osisID for verse or annotateRef for commentary
00781                 SWBuf keyVal = token.getAttribute(tokenName == "verse" ? "osisID" : "annotateRef");
00782 
00783                 // Massage the key into a form that parseVerseList can accept
00784                 prepareSWVerseKey(keyVal);
00785 
00786                 // The osisID or annotateRef can be more than a single verse
00787                 // The first or only one is the currentVerse
00788                 // Use the last verse seen (i.e. the currentVerse) as the basis for recovering from bad parsing.
00789                 // This should never happen if the references are valid OSIS references
00790                 ListKey verseKeys = currentVerse.parseVerseList(keyVal, currentVerse, true);
00791                 int memberKeyCount = verseKeys.Count();
00792                 if (memberKeyCount) {
00793                     currentVerse = verseKeys.getElement(0);
00794                     // See if this osisID or annotateRef refers to more than one verse.
00795                     // If it does, save it until all verses have been seen.
00796                     // At that point we will output links.
00797                     // This can be done by incrementing, which will produce an error
00798                     // if there is only one verse.
00799                     verseKeys.setPosition(TOP);
00800                     verseKeys.increment(1);
00801                     if (!verseKeys.popError()) {
00802                         linkedVerses.push_back(verseKeys);
00803                     }
00804                 }
00805                 else {
00806                     cout << "ERROR(REF): Invalid osisID/annotateRef: " << token.getAttribute((tokenName == "verse") ? "osisID" : "annotateRef") << endl;
00807                 }
00808 
00809                 strcpy(currentOsisID, currentVerse.getOSISRef());
00810 
00811                 if (debug & DEBUG_OTHER) {
00812                     cout << "DEBUG(FOUND): New current verse is " << currentVerse.getOSISRef() << endl;
00813                     cout << "DEBUG(FOUND): osisID/annotateRef is adjusted to: " << keyVal << endl;
00814                 }
00815 
00816                 sidVerse        = token.getAttribute("sID");
00817                 inVerse         = true;
00818                 inPreVerse      = false;
00819                 inBookIntro     = false;
00820                 inChapterIntro  = false;
00821                 verseDepth      = tagStack.size();
00822 
00823                 // Include the token if it is not a verse
00824                 if (tokenName != "verse") {
00825                     text.append(token);
00826                 }
00827                 else if (debug & DEBUG_VERSE)
00828                 {
00829                     // transform the verse into a milestone
00830                     XMLTag t = "<milestone resp=\"v\" />";
00831                     // copy all the attributes of the verse element to the milestone
00832                     StringList attrNames = token.getAttributeNames();
00833                     for (StringList::iterator loop = attrNames.begin(); loop != attrNames.end(); loop++) {
00834                         const char* attr = (*loop).c_str();
00835                         t.setAttribute(attr, token.getAttribute(attr));
00836                     }
00837                     text.append(t);
00838                 }
00839 
00840                 if (inWOC) {
00841                     text.append(wocTag);
00842                 }
00843                 return true;
00844             }
00845         } // done with Handle Book, Chapter, and Verse (or commentary equivalent)
00846 
00847         // Now consider everything else.
00848 
00849         // Handle WOC quotes.
00850         // Note this requires transformBSP to make them into milestones
00851         // Otherwise have to do it here
00852         if (tokenName == "q") {
00853             quoteStack.push(token);
00854 
00855             if (debug & DEBUG_QUOTE) {
00856                 cout << "DEBUG(QUOTE): " << currentOsisID << ": quote top(" << quoteStack.size() << ") " << token << endl;
00857             }
00858 
00859             if (token.getAttribute("who") && !strcmp(token.getAttribute("who"), "Jesus")) {
00860                 inWOC = true;
00861 
00862                 // Output per verse WOC markup.
00863                 text.append(wocTag);
00864 
00865                 // Output the quotation mark if appropriate, inside the WOC.
00866                 // If there is no marker attribute, let the SWORD engine manufacture one.
00867                 // If there is a marker attribute and it has content, then output that.
00868                 // If the marker attribute is present and empty, then there is nothing to do.
00869                 // And have it within the WOC markup
00870                 if (!token.getAttribute("marker") || token.getAttribute("marker")[0]) {
00871                     token.setAttribute("who", 0); // remove the who="Jesus"
00872                     text.append(token);
00873                 }
00874                 return true;
00875             }
00876             return false;
00877         }
00878 
00879         // Have we found the start of pre-verse material?
00880         // Pre-verse material follows the following rules
00881         // 1) Between the opening of a book and the first chapter, all the material is handled as an introduction to the book.
00882         // 2) Between the opening of a chapter and the first verse, the material is split between the introduction of the chapter
00883         //    and the first verse of the chapter.
00884         //    A <div> with a type of section will be taken as surrounding verses.
00885         //    A <title> of type other than main, chapter or sub, will be taken as a title for the verse.
00886         //    Once one of these conditions is met, the division between chapter introduction and pre-verse is set.
00887         // 3) Between verses, the material is split between the prior verse and the next verse.
00888         //    Basically, while end and empty tags are found, they belong to the prior verse.
00889         //    Once a begin tag is found, it belongs to the next verse.
00890         if (!inPreVerse && !inBookIntro) {
00891             if (inChapterIntro) {
00892                 // Determine when we are no longer in a chapter heading, but in pre-verse material:
00893                 // If we see one of the following:
00894                 //  a section div
00895                 //  a title that is not main, chapter or sub or unclassified (no type attribute)
00896                 if ((tokenName == "div" && typeAttr == "section") ||
00897                     (tokenName == "title" && typeAttr.length() != 0 && typeAttr != "main" && typeAttr != "chapter" && typeAttr != "sub")
00898                 ) {
00899                     if (debug & DEBUG_TITLE) {
00900                         cout << "DEBUG(TITLE): " << currentOsisID << ": Done looking for chapter introduction" << endl;
00901                     }
00902 
00903                     if (text.length()) {
00904                         if (debug & DEBUG_TITLE) {
00905                             cout << "DEBUG(TITLE): " << currentOsisID << ": CHAPTER INTRO "<< text << endl;
00906                         }
00907 
00908                         // Since we have found the boundary, we need to write out the chapter heading
00909                         writeEntry(text);
00910                     }
00911                     // And we are no longer in the chapter heading
00912                     inChapterIntro  = false;
00913                     // But rather, we are now in pre-verse material
00914                     inPreVerse      = true;
00915                 }
00916             }
00917             else if (!inVerse && inChapter) {
00918                 inPreVerse = true;
00919             }
00920 
00921             if (inPreVerse) {
00922                 char genBuf[200];
00923                 sprintf(genBuf, "<div type=\"x-milestone\" subType=\"x-preverse\" sID=\"pv%d\"/>", genID);
00924                 text.append(genBuf);
00925             }
00926         }
00927 
00928         if (debug & DEBUG_INTERVERSE) {
00929             if (!inVerse && !inBookIntro && !inChapterIntro) {
00930                 cout << "DEBUG(INTERVERSE): " << currentOsisID << ": interverse start token " << token << ":" << text.c_str() << endl;
00931             }
00932         }
00933 
00934         return false;
00935     } // Done with procesing start and empty tags
00936 
00937     // Process end tags
00938     else {
00939 
00940         if (tagStack.empty()) {
00941             cout << "FATAL(NESTING): " << currentOsisID << ": tag expected" << endl;
00942             exit(EXIT_BAD_NESTING);
00943         }
00944 
00945         // Note: empty end tags have the eID attribute
00946         if (!token.isEmpty()) {
00947             XMLTag topToken = tagStack.top();
00948             tagDepth = tagStack.size();
00949 
00950             if (debug & DEBUG_STACK) {
00951                 cout << "DEBUG(STACK): " << currentOsisID << ": pop(" << tagDepth << ") " << topToken.getName() << endl;
00952             }
00953 
00954             tagStack.pop();
00955 
00956             if (tokenName != topToken.getName()) {
00957                 cout << "FATAL(NESTING): " << currentOsisID << ": Expected " << topToken.getName() << " found " << tokenName << endl;
00958 //              exit(EXIT_BAD_NESTING); // (OSK) I'm sure this validity check is a good idea, but there's a but somewhere that's killing the converter here.
00959                         // So I'm disabling this line. Unvalidated OSIS files shouldn't be run through the converter anyway.
00960                         // (DM) This has nothing to do with well-form or valid. It checks milestoned elements for proper nesting.
00961             }
00962         }
00963 
00964         // We haven't seen the first div outside the header so there is little to do.
00965         if (!firstDiv) {
00966             if (tokenName == "header") {
00967                 headerEnded = true;
00968 
00969                 if (debug & DEBUG_OTHER) {
00970                     cout << "DEBUG(FOUND): End of header found" << endl;
00971                 }
00972             }
00973 
00974             // Collect the content so it can be used to suggest the module's conf.
00975             return false;
00976         }
00977 
00978         // VERSE and COMMENTARY END
00979         if ((tokenName == "verse") ||
00980             (tokenName == "div" && eidAttr == sidVerse)
00981         ) {
00982 
00983             if (tagDepth != verseDepth) {
00984                 cout << "WARNING(NESTING): verse " << currentOsisID << " is not well formed:(" << verseDepth << "," << tagDepth << ")" << endl;
00985             }
00986 
00987             // If we are in WOC then we need to terminate the <q who="Jesus" marker=""> that was added earlier in the verse.
00988             if (inWOC) {
00989                 text.append("</q>");
00990             }
00991 
00992 
00993             // Include the token if it is not a verse
00994             if (tokenName != "verse") {
00995                 text.append(token);
00996             }
00997             else if (debug & DEBUG_VERSE)
00998             {
00999                 // transform the verse into a milestone
01000                 XMLTag t = "<milestone resp=\"v\" />";
01001                 // copy all the attributes of the verse element to the milestone
01002                 StringList attrNames = token.getAttributeNames();
01003                 for (StringList::iterator loop = attrNames.begin(); loop != attrNames.end(); loop++) {
01004                     const char* attr = (*loop).c_str();
01005                     t.setAttribute(attr, token.getAttribute(attr));
01006                 }
01007                 text.append(t);
01008             }
01009 
01010             writeEntry(text);
01011 
01012             inVerse     = false;
01013             inPreVerse  = false;
01014             verseDepth  = 0;
01015 
01016             return true;
01017         }
01018         
01019         // Handle WOC quotes.
01020         // Note this requires transformBSP to make them into milestones
01021         // Otherwise have to manage it here
01022         if (tokenName == "q") {
01023             XMLTag topToken = quoteStack.top();
01024 
01025             if (debug & DEBUG_QUOTE) {
01026                 cout << "DEBUG(QUOTE): " << currentOsisID << ": quote pop(" << quoteStack.size() << ") " << topToken << " -- " << token << endl;
01027             }
01028 
01029             quoteStack.pop();
01030 
01031             // If we have found an end tag for a <q who="Jesus"> then we are done with the WOC
01032             // and we need to terminate the <q who="Jesus" marker=""> that was added earlier in the verse.
01033             if (token.getAttribute("who") && !strcmp(token.getAttribute("who"), "Jesus")) {
01034 
01035                 if (debug & DEBUG_QUOTE) {
01036                     cout << "DEBUG(QUOTE): " << currentOsisID << ": (" << quoteStack.size() << ") " << topToken << " -- " << token << endl;
01037                 }
01038 
01039                 inWOC = false;
01040                 const char *sID = topToken.getAttribute("sID");
01041                 const char *eID = token.getAttribute("eID");
01042                 if (!sID) {
01043                     sID = "";
01044                 }
01045                 if (!eID) {
01046                     eID = "";
01047                 }
01048                 if (strcmp(sID, eID)) {
01049                     cout << "ERROR(NESTING): improper nesting " << currentOsisID << ": matching (sID,eID) not found. Looking at (" << sID << "," << eID << ")" << endl;
01050                 }
01051 
01052 
01053                 // Output the quotation mark if appropriate, inside the WOC.
01054                 // If there is no marker attribute, let the SWORD engine manufacture one.
01055                 // If there is a marker attribute and it has content, then output that.
01056                 // If the marker attribute is present and empty, then there is nothing to do.
01057                 // And have it within the WOC markup
01058                 if (!token.getAttribute("marker") || token.getAttribute("marker")[0]) {
01059                     token.setAttribute("who", 0); // remove the who="Jesus"
01060                     text.append(token);
01061                 }
01062 
01063                 // Now close the WOC
01064                 text.append("</q>");
01065                 return true;
01066             }
01067             return false;
01068         }
01069 
01070         // Look for the end of document, book and chapter
01071         // Also for material that goes with last entry
01072         if (!inVerse && !inBookIntro && !inChapterIntro) {
01073             // Is this the end of a chapter.
01074             if ((tokenName == "chapter") ||
01075                 (tokenName == "div" && eidAttr == sidChapter)
01076             ) {
01077                 text.append(token);
01078                 writeEntry(text);
01079                 inChapter    = false;
01080                 sidChapter   = "";
01081                 chapterDepth = 0;
01082                 verseDepth   = 0;
01083                 return true;
01084             }
01085 
01086             // Is it the end of a book
01087             if (tokenName == "div" && eidAttr == sidBook) {
01088                 text.append(token);
01089                 writeEntry(text);
01090                 bookDepth    = 0;
01091                 chapterDepth = 0;
01092                 verseDepth   = 0;
01093                 return true;
01094             }
01095 
01096             // Do not include the end of an osis document
01097             if (tokenName == "osisText" || tokenName == "osis") {
01098                 bookDepth    = 0;
01099                 chapterDepth = 0;
01100                 verseDepth   = 0;
01101                 text         = "";
01102                 return true;
01103             }
01104 
01105             // When we are not inPreVerse, the interverse tags get appended to the preceeding verse.
01106             if (!inPreVerse) {
01107                 text.append(token);
01108                 writeEntry(text);
01109 
01110                 if (debug & DEBUG_INTERVERSE) {
01111                     cout << "DEBUG(INTERVERSE): " << currentOsisID << ": appending interverse end tag: " << tokenName << "(" << tagDepth << "," << chapterDepth << "," << bookDepth << ")" << endl;
01112                 }
01113 
01114                 return true;
01115             }
01116 
01117             if (debug & DEBUG_INTERVERSE) {
01118                 cout << "DEBUG(INTERVERSE): " << currentOsisID << ": interverse end tag: " << tokenName << "(" << tagDepth << "," << chapterDepth << "," << bookDepth << ")" << endl;
01119             }
01120 
01121             return false;
01122         }
01123 
01124         return false;
01125     } // done with Processing end tags
01126 
01127     return false;
01128 }

bool isOSISAbbrev ( const char *  buf  ) 

Definition at line 98 of file osis2mod.cpp.

00098                                    {
00099     VersificationMgr *vmgr = VersificationMgr::getSystemVersificationMgr();
00100     const VersificationMgr::System *av11n = vmgr->getVersificationSystem(currentVerse.getVersificationSystem());
00101     return av11n->getBookNumberByOSISName(buf) >= 0;
00102 }

bool isValidRef ( const char *  buf  ) 

Determine whether a verse as given is valid for the versification. This is done by comparing the before and after of normalization.

Definition at line 349 of file osis2mod.cpp.

00349                                  {
00350     // Create a VerseKey that does not do auto normalization
00351     // Note: need to turn on headings so that a heading does not get normalized anyway
00352     // And set it to the reference under question
00353     VerseKey before;
00354     before.setVersificationSystem(currentVerse.getVersificationSystem());
00355     before.setAutoNormalize(false);
00356     before.setIntros(true);
00357     before.setText(buf);
00358 
00359     // If we are a heading we must bail
00360     // These will autonormalize to the last verse of the prior chapter
00361     if (!before.getTestament() || !before.getBook() || !before.getChapter() || !before.getVerse()) {
00362         return true;
00363     }
00364 
00365     // Create a VerseKey that does do auto normalization
00366     // And set it to the reference under question
00367     VerseKey after;
00368     after.setVersificationSystem(currentVerse.getVersificationSystem());
00369     after.setAutoNormalize(true);
00370     after.setText(buf);
00371 
00372     if (before == after)
00373     {
00374         return true;
00375     }
00376 
00377     // If we have gotten here the reference is not in the selected versification.
00378     // cout << "INFO(V11N): " << before << " is not in the " << currentVerse.getVersificationSystem() << " versification." << endl;
00379     if (debug & DEBUG_REV11N) {
00380         cout << "DEBUG(V11N): " << before << " normalizes to "  << after << endl;
00381     }
00382 
00383     return false;
00384 }

void linkToEntry ( VerseKey &  linkKey,
VerseKey &  dest 
)

Definition at line 557 of file osis2mod.cpp.

00557                                                     {
00558 
00559     // Only link verses that are in the versification.
00560     if (!isValidRef(linkKey)) {
00561         return;
00562     }
00563 
00564     VerseKey saveKey;
00565     saveKey.setVersificationSystem(currentVerse.getVersificationSystem());
00566     saveKey.setAutoNormalize(0);
00567     saveKey.setIntros(1);
00568     saveKey = currentVerse;
00569     currentVerse = linkKey;
00570 
00571     cout << "INFO(LINK): Linking " << currentVerse.getOSISRef() << " to " << dest.getOSISRef() << "\n";
00572     module->linkEntry(&dest);
00573 
00574     currentVerse = saveKey;
00575 }

int main ( int  argc,
char **  argv 
)

Definition at line 1527 of file osis2mod.cpp.

01527                                 {
01528 
01529     fprintf(stderr, "You are running osis2mod: $Rev: 2783 $\n");
01530 
01531     // Let's test our command line arguments
01532     if (argc < 3) {
01533         usage(*argv);
01534     }
01535 
01536     // variables for arguments, holding defaults
01537     const char* program    = argv[0];
01538     const char* path       = argv[1];
01539     const char* osisDoc    = argv[2];
01540     int append             = 0;
01541     SWBuf compType         = "";
01542     bool isCommentary      = false;
01543     int iType              = 4;
01544     int entrySize          = 0;
01545     SWBuf cipherKey        = "";
01546     SWCompress *compressor = 0;
01547 
01548     for (int i = 3; i < argc; i++) {
01549         if (!strcmp(argv[i], "-a")) {
01550             append = 1;
01551         }
01552         else if (!strcmp(argv[i], "-z")) {
01553             if (compType.size()) usage(*argv, "Cannot specify both -z and -Z");
01554             if (entrySize) usage(*argv, "Cannot specify both -z and -s");
01555             compType = "ZIP";
01556         }
01557         else if (!strcmp(argv[i], "-Z")) {
01558             if (compType.size()) usage(*argv, "Cannot specify both -z and -Z");
01559             if (entrySize) usage(*argv, "Cannot specify both -Z and -s");
01560             compType = "LZSS";
01561         }
01562         else if (!strcmp(argv[i], "-b")) {
01563             if (i+1 < argc) {
01564                 iType = atoi(argv[++i]);
01565                 if ((iType >= 2) && (iType <= 4)) continue;
01566             }
01567             usage(*argv, "-b requires one of <2|3|4>");
01568         }
01569         else if (!strcmp(argv[i], "-N")) {
01570             normalize = false;
01571         }
01572         else if (!strcmp(argv[i], "-c")) {
01573             if (i+1 < argc) cipherKey = argv[++i];
01574             else usage(*argv, "-c requires <cipher_key>");
01575         }
01576         else if (!strcmp(argv[i], "-v")) {
01577             if (i+1 < argc) v11n = argv[++i];
01578             else usage(*argv, "-v requires <v11n>");
01579         }
01580         else if (!strcmp(argv[i], "-s")) {
01581             if (compType.size()) usage(*argv, "Cannot specify -s and -z or -Z");
01582             if (i+1 < argc) {
01583                 entrySize = atoi(argv[++i]);
01584                 if (entrySize == 2 || entrySize == 4) {
01585                     continue;
01586                 }
01587             }
01588             usage(*argv, "-s requires one of <2|4>");
01589         }
01590         else if (!strcmp(argv[i], "-C")) {
01591             isCommentary = true;
01592         }
01593         else if (!strcmp(argv[i], "-d")) {
01594             if (i+1 < argc) debug |= atoi(argv[++i]);
01595             else usage(*argv, "-d requires <flags>");
01596         }
01597         else usage(*argv, (((SWBuf)"Unknown argument: ")+ argv[i]).c_str());
01598     }
01599 
01600     if (isCommentary) isCommentary = true;  // avoid unused warning for now
01601 
01602     if (compType == "ZIP") {
01603 #ifndef EXCLUDEZLIB
01604         compressor = new ZipCompress();
01605 #else
01606         usage(*argv, "ERROR: SWORD library not compiled with ZIP compression support.\n\tBe sure libzip is available when compiling SWORD library");
01607 #endif
01608     }
01609     else if (compType == "LZSS") {
01610         compressor = new LZSSCompress();
01611     }
01612 
01613 #ifndef _ICU_
01614     if (normalize) {
01615         normalize = false;
01616         cout << "WARNING(UTF8): " << program << " is not compiled with support for ICU. Assuming -N." << endl;
01617     }
01618 #endif
01619 
01620     if (debug & DEBUG_OTHER) {
01621         cout << "DEBUG(ARGS):\n\tpath: " << path << "\n\tosisDoc: " << osisDoc << "\n\tcreate: " << append << "\n\tcompressType: " << compType << "\n\tblockType: " << iType << "\n\tcipherKey: " << cipherKey.c_str() << "\n\tnormalize: " << normalize << endl;
01622     }
01623 
01624     if (!append) {  // == 0 then create module
01625     // Try to initialize a default set of datafiles and indicies at our
01626     // datapath location passed to us from the user.
01627         if (compressor) {
01628             if (zText::createModule(path, iType, v11n)) {
01629                 fprintf(stderr, "ERROR: %s: couldn't create module at path: %s \n", program, path);
01630                 exit(EXIT_NO_CREATE);
01631             }
01632         }
01633         else if (entrySize == 4) {
01634             if (RawText4::createModule(path, v11n)) {
01635                 fprintf(stderr, "ERROR: %s: couldn't create module at path: %s \n", program, path);
01636                 exit(EXIT_NO_CREATE);
01637             }
01638         }
01639         else {
01640             if (RawText::createModule(path, v11n)) {
01641                 fprintf(stderr, "ERROR: %s: couldn't create module at path: %s \n", program, path);
01642                 exit(EXIT_NO_CREATE);
01643             }
01644         }
01645     }
01646 
01647     // Do some initialization stuff
01648     if (compressor) {
01649         // Create a compressed text module allowing very large entries
01650         // Taking defaults except for first, fourth, fifth and last argument
01651         module = new zText(
01652                 path,       // ipath
01653                 0,      // iname
01654                 0,      // idesc
01655                 iType,      // iblockType
01656                 compressor, // icomp
01657                 0,      // idisp
01658                 ENC_UNKNOWN,    // enc
01659                 DIRECTION_LTR,  // dir
01660                 FMT_UNKNOWN,    // markup
01661                 0,      // lang
01662                 v11n        // versification
01663                );
01664     }
01665     else if (entrySize == 4) {
01666         // Create a raw text module allowing very large entries
01667         // Taking defaults except for first and last argument
01668         module = new RawText4(
01669                 path,       // ipath
01670                 0,      // iname
01671                 0,      // idesc
01672                 0,      // idisp
01673                 ENC_UNKNOWN,    // encoding
01674                 DIRECTION_LTR,  // dir
01675                 FMT_UNKNOWN,    // markup
01676                 0,      // ilang
01677                 v11n        // versification
01678             );
01679     }
01680     else {
01681         // Create a raw text module allowing reasonable sized entries
01682         // Taking defaults except for first and last argument
01683         module = new RawText(
01684                 path,       // ipath
01685                 0,      // iname
01686                 0,      // idesc
01687                 0,      // idisp
01688                 ENC_UNKNOWN,    // encoding
01689                 DIRECTION_LTR,  // dir
01690                 FMT_UNKNOWN,    // markup
01691                 0,      // ilang
01692                 v11n        // versification
01693             );
01694     }
01695 
01696     SWFilter *cipherFilter = 0;
01697 
01698     if (cipherKey.length()) {
01699         fprintf(stderr, "Adding cipher filter with phrase: %s\n", cipherKey.c_str() );
01700         cipherFilter = new CipherFilter(cipherKey.c_str());
01701         module->addRawFilter(cipherFilter);
01702     }
01703 
01704     if (!module->isWritable()) {
01705         fprintf(stderr, "The module is not writable. Writing text to it will not work.\nExiting.\n" );
01706         exit(EXIT_NO_WRITE);
01707     }
01708 
01709     // Either read from std::cin (aka stdin), when the argument is a '-'
01710     // or from a specified file.
01711     if (!strcmp(osisDoc, "-")) {
01712         processOSIS(cin);
01713     }
01714     else {
01715         // Let's see if we can open our input file
01716         ifstream infile(osisDoc);
01717         if (infile.fail()) {
01718             fprintf(stderr, "ERROR: %s: couldn't open input file: %s \n", program, osisDoc);
01719             exit(EXIT_NO_READ);
01720         }
01721         processOSIS(infile);
01722         infile.close();
01723     }
01724 
01725     delete module;
01726     if (cipherFilter)
01727         delete cipherFilter;
01728 
01729     fprintf(stderr, "SUCCESS: %s: has finished its work and will now rest\n", program);
01730     exit(0); // success
01731 }

void makeValidRef ( VerseKey &  key  ) 

This routine is used to ensure that all the text in the input is saved to the module. Assumption: The input orders all the verses for a chapter in numerical order. Thus, any verses that are not in the chosen versification (v11n) follow those that are.

The prior implementation of this adjusted the verse to the last one that is in the chosen v11n. If it the chapter were extra, then it is appended to the last verse of the last chapter in the chosen v11n for that book. If it is just extra verses for a chapter, then it is appended to the last verse of the chapter.

The problem with this is when a OSIS verse refers to more than one verse, e.g. osisID="Gen.1.29 Gen.1.30 Gen.1.31" (Gen.1.31 is the last verse of the chapter in the chosen v11n) and then it is followed by Gen.1.32.

This routine assumes that linking is postponed to the end so that in the example Gen.1.30-31 are not linked but rather empty. This routine will then find the last verse in the computed chapter that has content.

Alternative, we could have done linking as we went, but this routine would have needed to find the first entry in the link set and elsewhere in the code when appending to a verse, it would need to be checked for adjacent links and those would have needed to be adjusted.

param key the key that may need to be adjusted

Definition at line 410 of file osis2mod.cpp.

00410                                  {
00411     VerseKey saveKey;
00412     saveKey.setVersificationSystem(currentVerse.getVersificationSystem());
00413     saveKey.setAutoNormalize(false);
00414     saveKey.setIntros(true);
00415     saveKey = currentVerse;
00416 
00417     // Since isValidRef returned false constrain the key to the nearest prior reference.
00418     // If we are past the last chapter set the reference to the last chapter
00419     int chapterMax = key.getChapterMax();
00420     if (key.getChapter() > chapterMax) {
00421         key.setChapter(chapterMax);
00422     }
00423 
00424     // Either we set the chapter to the last chapter and now need to set to the last verse in the chapter
00425     // Or the verse is beyond the end of the chapter.
00426     // In any case we need to constrain the verse to it's chapter.
00427     int verseMax   = key.getVerseMax();
00428     key.setVerse(verseMax);
00429 
00430     if (debug & DEBUG_REV11N) {
00431         cout << "DEBUG(V11N) Chapter max:" << chapterMax << ", Verse Max:" << verseMax << endl;
00432     }
00433 
00434     // There are three cases we want to handle:
00435     // In the examples we are using the KJV versification where the last verse of Matt.7 is Matt.7.29.
00436     // In each of these cases the out-of-versification, extra verse is Matt.7.30.
00437     // 1) The "extra" verse follows the last verse in the chapter.
00438     //      <verse osisID="Matt.7.29">...</verse><verse osisID="Matt.7.30">...</verse>
00439     //    In this case re-versify Matt.7.30 as Matt.7.29.
00440     //
00441     // 2) The "extra" verse follows a range (a set of linked verses).
00442     //      <verse osisID="Matt.7.28-Matt.7.29">...</verse><verse osisID="Matt.7.30">...</verse>
00443     //    In this case, re-versify Matt.7.30 as Matt.7.28, the first verse in the linked set.
00444     //    Since we are post-poning linking, we want to re-reversify to the last entry in the module.
00445     //
00446     // 3) The last verse in the chapter is not in the input. There may be other verses missing as well.
00447     //      <verse osisID="Matt.7.8">...</verse><verse osisID="Matt.7.30">...</verse>
00448     //    In this case we should re-versify Matt.7.30 as Matt.7.29.
00449     //    However, since this and 2) are ambiguous, we'll re-reversify to the last entry in the module.
00450     
00451     while (!key.popError() && !module->hasEntry(&key)) {
00452         key.decrement(1);
00453     }
00454 
00455     cout << "INFO(V11N): " << saveKey.getOSISRef()
00456          << " is not in the " << key.getVersificationSystem()
00457          << " versification. Appending content to " << key.getOSISRef() << endl;
00458 }

void prepareSWText ( const char *  osisID,
SWBuf &  text 
)

Definition at line 177 of file osis2mod.cpp.

00178 {
00179     // Always check on UTF8 and report on non-UTF8 entries
00180     int utf8State = detectUTF8(text.c_str());
00181 
00182     // Trust, but verify.
00183     if (!normalize && !utf8State) {
00184         cout << "WARNING(UTF8): " << osisID << ": Should be converted to UTF-8 (" << text << ")" << endl;
00185     }
00186 
00187 #ifdef _ICU_
00188     if (normalize) {
00189         // Don't need to normalize text that is ASCII
00190         // But assume other non-UTF-8 text is Latin1 (cp1252) and convert it to UTF-8
00191         if (!utf8State) {
00192             cout << "INFO(UTF8): " << osisID << ": Converting to UTF-8 (" << text << ")" << endl;
00193             converter.processText(text, (SWKey *)2);  // note the hack of 2 to mimic a real key. TODO: remove all hacks
00194             converted++;
00195 
00196             // Prepare for double check. This probably can be removed.
00197             // But for now we are running the check again.
00198             // This is to determine whether we need to normalize output of the conversion.
00199             utf8State = detectUTF8(text.c_str());
00200         }
00201 
00202         // Double check. This probably can be removed.
00203         if (!utf8State) {
00204             cout << "ERROR(UTF8): " << osisID << ": Converting to UTF-8 (" << text << ")" << endl;
00205         }
00206 
00207         if (utf8State > 0) {
00208             SWBuf before = text;
00209             normalizer.processText(text, (SWKey *)2);  // note the hack of 2 to mimic a real key. TODO: remove all hacks
00210             if (before != text) {
00211                 normalized++;
00212             }
00213         }
00214     }
00215 #endif
00216 }

void prepareSWVerseKey ( SWBuf &  buf  ) 

Definition at line 230 of file osis2mod.cpp.

00230                                    {
00231     // This routine modifies the buf in place
00232     char* s = buf.getRawData();
00233     char* p = s;
00234     bool inRange = false;
00235     while (*p) {
00236         if (inRange) {
00237             if (debug & DEBUG_REF) {
00238                 cout << "DEBUG(REF): Copy range marker:" << *p << endl;;
00239             }
00240 
00241             // Range markers are copied as is
00242             *s++ = *p++;
00243         }
00244 
00245         // Look ahead to see if we are in a work prefix
00246         // but don't look past an osisID
00247         char *n = p;
00248         while (*n && *n != ':' && *n != ' ' && *n != '-') {
00249             n++;
00250         }
00251 
00252         // We have found a work prefix
00253         if (*n == ':') {
00254             // set p to skip the work prefix
00255             p = n + 1;
00256 
00257             if (debug & DEBUG_REF) {
00258                 cout << "DEBUG(REF): Found a work prefix ";
00259                 for (char *x = s; x <= n; x++) {
00260                     cout << *x;
00261                 }
00262                 cout << endl;
00263             }
00264         }
00265 
00266         // Now we are in the meat of an osisID.
00267         // Copy it to its end but stop on a grain marker of '!'
00268         if (debug & DEBUG_REF) {
00269             cout << "DEBUG(REF): Copy osisID:";
00270         }
00271 
00272         while (*p && *p != '!' && *p != ' ' && *p != '-') {
00273             if (debug & DEBUG_REF) {
00274                 cout << *p;
00275             }
00276 
00277             *s++ = *p++;
00278         }
00279 
00280         if (debug & DEBUG_REF) {
00281             cout << endl;
00282         }
00283 
00284         // The ! and everything following until we hit
00285         // the end of the osisID is part of the grain reference
00286         if (*p == '!') {
00287             n = p;
00288             while (*n && *n != ' ' && *n != '-') {
00289                 n++;
00290             }
00291 
00292             if (debug & DEBUG_REF) {
00293                 cout << "DEBUG(REF): Found a grain suffix ";
00294                 for (char *x = p; x < n; x++) {
00295                     cout << *x;
00296                 }
00297                 cout << endl;
00298             }
00299 
00300             p = n;
00301         }
00302 
00303         // At this point we have processed an osisID
00304 
00305         // if we are not in a range and the next characer is a -
00306         // then we are entering a range
00307         inRange = !inRange && *p == '-';
00308 
00309         if (debug & DEBUG_REF) {
00310             if (inRange) {
00311                 cout << "DEBUG(REF): Found a range" << endl;
00312             }
00313         }
00314 
00315         // between ranges and stand alone osisIDs we might have whitespace
00316         if (!inRange && *p == ' ') {
00317             // skip this and subsequent spaces
00318             while (*p == ' ') {
00319                 p++;
00320             }
00321 
00322             // replacing them all with a ';'
00323             *s++ = ';';
00324 
00325             if (debug & DEBUG_REF) {
00326                 cout << "DEBUG(REF): replacing space with ;. Remaining: " << p << endl;
00327             }
00328         }
00329     }
00330 
00331     // Determine whether we have modified the buffer
00332     // We have modified the buffer if s is not sitting on the null byte of the original
00333     if (*s) {
00334         // null terminate the reference
00335         *s = '\0';
00336         // Since we modified the swbuf, we need to tell it what we have done
00337         buf.setSize(s - buf.c_str());
00338 
00339         if (debug & DEBUG_REF) {
00340             cout << "DEBUG(REF): shortended keyVal to`" << buf.c_str() << "`"<< endl;
00341         }
00342     }
00343 }

void processOSIS ( istream &  infile  ) 

Definition at line 1328 of file osis2mod.cpp.

01328                                   {
01329     typedef enum {
01330         CS_NOT_IN_COMMENT,      // or seen starting "<"
01331         CS_SEEN_STARTING_EXCLAMATION,
01332         CS_SEEN_STARTING_HYPHEN,
01333         CS_IN_COMMENT,
01334         CS_SEEN_ENDING_HYPHEN,
01335         CS_SEEN_SECOND_ENDING_HYPHEN,
01336         CS_SEEN_ENDING_GREATER_THAN
01337     } t_commentstate;
01338 
01339     activeOsisID[0] = '\0';
01340 
01341     strcpy(currentOsisID,"N/A");
01342 
01343     currentVerse.setVersificationSystem(v11n);
01344     currentVerse.setAutoNormalize(false);
01345     currentVerse.setIntros(true);   // turn on mod/testmnt/book/chap headings
01346     currentVerse.setPersist(true);
01347 
01348     module->setKey(currentVerse);
01349     module->setPosition(TOP);
01350 
01351     SWBuf token;
01352     SWBuf text;
01353     bool incomment = false;
01354     t_commentstate commentstate = CS_NOT_IN_COMMENT;
01355     bool intoken = false;
01356     bool inWhitespace = false;
01357     bool seeingSpace = false;
01358     unsigned char curChar = '\0';
01359 
01360     while (infile.good()) {
01361 
01362         int possibleChar = infile.get();
01363 
01364         // skip the character if it is bad. infile.good() will catch the problem
01365         if (possibleChar == -1) {
01366             continue;
01367         }
01368 
01369         curChar = (unsigned char) possibleChar;
01370 
01371         // All newlines are simply whitespace
01372         // Does a SWORD module actually require this?
01373         if (curChar == '\n') {
01374             curChar = ' ';
01375         }
01376 
01377         if (!intoken && curChar == '<') {
01378             intoken = true;
01379             token = "<";
01380             continue;
01381         }
01382 
01383         // Handle XML comments starting with "<!--", ending with "-->"
01384 
01385         if (intoken && !incomment) {
01386             switch (commentstate) {
01387                 case CS_NOT_IN_COMMENT :
01388                     if (curChar == '!') {
01389                         commentstate = CS_SEEN_STARTING_EXCLAMATION;
01390                         token.append((char) curChar);
01391                         continue;
01392                     } else {
01393                         break;
01394                     }
01395 
01396                 case CS_SEEN_STARTING_EXCLAMATION :
01397                     if (curChar == '-') {
01398                         commentstate = CS_SEEN_STARTING_HYPHEN;
01399                         token.append((char) curChar);
01400                         continue;
01401                     } else {
01402                         commentstate = CS_NOT_IN_COMMENT;
01403                         break;
01404                     }
01405 
01406                 case CS_SEEN_STARTING_HYPHEN :
01407                     if (curChar == '-') {
01408                         incomment = true;
01409                         commentstate = CS_IN_COMMENT;
01410                         token.append((char) curChar);
01411 
01412                         if (debug & DEBUG_OTHER) {
01413                             cout << "DEBUG(COMMENTS): in comment" << endl;
01414                         }
01415 
01416                         continue;
01417                     } else {
01418                         commentstate = CS_NOT_IN_COMMENT;
01419                         break;
01420                     }
01421 
01422                 default:
01423                     cout << "FATAL(COMMENTS): unknown commentstate on comment start: " << commentstate << endl;
01424                     exit(EXIT_BAD_NESTING);
01425             }
01426         }
01427 
01428         if (incomment) {
01429             switch (commentstate) {
01430                 case CS_IN_COMMENT:
01431                     if (curChar == '-') {
01432                         commentstate = CS_SEEN_ENDING_HYPHEN;
01433                         continue;
01434                     } else {
01435                         // ignore the character
01436                         continue;
01437                     }
01438 
01439                 case CS_SEEN_ENDING_HYPHEN :
01440                     if (curChar == '-') {
01441                         commentstate = CS_SEEN_SECOND_ENDING_HYPHEN;
01442                         continue;
01443                     } else {
01444                         // ignore character
01445                         commentstate = CS_IN_COMMENT;
01446                         continue;
01447                     }
01448 
01449                 case CS_SEEN_SECOND_ENDING_HYPHEN :
01450                     if (curChar == '>') {
01451                         intoken = false;
01452                         incomment = false;
01453                         commentstate = CS_NOT_IN_COMMENT;
01454 
01455                         if (debug & DEBUG_OTHER) {
01456                             cout << "DEBUG(COMMENTS): out of comment" << endl;
01457                         }
01458 
01459                         continue;
01460                     } else {
01461                         // ignore character
01462                         commentstate = CS_IN_COMMENT;
01463                         continue;
01464                     }
01465 
01466                 default:
01467                     cout << "FATAL(COMMENTS): unknown commentstate on comment end: " << commentstate << endl;
01468                     exit(EXIT_BAD_NESTING);
01469             }
01470         }
01471 
01472         // Outside of tokens merge adjacent whitespace
01473         if (!intoken) {
01474             seeingSpace = isspace(curChar)!=0;
01475             if (seeingSpace) {
01476                 if (inWhitespace) {
01477                     continue;
01478                 }
01479                 // convert all whitespace to blanks
01480                 curChar = ' ';
01481             }
01482             inWhitespace = seeingSpace;
01483         }
01484 
01485         if (intoken && curChar == '>') {
01486             intoken = false;
01487             inWhitespace = false;
01488             token.append('>');
01489             // take this isalpha if out to check for bugs in text
01490             if (isalpha(token[1]) ||
01491                 (((token[1] == '/') || (token[1] == '?')) && isalpha(token[2]))) {
01492                 //cout << "Handle:" << token.c_str() << endl;
01493                 XMLTag t = transformBSP(token.c_str());
01494 
01495                 if (!handleToken(text, t)) {
01496                     text.append(t);
01497                 }
01498             } else {
01499                 cout << "WARNING(PARSE): malformed token: " << token << endl;
01500             }
01501             continue;
01502         }
01503 
01504         if (intoken) {
01505             token.append((char) curChar);
01506         }
01507         else {
01508             switch (curChar) {
01509                 case '>' : text.append("&gt;"); break;
01510                 case '<' : text.append("&lt;"); break;
01511                 default  : text.append((char) curChar); break;
01512             }
01513         }
01514     }
01515 
01516     // Force the last entry from the text buffer.
01517     text = "";
01518     writeEntry(text, true);
01519     writeLinks();
01520 
01521 #ifdef _ICU_
01522     if (converted)  fprintf(stderr, "osis2mod converted %d verses to UTF-8\n", converted);
01523     if (normalized) fprintf(stderr, "osis2mod normalized %d verses to NFC\n", normalized);
01524 #endif
01525 }

XMLTag transformBSP ( XMLTag  t  ) 

Support normalizations necessary for a SWORD module. OSIS allows for document structure (Book, Section, Paragraph or BSP) to overlap Bible versification (Book, Chapter, Verse). Most SWORD applications need to display verses in isolation or in HTML table cells, requiring each stored entry (i.e. verses) to be well-formed xml. This routine normalizes container elements which could cross verse boundaries into milestones. For most of these OSIS elements, there is a milestone form. However, p is not milestoneable. For this reason, p is transformed into lb elements. param t the tag to transform return the transformed tag or the original one

Definition at line 1142 of file osis2mod.cpp.

01142                               {
01143     static std::stack<XMLTag> bspTagStack;
01144     static int sID = 1;
01145     char buf[11];
01146 
01147     // Support simplification transformations
01148     if (t.isEmpty()) {
01149 
01150         if (debug & DEBUG_XFORM) {
01151             cout << "DEBUG(XFORM): " << currentOsisID << ": xform empty " << t << endl;
01152         }
01153 
01154         return t;
01155     }
01156 
01157     SWBuf tagName = t.getName();
01158     if (!t.isEndTag()) {
01159         // Transform <p> into <div type="paragraph"> and milestone it
01160         if (tagName == "p") {
01161             t.setText("<div type=\"paragraph\" />");
01162             sprintf(buf, "gen%d", sID++);
01163             t.setAttribute("sID", buf);
01164         }
01165 
01166         // Transform <tag> into <tag  sID="">, where tag is a milestoneable element.
01167         // The following containers are milestoneable.
01168         // abbr, closer, div, foreign, l, lg, salute, signed, speech
01169         // Leaving out:
01170         //   abbr   When would this ever cross a boundary?
01171         //   seg    as it is used for a divineName hack
01172         //   foreign    so that it can be easily italicized
01173         else if (tagName == "chapter" ||
01174              tagName == "closer"  ||
01175              tagName == "div"     ||
01176              tagName == "l"       ||
01177              tagName == "lg"      ||
01178              tagName == "q"       ||
01179              tagName == "salute"  ||
01180              tagName == "signed"  ||
01181              tagName == "speech"  ||
01182              tagName == "verse"
01183         ) {
01184             t.setEmpty(true);
01185             sprintf(buf, "gen%d", sID++);
01186             t.setAttribute("sID", buf);
01187         }
01188         bspTagStack.push(t);
01189 
01190         if (debug & DEBUG_XFORM) {
01191             cout << "DEBUG(XFORM): " << currentOsisID << ": xform push (" << bspTagStack.size() << ") " << t << " (tagname=" << tagName << ")" << endl;
01192             XMLTag topToken = bspTagStack.top();
01193             cout << "DEBUG(XFORM): " << currentOsisID << ": xform top(" << bspTagStack.size() << ") " << topToken << endl;
01194         }
01195     }
01196     else {
01197         if (!bspTagStack.empty()) {
01198             XMLTag topToken = bspTagStack.top();
01199 
01200             if (debug & DEBUG_XFORM) {
01201                 cout << "DEBUG(XFORM): " << currentOsisID << ": xform pop(" << bspTagStack.size() << ") " << topToken << endl;
01202             }
01203 
01204             bspTagStack.pop();
01205 
01206             // Look for the milestoneable container tags handled above.
01207             if (tagName == "chapter" ||
01208                 tagName == "closer"  ||
01209                 tagName == "div"     ||
01210                 tagName == "l"       ||
01211                 tagName == "lg"      ||
01212                 tagName == "p"       ||
01213                 tagName == "q"       ||
01214                 tagName == "salute"  ||
01215                 tagName == "signed"  ||
01216                 tagName == "speech"  ||
01217                 tagName == "verse"
01218             ) {
01219                 // make this a clone of the start tag with sID changed to eID
01220                 // Note: in the case of </p> the topToken is a <div type="paragraph">
01221                 t = topToken;
01222                 t.setAttribute("eID", t.getAttribute("sID"));
01223                 t.setAttribute("sID", 0);
01224             }
01225         }
01226         else {
01227             cout << "FATAL(TAGSTACK): " << currentOsisID << ": closing tag without opening tag" << endl;
01228         }
01229     }
01230 
01231     return t;
01232 }

void usage ( const char *  app,
const char *  error = 0 
)

Definition at line 1275 of file osis2mod.cpp.

01275                                                    {
01276 
01277     if (error) fprintf(stderr, "\n%s: %s\n", app, error);
01278 
01279     fprintf(stderr, "OSIS Bible/commentary module creation tool for The SWORD Project\n");
01280     fprintf(stderr, "\nusage: %s <output/path> <osisDoc> [OPTIONS]\n", app);
01281     fprintf(stderr, "  <output/path>\t\t an existing folder that the module will be written\n");
01282     fprintf(stderr, "  <osisDoc>\t\t path to the validated OSIS document, or '-' to\n");
01283     fprintf(stderr, "\t\t\t\t read from standard input\n");
01284     fprintf(stderr, "  -a\t\t\t augment module if exists (default is to create new)\n");
01285     fprintf(stderr, "  -z\t\t\t use ZIP compression (default no compression)\n");
01286     fprintf(stderr, "  -Z\t\t\t use LZSS compression (default no compression)\n");
01287     fprintf(stderr, "  -b <2|3|4>\t\t compression block size (default 4):\n");
01288     fprintf(stderr, "\t\t\t\t 2 - verse; 3 - chapter; 4 - book\n");
01289     fprintf(stderr, "  -c <cipher_key>\t encipher module using supplied key\n");
01290     fprintf(stderr, "\t\t\t\t (default no enciphering)\n");
01291     fprintf(stderr, "  -N\t\t\t do not convert UTF-8 or normalize UTF-8 to NFC\n");
01292     fprintf(stderr, "\t\t\t\t (default is to convert to UTF-8, if needed,\n");
01293     fprintf(stderr, "\t\t\t\t  and then normalize to NFC)\n");
01294     fprintf(stderr, "\t\t\t\t Note: UTF-8 texts should be normalized to NFC.\n");
01295     fprintf(stderr, "  -s <2|4>\t\t bytes used to store entry size (default is 2).\n");
01296     fprintf(stderr, "\t\t\t\t Note: useful for commentaries with very large\n");
01297     fprintf(stderr, "\t\t\t\t entries in uncompressed modules\n");
01298     fprintf(stderr, "\t\t\t\t (2 bytes to store size equal 65535 characters)\n");
01299     fprintf(stderr, "  -v <v11n>\t\t specify a versification scheme to use (default is KJV)\n");
01300     fprintf(stderr, "\t\t\t\t Note: The following are valid values for v11n:\n");
01301     VersificationMgr *vmgr = VersificationMgr::getSystemVersificationMgr();
01302     StringList av11n = vmgr->getVersificationSystems();
01303     for (StringList::iterator loop = av11n.begin(); loop != av11n.end(); loop++) {
01304         fprintf(stderr, "\t\t\t\t\t%s\n", (*loop).c_str());
01305     }
01306     fprintf(stderr, "  -d <flags>\t\t turn on debugging (default is 0)\n");
01307     fprintf(stderr, "\t\t\t\t Note: This flag may change in the future.\n");
01308     fprintf(stderr, "\t\t\t\t Flags: The following are valid values:\n");
01309     fprintf(stderr, "\t\t\t\t\t0   - no debugging\n");
01310     fprintf(stderr, "\t\t\t\t\t1   - writes to module, very verbose\n");
01311     fprintf(stderr, "\t\t\t\t\t2   - verse start and end\n");
01312     fprintf(stderr, "\t\t\t\t\t4   - quotes, esp. Words of Christ\n");
01313     fprintf(stderr, "\t\t\t\t\t8   - titles\n");
01314     fprintf(stderr, "\t\t\t\t\t16  - inter-verse material\n");
01315     fprintf(stderr, "\t\t\t\t\t32  - BSP to BCV transformations\n");
01316     fprintf(stderr, "\t\t\t\t\t64  - v11n exceptions\n");
01317     fprintf(stderr, "\t\t\t\t\t128 - parsing of osisID and osisRef\n");
01318     fprintf(stderr, "\t\t\t\t\t256 - internal stack\n");
01319     fprintf(stderr, "\t\t\t\t\t512 - miscellaneous\n");
01320     fprintf(stderr, "\t\t\t\t This argument can be used more than once. (Or\n");
01321     fprintf(stderr, "\t\t\t\t the flags may be added together.)\n");
01322     fprintf(stderr, "\n");
01323     fprintf(stderr, "See http://www.crosswire.org/wiki/osis2mod for more details.\n");
01324     fprintf(stderr, "\n");
01325     exit(EXIT_BAD_ARG);
01326 }

void writeEntry ( SWBuf &  text,
bool  force = false 
)

Definition at line 460 of file osis2mod.cpp.

00460                                                  {
00461     char keyOsisID[255];
00462 
00463     static const char* revision = "<milestone type=\"x-importer\" subType=\"x-osis2mod\" n=\"$Rev: 2783 $\"/>";
00464     static bool firstOT = true;
00465     static bool firstNT = true;
00466 
00467     if (!inCanonicalOSISBook) {
00468         return;
00469     }
00470 
00471     strcpy(keyOsisID, currentVerse.getOSISRef());
00472 
00473     // set keyOsisID to anything that an osisID cannot be.
00474     if (force) {
00475         strcpy(keyOsisID, "-force");
00476     }
00477 
00478     static VerseKey lastKey;
00479     lastKey.setVersificationSystem(currentVerse.getVersificationSystem());
00480     lastKey.setAutoNormalize(0);
00481     lastKey.setIntros(1);
00482 
00483     VerseKey saveKey;
00484     saveKey.setVersificationSystem(currentVerse.getVersificationSystem());
00485     saveKey.setAutoNormalize(0);
00486     saveKey.setIntros(1);
00487     saveKey = currentVerse;
00488 
00489     // If we have seen a verse and the supplied one is different then we output the collected one.
00490     if (*activeOsisID && strcmp(activeOsisID, keyOsisID)) {
00491 
00492         if (!isValidRef(lastKey)) {
00493             makeValidRef(lastKey);
00494         }
00495 
00496         currentVerse = lastKey;
00497 
00498         prepareSWText(activeOsisID, activeVerseText);
00499 
00500         // Put the revision into the module
00501         int testmt = currentVerse.getTestament();
00502         if ((testmt == 1 && firstOT) || (testmt == 2 && firstNT)) {
00503             VerseKey t;
00504             t.setVersificationSystem(currentVerse.getVersificationSystem());
00505             t.setAutoNormalize(0);
00506             t.setIntros(1);
00507             t = currentVerse;
00508             currentVerse.setBook(0);
00509             currentVerse.setChapter(0);
00510             currentVerse.setVerse(0);
00511             module->setEntry(revision);
00512             currentVerse = t;
00513             switch (testmt) {
00514             case 1:
00515                 firstOT = false;
00516                 break;
00517             case 2:
00518                 firstNT = false;
00519                 break;
00520             }
00521         }
00522 
00523         // If the entry already exists, then append this entry to the text.
00524         // This is for verses that are outside the chosen versification. They are appended to the prior verse.
00525         // The space should not be needed if we retained verse tags.
00526         SWBuf currentText = module->getRawEntry();
00527         if (currentText.length()) {
00528             cout << "INFO(WRITE): Appending entry: " << currentVerse.getOSISRef() << ": " << activeVerseText << endl;
00529             activeVerseText = currentText + " " + activeVerseText;
00530         }
00531 
00532         if (debug & DEBUG_WRITE) {
00533             cout << "DEBUG(WRITE): " << activeOsisID << ":" << currentVerse.getOSISRef() << ": " << activeVerseText << endl;
00534         }
00535 
00536         module->setEntry(activeVerseText);
00537         activeVerseText = "";
00538     }
00539 
00540     // The following is for initial verse content and for appending interverse content.
00541     if (activeVerseText.length()) {
00542         activeVerseText += text;
00543     }
00544     else {
00545         // Eliminate leading whitespace on the beginning of each verse
00546         text.trimStart();
00547         activeVerseText = text;
00548     }
00549     // text has been consumed so clear it out.
00550     text = "";
00551 
00552     currentVerse = saveKey;
00553     lastKey = currentVerse;
00554     strcpy(activeOsisID, keyOsisID);
00555 }

void writeLinks (  ) 

Write out all links in the module. Waiting is necessary because writeEntry might ultimately append text to a verse moving it's offset in the data file. While we are minimizing it by postponing the write until we have gathered the next verse, the following scenario is happening: A module is using linked verses and has some verses that are not in the chosen versification. If the out-of-canon verse happens following a linked verse, the out-of-canon verse is appended to the prior verse. Care has to be taken that the linked verses all point to the first of the set.

Definition at line 1246 of file osis2mod.cpp.

01247 {
01248     // Link all the verses
01249     VerseKey destKey;
01250     destKey.setVersificationSystem(currentVerse.getVersificationSystem());
01251     destKey.setAutoNormalize(0);
01252     destKey.setIntros(1);
01253 
01254     VerseKey linkKey;
01255     linkKey.setVersificationSystem(currentVerse.getVersificationSystem());
01256     linkKey.setAutoNormalize(0);
01257     linkKey.setIntros(1);
01258     for (unsigned int i = 0; i < linkedVerses.size(); i++) {
01259         // The verseKeys is a list of verses
01260         // where the first is the real verse
01261         // and the others link to it.
01262         ListKey verseKeys = linkedVerses[i];
01263         verseKeys.setPosition(TOP);
01264         destKey = verseKeys.getElement();
01265         verseKeys.increment(1);
01266 
01267         while (!verseKeys.popError()) {
01268             linkKey = verseKeys.getElement();
01269             verseKeys.increment(1);
01270             linkToEntry(linkKey, destKey);
01271         }
01272     }
01273 }


Variable Documentation

char activeOsisID[255]

Definition at line 86 of file osis2mod.cpp.

Definition at line 89 of file osis2mod.cpp.

int converted = 0

Definition at line 81 of file osis2mod.cpp.

ListKey currentKeyIDs = ListKey()

Definition at line 91 of file osis2mod.cpp.

char currentOsisID[255]

Definition at line 87 of file osis2mod.cpp.

VerseKey currentVerse

Definition at line 84 of file osis2mod.cpp.

int debug = 0

Definition at line 57 of file osis2mod.cpp.

const int DEBUG_INTERVERSE = 16

Definition at line 62 of file osis2mod.cpp.

const int DEBUG_OTHER = 512

Definition at line 67 of file osis2mod.cpp.

const int DEBUG_QUOTE = 4

Definition at line 60 of file osis2mod.cpp.

const int DEBUG_REF = 128

Definition at line 65 of file osis2mod.cpp.

const int DEBUG_REV11N = 64

Definition at line 64 of file osis2mod.cpp.

const int DEBUG_STACK = 256

Definition at line 66 of file osis2mod.cpp.

const int DEBUG_TITLE = 8

Definition at line 61 of file osis2mod.cpp.

const int DEBUG_VERSE = 2

Definition at line 59 of file osis2mod.cpp.

const int DEBUG_WRITE = 1

Definition at line 58 of file osis2mod.cpp.

const int DEBUG_XFORM = 32

Definition at line 63 of file osis2mod.cpp.

const int EXIT_BAD_ARG = 1

Definition at line 70 of file osis2mod.cpp.

const int EXIT_BAD_NESTING = 5

Definition at line 74 of file osis2mod.cpp.

const int EXIT_NO_CREATE = 3

Definition at line 72 of file osis2mod.cpp.

const int EXIT_NO_READ = 4

Definition at line 73 of file osis2mod.cpp.

const int EXIT_NO_WRITE = 2

Definition at line 71 of file osis2mod.cpp.

bool inCanonicalOSISBook = true [static]

Definition at line 95 of file osis2mod.cpp.

std::vector<ListKey> linkedVerses

Definition at line 93 of file osis2mod.cpp.

SWText* module = 0

Definition at line 83 of file osis2mod.cpp.

bool normalize = true [static]

Definition at line 96 of file osis2mod.cpp.

int normalized = 0

Definition at line 80 of file osis2mod.cpp.

SWBuf v11n = "KJV"

Definition at line 85 of file osis2mod.cpp.


Generated on 18 Mar 2013 for The SWORD Project by  doxygen 1.6.1