#include <ctype.h>#include <stdio.h>#include <fcntl.h>#include <errno.h>#include <stdlib.h>#include <stack>#include <vector>#include <iostream>#include <fstream>#include <utilstr.h>#include <swmgr.h>#include <rawtext.h>#include <rawtext4.h>#include <swbuf.h>#include <utilxml.h>#include <listkey.h>#include <versekey.h>#include <ztext.h>#include <lzsscomprs.h>#include <zipcomprs.h>#include <cipherfil.h>
Go to the source code of this file.
Functions | |
| int | detectUTF8 (const char *txt) |
| bool | handleToken (SWBuf &text, XMLTag token) |
| bool | isOSISAbbrev (const char *buf) |
| bool | isValidRef (const char *buf) |
| void | linkToEntry (VerseKey &linkKey, VerseKey &dest) |
| int | main (int argc, char **argv) |
| void | makeValidRef (VerseKey &key) |
| void | prepareSWText (const char *osisID, SWBuf &text) |
| void | prepareSWVerseKey (SWBuf &buf) |
| void | processOSIS (istream &infile) |
| XMLTag | transformBSP (XMLTag t) |
| void | usage (const char *app, const char *error=0) |
| void | writeEntry (SWBuf &text, bool force=false) |
| void | writeLinks () |
Variables | |
| char | activeOsisID [255] |
| SWBuf | activeVerseText |
| int | converted = 0 |
| ListKey | currentKeyIDs = ListKey() |
| char | currentOsisID [255] |
| VerseKey | currentVerse |
| int | debug = 0 |
| const int | DEBUG_INTERVERSE = 16 |
| const int | DEBUG_OTHER = 512 |
| const int | DEBUG_QUOTE = 4 |
| const int | DEBUG_REF = 128 |
| const int | DEBUG_REV11N = 64 |
| const int | DEBUG_STACK = 256 |
| const int | DEBUG_TITLE = 8 |
| const int | DEBUG_VERSE = 2 |
| const int | DEBUG_WRITE = 1 |
| const int | DEBUG_XFORM = 32 |
| const int | EXIT_BAD_ARG = 1 |
| const int | EXIT_BAD_NESTING = 5 |
| const int | EXIT_NO_CREATE = 3 |
| const int | EXIT_NO_READ = 4 |
| const int | EXIT_NO_WRITE = 2 |
| static bool | inCanonicalOSISBook = true |
| std::vector< ListKey > | linkedVerses |
| SWText * | module = 0 |
| static bool | normalize = true |
| int | normalized = 0 |
| SWBuf | v11n = "KJV" |
| int detectUTF8 | ( | const char * | txt | ) |
Determine whether the string contains a valid unicode sequence. The following table give the pattern of a valid UTF-8 character. Unicode Range 1st 2nd 3rd 4th U-00000000 - U-0000007F 0nnnnnnn U-00000080 - U-000007FF 110nnnnn 10nnnnnn U-00000800 - U-0000FFFF 1110nnnn 10nnnnnn 10nnnnnn U-00010000 - U-001FFFFF 11110nnn 10nnnnnn 10nnnnnn 10nnnnnn Note: 1. The latest UTF-8 RFC allows for a max of 4 bytes. Earlier allowed 6. 2. The number of bits of the leading byte before the first 0 is the total number of bytes. 3. The "n" are the bits of the unicode codepoint. This routine does not check to see if the code point is in the range. It could.
param txt the text to check return 1 if all high order characters form a valid unicode sequence -1 if there are no high order characters. Note: this is also a valid unicode sequence 0 if there are high order characters that do not form a valid unicode sequence author DM Smith
Definition at line 129 of file osis2mod.cpp.
00129 { 00130 unsigned int countUTF8 = 0; 00131 int count = 0; 00132 00133 // Cast it to make masking and shifting easier 00134 const unsigned char *p = (const unsigned char*) txt; 00135 while (*p) { 00136 // Is the high order bit set? 00137 if (*p & 0x80) { 00138 // Then count the number of high order bits that are set. 00139 // This determines the number of following bytes 00140 // that are a part of the unicode character 00141 unsigned char i = *p; 00142 for (count = 0; i & 0x80; count++) { 00143 i <<= 1; 00144 } 00145 00146 // Validate count: 00147 // Count 0: bug in code that would cause core walking 00148 // Count 1: is a pattern of 10nnnnnn, 00149 // which does not signal the start of a unicode character 00150 // Count 5 to 8: 111110nn, 1111110n and 11111110 and 11111111 00151 // are not legal starts, either 00152 if (count < 2 || count > 4) return 0; 00153 00154 // At this point we expect (count - 1) following characters 00155 // of the pattern 10nnnnnn 00156 while (--count && *++p) { 00157 // The pattern of each following character must be: 10nnnnnn 00158 // So, compare the top 2 bits. 00159 if ((0xc0 & *p) != 0x80) return 0; 00160 } 00161 00162 // Oops, we've run out of bytes too soon: Cannot be UTF-8 00163 if (count) return 0; 00164 00165 // We have a valid UTF-8 character, so count it 00166 countUTF8++; 00167 } 00168 00169 // Advance to the next character to examine. 00170 p++; 00171 } 00172 00173 // At this point it is either UTF-8 or 7-bit ascii 00174 return countUTF8 ? 1 : -1; 00175 }
| bool handleToken | ( | SWBuf & | text, | |
| XMLTag | token | |||
| ) |
Definition at line 579 of file osis2mod.cpp.
00579 { 00580 00581 // Everything between the begin book tag and the first begin chapter tag is inBookIntro 00582 static bool inBookIntro = false; 00583 00584 // Everything between the begin chapter tag and the first begin verse tag is inChapterIntro 00585 static bool inChapterIntro = false; 00586 00587 // Flags indicating whether we are processing the content of a chapter 00588 static bool inChapter = false; 00589 00590 // Flags indicating whether we are processing the content of a verse 00591 static bool inVerse = false; 00592 00593 // Flags indicating whether we are processing the content of to be prepended to a verse 00594 static bool inPreVerse = false; 00595 static int genID = 1; 00596 00597 // Flag indicating whether we are in "Words of Christ" 00598 static bool inWOC = false; 00599 // Tag for WOC quotes within a verse 00600 static XMLTag wocTag = "<q who=\"Jesus\" marker=\"\">"; 00601 00602 // Flag used to indicate where useful text begins 00603 static bool firstDiv = false; 00604 static bool headerEnded = false; 00605 00606 // Retain the sID of book, chapter and verse (commentary) divs so that we can find them again. 00607 // This relies on transformBSP. 00608 static SWBuf sidBook = ""; 00609 static SWBuf sidChapter = ""; 00610 static SWBuf sidVerse = ""; 00611 00612 // Stack of quote elements used to handle Words of Christ 00613 static std::stack<XMLTag> quoteStack; 00614 00615 // Stack of elements used to validate that books, chapters and verses are well-formed 00616 // This goes beyond simple xml well-formed and also considers milestoned div, chapter and verse 00617 // to be begin and end tags, too. 00618 // It is an error if books and chapters are not well formed (though not required by OSIS) 00619 // It is a warning that verses are not well formed (because some clients are not ready) 00620 static std::stack<XMLTag> tagStack; 00621 00622 // The following are used to validate well-formedness 00623 static int chapterDepth = 0; 00624 static int bookDepth = 0; 00625 static int verseDepth = 0; 00626 00627 int tagDepth = tagStack.size(); 00628 SWBuf tokenName = token.getName(); 00629 bool isEndTag = token.isEndTag() || token.getAttribute("eID"); 00630 SWBuf typeAttr = token.getAttribute("type"); 00631 SWBuf eidAttr = token.getAttribute("eID"); 00632 00633 // process start tags 00634 if (!isEndTag) { 00635 00636 // Remember non-empty start tags 00637 if (!token.isEmpty()) { 00638 tagStack.push(token); 00639 00640 if (debug & DEBUG_STACK) { 00641 cout << "DEBUG(STACK): " << currentOsisID << ": push (" << tagStack.size() << ") " << token.getName() << endl; 00642 } 00643 } 00644 00645 // throw away everything up to the first div (that is outside the header) 00646 if (!firstDiv) { 00647 if (headerEnded && (tokenName == "div")) { 00648 if (debug & DEBUG_OTHER) { 00649 cout << "DEBUG(FOUND): Found first div and pitching prior material: " << text << endl; 00650 } 00651 00652 // TODO: Save off the content to use it to suggest the module's conf. 00653 firstDiv = true; 00654 text = ""; 00655 } 00656 else { 00657 // Collect the content so it can be used to suggest the module's conf. 00658 return false; 00659 } 00660 } 00661 00662 //-- WITH osisID OR annotateRef ------------------------------------------------------------------------- 00663 // Handle Book, Chapter, and Verse (or commentary equivalent) 00664 if (token.getAttribute("osisID") || token.getAttribute("annotateRef")) { 00665 00666 // BOOK START, <div type="book" ...> 00667 if (tokenName == "div" && typeAttr == "book") { 00668 if (inBookIntro || inChapterIntro) { // this one should never happen, but just in case 00669 00670 if (debug & DEBUG_TITLE) { 00671 cout << "DEBUG(TITLE): " << currentOsisID << ": OOPS INTRO " << endl; 00672 cout << "\tinChapterIntro = " << inChapterIntro << endl; 00673 cout << "\tinBookIntro = " << inBookIntro << endl; 00674 } 00675 00676 currentVerse.setTestament(0); 00677 currentVerse.setBook(0); 00678 currentVerse.setChapter(0); 00679 currentVerse.setVerse(0); 00680 writeEntry(text); 00681 } 00682 currentVerse = token.getAttribute("osisID"); 00683 currentVerse.setChapter(0); 00684 currentVerse.setVerse(0); 00685 strcpy(currentOsisID, currentVerse.getOSISRef()); 00686 00687 sidBook = token.getAttribute("sID"); 00688 inChapter = false; 00689 inVerse = false; 00690 inPreVerse = false; 00691 inBookIntro = true; 00692 inChapterIntro = false; 00693 00694 if (debug & DEBUG_TITLE) { 00695 cout << "DEBUG(TITLE): " << currentOsisID << ": Looking for book introduction" << endl; 00696 } 00697 00698 bookDepth = tagStack.size(); 00699 chapterDepth = 0; 00700 verseDepth = 0; 00701 00702 inCanonicalOSISBook = isOSISAbbrev(token.getAttribute("osisID")); 00703 if (!inCanonicalOSISBook) { 00704 cout << "WARNING(V11N): New book is " << token.getAttribute("osisID") << " and is not in " << v11n << " versification, ignoring" << endl; 00705 } 00706 else if (debug & DEBUG_OTHER) { 00707 cout << "DEBUG(FOUND): New book is " << currentVerse.getOSISRef() << endl; 00708 } 00709 00710 return false; 00711 } 00712 00713 // CHAPTER START, <chapter> or <div type="chapter" ...> 00714 if ((tokenName == "chapter") || 00715 (tokenName == "div" && typeAttr == "chapter") 00716 ) { 00717 if (inBookIntro) { 00718 if (debug & DEBUG_TITLE) { 00719 cout << "DEBUG(TITLE): " << currentOsisID << ": BOOK INTRO "<< text << endl; 00720 } 00721 00722 writeEntry(text); 00723 } 00724 00725 currentVerse = token.getAttribute("osisID"); 00726 currentVerse.setVerse(0); 00727 00728 if (debug & DEBUG_OTHER) { 00729 cout << "DEBUG(FOUND): Current chapter is " << currentVerse.getOSISRef() << " (" << token.getAttribute("osisID") << ")" << endl; 00730 } 00731 00732 strcpy(currentOsisID, currentVerse.getOSISRef()); 00733 00734 sidChapter = token.getAttribute("sID"); 00735 inChapter = true; 00736 inVerse = false; 00737 inPreVerse = false; 00738 inBookIntro = false; 00739 inChapterIntro = true; 00740 00741 if (debug & DEBUG_TITLE) { 00742 cout << "DEBUG(TITLE): " << currentOsisID << ": Looking for chapter introduction" << endl; 00743 } 00744 00745 chapterDepth = tagStack.size(); 00746 verseDepth = 0; 00747 00748 return false; 00749 } 00750 00751 // VERSE, <verse ...> OR COMMENTARY START, <div annotateType="xxx" ...> 00752 if ((tokenName == "verse") || 00753 (tokenName == "div" && token.getAttribute("annotateType")) 00754 ) { 00755 if (debug & DEBUG_OTHER) { 00756 cout << "DEBUG(FOUND): Entering verse" << endl; 00757 } 00758 00759 if (inChapterIntro) { 00760 if (debug & DEBUG_TITLE) { 00761 cout << "DEBUG(TITLE): " << currentOsisID << ": Done looking for chapter introduction" << endl; 00762 } 00763 00764 if (text.length()) { 00765 if (debug & DEBUG_TITLE) { 00766 cout << "DEBUG(TITLE): " << currentOsisID << ": CHAPTER INTRO "<< text << endl; 00767 } 00768 00769 writeEntry(text); 00770 } 00771 } 00772 00773 // Did we have pre-verse material that needs to be marked? 00774 if (inPreVerse) { 00775 char genBuf[200]; 00776 sprintf(genBuf, "<div type=\"x-milestone\" subType=\"x-preverse\" eID=\"pv%d\"/>", genID++); 00777 text.append(genBuf); 00778 } 00779 00780 // Get osisID for verse or annotateRef for commentary 00781 SWBuf keyVal = token.getAttribute(tokenName == "verse" ? "osisID" : "annotateRef"); 00782 00783 // Massage the key into a form that parseVerseList can accept 00784 prepareSWVerseKey(keyVal); 00785 00786 // The osisID or annotateRef can be more than a single verse 00787 // The first or only one is the currentVerse 00788 // Use the last verse seen (i.e. the currentVerse) as the basis for recovering from bad parsing. 00789 // This should never happen if the references are valid OSIS references 00790 ListKey verseKeys = currentVerse.parseVerseList(keyVal, currentVerse, true); 00791 int memberKeyCount = verseKeys.Count(); 00792 if (memberKeyCount) { 00793 currentVerse = verseKeys.getElement(0); 00794 // See if this osisID or annotateRef refers to more than one verse. 00795 // If it does, save it until all verses have been seen. 00796 // At that point we will output links. 00797 // This can be done by incrementing, which will produce an error 00798 // if there is only one verse. 00799 verseKeys.setPosition(TOP); 00800 verseKeys.increment(1); 00801 if (!verseKeys.popError()) { 00802 linkedVerses.push_back(verseKeys); 00803 } 00804 } 00805 else { 00806 cout << "ERROR(REF): Invalid osisID/annotateRef: " << token.getAttribute((tokenName == "verse") ? "osisID" : "annotateRef") << endl; 00807 } 00808 00809 strcpy(currentOsisID, currentVerse.getOSISRef()); 00810 00811 if (debug & DEBUG_OTHER) { 00812 cout << "DEBUG(FOUND): New current verse is " << currentVerse.getOSISRef() << endl; 00813 cout << "DEBUG(FOUND): osisID/annotateRef is adjusted to: " << keyVal << endl; 00814 } 00815 00816 sidVerse = token.getAttribute("sID"); 00817 inVerse = true; 00818 inPreVerse = false; 00819 inBookIntro = false; 00820 inChapterIntro = false; 00821 verseDepth = tagStack.size(); 00822 00823 // Include the token if it is not a verse 00824 if (tokenName != "verse") { 00825 text.append(token); 00826 } 00827 else if (debug & DEBUG_VERSE) 00828 { 00829 // transform the verse into a milestone 00830 XMLTag t = "<milestone resp=\"v\" />"; 00831 // copy all the attributes of the verse element to the milestone 00832 StringList attrNames = token.getAttributeNames(); 00833 for (StringList::iterator loop = attrNames.begin(); loop != attrNames.end(); loop++) { 00834 const char* attr = (*loop).c_str(); 00835 t.setAttribute(attr, token.getAttribute(attr)); 00836 } 00837 text.append(t); 00838 } 00839 00840 if (inWOC) { 00841 text.append(wocTag); 00842 } 00843 return true; 00844 } 00845 } // done with Handle Book, Chapter, and Verse (or commentary equivalent) 00846 00847 // Now consider everything else. 00848 00849 // Handle WOC quotes. 00850 // Note this requires transformBSP to make them into milestones 00851 // Otherwise have to do it here 00852 if (tokenName == "q") { 00853 quoteStack.push(token); 00854 00855 if (debug & DEBUG_QUOTE) { 00856 cout << "DEBUG(QUOTE): " << currentOsisID << ": quote top(" << quoteStack.size() << ") " << token << endl; 00857 } 00858 00859 if (token.getAttribute("who") && !strcmp(token.getAttribute("who"), "Jesus")) { 00860 inWOC = true; 00861 00862 // Output per verse WOC markup. 00863 text.append(wocTag); 00864 00865 // Output the quotation mark if appropriate, inside the WOC. 00866 // If there is no marker attribute, let the SWORD engine manufacture one. 00867 // If there is a marker attribute and it has content, then output that. 00868 // If the marker attribute is present and empty, then there is nothing to do. 00869 // And have it within the WOC markup 00870 if (!token.getAttribute("marker") || token.getAttribute("marker")[0]) { 00871 token.setAttribute("who", 0); // remove the who="Jesus" 00872 text.append(token); 00873 } 00874 return true; 00875 } 00876 return false; 00877 } 00878 00879 // Have we found the start of pre-verse material? 00880 // Pre-verse material follows the following rules 00881 // 1) Between the opening of a book and the first chapter, all the material is handled as an introduction to the book. 00882 // 2) Between the opening of a chapter and the first verse, the material is split between the introduction of the chapter 00883 // and the first verse of the chapter. 00884 // A <div> with a type of section will be taken as surrounding verses. 00885 // A <title> of type other than main, chapter or sub, will be taken as a title for the verse. 00886 // Once one of these conditions is met, the division between chapter introduction and pre-verse is set. 00887 // 3) Between verses, the material is split between the prior verse and the next verse. 00888 // Basically, while end and empty tags are found, they belong to the prior verse. 00889 // Once a begin tag is found, it belongs to the next verse. 00890 if (!inPreVerse && !inBookIntro) { 00891 if (inChapterIntro) { 00892 // Determine when we are no longer in a chapter heading, but in pre-verse material: 00893 // If we see one of the following: 00894 // a section div 00895 // a title that is not main, chapter or sub or unclassified (no type attribute) 00896 if ((tokenName == "div" && typeAttr == "section") || 00897 (tokenName == "title" && typeAttr.length() != 0 && typeAttr != "main" && typeAttr != "chapter" && typeAttr != "sub") 00898 ) { 00899 if (debug & DEBUG_TITLE) { 00900 cout << "DEBUG(TITLE): " << currentOsisID << ": Done looking for chapter introduction" << endl; 00901 } 00902 00903 if (text.length()) { 00904 if (debug & DEBUG_TITLE) { 00905 cout << "DEBUG(TITLE): " << currentOsisID << ": CHAPTER INTRO "<< text << endl; 00906 } 00907 00908 // Since we have found the boundary, we need to write out the chapter heading 00909 writeEntry(text); 00910 } 00911 // And we are no longer in the chapter heading 00912 inChapterIntro = false; 00913 // But rather, we are now in pre-verse material 00914 inPreVerse = true; 00915 } 00916 } 00917 else if (!inVerse && inChapter) { 00918 inPreVerse = true; 00919 } 00920 00921 if (inPreVerse) { 00922 char genBuf[200]; 00923 sprintf(genBuf, "<div type=\"x-milestone\" subType=\"x-preverse\" sID=\"pv%d\"/>", genID); 00924 text.append(genBuf); 00925 } 00926 } 00927 00928 if (debug & DEBUG_INTERVERSE) { 00929 if (!inVerse && !inBookIntro && !inChapterIntro) { 00930 cout << "DEBUG(INTERVERSE): " << currentOsisID << ": interverse start token " << token << ":" << text.c_str() << endl; 00931 } 00932 } 00933 00934 return false; 00935 } // Done with procesing start and empty tags 00936 00937 // Process end tags 00938 else { 00939 00940 if (tagStack.empty()) { 00941 cout << "FATAL(NESTING): " << currentOsisID << ": tag expected" << endl; 00942 exit(EXIT_BAD_NESTING); 00943 } 00944 00945 // Note: empty end tags have the eID attribute 00946 if (!token.isEmpty()) { 00947 XMLTag topToken = tagStack.top(); 00948 tagDepth = tagStack.size(); 00949 00950 if (debug & DEBUG_STACK) { 00951 cout << "DEBUG(STACK): " << currentOsisID << ": pop(" << tagDepth << ") " << topToken.getName() << endl; 00952 } 00953 00954 tagStack.pop(); 00955 00956 if (tokenName != topToken.getName()) { 00957 cout << "FATAL(NESTING): " << currentOsisID << ": Expected " << topToken.getName() << " found " << tokenName << endl; 00958 // exit(EXIT_BAD_NESTING); // (OSK) I'm sure this validity check is a good idea, but there's a but somewhere that's killing the converter here. 00959 // So I'm disabling this line. Unvalidated OSIS files shouldn't be run through the converter anyway. 00960 // (DM) This has nothing to do with well-form or valid. It checks milestoned elements for proper nesting. 00961 } 00962 } 00963 00964 // We haven't seen the first div outside the header so there is little to do. 00965 if (!firstDiv) { 00966 if (tokenName == "header") { 00967 headerEnded = true; 00968 00969 if (debug & DEBUG_OTHER) { 00970 cout << "DEBUG(FOUND): End of header found" << endl; 00971 } 00972 } 00973 00974 // Collect the content so it can be used to suggest the module's conf. 00975 return false; 00976 } 00977 00978 // VERSE and COMMENTARY END 00979 if ((tokenName == "verse") || 00980 (tokenName == "div" && eidAttr == sidVerse) 00981 ) { 00982 00983 if (tagDepth != verseDepth) { 00984 cout << "WARNING(NESTING): verse " << currentOsisID << " is not well formed:(" << verseDepth << "," << tagDepth << ")" << endl; 00985 } 00986 00987 // If we are in WOC then we need to terminate the <q who="Jesus" marker=""> that was added earlier in the verse. 00988 if (inWOC) { 00989 text.append("</q>"); 00990 } 00991 00992 00993 // Include the token if it is not a verse 00994 if (tokenName != "verse") { 00995 text.append(token); 00996 } 00997 else if (debug & DEBUG_VERSE) 00998 { 00999 // transform the verse into a milestone 01000 XMLTag t = "<milestone resp=\"v\" />"; 01001 // copy all the attributes of the verse element to the milestone 01002 StringList attrNames = token.getAttributeNames(); 01003 for (StringList::iterator loop = attrNames.begin(); loop != attrNames.end(); loop++) { 01004 const char* attr = (*loop).c_str(); 01005 t.setAttribute(attr, token.getAttribute(attr)); 01006 } 01007 text.append(t); 01008 } 01009 01010 writeEntry(text); 01011 01012 inVerse = false; 01013 inPreVerse = false; 01014 verseDepth = 0; 01015 01016 return true; 01017 } 01018 01019 // Handle WOC quotes. 01020 // Note this requires transformBSP to make them into milestones 01021 // Otherwise have to manage it here 01022 if (tokenName == "q") { 01023 XMLTag topToken = quoteStack.top(); 01024 01025 if (debug & DEBUG_QUOTE) { 01026 cout << "DEBUG(QUOTE): " << currentOsisID << ": quote pop(" << quoteStack.size() << ") " << topToken << " -- " << token << endl; 01027 } 01028 01029 quoteStack.pop(); 01030 01031 // If we have found an end tag for a <q who="Jesus"> then we are done with the WOC 01032 // and we need to terminate the <q who="Jesus" marker=""> that was added earlier in the verse. 01033 if (token.getAttribute("who") && !strcmp(token.getAttribute("who"), "Jesus")) { 01034 01035 if (debug & DEBUG_QUOTE) { 01036 cout << "DEBUG(QUOTE): " << currentOsisID << ": (" << quoteStack.size() << ") " << topToken << " -- " << token << endl; 01037 } 01038 01039 inWOC = false; 01040 const char *sID = topToken.getAttribute("sID"); 01041 const char *eID = token.getAttribute("eID"); 01042 if (!sID) { 01043 sID = ""; 01044 } 01045 if (!eID) { 01046 eID = ""; 01047 } 01048 if (strcmp(sID, eID)) { 01049 cout << "ERROR(NESTING): improper nesting " << currentOsisID << ": matching (sID,eID) not found. Looking at (" << sID << "," << eID << ")" << endl; 01050 } 01051 01052 01053 // Output the quotation mark if appropriate, inside the WOC. 01054 // If there is no marker attribute, let the SWORD engine manufacture one. 01055 // If there is a marker attribute and it has content, then output that. 01056 // If the marker attribute is present and empty, then there is nothing to do. 01057 // And have it within the WOC markup 01058 if (!token.getAttribute("marker") || token.getAttribute("marker")[0]) { 01059 token.setAttribute("who", 0); // remove the who="Jesus" 01060 text.append(token); 01061 } 01062 01063 // Now close the WOC 01064 text.append("</q>"); 01065 return true; 01066 } 01067 return false; 01068 } 01069 01070 // Look for the end of document, book and chapter 01071 // Also for material that goes with last entry 01072 if (!inVerse && !inBookIntro && !inChapterIntro) { 01073 // Is this the end of a chapter. 01074 if ((tokenName == "chapter") || 01075 (tokenName == "div" && eidAttr == sidChapter) 01076 ) { 01077 text.append(token); 01078 writeEntry(text); 01079 inChapter = false; 01080 sidChapter = ""; 01081 chapterDepth = 0; 01082 verseDepth = 0; 01083 return true; 01084 } 01085 01086 // Is it the end of a book 01087 if (tokenName == "div" && eidAttr == sidBook) { 01088 text.append(token); 01089 writeEntry(text); 01090 bookDepth = 0; 01091 chapterDepth = 0; 01092 verseDepth = 0; 01093 return true; 01094 } 01095 01096 // Do not include the end of an osis document 01097 if (tokenName == "osisText" || tokenName == "osis") { 01098 bookDepth = 0; 01099 chapterDepth = 0; 01100 verseDepth = 0; 01101 text = ""; 01102 return true; 01103 } 01104 01105 // When we are not inPreVerse, the interverse tags get appended to the preceeding verse. 01106 if (!inPreVerse) { 01107 text.append(token); 01108 writeEntry(text); 01109 01110 if (debug & DEBUG_INTERVERSE) { 01111 cout << "DEBUG(INTERVERSE): " << currentOsisID << ": appending interverse end tag: " << tokenName << "(" << tagDepth << "," << chapterDepth << "," << bookDepth << ")" << endl; 01112 } 01113 01114 return true; 01115 } 01116 01117 if (debug & DEBUG_INTERVERSE) { 01118 cout << "DEBUG(INTERVERSE): " << currentOsisID << ": interverse end tag: " << tokenName << "(" << tagDepth << "," << chapterDepth << "," << bookDepth << ")" << endl; 01119 } 01120 01121 return false; 01122 } 01123 01124 return false; 01125 } // done with Processing end tags 01126 01127 return false; 01128 }
| bool isOSISAbbrev | ( | const char * | buf | ) |
Definition at line 98 of file osis2mod.cpp.
00098 { 00099 VersificationMgr *vmgr = VersificationMgr::getSystemVersificationMgr(); 00100 const VersificationMgr::System *av11n = vmgr->getVersificationSystem(currentVerse.getVersificationSystem()); 00101 return av11n->getBookNumberByOSISName(buf) >= 0; 00102 }
| bool isValidRef | ( | const char * | buf | ) |
Determine whether a verse as given is valid for the versification. This is done by comparing the before and after of normalization.
Definition at line 349 of file osis2mod.cpp.
00349 { 00350 // Create a VerseKey that does not do auto normalization 00351 // Note: need to turn on headings so that a heading does not get normalized anyway 00352 // And set it to the reference under question 00353 VerseKey before; 00354 before.setVersificationSystem(currentVerse.getVersificationSystem()); 00355 before.setAutoNormalize(false); 00356 before.setIntros(true); 00357 before.setText(buf); 00358 00359 // If we are a heading we must bail 00360 // These will autonormalize to the last verse of the prior chapter 00361 if (!before.getTestament() || !before.getBook() || !before.getChapter() || !before.getVerse()) { 00362 return true; 00363 } 00364 00365 // Create a VerseKey that does do auto normalization 00366 // And set it to the reference under question 00367 VerseKey after; 00368 after.setVersificationSystem(currentVerse.getVersificationSystem()); 00369 after.setAutoNormalize(true); 00370 after.setText(buf); 00371 00372 if (before == after) 00373 { 00374 return true; 00375 } 00376 00377 // If we have gotten here the reference is not in the selected versification. 00378 // cout << "INFO(V11N): " << before << " is not in the " << currentVerse.getVersificationSystem() << " versification." << endl; 00379 if (debug & DEBUG_REV11N) { 00380 cout << "DEBUG(V11N): " << before << " normalizes to " << after << endl; 00381 } 00382 00383 return false; 00384 }
| void linkToEntry | ( | VerseKey & | linkKey, | |
| VerseKey & | dest | |||
| ) |
Definition at line 557 of file osis2mod.cpp.
00557 { 00558 00559 // Only link verses that are in the versification. 00560 if (!isValidRef(linkKey)) { 00561 return; 00562 } 00563 00564 VerseKey saveKey; 00565 saveKey.setVersificationSystem(currentVerse.getVersificationSystem()); 00566 saveKey.setAutoNormalize(0); 00567 saveKey.setIntros(1); 00568 saveKey = currentVerse; 00569 currentVerse = linkKey; 00570 00571 cout << "INFO(LINK): Linking " << currentVerse.getOSISRef() << " to " << dest.getOSISRef() << "\n"; 00572 module->linkEntry(&dest); 00573 00574 currentVerse = saveKey; 00575 }
| int main | ( | int | argc, | |
| char ** | argv | |||
| ) |
Definition at line 1527 of file osis2mod.cpp.
01527 { 01528 01529 fprintf(stderr, "You are running osis2mod: $Rev: 2783 $\n"); 01530 01531 // Let's test our command line arguments 01532 if (argc < 3) { 01533 usage(*argv); 01534 } 01535 01536 // variables for arguments, holding defaults 01537 const char* program = argv[0]; 01538 const char* path = argv[1]; 01539 const char* osisDoc = argv[2]; 01540 int append = 0; 01541 SWBuf compType = ""; 01542 bool isCommentary = false; 01543 int iType = 4; 01544 int entrySize = 0; 01545 SWBuf cipherKey = ""; 01546 SWCompress *compressor = 0; 01547 01548 for (int i = 3; i < argc; i++) { 01549 if (!strcmp(argv[i], "-a")) { 01550 append = 1; 01551 } 01552 else if (!strcmp(argv[i], "-z")) { 01553 if (compType.size()) usage(*argv, "Cannot specify both -z and -Z"); 01554 if (entrySize) usage(*argv, "Cannot specify both -z and -s"); 01555 compType = "ZIP"; 01556 } 01557 else if (!strcmp(argv[i], "-Z")) { 01558 if (compType.size()) usage(*argv, "Cannot specify both -z and -Z"); 01559 if (entrySize) usage(*argv, "Cannot specify both -Z and -s"); 01560 compType = "LZSS"; 01561 } 01562 else if (!strcmp(argv[i], "-b")) { 01563 if (i+1 < argc) { 01564 iType = atoi(argv[++i]); 01565 if ((iType >= 2) && (iType <= 4)) continue; 01566 } 01567 usage(*argv, "-b requires one of <2|3|4>"); 01568 } 01569 else if (!strcmp(argv[i], "-N")) { 01570 normalize = false; 01571 } 01572 else if (!strcmp(argv[i], "-c")) { 01573 if (i+1 < argc) cipherKey = argv[++i]; 01574 else usage(*argv, "-c requires <cipher_key>"); 01575 } 01576 else if (!strcmp(argv[i], "-v")) { 01577 if (i+1 < argc) v11n = argv[++i]; 01578 else usage(*argv, "-v requires <v11n>"); 01579 } 01580 else if (!strcmp(argv[i], "-s")) { 01581 if (compType.size()) usage(*argv, "Cannot specify -s and -z or -Z"); 01582 if (i+1 < argc) { 01583 entrySize = atoi(argv[++i]); 01584 if (entrySize == 2 || entrySize == 4) { 01585 continue; 01586 } 01587 } 01588 usage(*argv, "-s requires one of <2|4>"); 01589 } 01590 else if (!strcmp(argv[i], "-C")) { 01591 isCommentary = true; 01592 } 01593 else if (!strcmp(argv[i], "-d")) { 01594 if (i+1 < argc) debug |= atoi(argv[++i]); 01595 else usage(*argv, "-d requires <flags>"); 01596 } 01597 else usage(*argv, (((SWBuf)"Unknown argument: ")+ argv[i]).c_str()); 01598 } 01599 01600 if (isCommentary) isCommentary = true; // avoid unused warning for now 01601 01602 if (compType == "ZIP") { 01603 #ifndef EXCLUDEZLIB 01604 compressor = new ZipCompress(); 01605 #else 01606 usage(*argv, "ERROR: SWORD library not compiled with ZIP compression support.\n\tBe sure libzip is available when compiling SWORD library"); 01607 #endif 01608 } 01609 else if (compType == "LZSS") { 01610 compressor = new LZSSCompress(); 01611 } 01612 01613 #ifndef _ICU_ 01614 if (normalize) { 01615 normalize = false; 01616 cout << "WARNING(UTF8): " << program << " is not compiled with support for ICU. Assuming -N." << endl; 01617 } 01618 #endif 01619 01620 if (debug & DEBUG_OTHER) { 01621 cout << "DEBUG(ARGS):\n\tpath: " << path << "\n\tosisDoc: " << osisDoc << "\n\tcreate: " << append << "\n\tcompressType: " << compType << "\n\tblockType: " << iType << "\n\tcipherKey: " << cipherKey.c_str() << "\n\tnormalize: " << normalize << endl; 01622 } 01623 01624 if (!append) { // == 0 then create module 01625 // Try to initialize a default set of datafiles and indicies at our 01626 // datapath location passed to us from the user. 01627 if (compressor) { 01628 if (zText::createModule(path, iType, v11n)) { 01629 fprintf(stderr, "ERROR: %s: couldn't create module at path: %s \n", program, path); 01630 exit(EXIT_NO_CREATE); 01631 } 01632 } 01633 else if (entrySize == 4) { 01634 if (RawText4::createModule(path, v11n)) { 01635 fprintf(stderr, "ERROR: %s: couldn't create module at path: %s \n", program, path); 01636 exit(EXIT_NO_CREATE); 01637 } 01638 } 01639 else { 01640 if (RawText::createModule(path, v11n)) { 01641 fprintf(stderr, "ERROR: %s: couldn't create module at path: %s \n", program, path); 01642 exit(EXIT_NO_CREATE); 01643 } 01644 } 01645 } 01646 01647 // Do some initialization stuff 01648 if (compressor) { 01649 // Create a compressed text module allowing very large entries 01650 // Taking defaults except for first, fourth, fifth and last argument 01651 module = new zText( 01652 path, // ipath 01653 0, // iname 01654 0, // idesc 01655 iType, // iblockType 01656 compressor, // icomp 01657 0, // idisp 01658 ENC_UNKNOWN, // enc 01659 DIRECTION_LTR, // dir 01660 FMT_UNKNOWN, // markup 01661 0, // lang 01662 v11n // versification 01663 ); 01664 } 01665 else if (entrySize == 4) { 01666 // Create a raw text module allowing very large entries 01667 // Taking defaults except for first and last argument 01668 module = new RawText4( 01669 path, // ipath 01670 0, // iname 01671 0, // idesc 01672 0, // idisp 01673 ENC_UNKNOWN, // encoding 01674 DIRECTION_LTR, // dir 01675 FMT_UNKNOWN, // markup 01676 0, // ilang 01677 v11n // versification 01678 ); 01679 } 01680 else { 01681 // Create a raw text module allowing reasonable sized entries 01682 // Taking defaults except for first and last argument 01683 module = new RawText( 01684 path, // ipath 01685 0, // iname 01686 0, // idesc 01687 0, // idisp 01688 ENC_UNKNOWN, // encoding 01689 DIRECTION_LTR, // dir 01690 FMT_UNKNOWN, // markup 01691 0, // ilang 01692 v11n // versification 01693 ); 01694 } 01695 01696 SWFilter *cipherFilter = 0; 01697 01698 if (cipherKey.length()) { 01699 fprintf(stderr, "Adding cipher filter with phrase: %s\n", cipherKey.c_str() ); 01700 cipherFilter = new CipherFilter(cipherKey.c_str()); 01701 module->addRawFilter(cipherFilter); 01702 } 01703 01704 if (!module->isWritable()) { 01705 fprintf(stderr, "The module is not writable. Writing text to it will not work.\nExiting.\n" ); 01706 exit(EXIT_NO_WRITE); 01707 } 01708 01709 // Either read from std::cin (aka stdin), when the argument is a '-' 01710 // or from a specified file. 01711 if (!strcmp(osisDoc, "-")) { 01712 processOSIS(cin); 01713 } 01714 else { 01715 // Let's see if we can open our input file 01716 ifstream infile(osisDoc); 01717 if (infile.fail()) { 01718 fprintf(stderr, "ERROR: %s: couldn't open input file: %s \n", program, osisDoc); 01719 exit(EXIT_NO_READ); 01720 } 01721 processOSIS(infile); 01722 infile.close(); 01723 } 01724 01725 delete module; 01726 if (cipherFilter) 01727 delete cipherFilter; 01728 01729 fprintf(stderr, "SUCCESS: %s: has finished its work and will now rest\n", program); 01730 exit(0); // success 01731 }
| void makeValidRef | ( | VerseKey & | key | ) |
This routine is used to ensure that all the text in the input is saved to the module. Assumption: The input orders all the verses for a chapter in numerical order. Thus, any verses that are not in the chosen versification (v11n) follow those that are.
The prior implementation of this adjusted the verse to the last one that is in the chosen v11n. If it the chapter were extra, then it is appended to the last verse of the last chapter in the chosen v11n for that book. If it is just extra verses for a chapter, then it is appended to the last verse of the chapter.
The problem with this is when a OSIS verse refers to more than one verse, e.g. osisID="Gen.1.29 Gen.1.30 Gen.1.31" (Gen.1.31 is the last verse of the chapter in the chosen v11n) and then it is followed by Gen.1.32.
This routine assumes that linking is postponed to the end so that in the example Gen.1.30-31 are not linked but rather empty. This routine will then find the last verse in the computed chapter that has content.
Alternative, we could have done linking as we went, but this routine would have needed to find the first entry in the link set and elsewhere in the code when appending to a verse, it would need to be checked for adjacent links and those would have needed to be adjusted.
param key the key that may need to be adjusted
Definition at line 410 of file osis2mod.cpp.
00410 { 00411 VerseKey saveKey; 00412 saveKey.setVersificationSystem(currentVerse.getVersificationSystem()); 00413 saveKey.setAutoNormalize(false); 00414 saveKey.setIntros(true); 00415 saveKey = currentVerse; 00416 00417 // Since isValidRef returned false constrain the key to the nearest prior reference. 00418 // If we are past the last chapter set the reference to the last chapter 00419 int chapterMax = key.getChapterMax(); 00420 if (key.getChapter() > chapterMax) { 00421 key.setChapter(chapterMax); 00422 } 00423 00424 // Either we set the chapter to the last chapter and now need to set to the last verse in the chapter 00425 // Or the verse is beyond the end of the chapter. 00426 // In any case we need to constrain the verse to it's chapter. 00427 int verseMax = key.getVerseMax(); 00428 key.setVerse(verseMax); 00429 00430 if (debug & DEBUG_REV11N) { 00431 cout << "DEBUG(V11N) Chapter max:" << chapterMax << ", Verse Max:" << verseMax << endl; 00432 } 00433 00434 // There are three cases we want to handle: 00435 // In the examples we are using the KJV versification where the last verse of Matt.7 is Matt.7.29. 00436 // In each of these cases the out-of-versification, extra verse is Matt.7.30. 00437 // 1) The "extra" verse follows the last verse in the chapter. 00438 // <verse osisID="Matt.7.29">...</verse><verse osisID="Matt.7.30">...</verse> 00439 // In this case re-versify Matt.7.30 as Matt.7.29. 00440 // 00441 // 2) The "extra" verse follows a range (a set of linked verses). 00442 // <verse osisID="Matt.7.28-Matt.7.29">...</verse><verse osisID="Matt.7.30">...</verse> 00443 // In this case, re-versify Matt.7.30 as Matt.7.28, the first verse in the linked set. 00444 // Since we are post-poning linking, we want to re-reversify to the last entry in the module. 00445 // 00446 // 3) The last verse in the chapter is not in the input. There may be other verses missing as well. 00447 // <verse osisID="Matt.7.8">...</verse><verse osisID="Matt.7.30">...</verse> 00448 // In this case we should re-versify Matt.7.30 as Matt.7.29. 00449 // However, since this and 2) are ambiguous, we'll re-reversify to the last entry in the module. 00450 00451 while (!key.popError() && !module->hasEntry(&key)) { 00452 key.decrement(1); 00453 } 00454 00455 cout << "INFO(V11N): " << saveKey.getOSISRef() 00456 << " is not in the " << key.getVersificationSystem() 00457 << " versification. Appending content to " << key.getOSISRef() << endl; 00458 }
| void prepareSWText | ( | const char * | osisID, | |
| SWBuf & | text | |||
| ) |
Definition at line 177 of file osis2mod.cpp.
00178 { 00179 // Always check on UTF8 and report on non-UTF8 entries 00180 int utf8State = detectUTF8(text.c_str()); 00181 00182 // Trust, but verify. 00183 if (!normalize && !utf8State) { 00184 cout << "WARNING(UTF8): " << osisID << ": Should be converted to UTF-8 (" << text << ")" << endl; 00185 } 00186 00187 #ifdef _ICU_ 00188 if (normalize) { 00189 // Don't need to normalize text that is ASCII 00190 // But assume other non-UTF-8 text is Latin1 (cp1252) and convert it to UTF-8 00191 if (!utf8State) { 00192 cout << "INFO(UTF8): " << osisID << ": Converting to UTF-8 (" << text << ")" << endl; 00193 converter.processText(text, (SWKey *)2); // note the hack of 2 to mimic a real key. TODO: remove all hacks 00194 converted++; 00195 00196 // Prepare for double check. This probably can be removed. 00197 // But for now we are running the check again. 00198 // This is to determine whether we need to normalize output of the conversion. 00199 utf8State = detectUTF8(text.c_str()); 00200 } 00201 00202 // Double check. This probably can be removed. 00203 if (!utf8State) { 00204 cout << "ERROR(UTF8): " << osisID << ": Converting to UTF-8 (" << text << ")" << endl; 00205 } 00206 00207 if (utf8State > 0) { 00208 SWBuf before = text; 00209 normalizer.processText(text, (SWKey *)2); // note the hack of 2 to mimic a real key. TODO: remove all hacks 00210 if (before != text) { 00211 normalized++; 00212 } 00213 } 00214 } 00215 #endif 00216 }
| void prepareSWVerseKey | ( | SWBuf & | buf | ) |
Definition at line 230 of file osis2mod.cpp.
00230 { 00231 // This routine modifies the buf in place 00232 char* s = buf.getRawData(); 00233 char* p = s; 00234 bool inRange = false; 00235 while (*p) { 00236 if (inRange) { 00237 if (debug & DEBUG_REF) { 00238 cout << "DEBUG(REF): Copy range marker:" << *p << endl;; 00239 } 00240 00241 // Range markers are copied as is 00242 *s++ = *p++; 00243 } 00244 00245 // Look ahead to see if we are in a work prefix 00246 // but don't look past an osisID 00247 char *n = p; 00248 while (*n && *n != ':' && *n != ' ' && *n != '-') { 00249 n++; 00250 } 00251 00252 // We have found a work prefix 00253 if (*n == ':') { 00254 // set p to skip the work prefix 00255 p = n + 1; 00256 00257 if (debug & DEBUG_REF) { 00258 cout << "DEBUG(REF): Found a work prefix "; 00259 for (char *x = s; x <= n; x++) { 00260 cout << *x; 00261 } 00262 cout << endl; 00263 } 00264 } 00265 00266 // Now we are in the meat of an osisID. 00267 // Copy it to its end but stop on a grain marker of '!' 00268 if (debug & DEBUG_REF) { 00269 cout << "DEBUG(REF): Copy osisID:"; 00270 } 00271 00272 while (*p && *p != '!' && *p != ' ' && *p != '-') { 00273 if (debug & DEBUG_REF) { 00274 cout << *p; 00275 } 00276 00277 *s++ = *p++; 00278 } 00279 00280 if (debug & DEBUG_REF) { 00281 cout << endl; 00282 } 00283 00284 // The ! and everything following until we hit 00285 // the end of the osisID is part of the grain reference 00286 if (*p == '!') { 00287 n = p; 00288 while (*n && *n != ' ' && *n != '-') { 00289 n++; 00290 } 00291 00292 if (debug & DEBUG_REF) { 00293 cout << "DEBUG(REF): Found a grain suffix "; 00294 for (char *x = p; x < n; x++) { 00295 cout << *x; 00296 } 00297 cout << endl; 00298 } 00299 00300 p = n; 00301 } 00302 00303 // At this point we have processed an osisID 00304 00305 // if we are not in a range and the next characer is a - 00306 // then we are entering a range 00307 inRange = !inRange && *p == '-'; 00308 00309 if (debug & DEBUG_REF) { 00310 if (inRange) { 00311 cout << "DEBUG(REF): Found a range" << endl; 00312 } 00313 } 00314 00315 // between ranges and stand alone osisIDs we might have whitespace 00316 if (!inRange && *p == ' ') { 00317 // skip this and subsequent spaces 00318 while (*p == ' ') { 00319 p++; 00320 } 00321 00322 // replacing them all with a ';' 00323 *s++ = ';'; 00324 00325 if (debug & DEBUG_REF) { 00326 cout << "DEBUG(REF): replacing space with ;. Remaining: " << p << endl; 00327 } 00328 } 00329 } 00330 00331 // Determine whether we have modified the buffer 00332 // We have modified the buffer if s is not sitting on the null byte of the original 00333 if (*s) { 00334 // null terminate the reference 00335 *s = '\0'; 00336 // Since we modified the swbuf, we need to tell it what we have done 00337 buf.setSize(s - buf.c_str()); 00338 00339 if (debug & DEBUG_REF) { 00340 cout << "DEBUG(REF): shortended keyVal to`" << buf.c_str() << "`"<< endl; 00341 } 00342 } 00343 }
| void processOSIS | ( | istream & | infile | ) |
Definition at line 1328 of file osis2mod.cpp.
01328 { 01329 typedef enum { 01330 CS_NOT_IN_COMMENT, // or seen starting "<" 01331 CS_SEEN_STARTING_EXCLAMATION, 01332 CS_SEEN_STARTING_HYPHEN, 01333 CS_IN_COMMENT, 01334 CS_SEEN_ENDING_HYPHEN, 01335 CS_SEEN_SECOND_ENDING_HYPHEN, 01336 CS_SEEN_ENDING_GREATER_THAN 01337 } t_commentstate; 01338 01339 activeOsisID[0] = '\0'; 01340 01341 strcpy(currentOsisID,"N/A"); 01342 01343 currentVerse.setVersificationSystem(v11n); 01344 currentVerse.setAutoNormalize(false); 01345 currentVerse.setIntros(true); // turn on mod/testmnt/book/chap headings 01346 currentVerse.setPersist(true); 01347 01348 module->setKey(currentVerse); 01349 module->setPosition(TOP); 01350 01351 SWBuf token; 01352 SWBuf text; 01353 bool incomment = false; 01354 t_commentstate commentstate = CS_NOT_IN_COMMENT; 01355 bool intoken = false; 01356 bool inWhitespace = false; 01357 bool seeingSpace = false; 01358 unsigned char curChar = '\0'; 01359 01360 while (infile.good()) { 01361 01362 int possibleChar = infile.get(); 01363 01364 // skip the character if it is bad. infile.good() will catch the problem 01365 if (possibleChar == -1) { 01366 continue; 01367 } 01368 01369 curChar = (unsigned char) possibleChar; 01370 01371 // All newlines are simply whitespace 01372 // Does a SWORD module actually require this? 01373 if (curChar == '\n') { 01374 curChar = ' '; 01375 } 01376 01377 if (!intoken && curChar == '<') { 01378 intoken = true; 01379 token = "<"; 01380 continue; 01381 } 01382 01383 // Handle XML comments starting with "<!--", ending with "-->" 01384 01385 if (intoken && !incomment) { 01386 switch (commentstate) { 01387 case CS_NOT_IN_COMMENT : 01388 if (curChar == '!') { 01389 commentstate = CS_SEEN_STARTING_EXCLAMATION; 01390 token.append((char) curChar); 01391 continue; 01392 } else { 01393 break; 01394 } 01395 01396 case CS_SEEN_STARTING_EXCLAMATION : 01397 if (curChar == '-') { 01398 commentstate = CS_SEEN_STARTING_HYPHEN; 01399 token.append((char) curChar); 01400 continue; 01401 } else { 01402 commentstate = CS_NOT_IN_COMMENT; 01403 break; 01404 } 01405 01406 case CS_SEEN_STARTING_HYPHEN : 01407 if (curChar == '-') { 01408 incomment = true; 01409 commentstate = CS_IN_COMMENT; 01410 token.append((char) curChar); 01411 01412 if (debug & DEBUG_OTHER) { 01413 cout << "DEBUG(COMMENTS): in comment" << endl; 01414 } 01415 01416 continue; 01417 } else { 01418 commentstate = CS_NOT_IN_COMMENT; 01419 break; 01420 } 01421 01422 default: 01423 cout << "FATAL(COMMENTS): unknown commentstate on comment start: " << commentstate << endl; 01424 exit(EXIT_BAD_NESTING); 01425 } 01426 } 01427 01428 if (incomment) { 01429 switch (commentstate) { 01430 case CS_IN_COMMENT: 01431 if (curChar == '-') { 01432 commentstate = CS_SEEN_ENDING_HYPHEN; 01433 continue; 01434 } else { 01435 // ignore the character 01436 continue; 01437 } 01438 01439 case CS_SEEN_ENDING_HYPHEN : 01440 if (curChar == '-') { 01441 commentstate = CS_SEEN_SECOND_ENDING_HYPHEN; 01442 continue; 01443 } else { 01444 // ignore character 01445 commentstate = CS_IN_COMMENT; 01446 continue; 01447 } 01448 01449 case CS_SEEN_SECOND_ENDING_HYPHEN : 01450 if (curChar == '>') { 01451 intoken = false; 01452 incomment = false; 01453 commentstate = CS_NOT_IN_COMMENT; 01454 01455 if (debug & DEBUG_OTHER) { 01456 cout << "DEBUG(COMMENTS): out of comment" << endl; 01457 } 01458 01459 continue; 01460 } else { 01461 // ignore character 01462 commentstate = CS_IN_COMMENT; 01463 continue; 01464 } 01465 01466 default: 01467 cout << "FATAL(COMMENTS): unknown commentstate on comment end: " << commentstate << endl; 01468 exit(EXIT_BAD_NESTING); 01469 } 01470 } 01471 01472 // Outside of tokens merge adjacent whitespace 01473 if (!intoken) { 01474 seeingSpace = isspace(curChar)!=0; 01475 if (seeingSpace) { 01476 if (inWhitespace) { 01477 continue; 01478 } 01479 // convert all whitespace to blanks 01480 curChar = ' '; 01481 } 01482 inWhitespace = seeingSpace; 01483 } 01484 01485 if (intoken && curChar == '>') { 01486 intoken = false; 01487 inWhitespace = false; 01488 token.append('>'); 01489 // take this isalpha if out to check for bugs in text 01490 if (isalpha(token[1]) || 01491 (((token[1] == '/') || (token[1] == '?')) && isalpha(token[2]))) { 01492 //cout << "Handle:" << token.c_str() << endl; 01493 XMLTag t = transformBSP(token.c_str()); 01494 01495 if (!handleToken(text, t)) { 01496 text.append(t); 01497 } 01498 } else { 01499 cout << "WARNING(PARSE): malformed token: " << token << endl; 01500 } 01501 continue; 01502 } 01503 01504 if (intoken) { 01505 token.append((char) curChar); 01506 } 01507 else { 01508 switch (curChar) { 01509 case '>' : text.append(">"); break; 01510 case '<' : text.append("<"); break; 01511 default : text.append((char) curChar); break; 01512 } 01513 } 01514 } 01515 01516 // Force the last entry from the text buffer. 01517 text = ""; 01518 writeEntry(text, true); 01519 writeLinks(); 01520 01521 #ifdef _ICU_ 01522 if (converted) fprintf(stderr, "osis2mod converted %d verses to UTF-8\n", converted); 01523 if (normalized) fprintf(stderr, "osis2mod normalized %d verses to NFC\n", normalized); 01524 #endif 01525 }
Support normalizations necessary for a SWORD module. OSIS allows for document structure (Book, Section, Paragraph or BSP) to overlap Bible versification (Book, Chapter, Verse). Most SWORD applications need to display verses in isolation or in HTML table cells, requiring each stored entry (i.e. verses) to be well-formed xml. This routine normalizes container elements which could cross verse boundaries into milestones. For most of these OSIS elements, there is a milestone form. However, p is not milestoneable. For this reason, p is transformed into lb elements. param t the tag to transform return the transformed tag or the original one
Definition at line 1142 of file osis2mod.cpp.
01142 { 01143 static std::stack<XMLTag> bspTagStack; 01144 static int sID = 1; 01145 char buf[11]; 01146 01147 // Support simplification transformations 01148 if (t.isEmpty()) { 01149 01150 if (debug & DEBUG_XFORM) { 01151 cout << "DEBUG(XFORM): " << currentOsisID << ": xform empty " << t << endl; 01152 } 01153 01154 return t; 01155 } 01156 01157 SWBuf tagName = t.getName(); 01158 if (!t.isEndTag()) { 01159 // Transform <p> into <div type="paragraph"> and milestone it 01160 if (tagName == "p") { 01161 t.setText("<div type=\"paragraph\" />"); 01162 sprintf(buf, "gen%d", sID++); 01163 t.setAttribute("sID", buf); 01164 } 01165 01166 // Transform <tag> into <tag sID="">, where tag is a milestoneable element. 01167 // The following containers are milestoneable. 01168 // abbr, closer, div, foreign, l, lg, salute, signed, speech 01169 // Leaving out: 01170 // abbr When would this ever cross a boundary? 01171 // seg as it is used for a divineName hack 01172 // foreign so that it can be easily italicized 01173 else if (tagName == "chapter" || 01174 tagName == "closer" || 01175 tagName == "div" || 01176 tagName == "l" || 01177 tagName == "lg" || 01178 tagName == "q" || 01179 tagName == "salute" || 01180 tagName == "signed" || 01181 tagName == "speech" || 01182 tagName == "verse" 01183 ) { 01184 t.setEmpty(true); 01185 sprintf(buf, "gen%d", sID++); 01186 t.setAttribute("sID", buf); 01187 } 01188 bspTagStack.push(t); 01189 01190 if (debug & DEBUG_XFORM) { 01191 cout << "DEBUG(XFORM): " << currentOsisID << ": xform push (" << bspTagStack.size() << ") " << t << " (tagname=" << tagName << ")" << endl; 01192 XMLTag topToken = bspTagStack.top(); 01193 cout << "DEBUG(XFORM): " << currentOsisID << ": xform top(" << bspTagStack.size() << ") " << topToken << endl; 01194 } 01195 } 01196 else { 01197 if (!bspTagStack.empty()) { 01198 XMLTag topToken = bspTagStack.top(); 01199 01200 if (debug & DEBUG_XFORM) { 01201 cout << "DEBUG(XFORM): " << currentOsisID << ": xform pop(" << bspTagStack.size() << ") " << topToken << endl; 01202 } 01203 01204 bspTagStack.pop(); 01205 01206 // Look for the milestoneable container tags handled above. 01207 if (tagName == "chapter" || 01208 tagName == "closer" || 01209 tagName == "div" || 01210 tagName == "l" || 01211 tagName == "lg" || 01212 tagName == "p" || 01213 tagName == "q" || 01214 tagName == "salute" || 01215 tagName == "signed" || 01216 tagName == "speech" || 01217 tagName == "verse" 01218 ) { 01219 // make this a clone of the start tag with sID changed to eID 01220 // Note: in the case of </p> the topToken is a <div type="paragraph"> 01221 t = topToken; 01222 t.setAttribute("eID", t.getAttribute("sID")); 01223 t.setAttribute("sID", 0); 01224 } 01225 } 01226 else { 01227 cout << "FATAL(TAGSTACK): " << currentOsisID << ": closing tag without opening tag" << endl; 01228 } 01229 } 01230 01231 return t; 01232 }
| void usage | ( | const char * | app, | |
| const char * | error = 0 | |||
| ) |
Definition at line 1275 of file osis2mod.cpp.
01275 { 01276 01277 if (error) fprintf(stderr, "\n%s: %s\n", app, error); 01278 01279 fprintf(stderr, "OSIS Bible/commentary module creation tool for The SWORD Project\n"); 01280 fprintf(stderr, "\nusage: %s <output/path> <osisDoc> [OPTIONS]\n", app); 01281 fprintf(stderr, " <output/path>\t\t an existing folder that the module will be written\n"); 01282 fprintf(stderr, " <osisDoc>\t\t path to the validated OSIS document, or '-' to\n"); 01283 fprintf(stderr, "\t\t\t\t read from standard input\n"); 01284 fprintf(stderr, " -a\t\t\t augment module if exists (default is to create new)\n"); 01285 fprintf(stderr, " -z\t\t\t use ZIP compression (default no compression)\n"); 01286 fprintf(stderr, " -Z\t\t\t use LZSS compression (default no compression)\n"); 01287 fprintf(stderr, " -b <2|3|4>\t\t compression block size (default 4):\n"); 01288 fprintf(stderr, "\t\t\t\t 2 - verse; 3 - chapter; 4 - book\n"); 01289 fprintf(stderr, " -c <cipher_key>\t encipher module using supplied key\n"); 01290 fprintf(stderr, "\t\t\t\t (default no enciphering)\n"); 01291 fprintf(stderr, " -N\t\t\t do not convert UTF-8 or normalize UTF-8 to NFC\n"); 01292 fprintf(stderr, "\t\t\t\t (default is to convert to UTF-8, if needed,\n"); 01293 fprintf(stderr, "\t\t\t\t and then normalize to NFC)\n"); 01294 fprintf(stderr, "\t\t\t\t Note: UTF-8 texts should be normalized to NFC.\n"); 01295 fprintf(stderr, " -s <2|4>\t\t bytes used to store entry size (default is 2).\n"); 01296 fprintf(stderr, "\t\t\t\t Note: useful for commentaries with very large\n"); 01297 fprintf(stderr, "\t\t\t\t entries in uncompressed modules\n"); 01298 fprintf(stderr, "\t\t\t\t (2 bytes to store size equal 65535 characters)\n"); 01299 fprintf(stderr, " -v <v11n>\t\t specify a versification scheme to use (default is KJV)\n"); 01300 fprintf(stderr, "\t\t\t\t Note: The following are valid values for v11n:\n"); 01301 VersificationMgr *vmgr = VersificationMgr::getSystemVersificationMgr(); 01302 StringList av11n = vmgr->getVersificationSystems(); 01303 for (StringList::iterator loop = av11n.begin(); loop != av11n.end(); loop++) { 01304 fprintf(stderr, "\t\t\t\t\t%s\n", (*loop).c_str()); 01305 } 01306 fprintf(stderr, " -d <flags>\t\t turn on debugging (default is 0)\n"); 01307 fprintf(stderr, "\t\t\t\t Note: This flag may change in the future.\n"); 01308 fprintf(stderr, "\t\t\t\t Flags: The following are valid values:\n"); 01309 fprintf(stderr, "\t\t\t\t\t0 - no debugging\n"); 01310 fprintf(stderr, "\t\t\t\t\t1 - writes to module, very verbose\n"); 01311 fprintf(stderr, "\t\t\t\t\t2 - verse start and end\n"); 01312 fprintf(stderr, "\t\t\t\t\t4 - quotes, esp. Words of Christ\n"); 01313 fprintf(stderr, "\t\t\t\t\t8 - titles\n"); 01314 fprintf(stderr, "\t\t\t\t\t16 - inter-verse material\n"); 01315 fprintf(stderr, "\t\t\t\t\t32 - BSP to BCV transformations\n"); 01316 fprintf(stderr, "\t\t\t\t\t64 - v11n exceptions\n"); 01317 fprintf(stderr, "\t\t\t\t\t128 - parsing of osisID and osisRef\n"); 01318 fprintf(stderr, "\t\t\t\t\t256 - internal stack\n"); 01319 fprintf(stderr, "\t\t\t\t\t512 - miscellaneous\n"); 01320 fprintf(stderr, "\t\t\t\t This argument can be used more than once. (Or\n"); 01321 fprintf(stderr, "\t\t\t\t the flags may be added together.)\n"); 01322 fprintf(stderr, "\n"); 01323 fprintf(stderr, "See http://www.crosswire.org/wiki/osis2mod for more details.\n"); 01324 fprintf(stderr, "\n"); 01325 exit(EXIT_BAD_ARG); 01326 }
| void writeEntry | ( | SWBuf & | text, | |
| bool | force = false | |||
| ) |
Definition at line 460 of file osis2mod.cpp.
00460 { 00461 char keyOsisID[255]; 00462 00463 static const char* revision = "<milestone type=\"x-importer\" subType=\"x-osis2mod\" n=\"$Rev: 2783 $\"/>"; 00464 static bool firstOT = true; 00465 static bool firstNT = true; 00466 00467 if (!inCanonicalOSISBook) { 00468 return; 00469 } 00470 00471 strcpy(keyOsisID, currentVerse.getOSISRef()); 00472 00473 // set keyOsisID to anything that an osisID cannot be. 00474 if (force) { 00475 strcpy(keyOsisID, "-force"); 00476 } 00477 00478 static VerseKey lastKey; 00479 lastKey.setVersificationSystem(currentVerse.getVersificationSystem()); 00480 lastKey.setAutoNormalize(0); 00481 lastKey.setIntros(1); 00482 00483 VerseKey saveKey; 00484 saveKey.setVersificationSystem(currentVerse.getVersificationSystem()); 00485 saveKey.setAutoNormalize(0); 00486 saveKey.setIntros(1); 00487 saveKey = currentVerse; 00488 00489 // If we have seen a verse and the supplied one is different then we output the collected one. 00490 if (*activeOsisID && strcmp(activeOsisID, keyOsisID)) { 00491 00492 if (!isValidRef(lastKey)) { 00493 makeValidRef(lastKey); 00494 } 00495 00496 currentVerse = lastKey; 00497 00498 prepareSWText(activeOsisID, activeVerseText); 00499 00500 // Put the revision into the module 00501 int testmt = currentVerse.getTestament(); 00502 if ((testmt == 1 && firstOT) || (testmt == 2 && firstNT)) { 00503 VerseKey t; 00504 t.setVersificationSystem(currentVerse.getVersificationSystem()); 00505 t.setAutoNormalize(0); 00506 t.setIntros(1); 00507 t = currentVerse; 00508 currentVerse.setBook(0); 00509 currentVerse.setChapter(0); 00510 currentVerse.setVerse(0); 00511 module->setEntry(revision); 00512 currentVerse = t; 00513 switch (testmt) { 00514 case 1: 00515 firstOT = false; 00516 break; 00517 case 2: 00518 firstNT = false; 00519 break; 00520 } 00521 } 00522 00523 // If the entry already exists, then append this entry to the text. 00524 // This is for verses that are outside the chosen versification. They are appended to the prior verse. 00525 // The space should not be needed if we retained verse tags. 00526 SWBuf currentText = module->getRawEntry(); 00527 if (currentText.length()) { 00528 cout << "INFO(WRITE): Appending entry: " << currentVerse.getOSISRef() << ": " << activeVerseText << endl; 00529 activeVerseText = currentText + " " + activeVerseText; 00530 } 00531 00532 if (debug & DEBUG_WRITE) { 00533 cout << "DEBUG(WRITE): " << activeOsisID << ":" << currentVerse.getOSISRef() << ": " << activeVerseText << endl; 00534 } 00535 00536 module->setEntry(activeVerseText); 00537 activeVerseText = ""; 00538 } 00539 00540 // The following is for initial verse content and for appending interverse content. 00541 if (activeVerseText.length()) { 00542 activeVerseText += text; 00543 } 00544 else { 00545 // Eliminate leading whitespace on the beginning of each verse 00546 text.trimStart(); 00547 activeVerseText = text; 00548 } 00549 // text has been consumed so clear it out. 00550 text = ""; 00551 00552 currentVerse = saveKey; 00553 lastKey = currentVerse; 00554 strcpy(activeOsisID, keyOsisID); 00555 }
| void writeLinks | ( | ) |
Write out all links in the module. Waiting is necessary because writeEntry might ultimately append text to a verse moving it's offset in the data file. While we are minimizing it by postponing the write until we have gathered the next verse, the following scenario is happening: A module is using linked verses and has some verses that are not in the chosen versification. If the out-of-canon verse happens following a linked verse, the out-of-canon verse is appended to the prior verse. Care has to be taken that the linked verses all point to the first of the set.
Definition at line 1246 of file osis2mod.cpp.
01247 { 01248 // Link all the verses 01249 VerseKey destKey; 01250 destKey.setVersificationSystem(currentVerse.getVersificationSystem()); 01251 destKey.setAutoNormalize(0); 01252 destKey.setIntros(1); 01253 01254 VerseKey linkKey; 01255 linkKey.setVersificationSystem(currentVerse.getVersificationSystem()); 01256 linkKey.setAutoNormalize(0); 01257 linkKey.setIntros(1); 01258 for (unsigned int i = 0; i < linkedVerses.size(); i++) { 01259 // The verseKeys is a list of verses 01260 // where the first is the real verse 01261 // and the others link to it. 01262 ListKey verseKeys = linkedVerses[i]; 01263 verseKeys.setPosition(TOP); 01264 destKey = verseKeys.getElement(); 01265 verseKeys.increment(1); 01266 01267 while (!verseKeys.popError()) { 01268 linkKey = verseKeys.getElement(); 01269 verseKeys.increment(1); 01270 linkToEntry(linkKey, destKey); 01271 } 01272 } 01273 }
| char activeOsisID[255] |
Definition at line 86 of file osis2mod.cpp.
| SWBuf activeVerseText |
Definition at line 89 of file osis2mod.cpp.
| int converted = 0 |
Definition at line 81 of file osis2mod.cpp.
| ListKey currentKeyIDs = ListKey() |
Definition at line 91 of file osis2mod.cpp.
| char currentOsisID[255] |
Definition at line 87 of file osis2mod.cpp.
| VerseKey currentVerse |
Definition at line 84 of file osis2mod.cpp.
| int debug = 0 |
Definition at line 57 of file osis2mod.cpp.
| const int DEBUG_INTERVERSE = 16 |
Definition at line 62 of file osis2mod.cpp.
| const int DEBUG_OTHER = 512 |
Definition at line 67 of file osis2mod.cpp.
| const int DEBUG_QUOTE = 4 |
Definition at line 60 of file osis2mod.cpp.
| const int DEBUG_REF = 128 |
Definition at line 65 of file osis2mod.cpp.
| const int DEBUG_REV11N = 64 |
Definition at line 64 of file osis2mod.cpp.
| const int DEBUG_STACK = 256 |
Definition at line 66 of file osis2mod.cpp.
| const int DEBUG_TITLE = 8 |
Definition at line 61 of file osis2mod.cpp.
| const int DEBUG_VERSE = 2 |
Definition at line 59 of file osis2mod.cpp.
| const int DEBUG_WRITE = 1 |
Definition at line 58 of file osis2mod.cpp.
| const int DEBUG_XFORM = 32 |
Definition at line 63 of file osis2mod.cpp.
| const int EXIT_BAD_ARG = 1 |
Definition at line 70 of file osis2mod.cpp.
| const int EXIT_BAD_NESTING = 5 |
Definition at line 74 of file osis2mod.cpp.
| const int EXIT_NO_CREATE = 3 |
Definition at line 72 of file osis2mod.cpp.
| const int EXIT_NO_READ = 4 |
Definition at line 73 of file osis2mod.cpp.
| const int EXIT_NO_WRITE = 2 |
Definition at line 71 of file osis2mod.cpp.
bool inCanonicalOSISBook = true [static] |
Definition at line 95 of file osis2mod.cpp.
| std::vector<ListKey> linkedVerses |
Definition at line 93 of file osis2mod.cpp.
Definition at line 83 of file osis2mod.cpp.
bool normalize = true [static] |
Definition at line 96 of file osis2mod.cpp.
| int normalized = 0 |
Definition at line 80 of file osis2mod.cpp.
| SWBuf v11n = "KJV" |
Definition at line 85 of file osis2mod.cpp.
1.6.1