[sword-svn] r360 - in trunk/modules: . python

chrislit at crosswire.org chrislit at crosswire.org
Sat Aug 4 04:10:28 MST 2012


Author: chrislit
Date: 2012-08-04 04:10:27 -0700 (Sat, 04 Aug 2012)
New Revision: 360

Added:
   trunk/modules/python/
   trunk/modules/python/usfm2osis.py
   trunk/modules/python/usfmtags.py
Log:
Initial commits of usfmtags.py & usfm2osis.py


Added: trunk/modules/python/usfm2osis.py
===================================================================
--- trunk/modules/python/usfm2osis.py	                        (rev 0)
+++ trunk/modules/python/usfm2osis.py	2012-08-04 11:10:27 UTC (rev 360)
@@ -0,0 +1,1018 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+date = '$Date: 2012-03-09 01:23:40 -0800 (Fri, 09 Mar 2012) $'
+rev = '$Rev: 355 $'
+
+USFMversion = '2.35'  # http://ubs-icap.org/chm/usfm/2.35/index.html
+OSISversion = '2.1.1' # http://www.bibletechnologies.net/osisCore.2.1.1.xsd
+scriptVersion = '0.5'
+
+
+# usfm2osis.py
+# Copyright 2012 by the CrossWire Bible Society <http://www.crosswire.org/>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# The full text of the GNU General Public License is available at:
+# <http://www.gnu.org/licenses/gpl-3.0.txt>.
+
+
+### Guidelines & objectives:
+# Target Python 2.7+ (but not 3)
+# Use no non-default libraries (this may change in the future)
+# Don't use SWORD bindings (this will probably change to allow *optional* use of bindings, if installed)
+# Achieve full coverage of USFM according to UBS spec:
+#      <http://paratext.ubs-translations.org/about/usfm>
+# Employ best-practice conformant OSIS
+# Employ modularity (functions rather than a big long script)
+# Employ the same command-line syntax as usfm2osis.pl
+# Use & abuse Unicode tags (http://unicode.org/charts/PDF/UE0000.pdf) to simplify Regex processing
+
+### Roadmap:
+# 0.5 initial commit, including full coverage of core USFM tags
+# 0.6 test suite incorporating all USFM examples from UBS ICAP and other complex cases
+# 0.x more clean-up & re-ordering to correctly encapsulate milestones within appropriate containers
+# 0.x clean-up code: make fully OO? docstrings?
+# 1.0 feature complete for release & production use
+# 1.x xreffix.pl-functionality (osisParse(ref)), requiring SWORD bindings
+# 1.x SWORD-mode output?
+# 1.x IMP output?
+# 1.x SWORD module output?, requiring SWORD bindings
+
+
+import sys, codecs, re
+from encodings.aliases import aliases
+import multiprocessing, Queue
+
+date = date.replace('$', '').strip()[6:16]
+rev = rev.replace('$', '').strip()[5:]
+
+bookDict = {
+    ### Known USFM Book codes from Paratext
+    # OT
+    'GEN':'Gen', 'EXO':'Exod', 'LEV':'Lev', 'NUM':'Num', 'DEU':'Deut', 'JOS':'Josh', 'JDG':'Judg', 'RUT':'Ruth',
+    '1SA':'1Sam', '2SA':'2Sam', '1KI':'1Kgs', '2KI':'2Kgs', '1CH':'1Chr', '2CH':'2Chr', 'EZR':'Ezra', 'NEH':'Neh',
+    'EST':'Esth', 'JOB':'Job', 'PSA':'Ps', 'PRO':'Prov', 'ECC':'Eccl', 'SNG':'Song', 'ISA':'Isa', 'JER':'Jer',
+    'LAM':'Lam', 'EZK':'Ezek', 'DAN':'Dan', 'HOS':'Hos', 'JOL':'Joel', 'AMO':'Amos', 'OBA':'Obad', 'JON':'Jonah',
+    'MIC':'Mic', 'NAM':'Nah', 'HAB':'Hab', 'ZEP':'Zeph', 'HAG':'Hag', 'ZEC':'Zech', 'MAL':'Mal',
+    # NT
+    'MAT':'Matt', 'MRK':'Mark', 'LUK':'Luke', 'JHN':'John', 'ACT':'Acts', 'ROM':'Rom', '1CO':'1Cor', '2CO':'2Cor',
+    'GAL':'Gal', 'EPH':'Eph', 'PHP':'Phil', 'COL':'Col', '1TH':'1Thess', '2TH':'2Thess', '1TI':'1Tim', '2TI':'2Tim',
+    'TIT':'Titus', 'PHM':'Phlm', 'HEB':'Heb', 'JAS':'Jas', '1PE':'1Pet', '2PE':'2Pet', '1JN':'1John', '2JN':'2John',
+    '3JN':'3John', 'JUD':'Jude', 'REV':'Rev',
+    # DC - Catholic
+    'TOB':'Tob', 'JDT':'Jdt', 'ESG':'EsthGr', 'WIS':'Wis', 'SIR':'Sir', 'BAR':'Bar', 'LJE':'EpJer', 'S3Y':'PrAzar',
+    'SUS':'Sus', 'BEL':'Bel', '1MA':'1Macc', '2MA':'2Macc',
+    # DC - Eastern Orthodox
+    '3MA':'3Macc', '4MA':'4Macc', '1ES':'1Esd', '2ES':'2Esd', 'MAN':'PrMan', 'PS2':'Ps151',
+    # Rahlfs' LXX
+    'ODA':'Odes', 'PSS':'PssSol', 'JSA':'JoshA', 'JDB':'JudgB', 'TBS':'TobS', 'SST':'SusTh', 'DNT':'DanTh',
+    'BLT':'BelTh',
+    # Esdrae
+    '4ES':'4Ezra', '5ES':'5Ezra', '6ES':'6Ezra',
+    # Additional non-biblical books
+    'XXA':'XXA', 'XXB':'XXB', 'XXC':'XXC', 'XXD':'XXD', 'XXE':'XXE', 'XXF':'XXF', 'XXG':'XXG',
+    ###
+
+    ### Proposed Additions <http://lc.bfbs.org.uk/e107_files/downloads/canonicalissuesinparatext.pdf>
+    # Inconsistency with Esther
+    'DAG':'DanGr',
+    # Alternate Psalms
+    'PSB':'Ps',
+    # Ethiopic
+    'JUB':'Jub', 'ENO':'1En', 'REP':'Reproof', # == Tegsas
+    '1MQ':'1Meq', '2MQ':'2Meq', '3MQ':'3Meq', '4BA':'4Bar',
+    # Syriac
+    '2BA':'2Bar', 'LBA':'EpBar', 'PS3':'5ApocSyrPss',
+    # Vulgate
+    'LAO':'EpLao', 'PSO':'PrSol', 'PJE':'PrJer',
+    # Armenian
+    'WSI':'WSir', 'COP':'CorCorr', '3CO':'3Cor', 'EUT':'PrEut', 'DOJ':'DJohn',
+    # Apostolic Fathers
+    '1CL':'1Clem', '2CL':'2Clem', 'SHE':'Herm', 'LBA':'Barn', 'DID':'Did',
+    ###
+
+    # Proposed replacements <http://lc.bfbs.org.uk/e107_files/downloads/canonicalissuesinparatext.pdf>
+    'ODE':'Odes', 'EZA':'4Ezra', '5EZ':'5Ezra', '6EZ':'6Ezra',
+
+    # Additional biblical books
+    'ADE':'AddEsth',
+
+    # Peripheral books
+    'FRT':'FRONT', 'INT':'INTRODUCTION', 'BAK':'BACK', 'CNC':'CONCORDANCE', 'GLO':'GLOSSARY',
+    'TDX':'INDEX', 'NDX':'GAZETTEER', 'OTH':'X-OTHER'
+    }
+
+specialBooks = ['FRONT', 'INTRODUCTION', 'BACK', 'CONCORDANCE', 'GLOSSARY', 'INDEX', 'GAZETTEER', 'X-OTHER']
+
+peripherals = {
+    'Title Page':'titlePage', 'Half Title Page':'x-halfTitlePage', 'Promotional Page':'x-promotionalPage',
+    'Imprimatur':'imprimatur', 'Publication Data':'publicationData', 'Foreward':'x-foreward', 'Preface':'preface',
+    'Table of Contents':'tableofContents', 'Alphabetical Contents':'x-alphabeticalContents',
+    'Table of Abbreviations':'x-tableofAbbreviations', 'Chronology':'x-chronology',
+    'Weights and Measures':'x-weightsAndMeasures', 'Map Index':'x-mapIndex',
+    'NT Quotes from LXX':'x-ntQuotesFromLXX'
+    }
+
+introPeripherals = {
+    'Bible Introduction':'bible', 'Old Testament Introduction':'oldTestament',
+    'Pentateuch Introduction':'pentateuch', 'History Introduction':'history', 'Poetry Introduction':'poetry',
+    'Prophecy Introduction':'prophecy', 'New Testament Introduction':'newTestament',
+    'Gospels Introduction':'gospels', 'Acts Introduction':'acts', 'Epistles Introduction':'epistles',
+    'Revelation Introduction':'revelation', 'Deuterocanon Introduction':'deuterocanon'
+    }
+
+osis2locBk = dict()
+loc2osisBk = dict()
+verbose = bool()
+
+"""
+BEGIN PSF-licensed segment
+"""
+"""
+keynat from http://code.activestate.com/recipes/285264-natural-string-sorting/
+"""
+def keynat(string):
+    r'''A natural sort helper function for sort() and sorted()
+    without using regular expressions or exceptions.
+
+    >>> items = ('Z', 'a', '10th', '1st', '9')
+    >>> sorted(items)
+    ['10th', '1st', '9', 'Z', 'a']
+    >>> sorted(items, key=keynat)
+    ['1st', '9', '10th', 'a', 'Z']
+    '''
+    it = type(1)
+    r = []
+    for c in string:
+        if c.isdigit():
+            d = int(c)
+            if r and type( r[-1] ) == it:
+                r[-1] = r[-1] * 10 + d
+            else:
+                r.append(d)
+        else:
+            r.append(c.lower())
+    return r
+"""
+END PSF-licened segment
+"""
+
+def convertToOSIS(sFile):
+    global encoding
+    global relaxedConformance
+
+    verbosePrint('Processing: ' + sFile)
+
+    def cvtPreprocess(osis, relaxedConformance):
+        # lines should never start with non-tags
+        osis = re.sub(r'\n\s*([^\\\s])', r' \1', osis)  # TODO: test this
+        # convert CR to LF
+        osis = osis.replace(r'\r', r'\n')
+        # lines should never end with whitespace (other than \n)
+        osis = re.sub(r'\s+\n', r'\n', osis)
+        # XML-encode as necessary
+        osis = osis.replace('&', '&amp;')
+        osis = osis.replace('<', '&lt;')
+        osis = osis.replace('>', '&gt;')
+
+        return osis
+
+
+    def cvtIdentification(osis, relaxedConformance):
+        """
+        Identification
+        supported: \id, \ide, \sts, \rem, \h, \toc1, \toc2, \toc3
+        """
+        global loc2osisBk, osis2locBk
+        # \id_<CODE>_(Name of file, Book name, Language, Last edited, Date etc.)  ###TESTED###
+        osis = re.sub(r'\\id\s+([A-Z0-9]{3})\b\s*([^\\\n]*?)\n(.*)(?=\\id|$)', lambda m: u'󠁂<div type="book" osisID="' + bookDict[m.group(1)] + '">\n' + (('<!-- id comment - ' + m.group(2) + ' -->\n') if m.group(2) else '') +  m.group(3) + u'</div type="book">󠁂\n', osis, flags=re.DOTALL)
+        # keep a copy of the OSIS book abbreviation for below (\toc3 processing) to store for mapping localized book names to/from OSIS
+        osisBook = re.search(r'\\id\s+([A-Z0-9]{3})', osis)
+        if osisBook:
+            osisBook = bookDict[osisBook.group(1)]
+
+        # \ide_<ENCODING>  ###TESTED###
+        osis = re.sub(r'\\ide\b.*\n', r'', osis) # delete, since this was handled above
+
+        # \sts_<STATUS CODE>
+        osis = re.sub(r'\\sts\b\s+(.+)\s*\n', r'<milestone type="x-sts" n="\1"/>\n', osis)
+
+        # \rem_text...  ###TESTED###
+        osis = re.sub(r'\\rem\b\s+(.+)', r'<!-- rem - \1 -->', osis)
+
+        # \h#_text...  ###TESTED###
+        osis = re.sub(r'\\h\b\s+(.+)\s*\n', r'<title type="runningHead">\1</title>\n', osis)
+
+        # \toc1_text...
+        osis = re.sub(r'\\toc1\b\s+(.+)\s*\n', r'<milestone type="x-toc1" n="\1"/>\n', osis)
+
+        # \toc2_text...
+        osis = re.sub(r'\\toc2\b\s+(.+)\s*\n', r'<milestone type="x-toc2" n="\1"/>\n', osis)
+
+        # \toc3_text...
+        locBook = re.search(r'\\toc3\b\s+(.+)\s*\n', osis)
+        if locBook:
+            locBook = locBook.group(1)
+            if osisBook:
+                osis2locBk[osisBook]=locBook
+                loc2osisBk[locBook]=osisBook
+        osis = re.sub(r'\\toc3\b\s+(.+)\s*\n', lambda m: r'<milestone type="x-toc3" n="\1"/>\n', osis)
+
+        return osis
+
+
+    def cvtIntroductions(osis, relaxedConformance):
+        """
+        Introductions
+        supported: \imt#, \is#, \ip, \ipi, \im, \imi, \ipq, \imq, \ipr, \iq#, \ib, \ili, \iot, \io#, \ior...\ior*, \iex, \iqt...\iqt*, \imte, \ie
+        NB: tags are 'supported' to the degree that their non-introduction equivalents are supported
+        """
+        # \imt#
+        # \is#  ###TESTED###
+        # \ip  ###TESTED###
+        # \ipi
+        # \im
+        # \imi
+        # \ipq
+        # \imq
+        # \ipr
+        # \iq#
+        # \ib
+        # \ili
+        # \iot
+        # \io#
+        # \ior...\ior*
+        # \iex
+        # \iqt...\iqt*
+        # \imte
+        # \ie
+        # encapsulate introduction elements in a <div>
+        osis = re.sub(r'(\\i(mt|mt\d+|s|d\d+|p|pi|m|mi|pq|mq|pr|q|q\d+|b|li|ot|o|o\d+|or|or*|ex|qt|qt*|mte|e)\b.+?)(?=\n\\(c|s|m|p|d))', u'<div type="introduction">'+r'\1'+u'</div>\n', osis, flags=re.DOTALL)
+        # map all introduction elements to their non-introduction equivalents
+        for e in [r'mt', r'mt\d+', r's', r'd\d+', r'p', r'pi', r'm', r'mi', r'pq', r'mq', r'pr', r'q', r'q\d+', r'b', r'li', r'ot', r'o', r'o\d+', r'or', r'or*', r'ex', r'qt', r'qt*', r'mte', r'e']:
+            osis = re.sub(r'\\i('+e+r')\b', r'\\\1', osis)
+        return osis
+
+
+    def cvtTitles(osis, relaxedConformance):
+        """
+        Titles, Headings, and Labels
+        supported: \mt#, \mte#, \ms#, \mr, \s#, \sr, \r, \rq...\rq*, \d, \sp
+        """
+        # \ms#_text...  ###TESTED###  ##NB: supports only \ms1 to \ms3
+        osis = re.sub(r'\\ms1?\s+(.+)', lambda m: u'󠀰<div type="majorSection"><title>' + m.group(1) + '</title>', osis)
+        osis = re.sub(u'(󠀰[^󠀰󠁂]+)', r'\1'+u'</div>󠀰\n', osis, re.DOTALL)
+        osis = re.sub(r'\\ms2\s+(.+)', lambda m: u'󠀱<div type="majorSection" n="2"><title>' + m.group(1) + '</title>', osis)
+        osis = re.sub(u'(󠀱[^󠀰󠁂󠀱]+)', r'\1'+u'</div>󠀱\n', osis, re.DOTALL)
+        osis = re.sub(r'\\ms3\s+(.+)', lambda m: u'󠀲<div type="majorSection" n="3"><title>' + m.group(1) + '</title>', osis)
+        osis = re.sub(u'(󠀲[^󠀰󠁂󠀱󠀲]+)', r'\1'+u'</div>󠀲\n', osis, re.DOTALL)
+
+        # \mr_text...
+        osis = re.sub(r'\\mr\s+(.+)', u'󠁄<title type="scope"><reference>'+r'\1</reference></title>', osis)
+
+        # \s#_text...  ###TESTED###  ##NB: supports only \s1 to \s3
+        osis = re.sub(r'\\s1?\s+(.+)', lambda m: u'󠀳<div type="section"><title>' + m.group(1) + '</title>', osis)
+        osis = re.sub(u'(󠀳[^󠀰󠁂󠀱󠀲󠀳]+)', r'\1'+u'</div>󠀳\n', osis, re.DOTALL)
+        if relaxedConformance:
+            osis = re.sub(r'\\ss\s+', r'\\s2 ', osis)
+            osis = re.sub(r'\\sss\s+', r'\\s3 ', osis)
+        osis = re.sub(r'\\s2\s+(.+)', lambda m: u'󠀴<div type="subsection"><title>' + m.group(1) + '</title>', osis)
+        osis = re.sub(u'(󠀴[^󠀰󠁂󠀱󠀲󠀳󠀴]+)', r'\1'+u'</div>󠀴\n', osis, re.DOTALL)
+        osis = re.sub(r'\\s3\s+(.+)', lambda m: u'󠀵<div type="x-subSubSection"><title>' + m.group(1) + '</title>', osis)
+        osis = re.sub(u'(󠀵[^󠀰󠁂󠀱󠀲󠀳󠀴󠀵]+)', r'\1'+u'</div>󠀵\n', osis, re.DOTALL)
+
+        # \sr_text...
+        osis = re.sub(r'\\sr\s+(.+)', u'󠁄<title type="scope"><reference>'+r'\1</reference></title>', osis)
+        # \r_text...
+        osis = re.sub(r'\\r\s+(.+)', u'󠁄<title type="parallel"><reference type="parallel">'+r'\1</reference></title>', osis)
+        # \rq_text...\rq*
+        osis = re.sub(r'\\rq\s+(.+?)\\rq\*', u'<reference type="source">'+r'\1</reference>', osis, flags=re.DOTALL)
+
+        # \d_text...  ###TESTED###
+        osis = re.sub(r'\\d\s+(.+)', u'󠁄<title canonical="true" type="psalm">'+r'\1</title>', osis)
+
+        # \sp_text...  ###TESTED###
+        osis = re.sub(r'\\sp\s+(.+)', r'<speaker>\1</speaker>', osis)
+
+        # \mt#_text...  ###TESTED###
+        osis = re.sub(r'\\mt(\d?)\s+(.+)', lambda m: r'<title ' + (r'level="'+m.group(1)+r'" ' if m.group(1) else r'') + r'type="main">' + m.group(2) + r'</title>', osis)
+        # \mte#_text...
+        osis = re.sub(r'\\mte(\d?)\s+(.+)', lambda m: r'<title ' + (r'level="'+m.group(1)+r'" ' if m.group(1) else r'') + r'type="main" subType="x-end">' + m.group(2) + r'</title>', osis)
+
+        return osis
+
+
+    def cvtChaptersAndVerses(osis, relaxedConformance):
+        """
+        Chapters and Verses
+        supported: \c, \ca...\ca*, \cl, \cp, \cd, \v, \va...\va*, \vp...\vp*
+        """
+        # \c_#  ###TESTED###
+        osis = re.sub(r'\\c\s+([^\s]+)\b(.+?)(?=(\\c\s+|</div type="book"))', lambda m: u'󠁃<chapter osisID="$BOOK$.' + m.group(1) + r'" sID="$BOOK$.' + m.group(1) + '"/>' + m.group(2) +  u'<chapter eID="$BOOK$.' + m.group(1) + u'"/>󠁰\n', osis, flags=re.DOTALL)
+
+        # \cp_#
+        # \ca_#\ca*
+        def replaceChapterNumber(matchObj):
+            ctext = matchObj.group(1)
+            cp = re.search(r'\\cp\s+(.+?)(?=(\\|\s))', ctext)
+            if cp:
+                ctext = re.sub(r'\\cp\s+(.+?)\\cp*', '', ctext, flags=re.DOTALL)
+                cp = cp.group(1)
+                ctext = re.sub(r'"\$BOOK\$\.([^"\.]+)"', r'"$BOOK$.'+ca+'"', ctext)
+            ca = re.search(r'\\ca\s+(.+?)\\ca\*', ctext)
+            if ca:
+                ctext = re.sub(r'\\ca\s+(.+?)\\ca*', '', ctext, flags=re.DOTALL)
+                ca = ca.group(1)
+                ctext = re.sub(r'(osisID="\$BOOK\$\.[^"\.]+)"', r'\1 $BOOK$.'+ca+'"', ctext)
+            return ctext
+        osis = re.sub(r'(<chapter [^<]+sID[^<]+/>.+?<chapter eID[^>]+/>)', replaceChapterNumber, osis, flags=re.DOTALL)
+
+        # \cl_
+        osis = re.sub(r'\\cl\s+(.+)', u'󠁄<title>'+r'\1</title>', osis)
+
+        # \cd_#   <--This # seems to be an error
+        osis = re.sub(r'\\cd\b\s+(.+)', u'󠁄<title type="x-description">'+r'\1</title>', osis)
+
+        # \v_#  ###TESTED###
+        osis = re.sub(r'\\v\s+([^\s]+)\b\s*(.+?)(?=(\\v\s+|</div type="book"|<chapter eID))', lambda m: u'󠁖<verse osisID="$BOOK$.$CHAP$.' + m.group(1) + r'" sID="$BOOK$.$CHAP$.' + m.group(1) + r'"/>' + m.group(2) +  r'<verse eID="$BOOK$.$CHAP$.' + m.group(1) + u'"/>󠁖\n', osis, flags=re.DOTALL)
+
+        # \vp_#\vp*
+        # \va_#\va*
+        def replaceVerseNumber(matchObj):
+            vtext = matchObj.group(1)
+            vp = re.search(r'\\vp\s+(.+?)\\vp*', vtext)
+            if vp:
+                vtext = re.sub(r'\\vp\s+(.+?)\\vp*', '', vtext, flags=re.DOTALL)
+                vp = vp.group(1)
+                vtext = re.sub(r'"\$BOOK\$\.\$CHAP\$\.([^"\.]+)"', r'"$BOOK$.$CHAP$.'+va+'"', vtext)
+            va = re.search(r'\\va\s+(.+?)\\va\*', vtext)
+            if va:
+                vtext = re.sub(r'\\va\s+(.+?)\\va*', '', vtext, flags=re.DOTALL)
+                va = va.group(1)
+                vtext = re.sub(r'(osisID="\$BOOK\$\.\$CHAP\$\.[^"\.]+)"', r'\1 $BOOK$.$CHAP$.'+va+'"', vtext)
+            return vtext
+        osis = re.sub(r'(<verse [^<]+sID[^<]+/>.+?<verse eID[^>]+/>)', replaceVerseNumber, osis, flags=re.DOTALL)
+
+        return osis
+
+
+    def cvtParagraphs(osis, relaxedConformance):
+        """
+        Paragraphs
+        supported: \p, \m, \pmo, \pm, \pmc, \pmr, \pi#, \mi, \nb, \cls, \li#, \pc, \pr, \ph#, \b
+        """
+        # \p(_text...)  ###TESTED###
+        osis = re.sub(r'\\p\s+(.*?)(?=(\\(m|p|nb|lit|cls|tr)|<chapter eID|</?div\b|<(p|closer)\b))', lambda m: u'󠁰<p>\n' + m.group(1) + u'󠁰</p>\n', osis, flags=re.DOTALL)
+
+        # \pc(_text...)
+        # \pr(_text...)
+        # \m(_text...)  ###TESTED###
+        # \pmo(_text...)
+        # \pm(_text...)
+        # \pmc(_text...)
+        # \pmr_text...          # deprecated: map to same as \pr
+        # \pi#(_Sample text...)
+        # \mi(_text...)
+        # \nb  ###TESTED###
+        pType = {'pc':'x-center', 'pr':'x-right', 'm':'x-noindent', 'pmo':'x-embedded-opening', 'pm':'x-embedded', 'pmc':'x-embedded-closing', 'pmr':'x-right', 'pi':'x-indented-1', 'pi1':'x-indented-1', 'pi2':'x-indented-2', 'pi3':'x-indented-3', 'pi4':'x-indented-4', 'pi5':'x-indented-5', 'mi':'x-noindent-indented', 'nb':'x-nobreak'}
+        osis = re.sub(r'\\(pc|pr|m|pmo|pm|pmc|pmr|pi|pi1|pi2|pi3|pi4|pi5|mi|nb)\s+(.*?)(?=(\\(m|p|nb|lit|cls|tr)|<chapter eID|</?div\b|<(p|closer)\b))', lambda m: u'󠁰<p type="' + pType[m.group(1)]  + '">\n' + m.group(2) + u'󠁰</p>\n', osis, flags=re.DOTALL)
+
+        # \cls_text...
+        osis = re.sub(r'\\m\s+(.+?)(?=(\\(m|p|nb|lit|cls|tr)|<chapter eID|</?div\b|<(p|closer)\b))', lambda m: u'󠁰<closer>' + m.group(1) + u'󠁰</closer>\n', osis, flags=re.DOTALL)
+
+        # \ph#(_text...)
+        # \li#(_text...)  ###TESTED###
+        osis = re.sub(r'\\ph\b\s*', r'\\li ', osis)
+        osis = re.sub(r'\\ph(\d+)\b\s*', r'\\li\1 ', osis)
+        osis = re.sub(r'\\li\b\s*(.*?)(?=(['+u'󠁂󠁃󠁰󠁄'+r']|\\li[\d\s]|<lb\b|<title\b))', r'<item type="x-indent-1">\1</item>', osis, flags=re.DOTALL)
+        osis = re.sub(r'\\li(\d+)\b\s*(.*?)(?=(['+u'󠁂󠁃󠁰󠁄'+r']|\\li[\d\s]|<lb\b|<title\b))', r'<item type="x-indent-\1">\2</item>', osis, flags=re.DOTALL)
+        osis = osis.replace('\n</item>', '</item>\n')
+        osis = re.sub(u'(<item [^󠁂󠁃󠁰󠁄]+</item>)', r'<list>\1</list>', osis, flags=re.DOTALL)
+
+        # \b  ###TESTED###
+        osis = re.sub(r'\\b\b\s?', r'<lb type="p"/>', osis)
+
+        return osis
+
+
+    def cvtPoetry(osis, relaxedConformance):
+        """
+        Poetry
+        supported: \q#, \qr, \qc, \qs...\qs*, \qa, \qac...\qac*, \qm#, \b
+        """
+        # \qs_(Selah)\qs*
+        osis = re.sub(r'\\qs\b\s(.+?)\\qs\*', r'<l type="selah">\1</l>', osis, flags=re.DOTALL)
+
+        # \q#(_text...)  ###TESTED###
+        osis = re.sub(r'\\q\b\s*(.*?)(?=(['+u'󠁂󠁃󠁰󠁄'+r']|\\q[\d\s]|<l\b|<lb\b|<title\b))', r'<l level="1">\1</l>', osis, flags=re.DOTALL)
+        osis = re.sub(r'\\q(\d+)\b\s*(.*?)(?=(['+u'󠁂󠁃󠁰󠁄'+r']|\\q[\d\s]|<l\b|<lb\b|<title\b))', r'<l level="\1">\2</l>', osis, flags=re.DOTALL)
+
+        # \qr_text...
+        # \qc_text...
+        # \qm#(_text...)
+        qType = {'qr':'x-right', 'qc':'x-center', 'qm':'x-embedded" level="1', 'qm1':'x-embedded" level="1', 'qm2':'x-embedded" level="2', 'qm3':'x-embedded" level="3', 'qm4':'x-embedded" level="4', 'qm5':'x-embedded" level="5'}
+        osis = re.sub(r'\\(qr|qc|qm\d+)\b\s*(.*?)(?=(['+u'󠁂󠁃󠁰󠁄'+r']|\\q[\d\s]|<l\b|<lb\b|<title\b))', lambda m: r'<l type="' + qType[m.group(1)] + '">' + m.group(2) + '</l>', osis, flags=re.DOTALL)
+
+        osis = osis.replace('\n</l>', '</l>\n')
+        osis = re.sub(u'(<l [^󠁂󠁃󠁰󠁄]+</l>)', r'<lg>\1</lg>', osis, flags=re.DOTALL)
+
+        # \b  ###TESTED###
+        osis = re.sub('(<lg>.+?</lg>)', lambda m: m.group(1).replace(r'<lb type="p"/>', r'</lg><lg>'), osis, flags=re.DOTALL) # re-handle \b that occurs within <lg>
+
+        # \qa_text...
+        osis = re.sub(r'\\qa\s+(.+)', u'󠁄<title type="acrostic">'+r'\1</title>', osis)
+
+        # \qac_text...\qac*
+        osis = re.sub(r'\\qac\s+(.+?)\\qac\*', r'<hi type="acrostic">\1</hi>', osis, flags=re.DOTALL)
+
+        return osis
+
+
+    def cvtTables(osis, relaxedConformance):
+        """
+        Tables
+        supported: \tr, \th#, \thr#, \tc#, \tcr#
+        """
+        # \tr_
+        osis = re.sub(r'\\tr\b\s*(.*?)(?=(['+u'󠁂󠁃󠁰󠁄'+r']|\\tr\s|<lb\b|<title\b))', r'<row>\1</row>', osis, flags=re.DOTALL)
+
+        # \th#_text...
+        # \thr#_text...
+        # \tc#_text...
+        # \tcr#_text...
+        tType = {'th':' role="label"', 'thr':' role="label" type="x-right"', 'tc':'', 'tcr':' type="x-right'}
+        osis = re.sub(r'\\(thr?|tcr?)\d*\b\s*(.*?)(?=(\\t[hc]|</row))', lambda m: r'<cell' + tType[m.group(1)] + '>' + m.group(2) + '</cell>', osis, flags=re.DOTALL)
+
+        return osis
+
+
+    def processNote(note):
+        note = note.replace('\n', ' ')
+
+        # \fdc_refs...\fdc*
+        note = re.sub(r'\\fdc\b\s(.+?)\\fdc\b\*', r'<seg editions="dc">\1</seg>', note)
+
+        # \fq_  ###TESTED###
+        note = re.sub(r'\\fq\b\s(.+?)(?=(\\f|'+u'󠁆))', u'󠁆'+r'<catchWord>\1</catchWord>', note)
+
+        # \fqa_  ###TESTED###
+        note = re.sub(r'\\fqa\b\s(.+?)(?=(\\f|'+u'󠁆))', u'󠁆'+r'<rdg type="alternate">\1</rdg>', note)
+
+        # \ft_  ###TESTED###
+        note = re.sub(r'\\ft\s', r'', note)
+
+        # \fr_##SEP##
+        note = re.sub(r'\\fr\b\s(.+?)(?=(\\f|'+u'󠁆))', u'󠁆'+r'<reference>\1</reference>', note)
+
+        # \fk_
+        note = re.sub(r'\\fk\b\s(.+?)(?=(\\f|'+u'󠁆))', u'󠁆'+r'<catchWord>\1</catchWord>', note)
+
+        # \fl_
+        note = re.sub(r'\\fl\b\s(.+?)(?=(\\f|'+u'󠁆))', u'󠁆'+r'<label>\1</label>', note)
+
+        # \fp_
+        note = re.sub(r'\\fp\b\s(.+?)(?=(\\fp|$))', r'<p>\1</p>', note)
+        note = re.sub(r'(<note\b[^>]*?>)(.*?)<p>', r'\1<p>\2</p><p>', note)
+
+        # \fv_
+        note = re.sub(r'\\fv\b\s(.+?)(?=(\\f|'+u'󠁆))', u'󠁆'+r'<hi type="super">\1</hi>', note)
+
+        if relaxedConformance:
+            note = note.replace(r'\ft*', r'')
+            note = note.replace(r'\fq*', r'')
+            note = note.replace(r'\fqa*', r'')
+
+        note = note.replace(u'󠁆', '')
+        return note
+
+
+    def cvtFootnotes(osis, relaxedConformance):
+        """
+        Footnotes
+        supported:\f...\f*, \fe...\fe*, \fr, \fk, \fq, \fqa, \fl, \fp, \fv, \ft, \fdc...\fdc*, \fm...\fm*
+        """
+        # \f_+_...\f*  ###TESTED###
+        osis = re.sub(r'\\f\s+([^\s\\]+)?\s*(.+?)\s*\\f\*', lambda m: r'<note' + ((' n=""') if (m.group(1) == u'-') else ('' if (m.group(1) == '+') else (' n="' + m.group(1) + '"'))) + ' placement="foot">' + m.group(2) + u'󠁆</note>', osis, flags=re.DOTALL)
+
+        # \fe_+_...\fe*
+        osis = re.sub(r'\\fe\s+([^\s\\]+?)\s*(.+?)\s*\\fe\*', lambda m: r'<note' + ((' n=""') if (m.group(1) == u'-') else ('' if (m.group(1) == '+') else (' n="' + m.group(1) + '"'))) + ' placement="end">' + m.group(2) + u'󠁆</note>', osis, flags=re.DOTALL)
+
+        osis = re.sub(r'(<note\b[^>]*?>.*?</note>)', lambda m: processNote(m.group(1)), osis, flags=re.DOTALL)
+
+        # \fm_...\fm*
+        osis = re.sub(r'\\fm\b\s(.+?)\\fm\*', r'<hi type="super">\1</hi>', osis)
+
+        return osis
+
+
+    def processXref(note):
+        note = note.replace('\n', ' ')
+
+        # \xot_refs...\xot*
+        note = re.sub(r'\\xot\b\s(.+?)\\xot\b\*', u'󠁆'+r'<seg editions="ot">\1</seg>', note)
+
+        # \xnt_refs...\xnt*
+        note = re.sub(r'\\xnt\b\s(.+?)\\xnt\b\*', u'󠁆'+r'<seg editions="nt">\1</seg>', note)
+
+        # \xdc_refs...\xdc*
+        note = re.sub(r'\\xdc\b\s(.+?)\\xdc\b\*', u'󠁆'+r'<seg editions="dc">\1</seg>', note)
+
+        # \xq_
+        note = re.sub(r'\\xq\b\s(.+?)(?=(\\x|'+u'󠁆))', u'󠁆'+r'<catchWord>\1</catchWord>', note)
+
+        # \xt_  ###TESTED###
+        note = re.sub(r'\\xt\s', r'', note)
+
+        # \xo_##SEP##
+        note = re.sub(r'\\xo\b\s(.+?)(?=(\\x|'+u'󠁆))', u'󠁆'+r'<reference>\1</reference>', note)
+
+        # \xk_
+        note = re.sub(r'\\xk\b\s(.+?)(?=(\\x|'+u'󠁆))', u'󠁆'+r'<catchWord>\1</catchWord>', note)
+
+        if relaxedConformance:
+            note = note.replace(r'\xt*', r'')
+            note = note.replace(r'\xq*', r'')
+
+        note = note.replace(u'󠁆', '')
+        return note
+
+
+    def cvtCrossReferences(osis, relaxedConformance):
+        """
+        Cross References
+        supported: \\x...\\x*, \\xo, \\xk, \\xq, \\xt, \\xot...\\xot*, \\xnt...\\xnt*, \\xdc...\\xdc*
+        """
+        # \x_+_...\x*  ###TESTED###
+        osis = re.sub(r'\\x\s+([^\s]+?)\s+(.+?)\s*\\x\*', lambda m: r'<note' + ((' n=""') if (m.group(1) == u'-') else ('' if (m.group(1) == '+') else (' n="' + m.group(1) + '"'))) + ' type="crossReference"><reference>' + m.group(2) + u'</reference>󠁆</note>', osis, flags=re.DOTALL)
+
+        osis = re.sub(r'(<note [^>]*?type="crossReference"[^>]*>.*?</note>)', lambda m: processXref(m.group(1)), osis, flags=re.DOTALL)
+
+        return osis
+
+
+        """
+        Special Text and Character Styles
+        """
+    def cvtSpecialText(osis, relaxedConformance):
+        """
+        Special Text
+        supported: \add...\add*, \bk...\bk*, \dc...\dc*, \k...\k*, \lit, \nd...\nd*, \ord...\ord*, \pn...\pn*, \qt...\qt*, \sig...\sig*, \sls...\sls*, \tl...\tl*, \wj...\wj*
+        """
+        # \add_...\add*  ###TESTED###
+        osis = re.sub(r'\\add\s+(.+?)\\add\*', r'<transChange type="added">\1</transChange>', osis, flags=re.DOTALL)
+
+        # \wj_...\wj*  ###TESTED###
+        osis = re.sub(r'\\wj\s+(.+?)\\wj\*', r'<q who="Jesus" marker="">\1</q>', osis, flags=re.DOTALL)
+
+        # \nd_...\nd*
+        osis = re.sub(r'\\nd\s+(.+?)\\nd\*', r'<divineName>\1</divineName>', osis, flags=re.DOTALL)
+
+        # \pn_...\pn*
+        osis = re.sub(r'\\pn\s+(.+?)\\pn\*', r'<name>\1</name>', osis, flags=re.DOTALL)
+
+        # \qt_...\qt*
+        osis = re.sub(r'\\qt\s+(.+?)\\qt\*', r'<seg type="otPassage">\1</seg>', osis, flags=re.DOTALL)
+
+        # \sig_...\sig*
+        osis = re.sub(r'\\sig\s+(.+?)\\sig\*', r'<signed>\1</signed>', osis, flags=re.DOTALL)
+
+        # \ord_...\ord*
+        osis = re.sub(r'\\ord\s+(.+?)\\ord\*', r'<hi type="super">\1</hi>', osis, flags=re.DOTALL) # semantic incongruity (ordinal -> superscript)
+
+        # \tl_...\tl*
+        osis = re.sub(r'\\tl\s+(.+?)\\tl\*', r'<foreign>\1</foreign>', osis, flags=re.DOTALL)
+
+        # \bk_...\bk*  ###TESTED###
+        osis = re.sub(r'\\bk\s+(.+?)\\bk\*', r'<name type="x-workTitle">\1</name>', osis, flags=re.DOTALL)
+
+        # \k_...\k*  ###TESTED###
+        osis = re.sub(r'\\k\s+(.+?)\\k\*', r'<seg type="keyword">\1</seg>', osis, flags=re.DOTALL)
+
+        # \lit
+        osis = re.sub(r'\\lit\s+(.*?)(?=(\\(m|p|nb|lit|cls|tr)|<chapter eID|</?div\b|<(p|closer)\b))', lambda m: u'󠁰<p type="x-liturgical">\n' + m.group(1) + u'󠁰</p>\n', osis, flags=re.DOTALL)
+
+        # \dc_...\dc*  #### TODO: Find an example---should this really be transChange?
+        osis = re.sub(r'\\dc\b\s*(.+?)\\dc\*', r'<transChange type="added" editions="dc">\1</transChange>', osis, flags=re.DOTALL)
+
+        # \sls_...\sls*
+        osis = re.sub(r'\\sls\b\s*(.+?)\\sls\*', r'<foreign>/1</foreign>', osis, flags=re.DOTALL)  # find a better mapping than <foreign>?
+
+        return osis
+
+
+    def cvtCharacterStyling(osis, relaxedConformance):
+        """
+        Character Styling
+        supported: \em...\em*, \bd...\bd*, \it...\it*, \bdit...\bdit*, \no...\no*, \sc...\sc*
+        """
+        # \em_...\em*
+        osis = re.sub(r'\\em\s+(.+?)\\em\*', r'<hi type="emphasis">\1</hi>', osis, flags=re.DOTALL)
+
+        # \bd_...\bd*
+        osis = re.sub(r'\\bd\s+(.+?)\\bd\*', r'<hi type="bold">\1</hi>', osis, flags=re.DOTALL)
+
+        # \it_...\it*  ###TESTED###
+        osis = re.sub(r'\\it\s+(.+?)\\it\*', r'<hi type="italic">\1</hi>', osis, flags=re.DOTALL)
+
+        # \bdit_...\bdit*
+        osis = re.sub(r'\\bdit\s+(.+?)\\bdit\*', r'<hi type="bold"><hi type="italic">\1</hi></hi>', osis, flags=re.DOTALL)
+
+        # \no_...\no*
+        osis = re.sub(r'\\no\s+(.+?)\\no\*', r'<hi type="normal">\1</hi>', osis, flags=re.DOTALL)
+
+        # \sc_...\sc*
+        osis = re.sub(r'\\sc\s+(.+?)\\sc\*', r'<hi type="small-caps">\1</hi>', osis, flags=re.DOTALL)
+
+        return osis
+
+
+    def cvtSpacingAndBreaks(osis, relaxedConformance):
+        """
+        Spacing and Breaks
+        supported: ~, //, \pb
+        """
+        # ~
+        osis = osis.replace('~', '\uA0')
+
+        # //
+        osis = osis.replace('//', '')
+
+        # \pb
+        osis = re.sub(r'\\pb\s*', '<milestone type="pb"/>\n', osis, flags=re.DOTALL)
+
+        return osis
+
+
+    def cvtSpecialFeatures(osis, relaxedConformance):
+        """
+        Special Features
+        supported:
+        unsupported: \fig...\fig*, \ndx...\ndx*, \pro...\pro*, \w...\w*, \wg...\wg*, \wh...\wh*
+        """
+        # \fig DESC|FILE|SIZE|LOC|COPY|CAP|REF\fig*
+        def makeFigure(matchObject):
+            fig_desc,fig_file,fig_size,fig_loc,fig_copy,fig_cap,fig_ref = matchObject
+            figure = '<figure'
+            if  fig_file:
+                figure += ' src="' + matchObject.group('fig_file') + '"'
+            if fig_size:
+                figure += ' size="' + matchObject.group('fig_size') + '"'
+            if fig_copy:
+                figure += ' rights="' + matchObject.group('fig_copy') + '"'
+            """ TODO: implement parsing in osisParse(Bible reference string)
+            if fig_ref:
+                figure += ' annotateRef="' + osisParse(matchObject.group('fig_ref')) + '"'
+            """
+            figure += '>\n'
+            if fig_cap:
+                figure += '<caption>' + matchObject.group('fig_cap') + '</caption>\n'
+            if fig_ref:
+                figure += '<reference>' + matchObject.group('fig_ref') + '</reference>\n'
+            if fig_desc:
+                figure += '<!-- fig DESC - ' + fig_desc + ' -->\n'
+            if fig_loc:
+                figure += '<!-- fig LOC - ' + fig_loc + ' -->\n'
+            figure += '</figure>'
+            return figure
+        osis = re.sub(r'\\fig\b\s+([^\|]*)\s*\|([^\|]*)\s*\|([^\|]*)\s*\|([^\|]*)\s*\|([^\|]*)\s*\|([^\|]*)\s*\|([^\\]*)\s*\\fig\*', makeFigure, osis)
+
+        # \ndx_...\ndx*
+        osis = re.sub(r'\\ndx\s+(.+?)(\s*)\\ndx\*', r'\1<index index="Index" level1="\1"/>\2', osis, flags=re.DOTALL)
+
+        # \pro_...\pro*
+        osis = re.sub(r'([^\s]+)(\s*)\\pro\s+(.+?)(\s*)\\pro\*', r'<w xlit="\3">\1</w>\2\4', osis, flags=re.DOTALL)
+
+        # \w_...\w*
+        osis = re.sub(r'\\w\s+(.+?)(\s*)\\w\*', r'\1<index index="Glossay" level1="\1"/>\2', osis, flags=re.DOTALL)
+
+        # \wg_...\wg*
+        osis = re.sub(r'\\wg\s+(.+?)(\s*)\\wg\*', r'\1<index index="Greek" level1="\1"/>\2', osis, flags=re.DOTALL)
+
+        # \wh_...\wh*
+        osis = re.sub(r'\\wh\s+(.+?)(\s*)\\wh\*', r'\1<index index="Hebrew" level1="\1"/>\2', osis, flags=re.DOTALL)
+
+        return osis
+
+
+    def cvtPeripherals(osis, relaxedConformance):
+        """
+        Peripherals
+        supported:
+        unsupported: \periph
+        """
+        # \periph
+        def tagPeriph(matchObject):
+            periphType,contents = matchObject
+            periph = '<div type="'
+            if periphType in peripherals:
+                periph += peripherals[periphType]
+            elif periphType in introPeripherals:
+                periph += 'introduction" subType="x-' + introPeripherals[periphType]
+            else:
+                periph += 'x-unknown'
+            periph += '">\n' +  contents + '</div>\n'
+            return periph
+        osis = re.sub(r'\\periph\s+([^\n]+)\s*\n(.+?)(?=(</div type="book">|\\periph\s+))', tagPeriph, osis, flags=re.DOTALL)
+
+        return osis
+
+
+    def cvtStudyBibleContent(osis, relaxedConformance):
+        """
+        Study Bible Content
+        supported: \ef...\ef*, \ex...\ex*, \esb...\esbe, \cat
+        """
+        # \ef...\ef*
+        osis = re.sub(r'\\ef\s+([^\s\\]+?)\s*(.+?)\s*\\ef\*', lambda m: r'<note' + ((' n=""') if (m.group(1) == u'-') else ('' if (m.group(1) == '+') else (' n="' + m.group(1) + '"'))) + ' type="study">' + m.group(2) + u'󠁆</note>', osis, flags=re.DOTALL)
+        osis = re.sub(r'(<note\b[^>]*?>.*?</note>)', lambda m: processNote(m.group(1)), osis, flags=re.DOTALL)
+
+        # \ex...\ex*
+        osis = re.sub(r'\\ex\s+([^\s]+?)\s+(.+?)\s*\\ex\*', lambda m: r'<note' + ((' n=""') if (m.group(1) == u'-') else ('' if (m.group(1) == '+') else (' n="' + m.group(1) + '"'))) + ' type="crossReference" subType="x-study"><reference>' + m.group(2) + u'</reference>󠁆</note>', osis, flags=re.DOTALL)
+        osis = re.sub(r'(<note [^>]*?type="crossReference"[^>]*>.*?</note>)', lambda m: processXref(m.group(1)), osis, flags=re.DOTALL)
+
+        # \esb...\esbex  ### TODO: this likely needs to go much earlier in the process
+        osis = re.sub(r'\\esb\b\s*(.+?)\\esbe\b\s*', '󠀰<div type="x-sidebar">\1</div>󠀰\n', osis, flags=re.DOTALL)
+
+        # \cat_<TAG>\cat*
+        osis = re.sub(r'\\cat\b\s+(.+?)\\cat\*', r'<index index="category" level1="\1"/>', osis)
+
+        return osis
+
+
+    def cvtPrivateUseExtensions(osis, relaxedConformance):
+        """
+        \z namespace
+        supported: \z<Extension>
+        We can't really know what these mean, but will preserve them as <milestone/> elements.
+        """
+        # \z
+        osis = re.sub(r'\\z([^\s]+)', r'<milestone type="x-z-\1"/>', osis)
+
+        return osis
+
+
+    def processOsisIDs(osis):
+        # expand verse ranges, series
+        def expandRange(vRange):
+            vRange = re.findall(r'\d+', vRange)
+            osisID = list()
+            for n in range(int(vRange[0]), int(vRange[1])+1):
+                osisID.append('$BOOK$.$CHAP$.'+str(n))
+            return ' '.join(osisID)
+        osis = re.sub(r'\$BOOK\$\.\$CHAP\$\.(\d+-\d+)"', lambda m: expandRange(m.group(1))+'"', osis)
+
+        def expandSeries(vSeries):
+            vSeries = re.findall(r'\d+', vSeries)
+            osisID = list()
+            for n in vSeries:
+                osisID.append('$BOOK$.$CHAP$.'+str(n))
+            return ' '.join(osisID)
+        osis = re.sub(r'\$BOOK\$\.\$CHAP\$\.(\d+(,\d+)+)"', lambda m: expandSeries(m.group(1))+'"', osis)
+
+
+        # fill in book & chapter values
+        bookChunks = osis.split(u'󠁂')
+        osis = ''
+        for bc in bookChunks:
+            bookValue = re.search(r'<div type="book" osisID="([^"]+?)"', bc)
+            if bookValue:
+                bookValue = bookValue.group(1)
+                bc = bc.replace('$BOOK$', bookValue)
+                chapChunks = bc.split(u'󠁃')
+                newbc = ''
+                for cc in chapChunks:
+                    chapValue = re.search(r'<chapter osisID="[^\."]+\.([^"]+)', cc)
+                    if chapValue:
+                        chapValue = chapValue.group(1)
+                        cc = cc.replace('$CHAP$', chapValue)
+                    newbc += cc
+                bc = newbc
+            osis += bc
+        return osis
+
+
+    def osisReorderAndCleanup(osis):
+        # assorted re-orderings
+        osis = re.sub(u'(󠁰<chapter eID=.+?\n)(<verse eID=.+?>󠁖)\n?', r'\2\n\1', osis)
+        osis = re.sub(u'([󠀰󠀱󠀲]</div>)([^󠀰󠀱󠀲]*<chapter eID.+?>)', r'\2\1', osis)
+        osis = re.sub(u'(󠁰</p>\n?󠁰<p>)\n?(<verse eID=.+?>󠁖)\n?', r'\2\n\1\n', osis)
+        osis = re.sub(u'\n(<verse eID=.+?>󠁖)', r'\1\n', osis)
+        osis = re.sub(u'\n*(<l.+?>)(<verse eID=.+?>[󠁖\n]*<verse osisID=.+?>)', r'\2\1', osis)
+
+        # delete attributes from end tags (since they are invalid)
+        osis = re.sub(r'(</[^\s>]+) [^>]*>', r'\1>', osis)
+        osis = osis.replace(r'<lb type="p"/>', r'<lb/>')
+        # delete Unicode tags
+        for c in u'󠁂󠁃󠁖󠁰󠁄󠀰󠀱󠀲󠀳󠀴󠀵':
+            osis = osis.replace(c, '')
+
+        for endBlock in ['p', 'div', 'note', 'l', 'lg', 'chapter', 'verse']:
+            osis = re.sub(r' +</'+endBlock+r'>', r'</'+endBlock+r'>', osis)
+            osis = re.sub(r' +<'+endBlock+r'( eID=[^/>]+/>)', r'</'+endBlock+r'\1', osis)
+        osis = re.sub(r' +((</[^>]+>)+) *', r'\1 ', osis)
+
+        # strip extra spaces & newlines
+        osis = re.sub(r'  +', r' ', osis)
+        osis = re.sub(r' ?\n\n+', r'\n', osis)
+        return osis
+
+
+    ### Processing starts here
+    if encoding:
+        osis = codecs.open(sFile, 'r', encoding).read().strip() + '\n'
+    else:
+        encoding = 'utf-8'
+        osis = codecs.open(sFile, 'r', encoding).read().strip() + '\n'
+        # \ide_<ENCODING>
+        encoding = re.search(r'\\ide\s+(.+)\n', osis)
+        if encoding:
+            encoding = encoding.group(1).lower()
+            if encoding != 'utf-8':
+                if encoding in aliases:
+                    osis = codecs.open(sFile, 'r', encoding).read().strip() + '\n'
+                else:
+                    print('Encoding unknown, processing as UTF-8.')
+
+
+    # call individual conversion processors in series
+    osis = cvtPreprocess(osis, relaxedConformance)
+    osis = cvtIdentification(osis, relaxedConformance)
+    osis = cvtIntroductions(osis, relaxedConformance)
+    osis = cvtTitles(osis, relaxedConformance)
+    osis = cvtChaptersAndVerses(osis, relaxedConformance)
+    osis = cvtParagraphs(osis, relaxedConformance)
+    osis = cvtPoetry(osis, relaxedConformance)
+    osis = cvtTables(osis, relaxedConformance)
+    osis = cvtFootnotes(osis, relaxedConformance)
+    osis = cvtCrossReferences(osis, relaxedConformance)
+    osis = cvtSpecialText(osis, relaxedConformance)
+    osis = cvtCharacterStyling(osis, relaxedConformance)
+    osis = cvtSpacingAndBreaks(osis, relaxedConformance)
+    osis = cvtSpecialFeatures(osis, relaxedConformance)
+    osis = cvtPeripherals(osis, relaxedConformance)
+    osis = cvtStudyBibleContent(osis, relaxedConformance)
+    osis = cvtPrivateUseExtensions(osis, relaxedConformance)
+
+    osis = processOsisIDs(osis)
+    osis = osisReorderAndCleanup(osis)
+
+    # change type on special books
+    for sb in specialBooks:
+        osis = osis.replace('<div type="book" osisID="' + sb  + '">', '<div type="' + sb.lower() + '">')
+
+    return osis
+
+
+
+def writeOSISHeader(oFile, workID, lang='en'):
+    oFile.write('<?xml version="1.0" encoding="UTF-8"?>\n<osis xmlns="http://www.bibletechnologies.net/2003/OSIS/namespace" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.bibletechnologies.net/2003/OSIS/namespace http://www.bibletechnologies.net/osisCore.'+OSISversion+'.xsd">\n<osisText osisRefWork="Bible" xml:lang="' + lang + '" osisIDWork="' + workID + '">\n<header>\n<work osisWork="' + workID + '"/>\n</header>\n')
+
+def writeOSISFooter(oFile):
+    oFile.write('</osisText>\n</osis>\n')
+
+def verbosePrint(text):
+    if verbose:
+        print text
+
+def printUnhandled():
+    global relaxedConformance
+
+
+def printUsage():
+    print('usfm2osis.py -- USFM ' + USFMversion + ' to OSIS ' + OSISversion + ' converter version ' + scriptVersion)
+    print('                Revision: ' + rev + ' (' + date + ')')
+    print('')
+    print('Usage: usfm2osis.py <osisWork> [OPTION] ...  <USFM filename|wildcard> ...')
+    print('')
+    print('  -e ENCODING      input encoding override (default is to read the USFM file\'s')
+    print('                     \\ide value or assume UTF-8 encoding in its absence)')
+    print('  -h, --help       print this usage information')
+    print('  -o FILENAME      output filename (default is: <osisWork>.osis.xml)')
+    print('  -r               enable relaxed markup processing (for non-standard USFM)')
+    print('  -v               verbose feedback')
+    print('')
+    print('As an example, if you want to generate the osisWork <Bible.KJV> and your USFM')
+    print('  are located in the ./KJV folder, enter:')
+    print('    python usfm2osis.py Bible.KJV ./KJV/*.usfm')
+    verbosePrint('')
+    verbosePrint('Supported encodings: ' + ', '.join(aliases))
+    exit()
+
+class Worker(multiprocessing.Process):
+    def __init__(self, work_queue, result_queue):
+
+        # base class initialization
+        multiprocessing.Process.__init__(self)
+
+        # job management stuff
+        self.work_queue = work_queue
+        self.result_queue = result_queue
+        self.kill_received = False
+
+    def run(self):
+        while not self.kill_received:
+
+            # get a task
+            #job = self.work_queue.get_nowait()
+            try:
+                job = self.work_queue.get_nowait()
+            except Queue.Empty:
+                break
+
+            # the actual processing
+            osis = convertToOSIS(job)
+
+            # store the result
+            self.result_queue.put((job,osis))
+
+
+if __name__ == "__main__":
+    global encoding
+    global relaxedConformance
+
+    num_processes = multiprocessing.cpu_count()
+    num_jobs = num_processes
+
+    encoding = ''
+    relaxedConformance = False
+    inputFilesIdx = 2 # This marks the point in the sys.argv array, after which all values represent USFM files to be converted.
+
+    if '-v' in sys.argv:
+        verbose = True
+        inputFilesIdx += 1
+    else:
+        verbose = False
+
+    if '-h' in sys.argv or '--help' in sys.argv or len(sys.argv) < 3:
+        printUsage()
+    else:
+        OSISwork = sys.argv[1]
+
+        if '-o' in sys.argv:
+            i = sys.argv.index('-o')+1
+            if len(sys.argv) < i+1:
+                printUsage()
+            OSISfileName = sys.argv[i]
+            inputFilesIdx += 2 # increment 2, reflecting 2 args for -o
+        else:
+            OSISfileName = OSISwork + '.osis.xml'
+
+        if '-e' in sys.argv:
+            i = sys.argv.index('-e')+1
+            if len(sys.argv) < i+1:
+                printUsage()
+            encoding = sys.argv[i]
+            inputFilesIdx += 2 # increment 2, reflecting 2 args for -e
+
+        if '-r' in sys.argv:
+            relaxedConformance = True
+            inputFilesIdx += 1
+
+        usfmDocList = sys.argv[inputFilesIdx:]
+
+        OSISfile = codecs.open(OSISfileName, 'w', 'utf-8')
+        writeOSISHeader(OSISfile, OSISwork)
+
+
+        # run
+        # load up work queue
+        work_queue = multiprocessing.Queue()
+        for job in sorted(usfmDocList, key=keynat):
+            work_queue.put(job)
+
+        # create a queue to pass to workers to store the results
+        result_queue = multiprocessing.Queue()
+
+        # spawn workers
+        for i in range(num_processes):
+            worker = Worker(work_queue, result_queue)
+            worker.start()
+
+        # collect the results off the queue
+        osisSegment = dict()
+        for i in usfmDocList:
+            k,v=result_queue.get()
+            osisSegment[k]=v
+
+        unhandledTags = set()
+        for doc in sorted(usfmDocList, key=keynat):
+            unhandledTags |= set(re.findall(r'(\\[^\s\*]+?\b\*?)', osisSegment[doc]))
+            OSISfile.write(osisSegment[doc])
+        writeOSISFooter(OSISfile)
+
+        if unhandledTags:
+            if verbose:
+                print('')
+            print('Unhandled USFM tags: ' + ', '.join(sorted(unhandledTags)) + ' (' + str(len(unhandledTags)) + ' total)')
+            if not relaxedConformance:
+                print('Consider using the -r option for relaxed markup processing.')


Property changes on: trunk/modules/python/usfm2osis.py
___________________________________________________________________
Added: svn:executable
   + *

Added: trunk/modules/python/usfmtags.py
===================================================================
--- trunk/modules/python/usfmtags.py	                        (rev 0)
+++ trunk/modules/python/usfmtags.py	2012-08-04 11:10:27 UTC (rev 360)
@@ -0,0 +1,68 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+date = '$Date: 2012-03-09 01:23:40 -0800 (Fri, 09 Mar 2012) $'
+rev = '$Rev: 355 $'
+
+USFMversion = '2.35'  # http://ubs-icap.org/chm/usfm/2.35/index.html
+
+# usfmtags.py version 1.0
+# Copyright 2012 by the CrossWire Bible Society <http://www.crosswire.org/>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# The full text of the GNU General Public License is available at:
+# <http://www.gnu.org/licenses/gpl-3.0.txt>.
+
+import re, sys, codecs
+
+date = date.replace('$', '').strip()[6:16]
+rev = rev.replace('$', '').strip()[5:]
+
+simpleTags = (['\\id', '\\ide', '\\sts', '\\rem', '\\h', '\\toc1', '\\toc2', '\\toc3', '\\ip', '\\ipi', '\\im', '\\imi', '\\ipq', '\\imq', '\\ipr', '\\ib', '\\ili', '\\iot', '\\ior', '\\ior*', '\\iex', '\\iqt', '\\iqt*', '\\imte', '\\ie', '\\mr', '\\sr', '\\r', '\\rq', '\\rq*', '\\d', '\\sp', '\\c', '\\ca', '\\ca*', '\\cl', '\\cp', '\\cd', '\\v', '\\va', '\\va*', '\\vp', '\\vp*', '\\p', '\\m', '\\pmo', '\\pm', '\\pmc', '\\pmr', '\\mi', '\\nb', '\\cls', '\\pc', '\\pr', '\\b', '\\qr', '\\qc', '\\qs', '\\qs*', '\\qa', '\\qac', '\\qac*', '\\tr', '\\f', '\\f*', '\\fe', '\\fe*', '\\fr', '\\fk', '\\fq', '\\fqa', '\\fl', '\\fp', '\\fv', '\\ft', '\\fdc', '\\fdc*', '\\fm', '\\fm*', '\\x', '\\x*', '\\xo', '\\xk', '\\xq', '\\xt', '\\xot', '\\xot*', '\\xnt', '\\xnt*', '\\xdc', '\\xdc*', '\\add', '\\add*', '\\bk', '\\bk*', '\\dc', '\\dc*', '\\k', '\\k*', '\\lit', '\\nd', '\\nd*', '\\ord', '\\ord*', '\\pn', '\\pn*', '\\qt', '\\qt*', '\\sig', '\\sig*', '\\sls', '\\sls*', '\\tl', '\\tl*', '\\wj', '\\wj*', '\\em', '\\em*', '\\bd', '\\bd*', '\\it', '\\it*', '\\bdit', '\\bdit*', '\\no', '\\no*', '\\sc', '\\sc*', '\\pb', '\\fig', '\\fig*', '\\ndx', '\\ndx*', '\\pro', '\\pro*', '\\w', '\\w*', '\\wg', '\\wg*', '\\wh', '\\wh*', '\\periph', '\\ef', '\\ef*', '\\ex', '\\ex*', '\\esb', '\\esbe', '\\cat', '\\z'])
+digitTags = set(['\\imt', '\\is', '\\iq', '\\io', '\\mt', '\\mte', '\\ms', '\\s', '\\pi', '\\li', '\\ph', '\\q', '\\qm', '\\th', '\\thr', '\\tc', '\\tcr'])
+
+def main(argv):
+    tagSet = set()
+    knownSet = set()
+    unknownSet = set()
+
+    if '-h' in argv or '--help' in argv or len(argv) < 2:
+        printUsage()
+    else:
+        for doc in argv[1:]:
+            text = codecs.open(doc, 'r', 'utf-8').read()
+            tagSet.update(set(re.findall(r'(\\[a-zA-Z0-9]+\b\*?)', text)))
+
+        for tag in tagSet:
+            if tag in simpleTags:
+                knownSet.add(tag)
+            elif tag.rstrip('1234567890') in digitTags:
+                knownSet.add(tag)
+            else:
+                unknownSet.add(tag)
+
+        print 'Known USFM Tags: ' + ', '.join(sorted(knownSet))
+        print 'Unrecognized USFM Tags: ' + ', '.join(sorted(unknownSet))
+        
+
+
+def printUsage():
+    print 'usfmtags.py <USFM filenames|wildcard>'
+    print ' Revision: ' + rev + ' (' + date + ')'
+    print ''
+    print ' This utility will scan USFM files and print two lists of all unique tags in them.'
+    print ' The first list identifies all valid tags, identified in the USFM ' + USFMversion + ' spec.'
+    print ' The second list identifies tags unknown to that spec.'
+    exit()
+
+if __name__ == "__main__":
+    main(sys.argv)




More information about the sword-cvs mailing list