[sword-svn] r375 - trunk/modules/python

chrislit at crosswire.org chrislit at crosswire.org
Fri Aug 10 10:09:14 MST 2012


Author: chrislit
Date: 2012-08-10 10:09:14 -0700 (Fri, 10 Aug 2012)
New Revision: 375

Modified:
   trunk/modules/python/usfm2osis.py
Log:
cleaned up spacing in output
fixed output validation errors due to addition of intro tags


Modified: trunk/modules/python/usfm2osis.py
===================================================================
--- trunk/modules/python/usfm2osis.py	2012-08-10 12:55:16 UTC (rev 374)
+++ trunk/modules/python/usfm2osis.py	2012-08-10 17:09:14 UTC (rev 375)
@@ -35,7 +35,7 @@
 # Employ best-practice conformant OSIS
 # Employ modularity (functions rather than a big long script)
 # Employ the same command-line syntax as usfm2osis.pl
-# Use & abuse Unicode tags (http://unicode.org/charts/PDF/UE0000.pdf) to simplify Regex processing
+# Use non-characters for milestoning
 
 ### Roadmap:
 # 0.5 initial commit, including full coverage of core USFM tags
@@ -49,8 +49,8 @@
 # 1.x SWORD module output?, requiring SWORD bindings
 
 ### Key to non-characters:
-# Used   : ﷐﷑﷒﷓﷔﷕﷖﷗﷘﷙﷚﷛﷜﷝﷞﷟
-# Unused : ﷠﷡﷢﷣﷤﷥﷦﷧﷨﷩﷪﷫﷬﷭﷮﷯
+# Used   : ﷐﷑﷒﷓﷔﷕﷖﷗﷘﷙﷚﷛﷜﷝﷞﷟﷠﷡
+# Unused : ﷧﷨﷩﷪﷫﷬﷭﷮﷯
 # ﷐ book
 # ﷑ chapter
 # ﷒ verse
@@ -67,6 +67,13 @@
 # ﷝ s4
 # ﷞ s5
 # ﷟ notes
+# ﷠ intro-list
+# ﷡ intro-outline
+# ﷢ is1
+# ﷣ is2
+# ﷤ is3
+# ﷥ is4
+# ﷦ is5
 
 import sys, codecs, re
 from encodings.aliases import aliases
@@ -325,7 +332,7 @@
         # \rem_text...
         osis = re.sub(r'\\rem\b\s+(.+)', r'<!-- rem - \1 -->', osis)
         
-        # \restore: unpublished, seek example
+        # \restore_text...
         if relaxedConformance:
             osis = re.sub(r'\\restore\b\s+(.+)', r'<!-- restore - \1 -->', osis)
 
@@ -359,20 +366,23 @@
         # \imt#_text...
         osis = re.sub(r'\\imt(\d?)\s+(.+)', lambda m: '<title ' + ('level="'+m.group(1)+'" ' if m.group(1) else '') + 'type="main" subType="x-introduction">' + m.group(2) + '</title>', osis)
 
+        # \imte#_text...
+        osis = re.sub(r'\\imte(\d?)\b\s+(.+)', lambda m: '<title ' + ('level="'+m.group(1)+'" ' if m.group(1) else '') + 'type="main" subType="x-introduction-end">' + m.group(2) + '</title>', osis)
+
         # \is#_text...
         osis = re.sub(r'\\is1?\s+(.+)', lambda m: u'﷚<div type="section" subType="x-introduction"><title>' + m.group(1) + '</title>', osis)
-        osis = re.sub(u'(﷚[^﷕﷐﷖﷗﷘﷙﷚]+)', r'\1'+u'</div>﷚\n', osis, re.DOTALL)
-        osis = re.sub(r'\\is2\s+(.+)', lambda m: u'﷛<div type="subsection" subType="x-introduction"><title>' + m.group(1) + '</title>', osis)
-        osis = re.sub(u'(﷛[^﷕﷐﷖﷗﷘﷙﷚﷛]+)', r'\1'+u'</div>﷛\n', osis, re.DOTALL)
+        osis = re.sub(u'(﷚[^﷕﷐﷖﷗﷘﷙﷚]+)', r'\1'+u'</div>﷚\n', osis, flags=re.DOTALL)
+        osis = re.sub(r'\\is2\s+(.+)', lambda m: u'﷛<div type="subSection" subType="x-introduction"><title>' + m.group(1) + '</title>', osis)
+        osis = re.sub(u'(﷛[^﷕﷐﷖﷗﷘﷙﷚﷛]+)', r'\1'+u'</div>﷛\n', osis, flags=re.DOTALL)
         osis = re.sub(r'\\is3\s+(.+)', lambda m: u'﷜<div type="x-subSubSection" subType="x-introduction"><title>' + m.group(1) + '</title>', osis)
-        osis = re.sub(u'(﷜[^﷕﷐﷖﷗﷘﷙﷚﷛﷜]+)', r'\1'+u'</div>﷜\n', osis, re.DOTALL)
+        osis = re.sub(u'(﷜[^﷕﷐﷖﷗﷘﷙﷚﷛﷜]+)', r'\1'+u'</div>﷜\n', osis, flags=re.DOTALL)
         osis = re.sub(r'\\is4\s+(.+)', lambda m: u'﷝<div type="x-subSubSubSection" subType="x-introduction"><title>' + m.group(1) + '</title>', osis)
-        osis = re.sub(u'(﷝[^﷕﷐﷖﷗﷘﷙﷚﷛﷜﷝]+)', r'\1'+u'</div>﷝\n', osis, re.DOTALL)
+        osis = re.sub(u'(﷝[^﷕﷐﷖﷗﷘﷙﷚﷛﷜﷝]+)', r'\1'+u'</div>﷝\n', osis, flags=re.DOTALL)
         osis = re.sub(r'\\is5\s+(.+)', lambda m: u'﷞<div type="x-subSubSubSubSection" subType="x-introduction"><title>' + m.group(1) + '</title>', osis)
-        osis = re.sub(u'(﷞[^﷕﷐﷖﷗﷘﷙﷚﷛﷜﷝﷞]+)', r'\1'+u'</div>﷞\n', osis, re.DOTALL)
+        osis = re.sub(u'(﷞[^﷕﷐﷖﷗﷘﷙﷚﷛﷜﷝﷞]+)', r'\1'+u'</div>﷞\n', osis, flags=re.DOTALL)
 
         # \ip_text...
-        osis = re.sub(r'\\ip\s+(.*?)(?=(\\(i?m|i?p|nb|lit|cls|tr)|<chapter eID|</?div\b|<(p|closer)\b))', lambda m: u'﷓<p subType="x-introduction">\n' + m.group(1) + u'﷓</p>\n', osis, flags=re.DOTALL)
+        osis = re.sub(r'\\ip\s+(.*?)(?=(\\(i?m|i?p|nb|lit|cls|tr|io|iq|i?li|iex?|s)|<chapter eID|</?div\b|<(p|closer)\b))', lambda m: u'﷓<p subType="x-introduction">\n' + m.group(1) + u'﷓</p>\n', osis, flags=re.DOTALL)
 
         # \ipi_text...
         # \im_text...
@@ -381,11 +391,11 @@
         # \imq_text...
         # \ipr_text...
         pType = {'ipi':'x-indented', 'im':'x-noindent', 'imi':'x-noindent-indented', 'ipq':'x-quote', 'imq':'x-noindent-quote', 'ipr':'x-right'}
-        osis = re.sub(r'\\(ipi|im|ipq|imq|ipr)\s+(.*?)(?=(\\(i?m|i?p|nb|lit|cls|tr)|<chapter eID|</?div\b|<(p|closer)\b))', lambda m: u'﷓<p type="' + pType[m.group(1)]  + '" subType="x-introduction">\n' + m.group(2) + u'﷓</p>\n', osis, flags=re.DOTALL)
+        osis = re.sub(r'\\(ipi|im|ipq|imq|ipr)\s+(.*?)(?=(\\(i?m|i?p|nb|lit|cls|tr|io|iq|i?li|iex?|s)|<chapter eID|</?div\b|<(p|closer)\b))', lambda m: u'﷓<p type="' + pType[m.group(1)]  + '" subType="x-introduction">\n' + m.group(2) + u'﷓</p>\n', osis, flags=re.DOTALL)
 
         # \iq#_text...
-        osis = re.sub(r'\\iq\b\s*(.*?)(?=(['+u'﷐﷑﷓﷔'+r']|\\i?q[\d\s]|<l\b|<lb\b|<title\b))', r'<l level="1" subType="x-introduction">\1</l>', osis, flags=re.DOTALL)
-        osis = re.sub(r'\\iq(\d)\b\s*(.*?)(?=(['+u'﷐﷑﷓﷔'+r']|\\i?q[\d\s]|<l\b|<lb\b|<title\b))', r'<l level="\1" subType="x-introduction">\2</l>', osis, flags=re.DOTALL)
+        osis = re.sub(r'\\iq\b\s*(.*?)(?=(['+u'﷐﷑﷓﷔'+r']|\\i?q[\d\s]|\\fig|<l\b|<lb\b|<title\b))', r'<l level="1" subType="x-introduction">\1</l>', osis, flags=re.DOTALL)
+        osis = re.sub(r'\\iq(\d)\b\s*(.*?)(?=(['+u'﷐﷑﷓﷔'+r']|\\i?q[\d\s]|\\fig|<l\b|<lb\b|<title\b))', r'<l level="\1" subType="x-introduction">\2</l>', osis, flags=re.DOTALL)
 
         # \ib
         osis = re.sub(r'\\ib\b\s?', '<lb type="x-p"/>', osis)
@@ -394,18 +404,18 @@
         osis = re.sub('(<lg>.+?</lg>)', lambda m: m.group(1).replace('<lb type="x-p"/>', '</lg><lg>'), osis, flags=re.DOTALL) # re-handle \b that occurs within <lg>
 
         # \ili#_text...
-        osis = re.sub(r'\\ili\b\s*(.*?)(?=(['+u'﷐﷑﷓﷔'+r']|\\ili[\d\s]|<lb\b|<title\b))', r'<item type="x-indent-1" subType="x-introduction">\1</item>', osis, flags=re.DOTALL)
-        osis = re.sub(r'\\ili(\d)\b\s*(.*?)(?=(['+u'﷐﷑﷓﷔'+r']|\\ili[\d\s]|<lb\b|<title\b))', r'<item type="x-indent-\1" subType="x-introduction">\2</item>', osis, flags=re.DOTALL)
+        osis = re.sub(r'\\ili\b\s*(.*?)(?=(['+u'﷐﷑﷓﷔'+r']|\\ili[\d\s]|<lb\b|<title\b|<item\b))', ur'<item type="x-indent-1" subType="x-introduction">﷠\1﷠</item>', osis, flags=re.DOTALL)
+        osis = re.sub(r'\\ili(\d)\b\s*(.*?)(?=(['+u'﷐﷑﷓﷔'+r']|\\ili[\d\s]|<lb\b|<title\b|<item\b))', ur'<item type="x-indent-\1" subType="x-introduction">﷠\2﷠</item>', osis, flags=re.DOTALL)
         osis = osis.replace('\n</item>', '</item>\n')
-        osis = re.sub(u'(<item [^﷐﷑﷓﷔]+</item>)', r'<list>\1</list>', osis, flags=re.DOTALL)
+        osis = re.sub(u'(<item [^﷐﷑﷓﷔]+</item>)', ur'﷓<list>\1</list>﷓', osis, flags=re.DOTALL)
 
         # \iot_text...
         # \io#_text...(references range)
-        osis = re.sub(r'\\io\b\s*(.*?)(?=(['+u'﷐﷑﷓﷔'+r']|\\io[t\d\s]|<lb\b|<title\b))', r'<item type="x-indent-1" subType="x-introduction">\1</item>', osis, flags=re.DOTALL)
-        osis = re.sub(r'\\io(\d)\b\s*(.*?)(?=(['+u'﷐﷑﷓﷔'+r']|\\io[t\d\s]|<lb\b|<title\b))', r'<item type="x-indent-\1" subType="x-introduction">\2</item>', osis, flags=re.DOTALL)
-        osis = re.sub(r'\\iot\b\s*(.*?)(?=(['+u'﷐﷑﷓﷔'+r']|\\io[t\d\s]|<lb\b|<title\b))', r'<item type="head">\1</item type="head">', osis, flags=re.DOTALL)
+        osis = re.sub(r'\\io\b\s*(.*?)(?=(['+u'﷐﷑﷓﷔'+r']|\\io[t\d\s]|\\iex?|<lb\b|<title\b|<item\b))', ur'<item type="x-indent-1" subType="x-introduction">﷡\1﷡</item>', osis, flags=re.DOTALL)
+        osis = re.sub(r'\\io(\d)\b\s*(.*?)(?=(['+u'﷐﷑﷓﷔'+r']|\\io[t\d\s]|\\iex?|<lb\b|<title\b|<item\b))', ur'<item type="x-indent-\1" subType="x-introduction">﷡\2﷡</item>', osis, flags=re.DOTALL)
+        osis = re.sub(r'\\iot\b\s*(.*?)(?=(['+u'﷐﷑﷓﷔'+r']|\\io[t\d\s]|\\iex?|<lb\b|<title\b|<item\b))', ur'<item type="head">﷡\1﷡</item type="head">', osis, flags=re.DOTALL)
         osis = osis.replace('\n</item>', '</item>\n')
-        osis = re.sub(u'(<item [^﷐﷑﷓﷔]+</item>)', r'<div type="outline"><list>\1</list></div>', osis, flags=re.DOTALL)
+        osis = re.sub(u'(<item [^﷐﷑﷓﷔﷠]+</item>)', ur'﷓<div type="outline"><list>\1</list></div>﷓', osis, flags=re.DOTALL)
         osis = re.sub('item type="head"', 'head', osis)
 
         # \ior_text...\ior*
@@ -417,9 +427,6 @@
         # \iqt_text...\iqt*
         osis = re.sub(r'\\iqt\s+(.+?)\\iqt\*', r'<q subType="x-introduction">\1</q>', osis, flags=re.DOTALL)
 
-        # \imte#_text...
-        osis = re.sub(r'\\imte(\d?)\b\s+(.+)', lambda m: '<title ' + ('level="'+m.group(1)+'" ' if m.group(1) else '') + 'type="main" subType="x-introduction-end">' + m.group(2) + '</title>', osis)
-
         # \ie
         osis = re.sub(r'\\ie\b\s*', '<milestone type="x-usfm-ie"/>', osis)
 
@@ -433,33 +440,33 @@
         """
         # \ms#_text...
         osis = re.sub(r'\\ms1?\s+(.+)', lambda m: u'﷕<div type="majorSection"><title>' + m.group(1) + '</title>', osis)
-        osis = re.sub(u'(﷕[^﷕﷐]+)', r'\1'+u'</div>﷕\n', osis, re.DOTALL)
+        osis = re.sub(u'(﷕[^﷕﷐]+)', r'\1'+u'</div>﷕\n', osis, flags=re.DOTALL)
         osis = re.sub(r'\\ms2\s+(.+)', lambda m: u'﷖<div type="majorSection" n="2"><title>' + m.group(1) + '</title>', osis)
-        osis = re.sub(u'(﷖[^﷕﷐﷖]+)', r'\1'+u'</div>﷖\n', osis, re.DOTALL)
+        osis = re.sub(u'(﷖[^﷕﷐﷖]+)', r'\1'+u'</div>﷖\n', osis, flags=re.DOTALL)
         osis = re.sub(r'\\ms3\s+(.+)', lambda m: u'﷗<div type="majorSection" n="3"><title>' + m.group(1) + '</title>', osis)
-        osis = re.sub(u'(﷗[^﷕﷐﷖﷗]+)', r'\1'+u'</div>﷗\n', osis, re.DOTALL)
+        osis = re.sub(u'(﷗[^﷕﷐﷖﷗]+)', r'\1'+u'</div>﷗\n', osis, flags=re.DOTALL)
         osis = re.sub(r'\\ms4\s+(.+)', lambda m: u'﷘<div type="majorSection" n="4"><title>' + m.group(1) + '</title>', osis)
-        osis = re.sub(u'(﷘[^﷕﷐﷖﷗﷘]+)', r'\1'+u'</div>﷘\n', osis, re.DOTALL)
+        osis = re.sub(u'(﷘[^﷕﷐﷖﷗﷘]+)', r'\1'+u'</div>﷘\n', osis, flags=re.DOTALL)
         osis = re.sub(r'\\ms5\s+(.+)', lambda m: u'﷙<div type="majorSection" n="5"><title>' + m.group(1) + '</title>', osis)
-        osis = re.sub(u'(﷙[^﷕﷐﷖﷗﷘﷙]+)', r'\1'+u'</div>﷙\n', osis, re.DOTALL)
+        osis = re.sub(u'(﷙[^﷕﷐﷖﷗﷘﷙]+)', r'\1'+u'</div>﷙\n', osis, flags=re.DOTALL)
 
         # \mr_text...
         osis = re.sub(r'\\mr\s+(.+)', u'﷔<title type="scope"><reference>'+r'\1</reference></title>', osis)
 
         # \s#_text...
         osis = re.sub(r'\\s1?\s+(.+)', lambda m: u'﷚<div type="section"><title>' + m.group(1) + '</title>', osis)
-        osis = re.sub(u'(﷚[^﷕﷐﷖﷗﷘﷙﷚]+)', r'\1'+u'</div>﷚\n', osis, re.DOTALL)
+        osis = re.sub(u'(﷚<div type="section">[^﷕﷐﷖﷗﷘﷙﷚]+)', r'\1'+u'</div>﷚\n', osis, flags=re.DOTALL)
         if relaxedConformance:
             osis = re.sub(r'\\ss\s+', r'\\s2 ', osis)
             osis = re.sub(r'\\sss\s+', r'\\s3 ', osis)
-        osis = re.sub(r'\\s2\s+(.+)', lambda m: u'﷛<div type="subsection"><title>' + m.group(1) + '</title>', osis)
-        osis = re.sub(u'(﷛[^﷕﷐﷖﷗﷘﷙﷚﷛]+)', r'\1'+u'</div>﷛\n', osis, re.DOTALL)
+        osis = re.sub(r'\\s2\s+(.+)', lambda m: u'﷛<div type="subSection"><title>' + m.group(1) + '</title>', osis)
+        osis = re.sub(u'(﷛<div type="subSection">[^﷕﷐﷖﷗﷘﷙﷚﷛]+)', r'\1'+u'</div>﷛\n', osis, flags=re.DOTALL)
         osis = re.sub(r'\\s3\s+(.+)', lambda m: u'﷜<div type="x-subSubSection"><title>' + m.group(1) + '</title>', osis)
-        osis = re.sub(u'(﷜[^﷕﷐﷖﷗﷘﷙﷚﷛﷜]+)', r'\1'+u'</div>﷜\n', osis, re.DOTALL)
+        osis = re.sub(u'(﷜<div type="x-subSubSection">[^﷕﷐﷖﷗﷘﷙﷚﷛﷜]+)', r'\1'+u'</div>﷜\n', osis, flags=re.DOTALL)
         osis = re.sub(r'\\s4\s+(.+)', lambda m: u'﷝<div type="x-subSubSubSection"><title>' + m.group(1) + '</title>', osis)
-        osis = re.sub(u'(﷝[^﷕﷐﷖﷗﷘﷙﷚﷛﷜﷝]+)', r'\1'+u'</div>﷝\n', osis, re.DOTALL)
+        osis = re.sub(u'(﷝<div type="x-subSubSubSection">[^﷕﷐﷖﷗﷘﷙﷚﷛﷜﷝]+)', r'\1'+u'</div>﷝\n', osis, flags=re.DOTALL)
         osis = re.sub(r'\\s5\s+(.+)', lambda m: u'﷞<div type="x-subSubSubSubSection"><title>' + m.group(1) + '</title>', osis)
-        osis = re.sub(u'(﷞[^﷕﷐﷖﷗﷘﷙﷚﷛﷜﷝﷞]+)', r'\1'+u'</div>﷞\n', osis, re.DOTALL)
+        osis = re.sub(u'(﷞<div type="x-subSubSubSubSection">[^﷕﷐﷖﷗﷘﷙﷚﷛﷜﷝﷞]+)', r'\1'+u'</div>﷞\n', osis, flags=re.DOTALL)
 
         # \sr_text...
         osis = re.sub(r'\\sr\s+(.+)', ur'﷔<title type="scope"><reference>\1</reference></title>', osis)
@@ -564,10 +571,10 @@
         # \li#(_text...)
         osis = re.sub(r'\\ph\b\s*', r'\\li ', osis)
         osis = re.sub(r'\\ph(\d)\b\s*', r'\\li\1 ', osis)
-        osis = re.sub(r'\\li\b\s*(.*?)(?=(['+u'﷐﷑﷓﷔'+r']|\\li[\d\s]|<lb\b|<title\b))', r'<item type="x-indent-1">\1</item>', osis, flags=re.DOTALL)
-        osis = re.sub(r'\\li(\d)\b\s*(.*?)(?=(['+u'﷐﷑﷓﷔'+r']|\\li[\d\s]|<lb\b|<title\b))', r'<item type="x-indent-\1">\2</item>', osis, flags=re.DOTALL)
+        osis = re.sub(r'\\li\b\s*(.*?)(?=(['+u'﷐﷑﷓﷔'+r']|\\li[\d\s]|<lb\b|<title\b|<item\b))', r'<item type="x-indent-1">\1</item>', osis, flags=re.DOTALL)
+        osis = re.sub(r'\\li(\d)\b\s*(.*?)(?=(['+u'﷐﷑﷓﷔'+r']|\\li[\d\s]|<lb\b|<title\b|<item\b))', r'<item type="x-indent-\1">\2</item>', osis, flags=re.DOTALL)
         osis = osis.replace('\n</item>', '</item>\n')
-        osis = re.sub(u'(<item [^﷐﷑﷓﷔]+</item>)', r'<list>\1</list>', osis, flags=re.DOTALL)
+        osis = re.sub(u'(<item [^﷐﷑﷓﷔﷠﷡]+</item>)', ur'﷓<list>\1</list>﷓', osis, flags=re.DOTALL)
 
         # \b
         osis = re.sub(r'\\b\b\s?', '<lb type="x-p"/>', osis)
@@ -591,14 +598,14 @@
         osis = re.sub(r'\\qs\b\s(.+?)\\qs\*', r'<l type="selah">\1</l>', osis, flags=re.DOTALL)
 
         # \q#(_text...)
-        osis = re.sub(r'\\q\b\s*(.*?)(?=(['+u'﷐﷑﷓﷔'+r']|\\q[\d\s]|<l\b|<lb\b|<title\b))', r'<l level="1">\1</l>', osis, flags=re.DOTALL)
-        osis = re.sub(r'\\q(\d)\b\s*(.*?)(?=(['+u'﷐﷑﷓﷔'+r']|\\q[\d\s]|<l\b|<lb\b|<title\b))', r'<l level="\1">\2</l>', osis, flags=re.DOTALL)
+        osis = re.sub(r'\\q\b\s*(.*?)(?=(['+u'﷐﷑﷓﷔'+r']|\\q[\d\s]|\\fig|<l\b|<lb\b|<title\b))', r'<l level="1">\1</l>', osis, flags=re.DOTALL)
+        osis = re.sub(r'\\q(\d)\b\s*(.*?)(?=(['+u'﷐﷑﷓﷔'+r']|\\q[\d\s]|\\fig|<l\b|<lb\b|<title\b))', r'<l level="\1">\2</l>', osis, flags=re.DOTALL)
 
         # \qr_text...
         # \qc_text...
         # \qm#(_text...)
         qType = {'qr':'x-right', 'qc':'x-center', 'qm':'x-embedded" level="1', 'qm1':'x-embedded" level="1', 'qm2':'x-embedded" level="2', 'qm3':'x-embedded" level="3', 'qm4':'x-embedded" level="4', 'qm5':'x-embedded" level="5'}
-        osis = re.sub(r'\\(qr|qc|qm\d)\b\s*(.*?)(?=(['+u'﷐﷑﷓﷔'+r']|\\q[\d\s]|<l\b|<lb\b|<title\b))', lambda m: '<l type="' + qType[m.group(1)] + '">' + m.group(2) + '</l>', osis, flags=re.DOTALL)
+        osis = re.sub(r'\\(qr|qc|qm\d)\b\s*(.*?)(?=(['+u'﷐﷑﷓﷔'+r']|\\q[\d\s]|\\fig|<l\b|<lb\b|<title\b))', lambda m: '<l type="' + qType[m.group(1)] + '">' + m.group(2) + '</l>', osis, flags=re.DOTALL)
 
         osis = osis.replace('\n</l>', '</l>\n')
         osis = re.sub(u'(<l [^﷐﷑﷓﷔]+</l>)', r'<lg>\1</lg>', osis, flags=re.DOTALL)
@@ -649,7 +656,7 @@
         note = re.sub(r'\\ft\s', '', note)
 
         # \fr_##SEP##
-        note = re.sub(r'\\fr\b\s(.+?)(?=(\\f|'+u'﷟))', u'﷟'+r'<reference>\1</reference>', note)
+        note = re.sub(r'\\fr\b\s(.+?)(?=(\\f|'+u'﷟))', u'﷟'+r'<reference type="annotateRef">\1</reference>', note)
 
         # \fk_
         note = re.sub(r'\\fk\b\s(.+?)(?=(\\f|'+u'﷟))', u'﷟'+r'<catchWord>\1</catchWord>', note)
@@ -712,21 +719,20 @@
         # \xq_
         note = re.sub(r'\\xq\b\s(.+?)(?=(\\x|'+u'﷟))', u'﷟'+r'<catchWord>\1</catchWord>', note)
 
-        # \xt_
-        note = re.sub(r'\\xt\s', '', note)
+        # \xo_##SEP##
+        note = re.sub(r'\\xo\b\s(.+?)(?=(\\x|'+u'﷟))', u'﷟'+r'<reference type="annotateRef">\1</reference>', note)
 
+        # \xk_
+        note = re.sub(r'\\xk\b\s(.+?)(?=(\\x|'+u'﷟))', u'﷟'+r'<catchWord>\1</catchWord>', note)
+
+        # \xt_  # This isn't guaranteed to be *the* reference, but it's a good guess.
+        note = re.sub(r'\\xt\b\s(.+?)(?=(\\x|'+u'﷟))', u'﷟'+r'<reference>\1</reference>', note)
+        
         if relaxedConformance:
             # TODO: \xtSee..\xtSee*: Concordance and Names Index markup for an alternate entry target reference.
             # TODO: \xtSeeAlso...\xtSeeAlso: Concordance and Names Index markup for an additional entry target reference.
             pass
 
-
-        # \xo_##SEP##
-        note = re.sub(r'\\xo\b\s(.+?)(?=(\\x|'+u'﷟))', u'﷟'+r'<reference>\1</reference>', note)
-
-        # \xk_
-        note = re.sub(r'\\xk\b\s(.+?)(?=(\\x|'+u'﷟))', u'﷟'+r'<catchWord>\1</catchWord>', note)
-
         if relaxedConformance:
             note = note.replace(r'\xq*', '')
             note = note.replace(r'\xt*', '')
@@ -743,7 +749,7 @@
         supported: \\x...\\x*, \\xo, \\xk, \\xq, \\xt, \\xot...\\xot*, \\xnt...\\xnt*, \\xdc...\\xdc*
         """
         # \x_+_...\x*
-        osis = re.sub(r'\\x\s+([^\s]+?)\s+(.+?)\s*\\x\*', lambda m: '<note' + ((' n=""') if (m.group(1) == u'-') else ('' if (m.group(1) == '+') else (' n="' + m.group(1) + '"'))) + ' type="crossReference"><reference>' + m.group(2) + u'</reference>﷟</note>', osis, flags=re.DOTALL)
+        osis = re.sub(r'\\x\s+([^\s]+?)\s+(.+?)\s*\\x\*', lambda m: '<note' + ((' n=""') if (m.group(1) == u'-') else ('' if (m.group(1) == '+') else (' n="' + m.group(1) + '"'))) + ' type="crossReference">' + m.group(2) + u'﷟</note>', osis, flags=re.DOTALL)
 
         osis = re.sub(r'(<note [^>]*?type="crossReference"[^>]*>.*?</note>)', lambda m: processXref(m.group(1)), osis, flags=re.DOTALL)
 
@@ -871,7 +877,7 @@
             if fig_cap:
                 figure += '<caption>' + fig_cap + '</caption>\n'
             if fig_ref:
-                figure += '<reference>' + fig_ref + '</reference>\n'
+                figure += '<reference type="annotateRef">' + fig_ref + '</reference>\n'
             if fig_desc:
                 figure += '<!-- fig DESC - ' + fig_desc + ' -->\n'
             if fig_loc:
@@ -1019,12 +1025,12 @@
         osis = re.sub(r'(</[^\s>]+) [^>]*>', r'\1>', osis)
         osis = osis.replace('<lb type="x-p"/>', '<lb/>')
         # delete Unicode tags
-        for c in u'﷐﷑﷒﷓﷔﷕﷖﷗﷘﷙﷚﷛﷜﷝﷞﷟':
+        for c in u'﷐﷑﷒﷓﷔﷕﷖﷗﷘﷙﷚﷛﷜﷝﷞﷟﷠﷡﷢﷣﷤﷥﷦﷧﷨﷩﷪﷫﷬﷭﷮﷯':
             osis = osis.replace(c, '')
 
-        for endBlock in ['p', 'div', 'note', 'l', 'lg', 'chapter', 'verse']:
-            osis = re.sub(' +</'+endBlock+'>', '</'+endBlock+r'>', osis)
-            osis = re.sub(' +<'+endBlock+'( eID=[^/>]+/>)', '</'+endBlock+r'\1', osis)
+        for endBlock in ['p', 'div', 'note', 'l', 'lg', 'chapter', 'verse', 'head', 'title', 'item', 'list']:
+            osis = re.sub('\s+</'+endBlock+'>', '</'+endBlock+r'>\n', osis)
+            osis = re.sub('\s+<'+endBlock+'( eID=[^/>]+/>)', '<'+endBlock+r'\1'+'\n', osis)
         osis = re.sub(' +((</[^>]+>)+) *', r'\1 ', osis)
 
         # strip extra spaces & newlines




More information about the sword-cvs mailing list