[jsword-svn] r1934 - in trunk: common/src/main/java/org/crosswire/common/xml jsword/src/main/java/org/crosswire/jsword/book/filter/thml

dmsmith at crosswire.org dmsmith at crosswire.org
Tue Feb 24 04:39:08 MST 2009


Author: dmsmith
Date: 2009-02-24 04:39:08 -0700 (Tue, 24 Feb 2009)
New Revision: 1934

Modified:
   trunk/common/src/main/java/org/crosswire/common/xml/XMLUtil.java
   trunk/jsword/src/main/java/org/crosswire/jsword/book/filter/thml/THMLFilter.java
Log:
Allow ThML to contain <br>, <img> and <hr>

Modified: trunk/common/src/main/java/org/crosswire/common/xml/XMLUtil.java
===================================================================
--- trunk/common/src/main/java/org/crosswire/common/xml/XMLUtil.java	2009-02-24 03:13:27 UTC (rev 1933)
+++ trunk/common/src/main/java/org/crosswire/common/xml/XMLUtil.java	2009-02-24 11:39:08 UTC (rev 1934)
@@ -253,6 +253,23 @@
     }
 
     /**
+     * Common HTML tags such as &lt;br&gt;,&lt;hr&gt; and &lt;img&gt; may be
+     * left open causing XML parsing to fail. This method closes these tags.
+     * 
+     * @param broken the string to be cleaned
+     * @return the cleaned string
+     */
+    public static String closeEmptyTags(String broken) 
+    {
+        if (broken == null)
+        {
+            return null;
+        }
+
+        return openHTMLTagPattern.matcher(broken).replaceAll("<$1$2/>");  //$NON-NLS-1$
+    }
+
+    /**
      * XML parse failed, so we can try getting rid of all the tags and having
      * another go. We define a tag to start at a &lt; and end at the end of the
      * next word (where a word is what comes in between spaces) that does not
@@ -489,4 +506,9 @@
      * Valid are: #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
      */
     private static Pattern invalidCharacterPattern = Pattern.compile("[^\t\r\n\u0020-\uD7FF\uE000-\uFFFD]"); //$NON-NLS-1$
+   
+    /**
+     * Pattern that matches open &lt;br&gt;,&lt;hr&gt; and &lt;img&gt; tags.
+     */
+    private static Pattern openHTMLTagPattern = Pattern.compile("<(img|hr|br)([^>]*)(?<!/)>"); //$NON-NLS-1$
 }

Modified: trunk/jsword/src/main/java/org/crosswire/jsword/book/filter/thml/THMLFilter.java
===================================================================
--- trunk/jsword/src/main/java/org/crosswire/jsword/book/filter/thml/THMLFilter.java	2009-02-24 03:13:27 UTC (rev 1933)
+++ trunk/jsword/src/main/java/org/crosswire/jsword/book/filter/thml/THMLFilter.java	2009-02-24 11:39:08 UTC (rev 1934)
@@ -124,6 +124,11 @@
         String clean = XMLUtil.cleanAllCharacters(plain);
         Element ele = parse(book, key, clean, "cleaning text"); //$NON-NLS-1$
 
+        if (ele == null) 
+        {
+            ele = parse(book, key, XMLUtil.closeEmptyTags(clean), "closing empty tags"); //$NON-NLS-1$
+        }
+
         if (ele == null)
         {
             ele = cleanTags(book, key, clean);




More information about the jsword-svn mailing list