### Eclipse Workspace Patch 1.0 #P common Index: src/main/java/org/crosswire/common/xml/XMLUtil.java =================================================================== --- src/main/java/org/crosswire/common/xml/XMLUtil.java (revision 1927) +++ src/main/java/org/crosswire/common/xml/XMLUtil.java (working copy) @@ -253,6 +253,23 @@ } /** + * Common HTML tags such as <br>,<hr> and <img> may be + * left open causing XML parsing to fail. This method closes these tags. + * + * @param broken the string to be cleaned + * @return the cleaned string + */ + public static String closeEmptyTags(String broken) + { + if (broken == null) + { + return null; + } + + return openHTMLTagPattern.matcher(broken).replaceAll("<$1$2/>"); //$NON-NLS-1$ + } + + /** * XML parse failed, so we can try getting rid of all the tags and having * another go. We define a tag to start at a < and end at the end of the * next word (where a word is what comes in between spaces) that does not @@ -489,4 +506,9 @@ * Valid are: #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] */ private static Pattern invalidCharacterPattern = Pattern.compile("[^\t\r\n\u0020-\uD7FF\uE000-\uFFFD]"); //$NON-NLS-1$ + + /** + * Pattern that matches open <br>,<hr> and <img> tags. + */ + private static Pattern openHTMLTagPattern = Pattern.compile("<(img|hr|br)([^>]*)(?"); //$NON-NLS-1$ } #P jsword Index: src/main/java/org/crosswire/jsword/book/filter/thml/THMLFilter.java =================================================================== --- src/main/java/org/crosswire/jsword/book/filter/thml/THMLFilter.java (revision 1927) +++ src/main/java/org/crosswire/jsword/book/filter/thml/THMLFilter.java (working copy) @@ -123,6 +123,11 @@ // So just try to strip out all XML looking things String clean = XMLUtil.cleanAllCharacters(plain); Element ele = parse(book, key, clean, "cleaning text"); //$NON-NLS-1$ + + if (ele == null) + { + ele = parse(book, key, XMLUtil.closeEmptyTags(clean), "closing empty tags"); //$NON-NLS-1$ + } if (ele == null) {