1   /**
2    * Distribution License:
3    * JSword is free software; you can redistribute it and/or modify it under
4    * the terms of the GNU Lesser General Public License, version 2.1 or later
5    * as published by the Free Software Foundation. This program is distributed
6    * in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even
7    * the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
8    * See the GNU Lesser General Public License for more details.
9    *
10   * The License is available on the internet at:
11   *       http://www.gnu.org/copyleft/lgpl.html
12   * or by writing to:
13   *      Free Software Foundation, Inc.
14   *      59 Temple Place - Suite 330
15   *      Boston, MA 02111-1307, USA
16   *
17   * Copyright: 2005-2013
18   *     The copyright to this program is held by its authors.
19   *
20   */
21  package org.crosswire.common.xml;
22  
23  import java.io.IOException;
24  import java.io.InputStream;
25  import java.util.ArrayList;
26  import java.util.Collections;
27  import java.util.HashSet;
28  import java.util.List;
29  import java.util.Set;
30  import java.util.regex.Matcher;
31  import java.util.regex.Pattern;
32  
33  import org.crosswire.common.util.FileUtil;
34  import org.crosswire.common.util.PropertyMap;
35  import org.crosswire.common.util.ResourceUtil;
36  import org.jdom2.Document;
37  import org.jdom2.JDOMException;
38  import org.jdom2.input.SAXBuilder;
39  import org.jdom2.input.sax.XMLReaders;
40  import org.slf4j.Logger;
41  import org.slf4j.LoggerFactory;
42  import org.xml.sax.Attributes;
43  import org.xml.sax.ContentHandler;
44  import org.xml.sax.SAXException;
45  
46  /**
47   * Utilities for working with SAX XML parsing.
48   * 
49   * @see gnu.lgpl.License The GNU Lesser General Public License for details.
50   * @author Joe Walker
51   * @author DM Smith
52   */
53  public final class XMLUtil {
54      /**
55       * Prevent instantiation
56       */
57      private XMLUtil() {
58      }
59  
60      /**
61       * Get and load an XML file from the classpath and a few other places into a
62       * JDOM Document object.
63       * 
64       * @param subject
65       *            The name of the desired resource (without any extension)
66       * @return The requested resource
67       * @throws IOException
68       *             if there is a problem reading the file
69       * @throws JDOMException
70       *             If the resource is not valid XML
71       */
72      public static Document getDocument(String subject) throws JDOMException, IOException {
73          String resource = subject + FileUtil.EXTENSION_XML;
74          InputStream in = ResourceUtil.getResourceAsStream(resource);
75  
76          log.debug("Loading {}.xml from classpath: [OK]", subject);
77          // With JDom 1.x this passed true
78          SAXBuilder builder = new SAXBuilder(XMLReaders.DTDVALIDATING);
79          return builder.build(in);
80      }
81  
82      /**
83       * Serialize a SAXEventProvider into an XML String
84       * 
85       * @param provider
86       *            The source of SAX events
87       * @return a serialized string
88       * @throws SAXException 
89       */
90      public static String writeToString(SAXEventProvider provider) throws SAXException {
91          ContentHandler ser = new PrettySerializingContentHandler();
92          provider.provideSAXEvents(ser);
93          return ser.toString();
94      }
95  
96      /**
97       * Get the full name of the attribute, including the namespace if any.
98       * 
99       * @param attrs
100      *            the collection of attributes
101      * @param index
102      *            the index of the desired attribute
103      * @return the requested attribute
104      */
105     public static String getAttributeName(Attributes attrs, int index) {
106         String qName = attrs.getQName(index);
107         if (qName != null) {
108             return qName;
109         }
110         return attrs.getLocalName(index);
111     }
112 
113     /**
114      * Show the attributes of an element as debug
115      * @param attrs 
116      */
117     public static void debugSAXAttributes(Attributes attrs) {
118         for (int i = 0; i < attrs.getLength(); i++) {
119             log.debug("attr[{}]: {}={}", Integer.toString(i), attrs.getQName(i), attrs.getValue(i));
120         }
121     }
122 
123     /**
124      * Normalizes the given string
125      * @param s 
126      * @return the escaped string
127      */
128     public static String escape(String s) {
129         if (s == null) {
130             return s;
131         }
132         int len = s.length();
133         StringBuilder str = new StringBuilder(len);
134 
135         for (int i = 0; i < len; i++) {
136             char ch = s.charAt(i);
137             switch (ch) {
138             case '<':
139                 str.append("&lt;");
140                 break;
141 
142             case '>':
143                 str.append("&gt;");
144                 break;
145 
146             case '&':
147                 str.append("&amp;");
148                 break;
149 
150             case '"':
151                 str.append("&quot;");
152                 break;
153 
154             default:
155                 str.append(ch);
156             }
157         }
158 
159         return str.toString();
160     }
161 
162     /**
163      * For each entity in the input that is not allowed in XML, replace the
164      * entity with its unicode equivalent or remove it. For each instance of a
165      * bare &, replace it with &amp;<br>
166      * XML only allows 4 entities: &amp;amp;, &amp;quot;, &amp;lt; and &amp;gt;.
167      * 
168      * @param broken
169      *            the string to handle entities
170      * @return the string with entities appropriately fixed up
171      */
172     public static String cleanAllEntities(String broken) {
173         if (broken == null) {
174             return null;
175         }
176 
177         String working = broken;
178         int cleanfrom = 0;
179 
180         while (true) {
181             int amp = working.indexOf('&', cleanfrom);
182 
183             // If there are no more amps then we are done
184             if (amp == -1) {
185                 break;
186             }
187 
188             // Skip references of the kind &#ddd;
189             if (validCharacterEntityPattern.matcher(working.substring(amp)).find()) {
190                 cleanfrom = working.indexOf(';', amp) + 1;
191                 continue;
192             }
193 
194             int i = amp + 1;
195             while (true) {
196                 // if we are at the end of the string then just escape the '&';
197                 if (i >= working.length()) {
198                     // String entity = working.substring(amp);
199                     // String replace = guessEntity(entity);
200                     // DataPolice.report("replacing unterminated entity: '" +
201                     // entity + "' with: '" + replace + "'");
202 
203                     return working.substring(0, amp) + "&amp;" + working.substring(amp + 1);
204                 }
205 
206                 // if we have come to a ; then we have an entity
207                 // If it is something that xml can't handle then replace it.
208                 char c = working.charAt(i);
209                 if (c == ';') {
210                     String entity = working.substring(amp, i + 1);
211                     String replace = handleEntity(entity);
212                     // log.warn("replacing entity: '{}' with: '{}'", entity, replace);
213 
214                     working = working.substring(0, amp) + replace + working.substring(i + 1);
215                     break;
216                 }
217 
218                 // Did we end an entity without finding a closing ;
219                 // Then treat it as an '&' that needs to be replaced with &amp;
220                 if (!Character.isLetterOrDigit(c)) {
221                     // String entity = working.substring(amp, i);
222                     // String replace = "&amp;" + working.substring(amp + 1, i);
223                     // log.warn("replacing invalid entity: '{}' with: '{}': {}", entity, replace, broken);
224 
225                     working = working.substring(0, amp) + "&amp;" + working.substring(amp + 1);
226                     amp = i + 4; // account for the 4 extra characters
227                     break;
228                 }
229 
230                 i++;
231             }
232 
233             cleanfrom = amp + 1;
234         }
235 
236         return working;
237     }
238 
239     /**
240      * Remove all invalid characters in the input, replacing them with a space. XML has stringent
241      * requirements as to which characters are or are not allowed. The set of
242      * allowable characters are:<br>
243      * #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]<br>
244      * Note: Java handles to ￿
245      * 
246      * @param broken
247      *            the string to be cleaned
248      * @return the cleaned string
249      */
250     public static String cleanAllCharacters(String broken) {
251         return invalidCharacterPattern.matcher(broken).replaceAll(" ");
252     }
253 
254     /**
255      * Strip all closing tags from the end of the XML fragment, and then
256      * re-close all tags that are open at the end of the string.
257      * 
258      * @param broken
259      *            the string to be cleaned.
260      * @return cleaned string, or {@code null} if the string could not be
261      *         cleaned due to more broken XML
262      */
263     public static String recloseTags(String broken) {
264         String result = broken;
265         // remove closing tags from the end
266         while (result.matches(".*</[a-zA-Z]+>[ \t\r\n]*")) {
267             result = result.substring(0, result.lastIndexOf('<'));
268         }
269         // close tags again
270         List<String> openTags = new ArrayList<String>();
271         Matcher m = Pattern.compile("</?[a-zA-Z]+").matcher(result);
272         boolean lTagFound = false;
273         boolean lgTagFound = false;
274         while (m.find()) {
275             String match = m.group();
276             if (match.startsWith("</")) {
277                 if (openTags.size() == 0 && "</l".equals(match) && !lTagFound) {
278                     return recloseTags("<l>" + broken);
279                 }
280                 if (openTags.size() == 0 && "</lg".equals(match) && !lgTagFound) {
281                     return recloseTags("<lg>" + broken);
282                 }
283                 if (openTags.size() == 0) {
284                     return null;
285                 }
286                 String lastTag = openTags.remove(openTags.size() - 1);
287                 if (!match.equals("</" + lastTag)) {
288                     return null;
289                 }
290             } else {
291                 int closePos = result.indexOf('>', m.end());
292                 if (closePos == -1) {
293                     return null;
294                 }
295                 while (Character.isWhitespace(result.charAt(closePos - 1))) {
296                     --closePos;
297                 }
298                 if (result.charAt(closePos - 1) != '/') {
299                     if ("<l".equals(match)) {
300                         lTagFound = true;
301                     }
302                     if ("<lg".equals(match)) {
303                         lgTagFound = true;
304                     }
305                     openTags.add(match.substring(1));
306                 }
307             }
308         }
309         Collections.reverse(openTags);
310         for (String openTag : openTags) {
311             result += "</" + openTag + ">";
312         }
313         return result;
314     }
315 
316     /**
317      * Common HTML tags such as &lt;br&gt;,&lt;hr&gt; and &lt;img&gt; may be
318      * left open causing XML parsing to fail. This method closes these tags.
319      * 
320      * @param broken
321      *            the string to be cleaned
322      * @return the cleaned string
323      */
324     public static String closeEmptyTags(String broken) {
325         if (broken == null) {
326             return null;
327         }
328 
329         return openHTMLTagPattern.matcher(broken).replaceAll("<$1$2/>");
330     }
331 
332     /**
333      * XML parse failed, so we can try getting rid of all the tags and having
334      * another go. We define a tag to start at a &lt; and end at the end of the
335      * next word (where a word is what comes in between spaces) that does not
336      * contain an = sign, or at a >, whichever is earlier.
337      * @param broken 
338      * @return the string without any tags
339      */
340     public static String cleanAllTags(String broken) {
341         if (broken == null) {
342             return null;
343         }
344 
345         String working = broken;
346 
347         allTags: while (true) {
348             int lt = working.indexOf('<');
349 
350             // If there are no more amps then we are done
351             if (lt == -1) {
352                 break allTags;
353             }
354 
355             // loop to find the end of this tag
356             int i = lt;
357             int startattr = -1;
358 
359             singletag: while (true) {
360                 i++;
361 
362                 // the tag can't exist past the end of the string
363                 if (i >= working.length()) {
364                     // go back one so we can safely chop
365                     i--;
366                     break singletag;
367                 }
368 
369                 char c = working.charAt(i);
370 
371                 // normal end of tag
372                 if (c == '>') {
373                     break singletag;
374                 }
375 
376                 // we declare end-of-tag if this 'word' is not an attribute
377                 if (c == ' ') {
378                     if (startattr == -1) {
379                         // NOTE(joe): should we skip over consecutive spaces?
380                         startattr = i;
381                     } else {
382                         // so we've already had a space indicating start of
383                         // attribute, so this must be the beginning of the next
384                         // NOTE(joe): no - spaces can exist in attr values
385                         String value = working.substring(startattr, i);
386                         if (value.indexOf('=') == -1) {
387                             // this 'attribute' does not contain an equals so
388                             // we call it a word and end the parse
389                             break singletag;
390                         }
391                     }
392                 }
393             }
394 
395             // So we have the end of the tag, delete it, but leave a space in it's place
396             // DataPolice.report("discarding tag: " + working.substring(lt, i + 1));
397             working = working.substring(0, lt) + " " + working.substring(i + 1);
398         }
399 
400         return working;
401     }
402 
403     /**
404      * Replace entity with its unicode equivalent, if it is not a valid XML
405      * entity. Otherwise strip it out. XML only allows 4 entities: &amp;amp;,
406      * &amp;quot;, &amp;lt; and &amp;gt;.
407      * 
408      * @param entity
409      *            the entity to be replaced
410      * @return the substitution for the entity, either itself, the unicode
411      *         equivalent or an empty string.
412      */
413     private static String handleEntity(String entity) {
414         if (goodEntities.contains(entity)) {
415             return entity;
416         }
417 
418         String replace = badEntities.get(entity);
419         if (replace != null) {
420             return replace;
421         }
422 
423         // replace unknown entities with a space
424         return " ";
425     }
426 
427     // Map entities to their unicode equivalent
428     private static Set<String> goodEntities = new HashSet<String>();
429     private static PropertyMap badEntities = new PropertyMap();
430     static {
431         // pre-defined XML entities
432         goodEntities.add("&quot;"); // quotation mark
433         goodEntities.add("&amp;"); // ampersand
434         goodEntities.add("&lt;"); // less-than sign
435         goodEntities.add("&gt;"); // greater-than sign
436 
437         // misc entities
438         badEntities.put("&euro;", "\u20AC"); // euro
439         badEntities.put("&lsquo;", "\u2018"); // left single quotation mark
440         badEntities.put("&rsquo;", "\u2019"); // right single quotation mark
441 
442         // Latin 1 entities
443         badEntities.put("&nbsp;", "\u00A0"); // no-break space
444         badEntities.put("&iexcl;", "\u00A1"); // inverted exclamation mark
445         badEntities.put("&cent;", "\u00A2"); // cent sign
446         badEntities.put("&pound;", "\u00A3"); // pound sign
447         badEntities.put("&curren;", "\u00A4"); // currency sign
448         badEntities.put("&yen;", "\u00A5"); // yen sign
449         badEntities.put("&brvbar;", "\u00A6"); // broken vertical bar
450         badEntities.put("&sect;", "\u00A7"); // section sign
451         badEntities.put("&uml;", "\u00A8"); // diaeresis
452         badEntities.put("&copy;", "\u00A9"); // copyright sign
453         badEntities.put("&ordf;", "\u00AA"); // feminine ordinal indicator
454         badEntities.put("&laquo;", "\u00AB"); // left-pointing double angle quotation mark
455         badEntities.put("&not;", "\u00AC"); // not sign
456         badEntities.put("&shy;", "\u00AD"); // soft hyphen
457         badEntities.put("&reg;", "\u00AE"); // registered sign
458         badEntities.put("&macr;", "\u00AF"); // macron
459         badEntities.put("&deg;", "\u00B0"); // degree sign
460         badEntities.put("&plusmn;", "\u00B1"); // plus-minus sign
461         badEntities.put("&sup2;", "\u00B2"); // superscript two
462         badEntities.put("&sup3;", "\u00B3"); // superscript three
463         badEntities.put("&acute;", "\u00B4"); // acute accent
464         badEntities.put("&micro;", "\u00B5"); // micro sign
465         badEntities.put("&para;", "\u00B6"); // pilcrow sign
466         badEntities.put("&middot;", "\u00B7"); // middle dot
467         badEntities.put("&cedil;", "\u00B8"); // cedilla
468         badEntities.put("&sup1;", "\u00B9"); // superscript one
469         badEntities.put("&ordm;", "\u00BA"); // masculine ordinal indicator
470         badEntities.put("&raquo;", "\u00BB"); // right-pointing double angle quotation mark
471         badEntities.put("&frac14;", "\u00BC"); // vulgar fraction one quarter
472         badEntities.put("&frac12;", "\u00BD"); // vulgar fraction one half
473         badEntities.put("&frac34;", "\u00BE"); // vulgar fraction three quarters
474         badEntities.put("&iquest;", "\u00BF"); // inverted question mark
475         badEntities.put("&Agrave;", "\u00C0"); // latin capital letter A with grave
476         badEntities.put("&Aacute;", "\u00C1"); // latin capital letter A with acute
477         badEntities.put("&Acirc;", "\u00C2"); // latin capital letter A with circumflex
478         badEntities.put("&Atilde;", "\u00C3"); // latin capital letter A with tilde
479         badEntities.put("&Auml;", "\u00C4"); // latin capital letter A with diaeresis
480         badEntities.put("&Aring;", "\u00C5"); // latin capital letter A with ring above
481         badEntities.put("&AElig;", "\u00C6"); // latin capital letter AE
482         badEntities.put("&Ccedil;", "\u00C7"); // latin capital letter C with cedilla
483         badEntities.put("&Egrave;", "\u00C8"); // latin capital letter E with grave
484         badEntities.put("&Eacute;", "\u00C9"); // latin capital letter E with acute
485         badEntities.put("&Ecirc;", "\u00CA"); // latin capital letter E with circumflex
486         badEntities.put("&Euml;", "\u00CB"); // latin capital letter E with diaeresis
487         badEntities.put("&Igrave;", "\u00CC"); // latin capital letter I with grave
488         badEntities.put("&Iacute;", "\u00CD"); // latin capital letter I with acute
489         badEntities.put("&Icirc;", "\u00CE"); // latin capital letter I with circumflex
490         badEntities.put("&Iuml;", "\u00CF"); // latin capital letter I with diaeresis
491         badEntities.put("&ETH;", "\u00D0"); // latin capital letter ETH
492         badEntities.put("&Ntilde;", "\u00D1"); // latin capital letter N with tilde
493         badEntities.put("&Ograve;", "\u00D2"); // latin capital letter O with grave
494         badEntities.put("&Oacute;", "\u00D3"); // latin capital letter O with acute
495         badEntities.put("&Ocirc;", "\u00D4"); // latin capital letter O with circumflex
496         badEntities.put("&Otilde;", "\u00D5"); // latin capital letter O with tilde
497         badEntities.put("&Ouml;", "\u00D6"); // latin capital letter O with diaeresis
498         badEntities.put("&times;", "\u00D7"); // multiplication sign
499         badEntities.put("&Oslash;", "\u00D8"); // latin capital letter O with stroke
500         badEntities.put("&Ugrave;", "\u00D9"); // latin capital letter U with grave
501         badEntities.put("&Uacute;", "\u00DA"); // latin capital letter U with acute
502         badEntities.put("&Ucirc;", "\u00DB"); // latin capital letter U with circumflex
503         badEntities.put("&Uuml;", "\u00DC"); // latin capital letter U with diaeresis
504         badEntities.put("&Yacute;", "\u00DD"); // latin capital letter Y with acute
505         badEntities.put("&THORN;", "\u00DE"); // latin capital letter THORN
506         badEntities.put("&szlig;", "\u00DF"); // latin small letter sharp s
507         badEntities.put("&agrave;", "\u00E0"); // latin small letter a with grave
508         badEntities.put("&aacute;", "\u00E1"); // latin small letter a with acute
509         badEntities.put("&acirc;", "\u00E2"); // latin small letter a with circumflex
510         badEntities.put("&atilde;", "\u00E3"); // latin small letter a with tilde
511         badEntities.put("&auml;", "\u00E4"); // latin small letter a with diaeresis
512         badEntities.put("&aring;", "\u00E5"); // latin small letter a with ring above
513         badEntities.put("&aelig;", "\u00E6"); // latin small letter ae
514         badEntities.put("&ccedil;", "\u00E7"); // latin small letter c with cedilla
515         badEntities.put("&egrave;", "\u00E8"); // latin small letter e with grave
516         badEntities.put("&eacute;", "\u00E9"); // latin small letter e with acute
517         badEntities.put("&ecirc;", "\u00EA"); // latin small letter e with circumflex
518         badEntities.put("&euml;", "\u00EB"); // latin small letter e with diaeresis
519         badEntities.put("&igrave;", "\u00EC"); // latin small letter i with grave
520         badEntities.put("&iacute;", "\u00ED"); // latin small letter i with acute
521         badEntities.put("&icirc;", "\u00EE"); // latin small letter i with circumflex
522         badEntities.put("&iuml;", "\u00EF"); // latin small letter i with diaeresis
523         badEntities.put("&eth;", "\u00F0"); // latin small letter eth
524         badEntities.put("&ntilde;", "\u00F1"); // latin small letter n with tilde
525         badEntities.put("&ograve;", "\u00F2"); // latin small letter o with grave
526         badEntities.put("&oacute;", "\u00F3"); // latin small letter o with acute
527         badEntities.put("&ocirc;", "\u00F4"); // latin small letter o with circumflex
528         badEntities.put("&otilde;", "\u00F5"); // latin small letter o with tilde
529         badEntities.put("&ouml;", "\u00F6"); // latin small letter o with diaeresis
530         badEntities.put("&divide;", "\u00F7"); // division sign
531         badEntities.put("&oslash;", "\u00F8"); // latin small letter o with stroke
532         badEntities.put("&ugrave;", "\u00F9"); // latin small letter u with grave
533         badEntities.put("&uacute;", "\u00FA"); // latin small letter u with acute
534         badEntities.put("&ucirc;", "\u00FB"); // latin small letter u with circumflex
535         badEntities.put("&uuml;", "\u00FC"); // latin small letter u with diaeresis
536         badEntities.put("&yacute;", "\u00FD"); // latin small letter y with acute
537         badEntities.put("&thorn;", "\u00FE"); // latin small letter thorn
538         badEntities.put("&yuml;", "\u00FF"); // latin small letter y with diaeresis
539     }
540 
541     /**
542      * Pattern for numeric entities.
543      */
544     private static Pattern validCharacterEntityPattern = Pattern.compile("^&#x?\\d{2,4};");
545 
546     /**
547      * Pattern that negates the allowable XML 4 byte unicode characters. Valid
548      * are: #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] |
549      * [#x10000-#x10FFFF]
550      */
551     private static Pattern invalidCharacterPattern = Pattern.compile("[^\t\r\n\u0020-\uD7FF\uE000-\uFFFD]");
552 
553     /**
554      * Pattern that matches open &lt;br&gt;,&lt;hr&gt; and &lt;img&gt; tags.
555      */
556     private static Pattern openHTMLTagPattern = Pattern.compile("<(img|hr|br)([^>]*)(?<!/)>");
557 
558     /**
559      * The log stream
560      */
561     private static final Logger log = LoggerFactory.getLogger(XMLUtil.class);
562 }
563