1   /**
2    * Distribution License:
3    * JSword is free software; you can redistribute it and/or modify it under
4    * the terms of the GNU Lesser General Public License, version 2.1 or later
5    * as published by the Free Software Foundation. This program is distributed
6    * in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even
7    * the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
8    * See the GNU Lesser General Public License for more details.
9    *
10   * The License is available on the internet at:
11   *       http://www.gnu.org/copyleft/lgpl.html
12   * or by writing to:
13   *      Free Software Foundation, Inc.
14   *      59 Temple Place - Suite 330
15   *      Boston, MA 02111-1307, USA
16   *
17   * Copyright: 2005-2013
18   *     The copyright to this program is held by it's authors.
19   *
20   */
21  package org.crosswire.common.xml;
22  
23  import java.io.IOException;
24  import java.io.InputStream;
25  import java.util.HashSet;
26  import java.util.Set;
27  import java.util.regex.Pattern;
28  
29  import org.crosswire.common.util.FileUtil;
30  import org.crosswire.common.util.PropertyMap;
31  import org.crosswire.common.util.ResourceUtil;
32  import org.jdom2.Document;
33  import org.jdom2.JDOMException;
34  import org.jdom2.input.SAXBuilder;
35  import org.jdom2.input.sax.XMLReaders;
36  import org.slf4j.Logger;
37  import org.slf4j.LoggerFactory;
38  import org.xml.sax.Attributes;
39  import org.xml.sax.ContentHandler;
40  import org.xml.sax.SAXException;
41  
42  /**
43   * Utilities for working with SAX XML parsing.
44   * 
45   * @see gnu.lgpl.License for license details.<br>
46   *      The copyright to this program is held by it's authors.
47   * @author Joe Walker [joe at eireneh dot com]
48   * @author DM Smith
49   */
50  public final class XMLUtil {
51      /**
52       * Prevent instantiation
53       */
54      private XMLUtil() {
55      }
56  
57      /**
58       * Get and load an XML file from the classpath and a few other places into a
59       * JDOM Document object.
60       * 
61       * @param subject
62       *            The name of the desired resource (without any extension)
63       * @return The requested resource
64       * @throws IOException
65       *             if there is a problem reading the file
66       * @throws JDOMException
67       *             If the resource is not valid XML
68       */
69      public static Document getDocument(String subject) throws JDOMException, IOException {
70          String resource = subject + FileUtil.EXTENSION_XML;
71          InputStream in = ResourceUtil.getResourceAsStream(resource);
72  
73          log.debug("Loading {}.xml from classpath: [OK]", subject);
74          // With JDom 1.x this passed true
75          SAXBuilder builder = new SAXBuilder(XMLReaders.DTDVALIDATING);
76          return builder.build(in);
77      }
78  
79      /**
80       * Serialize a SAXEventProvider into an XML String
81       * 
82       * @param provider
83       *            The source of SAX events
84       * @return a serialized string
85       */
86      public static String writeToString(SAXEventProvider provider) throws SAXException {
87          ContentHandler ser = new PrettySerializingContentHandler();
88          provider.provideSAXEvents(ser);
89          return ser.toString();
90      }
91  
92      /**
93       * Get the full name of the attribute, including the namespace if any.
94       * 
95       * @param attrs
96       *            the collection of attributes
97       * @param index
98       *            the index of the desired attribute
99       * @return the requested attribute
100      */
101     public static String getAttributeName(Attributes attrs, int index) {
102         String qName = attrs.getQName(index);
103         if (qName != null) {
104             return qName;
105         }
106         return attrs.getLocalName(index);
107     }
108 
109     /**
110      * Show the attributes of an element as debug
111      */
112     public static void debugSAXAttributes(Attributes attrs) {
113         for (int i = 0; i < attrs.getLength(); i++) {
114             log.debug("attr[{}]: {}={}", Integer.toString(i), attrs.getQName(i), attrs.getValue(i));
115         }
116     }
117 
118     /**
119      * Normalizes the given string
120      */
121     public static String escape(String s) {
122         if (s == null) {
123             return s;
124         }
125         int len = s.length();
126         StringBuilder str = new StringBuilder(len);
127 
128         for (int i = 0; i < len; i++) {
129             char ch = s.charAt(i);
130             switch (ch) {
131             case '<':
132                 str.append("&lt;");
133                 break;
134 
135             case '>':
136                 str.append("&gt;");
137                 break;
138 
139             case '&':
140                 str.append("&amp;");
141                 break;
142 
143             case '"':
144                 str.append("&quot;");
145                 break;
146 
147             default:
148                 str.append(ch);
149             }
150         }
151 
152         return str.toString();
153     }
154 
155     /**
156      * For each entity in the input that is not allowed in XML, replace the
157      * entity with its unicode equivalent or remove it. For each instance of a
158      * bare &, replace it with &amp;<br/>
159      * XML only allows 4 entities: &amp;amp;, &amp;quot;, &amp;lt; and &amp;gt;.
160      * 
161      * @param broken
162      *            the string to handle entities
163      * @return the string with entities appropriately fixed up
164      */
165     public static String cleanAllEntities(String broken) {
166         if (broken == null) {
167             return null;
168         }
169 
170         String working = broken;
171         int cleanfrom = 0;
172 
173         while (true) {
174             int amp = working.indexOf('&', cleanfrom);
175 
176             // If there are no more amps then we are done
177             if (amp == -1) {
178                 break;
179             }
180 
181             // Skip references of the kind &#ddd;
182             if (validCharacterEntityPattern.matcher(working.substring(amp)).find()) {
183                 cleanfrom = working.indexOf(';', amp) + 1;
184                 continue;
185             }
186 
187             int i = amp + 1;
188             while (true) {
189                 // if we are at the end of the string then just escape the '&';
190                 if (i >= working.length()) {
191                     // String entity = working.substring(amp);
192                     // String replace = guessEntity(entity);
193                     // DataPolice.report("replacing unterminated entity: '" +
194                     // entity + "' with: '" + replace + "'");
195 
196                     return working.substring(0, amp) + "&amp;" + working.substring(amp + 1);
197                 }
198 
199                 // if we have come to a ; then we have an entity
200                 // If it is something that xml can't handle then replace it.
201                 char c = working.charAt(i);
202                 if (c == ';') {
203                     String entity = working.substring(amp, i + 1);
204                     String replace = handleEntity(entity);
205                     // log.warn("replacing entity: '{}' with: '{}'", entity, replace);
206 
207                     working = working.substring(0, amp) + replace + working.substring(i + 1);
208                     break;
209                 }
210 
211                 // Did we end an entity without finding a closing ;
212                 // Then treat it as an '&' that needs to be replaced with &amp;
213                 if (!Character.isLetterOrDigit(c)) {
214                     // String entity = working.substring(amp, i);
215                     // String replace = "&amp;" + working.substring(amp + 1, i);
216                     // log.warn("replacing invalid entity: '{}' with: '{}': {}", entity, replace, broken);
217 
218                     working = working.substring(0, amp) + "&amp;" + working.substring(amp + 1);
219                     amp = i + 4; // account for the 4 extra characters
220                     break;
221                 }
222 
223                 i++;
224             }
225 
226             cleanfrom = amp + 1;
227         }
228 
229         return working;
230     }
231 
232     /**
233      * Remove all invalid characters in the input, replacing them with a space. XML has stringent
234      * requirements as to which characters are or are not allowed. The set of
235      * allowable characters are:<br />
236      * #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]<br/>
237      * Note: Java handles to ￿
238      * 
239      * @param broken
240      *            the string to be cleaned
241      * @return the cleaned string
242      */
243     public static String cleanAllCharacters(String broken) {
244         return invalidCharacterPattern.matcher(broken).replaceAll(" ");
245     }
246 
247     /**
248      * Common HTML tags such as &lt;br&gt;,&lt;hr&gt; and &lt;img&gt; may be
249      * left open causing XML parsing to fail. This method closes these tags.
250      * 
251      * @param broken
252      *            the string to be cleaned
253      * @return the cleaned string
254      */
255     public static String closeEmptyTags(String broken) {
256         if (broken == null) {
257             return null;
258         }
259 
260         return openHTMLTagPattern.matcher(broken).replaceAll("<$1$2/>");
261     }
262 
263     /**
264      * XML parse failed, so we can try getting rid of all the tags and having
265      * another go. We define a tag to start at a &lt; and end at the end of the
266      * next word (where a word is what comes in between spaces) that does not
267      * contain an = sign, or at a >, whichever is earlier.
268      */
269     public static String cleanAllTags(String broken) {
270         if (broken == null) {
271             return null;
272         }
273 
274         String working = broken;
275 
276         allTags: while (true) {
277             int lt = working.indexOf('<');
278 
279             // If there are no more amps then we are done
280             if (lt == -1) {
281                 break allTags;
282             }
283 
284             // loop to find the end of this tag
285             int i = lt;
286             int startattr = -1;
287 
288             singletag: while (true) {
289                 i++;
290 
291                 // the tag can't exist past the end of the string
292                 if (i >= working.length()) {
293                     // go back one so we can safely chop
294                     i--;
295                     break singletag;
296                 }
297 
298                 char c = working.charAt(i);
299 
300                 // normal end of tag
301                 if (c == '>') {
302                     break singletag;
303                 }
304 
305                 // we declare end-of-tag if this 'word' is not an attribute
306                 if (c == ' ') {
307                     if (startattr == -1) {
308                         // NOTE(joe): should we skip over consecutive spaces?
309                         startattr = i;
310                     } else {
311                         // so we've already had a space indicating start of
312                         // attribute, so this must be the beginning of the next
313                         // NOTE(joe): no - spaces can exist in attr values
314                         String value = working.substring(startattr, i);
315                         if (value.indexOf('=') == -1) {
316                             // this 'attribute' does not contain an equals so
317                             // we call it a word and end the parse
318                             break singletag;
319                         }
320                     }
321                 }
322             }
323 
324             // So we have the end of the tag, delete it, but leave a space in it's place
325             // DataPolice.report("discarding tag: " + working.substring(lt, i + 1));
326             working = working.substring(0, lt) + " " + working.substring(i + 1);
327         }
328 
329         return working;
330     }
331 
332     /**
333      * Replace entity with its unicode equivalent, if it is not a valid XML
334      * entity. Otherwise strip it out. XML only allows 4 entities: &amp;amp;,
335      * &amp;quot;, &amp;lt; and &amp;gt;.
336      * 
337      * @param entity
338      *            the entity to be replaced
339      * @return the substitution for the entity, either itself, the unicode
340      *         equivalent or an empty string.
341      */
342     private static String handleEntity(String entity) {
343         if (goodEntities.contains(entity)) {
344             return entity;
345         }
346 
347         String replace = badEntities.get(entity);
348         if (replace != null) {
349             return replace;
350         }
351 
352         // replace unknown entities with a space
353         return " ";
354     }
355 
356     // Map entities to their unicode equivalent
357     private static Set<String> goodEntities = new HashSet<String>();
358     private static PropertyMap badEntities = new PropertyMap();
359     static {
360         // pre-defined XML entities
361         goodEntities.add("&quot;"); // quotation mark
362         goodEntities.add("&amp;"); // ampersand
363         goodEntities.add("&lt;"); // less-than sign
364         goodEntities.add("&gt;"); // greater-than sign
365 
366         // misc entities
367         badEntities.put("&euro;", "\u20AC"); // euro
368         badEntities.put("&lsquo;", "\u2018"); // left single quotation mark
369         badEntities.put("&rsquo;", "\u2019"); // right single quotation mark
370 
371         // Latin 1 entities
372         badEntities.put("&nbsp;", "\u00A0"); // no-break space
373         badEntities.put("&iexcl;", "\u00A1"); // inverted exclamation mark
374         badEntities.put("&cent;", "\u00A2"); // cent sign
375         badEntities.put("&pound;", "\u00A3"); // pound sign
376         badEntities.put("&curren;", "\u00A4"); // currency sign
377         badEntities.put("&yen;", "\u00A5"); // yen sign
378         badEntities.put("&brvbar;", "\u00A6"); // broken vertical bar
379         badEntities.put("&sect;", "\u00A7"); // section sign
380         badEntities.put("&uml;", "\u00A8"); // diaeresis
381         badEntities.put("&copy;", "\u00A9"); // copyright sign
382         badEntities.put("&ordf;", "\u00AA"); // feminine ordinal indicator
383         badEntities.put("&laquo;", "\u00AB"); // left-pointing double angle quotation mark
384         badEntities.put("&not;", "\u00AC"); // not sign
385         badEntities.put("&shy;", "\u00AD"); // soft hyphen
386         badEntities.put("&reg;", "\u00AE"); // registered sign
387         badEntities.put("&macr;", "\u00AF"); // macron
388         badEntities.put("&deg;", "\u00B0"); // degree sign
389         badEntities.put("&plusmn;", "\u00B1"); // plus-minus sign
390         badEntities.put("&sup2;", "\u00B2"); // superscript two
391         badEntities.put("&sup3;", "\u00B3"); // superscript three
392         badEntities.put("&acute;", "\u00B4"); // acute accent
393         badEntities.put("&micro;", "\u00B5"); // micro sign
394         badEntities.put("&para;", "\u00B6"); // pilcrow sign
395         badEntities.put("&middot;", "\u00B7"); // middle dot
396         badEntities.put("&cedil;", "\u00B8"); // cedilla
397         badEntities.put("&sup1;", "\u00B9"); // superscript one
398         badEntities.put("&ordm;", "\u00BA"); // masculine ordinal indicator
399         badEntities.put("&raquo;", "\u00BB"); // right-pointing double angle quotation mark
400         badEntities.put("&frac14;", "\u00BC"); // vulgar fraction one quarter
401         badEntities.put("&frac12;", "\u00BD"); // vulgar fraction one half
402         badEntities.put("&frac34;", "\u00BE"); // vulgar fraction three quarters
403         badEntities.put("&iquest;", "\u00BF"); // inverted question mark
404         badEntities.put("&Agrave;", "\u00C0"); // latin capital letter A with grave
405         badEntities.put("&Aacute;", "\u00C1"); // latin capital letter A with acute
406         badEntities.put("&Acirc;", "\u00C2"); // latin capital letter A with circumflex
407         badEntities.put("&Atilde;", "\u00C3"); // latin capital letter A with tilde
408         badEntities.put("&Auml;", "\u00C4"); // latin capital letter A with diaeresis
409         badEntities.put("&Aring;", "\u00C5"); // latin capital letter A with ring above
410         badEntities.put("&AElig;", "\u00C6"); // latin capital letter AE
411         badEntities.put("&Ccedil;", "\u00C7"); // latin capital letter C with cedilla
412         badEntities.put("&Egrave;", "\u00C8"); // latin capital letter E with grave
413         badEntities.put("&Eacute;", "\u00C9"); // latin capital letter E with acute
414         badEntities.put("&Ecirc;", "\u00CA"); // latin capital letter E with circumflex
415         badEntities.put("&Euml;", "\u00CB"); // latin capital letter E with diaeresis
416         badEntities.put("&Igrave;", "\u00CC"); // latin capital letter I with grave
417         badEntities.put("&Iacute;", "\u00CD"); // latin capital letter I with acute
418         badEntities.put("&Icirc;", "\u00CE"); // latin capital letter I with circumflex
419         badEntities.put("&Iuml;", "\u00CF"); // latin capital letter I with diaeresis
420         badEntities.put("&ETH;", "\u00D0"); // latin capital letter ETH
421         badEntities.put("&Ntilde;", "\u00D1"); // latin capital letter N with tilde
422         badEntities.put("&Ograve;", "\u00D2"); // latin capital letter O with grave
423         badEntities.put("&Oacute;", "\u00D3"); // latin capital letter O with acute
424         badEntities.put("&Ocirc;", "\u00D4"); // latin capital letter O with circumflex
425         badEntities.put("&Otilde;", "\u00D5"); // latin capital letter O with tilde
426         badEntities.put("&Ouml;", "\u00D6"); // latin capital letter O with diaeresis
427         badEntities.put("&times;", "\u00D7"); // multiplication sign
428         badEntities.put("&Oslash;", "\u00D8"); // latin capital letter O with stroke
429         badEntities.put("&Ugrave;", "\u00D9"); // latin capital letter U with grave
430         badEntities.put("&Uacute;", "\u00DA"); // latin capital letter U with acute
431         badEntities.put("&Ucirc;", "\u00DB"); // latin capital letter U with circumflex
432         badEntities.put("&Uuml;", "\u00DC"); // latin capital letter U with diaeresis
433         badEntities.put("&Yacute;", "\u00DD"); // latin capital letter Y with acute
434         badEntities.put("&THORN;", "\u00DE"); // latin capital letter THORN
435         badEntities.put("&szlig;", "\u00DF"); // latin small letter sharp s
436         badEntities.put("&agrave;", "\u00E0"); // latin small letter a with grave
437         badEntities.put("&aacute;", "\u00E1"); // latin small letter a with acute
438         badEntities.put("&acirc;", "\u00E2"); // latin small letter a with circumflex
439         badEntities.put("&atilde;", "\u00E3"); // latin small letter a with tilde
440         badEntities.put("&auml;", "\u00E4"); // latin small letter a with diaeresis
441         badEntities.put("&aring;", "\u00E5"); // latin small letter a with ring above
442         badEntities.put("&aelig;", "\u00E6"); // latin small letter ae
443         badEntities.put("&ccedil;", "\u00E7"); // latin small letter c with cedilla
444         badEntities.put("&egrave;", "\u00E8"); // latin small letter e with grave
445         badEntities.put("&eacute;", "\u00E9"); // latin small letter e with acute
446         badEntities.put("&ecirc;", "\u00EA"); // latin small letter e with circumflex
447         badEntities.put("&euml;", "\u00EB"); // latin small letter e with diaeresis
448         badEntities.put("&igrave;", "\u00EC"); // latin small letter i with grave
449         badEntities.put("&iacute;", "\u00ED"); // latin small letter i with acute
450         badEntities.put("&icirc;", "\u00EE"); // latin small letter i with circumflex
451         badEntities.put("&iuml;", "\u00EF"); // latin small letter i with diaeresis
452         badEntities.put("&eth;", "\u00F0"); // latin small letter eth
453         badEntities.put("&ntilde;", "\u00F1"); // latin small letter n with tilde
454         badEntities.put("&ograve;", "\u00F2"); // latin small letter o with grave
455         badEntities.put("&oacute;", "\u00F3"); // latin small letter o with acute
456         badEntities.put("&ocirc;", "\u00F4"); // latin small letter o with circumflex
457         badEntities.put("&otilde;", "\u00F5"); // latin small letter o with tilde
458         badEntities.put("&ouml;", "\u00F6"); // latin small letter o with diaeresis
459         badEntities.put("&divide;", "\u00F7"); // division sign
460         badEntities.put("&oslash;", "\u00F8"); // latin small letter o with stroke
461         badEntities.put("&ugrave;", "\u00F9"); // latin small letter u with grave
462         badEntities.put("&uacute;", "\u00FA"); // latin small letter u with acute
463         badEntities.put("&ucirc;", "\u00FB"); // latin small letter u with circumflex
464         badEntities.put("&uuml;", "\u00FC"); // latin small letter u with diaeresis
465         badEntities.put("&yacute;", "\u00FD"); // latin small letter y with acute
466         badEntities.put("&thorn;", "\u00FE"); // latin small letter thorn
467         badEntities.put("&yuml;", "\u00FF"); // latin small letter y with diaeresis
468     }
469 
470     /**
471      * Pattern for numeric entities.
472      */
473     private static Pattern validCharacterEntityPattern = Pattern.compile("^&#x?\\d{2,4};");
474 
475     /**
476      * Pattern that negates the allowable XML 4 byte unicode characters. Valid
477      * are: #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] |
478      * [#x10000-#x10FFFF]
479      */
480     private static Pattern invalidCharacterPattern = Pattern.compile("[^\t\r\n\u0020-\uD7FF\uE000-\uFFFD]");
481 
482     /**
483      * Pattern that matches open &lt;br&gt;,&lt;hr&gt; and &lt;img&gt; tags.
484      */
485     private static Pattern openHTMLTagPattern = Pattern.compile("<(img|hr|br)([^>]*)(?<!/)>");
486 
487     /**
488      * The log stream
489      */
490     private static final Logger log = LoggerFactory.getLogger(XMLUtil.class);
491 }
492