1   /**
2    * Distribution License:
3    * JSword is free software; you can redistribute it and/or modify it under
4    * the terms of the GNU Lesser General Public License, version 2.1 as published by
5    * the Free Software Foundation. This program is distributed in the hope
6    * that it will be useful, but WITHOUT ANY WARRANTY; without even the
7    * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
8    * See the GNU Lesser General Public License for more details.
9    *
10   * The License is available on the internet at:
11   *       http://www.gnu.org/copyleft/lgpl.html
12   * or by writing to:
13   *      Free Software Foundation, Inc.
14   *      59 Temple Place - Suite 330
15   *      Boston, MA 02111-1307, USA
16   *
17   * Copyright: 2005
18   *     The copyright to this program is held by it's authors.
19   *
20   * ID: $Id: XMLUtil.java 2112 2011-03-11 01:29:39Z dmsmith $
21   */
22  package org.crosswire.common.xml;
23  
24  import java.io.IOException;
25  import java.io.InputStream;
26  import java.util.HashSet;
27  import java.util.Set;
28  import java.util.regex.Pattern;
29  
30  import org.crosswire.common.util.FileUtil;
31  import org.crosswire.common.util.Logger;
32  import org.crosswire.common.util.PropertyMap;
33  import org.crosswire.common.util.ResourceUtil;
34  import org.jdom.Document;
35  import org.jdom.JDOMException;
36  import org.jdom.input.SAXBuilder;
37  import org.xml.sax.Attributes;
38  import org.xml.sax.ContentHandler;
39  import org.xml.sax.SAXException;
40  
41  /**
42   * Utilities for working with SAX XML parsing.
43   * 
44   * @see gnu.lgpl.License for license details.<br>
45   *      The copyright to this program is held by it's authors.
46   * @author Joe Walker [joe at eireneh dot com]
47   * @author DM Smith [dmsmith555 at yahoo dot com]
48   */
49  public final class XMLUtil {
50      /**
51       * Prevent instantiation
52       */
53      private XMLUtil() {
54      }
55  
56      /**
57       * Get and load an XML file from the classpath and a few other places into a
58       * JDOM Document object.
59       * 
60       * @param subject
61       *            The name of the desired resource (without any extension)
62       * @return The requested resource
63       * @throws IOException
64       *             if there is a problem reading the file
65       * @throws JDOMException
66       *             If the resource is not valid XML
67       */
68      public static Document getDocument(String subject) throws JDOMException, IOException {
69          String resource = subject + FileUtil.EXTENSION_XML;
70          InputStream in = ResourceUtil.getResourceAsStream(resource);
71  
72          log.debug("Loading " + subject + ".xml from classpath: [OK]");
73          SAXBuilder builder = new SAXBuilder(true);
74          return builder.build(in);
75      }
76  
77      /**
78       * Serialize a SAXEventProvider into an XML String
79       * 
80       * @param provider
81       *            The source of SAX events
82       * @return a serialized string
83       */
84      public static String writeToString(SAXEventProvider provider) throws SAXException {
85          ContentHandler ser = new PrettySerializingContentHandler();
86          provider.provideSAXEvents(ser);
87          return ser.toString();
88      }
89  
90      /**
91       * Get the full name of the attribute, including the namespace if any.
92       * 
93       * @param attrs
94       *            the collection of attributes
95       * @param index
96       *            the index of the desired attribute
97       * @return the requested attribute
98       */
99      public static String getAttributeName(Attributes attrs, int index) {
100         String qName = attrs.getQName(index);
101         if (qName != null) {
102             return qName;
103         }
104         return attrs.getLocalName(index);
105     }
106 
107     /**
108      * Show the attributes of an element as debug
109      */
110     public static void debugSAXAttributes(Attributes attrs) {
111         for (int i = 0; i < attrs.getLength(); i++) {
112             log.debug("attr[" + i + "]: " + attrs.getQName(i) + '=' + attrs.getValue(i));
113         }
114     }
115 
116     /**
117      * Normalizes the given string
118      */
119     public static String escape(String s) {
120         if (s == null) {
121             return s;
122         }
123         int len = s.length();
124         StringBuilder str = new StringBuilder(len);
125 
126         for (int i = 0; i < len; i++) {
127             char ch = s.charAt(i);
128             switch (ch) {
129             case '<':
130                 str.append("&lt;");
131                 break;
132 
133             case '>':
134                 str.append("&gt;");
135                 break;
136 
137             case '&':
138                 str.append("&amp;");
139                 break;
140 
141             case '"':
142                 str.append("&quot;");
143                 break;
144 
145             default:
146                 str.append(ch);
147             }
148         }
149 
150         return str.toString();
151     }
152 
153     /**
154      * For each entity in the input that is not allowed in XML, replace the
155      * entity with its unicode equivalent or remove it. For each instance of a
156      * bare &, replace it with &amp;<br/>
157      * XML only allows 4 entities: &amp;amp;, &amp;quot;, &amp;lt; and &amp;gt;.
158      * 
159      * @param broken
160      *            the string to handle entities
161      * @return the string with entities appropriately fixed up
162      */
163     public static String cleanAllEntities(String broken) {
164         if (broken == null) {
165             return null;
166         }
167 
168         String working = broken;
169         int cleanfrom = 0;
170 
171         while (true) {
172             int amp = working.indexOf('&', cleanfrom);
173 
174             // If there are no more amps then we are done
175             if (amp == -1) {
176                 break;
177             }
178 
179             // Skip references of the kind &#ddd;
180             if (validCharacterEntityPattern.matcher(working.substring(amp)).find()) {
181                 cleanfrom = working.indexOf(';', amp) + 1;
182                 continue;
183             }
184 
185             int i = amp + 1;
186             while (true) {
187                 // if we are at the end of the string then just escape the '&';
188                 if (i >= working.length()) {
189                     // String entity = working.substring(amp);
190                     // String replace = guessEntity(entity);
191                     // DataPolice.report("replacing unterminated entity: '" +
192                     // entity + "' with: '" + replace + "'");
193 
194                     return working.substring(0, amp) + "&amp;" + working.substring(amp + 1);
195                 }
196 
197                 // if we have come to a ; then we have an entity
198                 // If it is something that xml can't handle then replace it.
199                 char c = working.charAt(i);
200                 if (c == ';') {
201                     String entity = working.substring(amp, i + 1);
202                     String replace = handleEntity(entity);
203                     // log.warn("replacing entity: '" + entity + "' with: '" +
204                     // replace + "'");
205 
206                     working = working.substring(0, amp) + replace + working.substring(i + 1);
207                     break;
208                 }
209 
210                 // Did we end an entity without finding a closing ;
211                 // Then treat it as an '&' that needs to be replaced with &amp;
212                 if (!Character.isLetterOrDigit(c)) {
213                     // String entity = working.substring(amp, i);
214                     // String replace = "&amp;" + working.substring(amp + 1, i);
215                     // log.warn("replacing invalid entity: '" + entity +
216                     // "' with: '" + replace + "': " + broken);
217 
218                     working = working.substring(0, amp) + "&amp;" + working.substring(amp + 1);
219                     amp = i + 4; // account for the 4 extra characters
220                     break;
221                 }
222 
223                 i++;
224             }
225 
226             cleanfrom = amp + 1;
227         }
228 
229         return working;
230     }
231 
232     /**
233      * Remove all invalid characters in the input, replacing them with a space. XML has stringent
234      * requirements as to which characters are or are not allowed. The set of
235      * allowable characters are:<br />
236      * #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]<br/>
237      * Note: Java handles to ?
238      * 
239      * @param broken
240      *            the string to be cleaned
241      * @return the cleaned string
242      */
243     public static String cleanAllCharacters(String broken) {
244         return invalidCharacterPattern.matcher(broken).replaceAll(" ");
245     }
246 
247     /**
248      * Common HTML tags such as &lt;br&gt;,&lt;hr&gt; and &lt;img&gt; may be
249      * left open causing XML parsing to fail. This method closes these tags.
250      * 
251      * @param broken
252      *            the string to be cleaned
253      * @return the cleaned string
254      */
255     public static String closeEmptyTags(String broken) {
256         if (broken == null) {
257             return null;
258         }
259 
260         return openHTMLTagPattern.matcher(broken).replaceAll("<$1$2/>");
261     }
262 
263     /**
264      * XML parse failed, so we can try getting rid of all the tags and having
265      * another go. We define a tag to start at a &lt; and end at the end of the
266      * next word (where a word is what comes in between spaces) that does not
267      * contain an = sign, or at a >, whichever is earlier.
268      */
269     public static String cleanAllTags(String broken) {
270         if (broken == null) {
271             return null;
272         }
273 
274         String working = broken;
275 
276         allTags: while (true) {
277             int lt = working.indexOf('<');
278 
279             // If there are no more amps then we are done
280             if (lt == -1) {
281                 break allTags;
282             }
283 
284             // loop to find the end of this tag
285             int i = lt;
286             int startattr = -1;
287 
288             singletag: while (true) {
289                 i++;
290 
291                 // the tag can't exist past the end of the string
292                 if (i >= working.length()) {
293                     // go back one so we can safely chop
294                     i--;
295                     break singletag;
296                 }
297 
298                 char c = working.charAt(i);
299 
300                 // normal end of tag
301                 if (c == '>') {
302                     break singletag;
303                 }
304 
305                 // we declare end-of-tag if this 'word' is not an attribute
306                 if (c == ' ') {
307                     if (startattr == -1) {
308                         // NOTE(joe): should we skip over consecutive spaces?
309                         startattr = i;
310                     } else {
311                         // so we've already had a space indicating start of
312                         // attribute, so this must be the beginning of the next
313                         // NOTE(joe): no - spaces can exist in attr values
314                         String value = working.substring(startattr, i);
315                         if (value.indexOf('=') == -1) {
316                             // this 'attribute' does not contain an equals so
317                             // we call it a word and end the parse
318                             break singletag;
319                         }
320                     }
321                 }
322             }
323 
324             // So we have the end of the tag, delete it, but leave a space in it's place
325             // DataPolice.report("discarding tag: " + working.substring(lt, i + 1));
326             working = working.substring(0, lt) + " " + working.substring(i + 1);
327         }
328 
329         return working;
330     }
331 
332     /**
333      * Replace entity with its unicode equivalent, if it is not a valid XML
334      * entity. Otherwise strip it out. XML only allows 4 entities: &amp;amp;,
335      * &amp;quot;, &amp;lt; and &amp;gt;.
336      * 
337      * @param entity
338      *            the entity to be replaced
339      * @return the substitution for the entity, either itself, the unicode
340      *         equivalent or an empty string.
341      */
342     private static String handleEntity(String entity) {
343         if (goodEntities.contains(entity)) {
344             return entity;
345         }
346 
347         String replace = badEntities.get(entity);
348         if (replace != null) {
349             return replace;
350         }
351 
352         // replace unknown entities with a space
353         return " ";
354     }
355 
356     // Map entities to their unicode equivalent
357     private static Set<String> goodEntities = new HashSet<String>();
358     private static PropertyMap badEntities = new PropertyMap();
359     static {
360         // pre-defined XML entities
361         goodEntities.add("&quot;"); // quotation mark
362         goodEntities.add("&amp;"); // ampersand
363         goodEntities.add("&lt;"); // less-than sign
364         goodEntities.add("&gt;"); // greater-than sign
365 
366         // misc entities
367         badEntities.put("&euro;", "\u20AC"); // euro
368         badEntities.put("&lsquo;", "\u2018"); // left single quotation mark
369         badEntities.put("&rsquo;", "\u2019"); // right single quotation mark
370 
371         // Latin 1 entities
372         badEntities.put("&nbsp;", "\u00A0"); // no-break space
373         badEntities.put("&iexcl;", "\u00A1"); // inverted exclamation mark
374         badEntities.put("&cent;", "\u00A2"); // cent sign
375         badEntities.put("&pound;", "\u00A3"); // pound sign
376         badEntities.put("&curren;", "\u00A4"); // currency sign
377         badEntities.put("&yen;", "\u00A5"); // yen sign
378         badEntities.put("&brvbar;", "\u00A6"); // broken vertical bar
379         badEntities.put("&sect;", "\u00A7"); // section sign
380         badEntities.put("&uml;", "\u00A8"); // diaeresis
381         badEntities.put("&copy;", "\u00A9"); // copyright sign
382         badEntities.put("&ordf;", "\u00AA"); // feminine ordinal indicator
383         badEntities.put("&laquo;", "\u00AB"); // left-pointing double angle quotation mark
384         badEntities.put("&not;", "\u00AC"); // not sign
385         badEntities.put("&shy;", "\u00AD"); // soft hyphen
386         badEntities.put("&reg;", "\u00AE"); // registered sign
387         badEntities.put("&macr;", "\u00AF"); // macron
388         badEntities.put("&deg;", "\u00B0"); // degree sign
389         badEntities.put("&plusmn;", "\u00B1"); // plus-minus sign
390         badEntities.put("&sup2;", "\u00B2"); // superscript two
391         badEntities.put("&sup3;", "\u00B3"); // superscript three
392         badEntities.put("&acute;", "\u00B4"); // acute accent
393         badEntities.put("&micro;", "\u00B5"); // micro sign
394         badEntities.put("&para;", "\u00B6"); // pilcrow sign
395         badEntities.put("&middot;", "\u00B7"); // middle dot
396         badEntities.put("&cedil;", "\u00B8"); // cedilla
397         badEntities.put("&sup1;", "\u00B9"); // superscript one
398         badEntities.put("&ordm;", "\u00BA"); // masculine ordinal indicator
399         badEntities.put("&raquo;", "\u00BB"); // right-pointing double angle quotation mark
400         badEntities.put("&frac14;", "\u00BC"); // vulgar fraction one quarter
401         badEntities.put("&frac12;", "\u00BD"); // vulgar fraction one half
402         badEntities.put("&frac34;", "\u00BE"); // vulgar fraction three quarters
403         badEntities.put("&iquest;", "\u00BF"); // inverted question mark
404         badEntities.put("&Agrave;", "\u00C0"); // latin capital letter A with grave
405         badEntities.put("&Aacute;", "\u00C1"); // latin capital letter A with acute
406         badEntities.put("&Acirc;", "\u00C2"); // latin capital letter A with circumflex
407         badEntities.put("&Atilde;", "\u00C3"); // latin capital letter A with tilde
408         badEntities.put("&Auml;", "\u00C4"); // latin capital letter A with diaeresis
409         badEntities.put("&Aring;", "\u00C5"); // latin capital letter A with ring above
410         badEntities.put("&AElig;", "\u00C6"); // latin capital letter AE
411         badEntities.put("&Ccedil;", "\u00C7"); // latin capital letter C with cedilla
412         badEntities.put("&Egrave;", "\u00C8"); // latin capital letter E with grave
413         badEntities.put("&Eacute;", "\u00C9"); // latin capital letter E with acute
414         badEntities.put("&Ecirc;", "\u00CA"); // latin capital letter E with circumflex
415         badEntities.put("&Euml;", "\u00CB"); // latin capital letter E with diaeresis
416         badEntities.put("&Igrave;", "\u00CC"); // latin capital letter I with grave
417         badEntities.put("&Iacute;", "\u00CD"); // latin capital letter I with acute
418         badEntities.put("&Icirc;", "\u00CE"); // latin capital letter I with circumflex
419         badEntities.put("&Iuml;", "\u00CF"); // latin capital letter I with diaeresis
420         badEntities.put("&ETH;", "\u00D0"); // latin capital letter ETH
421         badEntities.put("&Ntilde;", "\u00D1"); // latin capital letter N with tilde
422         badEntities.put("&Ograve;", "\u00D2"); // latin capital letter O with grave
423         badEntities.put("&Oacute;", "\u00D3"); // latin capital letter O with acute
424         badEntities.put("&Ocirc;", "\u00D4"); // latin capital letter O with circumflex
425         badEntities.put("&Otilde;", "\u00D5"); // latin capital letter O with tilde
426         badEntities.put("&Ouml;", "\u00D6"); // latin capital letter O with diaeresis
427         badEntities.put("&times;", "\u00D7"); // multiplication sign
428         badEntities.put("&Oslash;", "\u00D8"); // latin capital letter O with stroke
429         badEntities.put("&Ugrave;", "\u00D9"); // latin capital letter U with grave
430         badEntities.put("&Uacute;", "\u00DA"); // latin capital letter U with acute
431         badEntities.put("&Ucirc;", "\u00DB"); // latin capital letter U with circumflex
432         badEntities.put("&Uuml;", "\u00DC"); // latin capital letter U with diaeresis
433         badEntities.put("&Yacute;", "\u00DD"); // latin capital letter Y with acute
434         badEntities.put("&THORN;", "\u00DE"); // latin capital letter THORN
435         badEntities.put("&szlig;", "\u00DF"); // latin small letter sharp s
436         badEntities.put("&agrave;", "\u00E0"); // latin small letter a with grave
437         badEntities.put("&aacute;", "\u00E1"); // latin small letter a with acute
438         badEntities.put("&acirc;", "\u00E2"); // latin small letter a with circumflex
439         badEntities.put("&atilde;", "\u00E3"); // latin small letter a with tilde
440         badEntities.put("&auml;", "\u00E4"); // latin small letter a with diaeresis
441         badEntities.put("&aring;", "\u00E5"); // latin small letter a with ring above
442         badEntities.put("&aelig;", "\u00E6"); // latin small letter ae
443         badEntities.put("&ccedil;", "\u00E7"); // latin small letter c with cedilla
444         badEntities.put("&egrave;", "\u00E8"); // latin small letter e with grave
445         badEntities.put("&eacute;", "\u00E9"); // latin small letter e with acute
446         badEntities.put("&ecirc;", "\u00EA"); // latin small letter e with circumflex
447         badEntities.put("&euml;", "\u00EB"); // latin small letter e with diaeresis
448         badEntities.put("&igrave;", "\u00EC"); // latin small letter i with grave
449         badEntities.put("&iacute;", "\u00ED"); // latin small letter i with acute
450         badEntities.put("&icirc;", "\u00EE"); // latin small letter i with circumflex
451         badEntities.put("&iuml;", "\u00EF"); // latin small letter i with diaeresis
452         badEntities.put("&eth;", "\u00F0"); // latin small letter eth
453         badEntities.put("&ntilde;", "\u00F1"); // latin small letter n with tilde
454         badEntities.put("&ograve;", "\u00F2"); // latin small letter o with grave
455         badEntities.put("&oacute;", "\u00F3"); // latin small letter o with acute
456         badEntities.put("&ocirc;", "\u00F4"); // latin small letter o with circumflex
457         badEntities.put("&otilde;", "\u00F5"); // latin small letter o with tilde
458         badEntities.put("&ouml;", "\u00F6"); // latin small letter o with diaeresis
459         badEntities.put("&divide;", "\u00F7"); // division sign
460         badEntities.put("&oslash;", "\u00F8"); // latin small letter o with stroke
461         badEntities.put("&ugrave;", "\u00F9"); // latin small letter u with grave
462         badEntities.put("&uacute;", "\u00FA"); // latin small letter u with acute
463         badEntities.put("&ucirc;", "\u00FB"); // latin small letter u with circumflex
464         badEntities.put("&uuml;", "\u00FC"); // latin small letter u with diaeresis
465         badEntities.put("&yacute;", "\u00FD"); // latin small letter y with acute
466         badEntities.put("&thorn;", "\u00FE"); // latin small letter thorn
467         badEntities.put("&yuml;", "\u00FF"); // latin small letter y with diaeresis
468     }
469 
470     /**
471      * The log stream
472      */
473     private static final Logger log = Logger.getLogger(XMLUtil.class);
474 
475     /**
476      * Pattern for numeric entities.
477      */
478     private static Pattern validCharacterEntityPattern = Pattern.compile("^&#x?\\d{2,4};");
479 
480     /**
481      * Pattern that negates the allowable XML 4 byte unicode characters. Valid
482      * are: #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] |
483      * [#x10000-#x10FFFF]
484      */
485     private static Pattern invalidCharacterPattern = Pattern.compile("[^\t\r\n\u0020-\uD7FF\uE000-\uFFFD]");
486 
487     /**
488      * Pattern that matches open &lt;br&gt;,&lt;hr&gt; and &lt;img&gt; tags.
489      */
490     private static Pattern openHTMLTagPattern = Pattern.compile("<(img|hr|br)([^>]*)(?<!/)>");
491 }
492