| XMLUtil.java |
1 /**
2 * Distribution License:
3 * JSword is free software; you can redistribute it and/or modify it under
4 * the terms of the GNU Lesser General Public License, version 2.1 or later
5 * as published by the Free Software Foundation. This program is distributed
6 * in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even
7 * the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
8 * See the GNU Lesser General Public License for more details.
9 *
10 * The License is available on the internet at:
11 * http://www.gnu.org/copyleft/lgpl.html
12 * or by writing to:
13 * Free Software Foundation, Inc.
14 * 59 Temple Place - Suite 330
15 * Boston, MA 02111-1307, USA
16 *
17 * Copyright: 2005-2013
18 * The copyright to this program is held by it's authors.
19 *
20 */
21 package org.crosswire.common.xml;
22
23 import java.io.IOException;
24 import java.io.InputStream;
25 import java.util.HashSet;
26 import java.util.Set;
27 import java.util.regex.Pattern;
28
29 import org.crosswire.common.util.FileUtil;
30 import org.crosswire.common.util.PropertyMap;
31 import org.crosswire.common.util.ResourceUtil;
32 import org.jdom2.Document;
33 import org.jdom2.JDOMException;
34 import org.jdom2.input.SAXBuilder;
35 import org.jdom2.input.sax.XMLReaders;
36 import org.slf4j.Logger;
37 import org.slf4j.LoggerFactory;
38 import org.xml.sax.Attributes;
39 import org.xml.sax.ContentHandler;
40 import org.xml.sax.SAXException;
41
42 /**
43 * Utilities for working with SAX XML parsing.
44 *
45 * @see gnu.lgpl.License for license details.<br>
46 * The copyright to this program is held by it's authors.
47 * @author Joe Walker [joe at eireneh dot com]
48 * @author DM Smith
49 */
50 public final class XMLUtil {
51 /**
52 * Prevent instantiation
53 */
54 private XMLUtil() {
55 }
56
57 /**
58 * Get and load an XML file from the classpath and a few other places into a
59 * JDOM Document object.
60 *
61 * @param subject
62 * The name of the desired resource (without any extension)
63 * @return The requested resource
64 * @throws IOException
65 * if there is a problem reading the file
66 * @throws JDOMException
67 * If the resource is not valid XML
68 */
69 public static Document getDocument(String subject) throws JDOMException, IOException {
70 String resource = subject + FileUtil.EXTENSION_XML;
71 InputStream in = ResourceUtil.getResourceAsStream(resource);
72
73 log.debug("Loading {}.xml from classpath: [OK]", subject);
74 // With JDom 1.x this passed true
75 SAXBuilder builder = new SAXBuilder(XMLReaders.DTDVALIDATING);
76 return builder.build(in);
77 }
78
79 /**
80 * Serialize a SAXEventProvider into an XML String
81 *
82 * @param provider
83 * The source of SAX events
84 * @return a serialized string
85 */
86 public static String writeToString(SAXEventProvider provider) throws SAXException {
87 ContentHandler ser = new PrettySerializingContentHandler();
88 provider.provideSAXEvents(ser);
89 return ser.toString();
90 }
91
92 /**
93 * Get the full name of the attribute, including the namespace if any.
94 *
95 * @param attrs
96 * the collection of attributes
97 * @param index
98 * the index of the desired attribute
99 * @return the requested attribute
100 */
101 public static String getAttributeName(Attributes attrs, int index) {
102 String qName = attrs.getQName(index);
103 if (qName != null) {
104 return qName;
105 }
106 return attrs.getLocalName(index);
107 }
108
109 /**
110 * Show the attributes of an element as debug
111 */
112 public static void debugSAXAttributes(Attributes attrs) {
113 for (int i = 0; i < attrs.getLength(); i++) {
114 log.debug("attr[{}]: {}={}", Integer.toString(i), attrs.getQName(i), attrs.getValue(i));
115 }
116 }
117
118 /**
119 * Normalizes the given string
120 */
121 public static String escape(String s) {
122 if (s == null) {
123 return s;
124 }
125 int len = s.length();
126 StringBuilder str = new StringBuilder(len);
127
128 for (int i = 0; i < len; i++) {
129 char ch = s.charAt(i);
130 switch (ch) {
131 case '<':
132 str.append("<");
133 break;
134
135 case '>':
136 str.append(">");
137 break;
138
139 case '&':
140 str.append("&");
141 break;
142
143 case '"':
144 str.append(""");
145 break;
146
147 default:
148 str.append(ch);
149 }
150 }
151
152 return str.toString();
153 }
154
155 /**
156 * For each entity in the input that is not allowed in XML, replace the
157 * entity with its unicode equivalent or remove it. For each instance of a
158 * bare &, replace it with &<br/>
159 * XML only allows 4 entities: &amp;, &quot;, &lt; and &gt;.
160 *
161 * @param broken
162 * the string to handle entities
163 * @return the string with entities appropriately fixed up
164 */
165 public static String cleanAllEntities(String broken) {
166 if (broken == null) {
167 return null;
168 }
169
170 String working = broken;
171 int cleanfrom = 0;
172
173 while (true) {
174 int amp = working.indexOf('&', cleanfrom);
175
176 // If there are no more amps then we are done
177 if (amp == -1) {
178 break;
179 }
180
181 // Skip references of the kind &#ddd;
182 if (validCharacterEntityPattern.matcher(working.substring(amp)).find()) {
183 cleanfrom = working.indexOf(';', amp) + 1;
184 continue;
185 }
186
187 int i = amp + 1;
188 while (true) {
189 // if we are at the end of the string then just escape the '&';
190 if (i >= working.length()) {
191 // String entity = working.substring(amp);
192 // String replace = guessEntity(entity);
193 // DataPolice.report("replacing unterminated entity: '" +
194 // entity + "' with: '" + replace + "'");
195
196 return working.substring(0, amp) + "&" + working.substring(amp + 1);
197 }
198
199 // if we have come to a ; then we have an entity
200 // If it is something that xml can't handle then replace it.
201 char c = working.charAt(i);
202 if (c == ';') {
203 String entity = working.substring(amp, i + 1);
204 String replace = handleEntity(entity);
205 // log.warn("replacing entity: '{}' with: '{}'", entity, replace);
206
207 working = working.substring(0, amp) + replace + working.substring(i + 1);
208 break;
209 }
210
211 // Did we end an entity without finding a closing ;
212 // Then treat it as an '&' that needs to be replaced with &
213 if (!Character.isLetterOrDigit(c)) {
214 // String entity = working.substring(amp, i);
215 // String replace = "&" + working.substring(amp + 1, i);
216 // log.warn("replacing invalid entity: '{}' with: '{}': {}", entity, replace, broken);
217
218 working = working.substring(0, amp) + "&" + working.substring(amp + 1);
219 amp = i + 4; // account for the 4 extra characters
220 break;
221 }
222
223 i++;
224 }
225
226 cleanfrom = amp + 1;
227 }
228
229 return working;
230 }
231
232 /**
233 * Remove all invalid characters in the input, replacing them with a space. XML has stringent
234 * requirements as to which characters are or are not allowed. The set of
235 * allowable characters are:<br />
236 * #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]<br/>
237 * Note: Java handles to ?
238 *
239 * @param broken
240 * the string to be cleaned
241 * @return the cleaned string
242 */
243 public static String cleanAllCharacters(String broken) {
244 return invalidCharacterPattern.matcher(broken).replaceAll(" ");
245 }
246
247 /**
248 * Common HTML tags such as <br>,<hr> and <img> may be
249 * left open causing XML parsing to fail. This method closes these tags.
250 *
251 * @param broken
252 * the string to be cleaned
253 * @return the cleaned string
254 */
255 public static String closeEmptyTags(String broken) {
256 if (broken == null) {
257 return null;
258 }
259
260 return openHTMLTagPattern.matcher(broken).replaceAll("<$1$2/>");
261 }
262
263 /**
264 * XML parse failed, so we can try getting rid of all the tags and having
265 * another go. We define a tag to start at a < and end at the end of the
266 * next word (where a word is what comes in between spaces) that does not
267 * contain an = sign, or at a >, whichever is earlier.
268 */
269 public static String cleanAllTags(String broken) {
270 if (broken == null) {
271 return null;
272 }
273
274 String working = broken;
275
276 allTags: while (true) {
277 int lt = working.indexOf('<');
278
279 // If there are no more amps then we are done
280 if (lt == -1) {
281 break allTags;
282 }
283
284 // loop to find the end of this tag
285 int i = lt;
286 int startattr = -1;
287
288 singletag: while (true) {
289 i++;
290
291 // the tag can't exist past the end of the string
292 if (i >= working.length()) {
293 // go back one so we can safely chop
294 i--;
295 break singletag;
296 }
297
298 char c = working.charAt(i);
299
300 // normal end of tag
301 if (c == '>') {
302 break singletag;
303 }
304
305 // we declare end-of-tag if this 'word' is not an attribute
306 if (c == ' ') {
307 if (startattr == -1) {
308 // NOTE(joe): should we skip over consecutive spaces?
309 startattr = i;
310 } else {
311 // so we've already had a space indicating start of
312 // attribute, so this must be the beginning of the next
313 // NOTE(joe): no - spaces can exist in attr values
314 String value = working.substring(startattr, i);
315 if (value.indexOf('=') == -1) {
316 // this 'attribute' does not contain an equals so
317 // we call it a word and end the parse
318 break singletag;
319 }
320 }
321 }
322 }
323
324 // So we have the end of the tag, delete it, but leave a space in it's place
325 // DataPolice.report("discarding tag: " + working.substring(lt, i + 1));
326 working = working.substring(0, lt) + " " + working.substring(i + 1);
327 }
328
329 return working;
330 }
331
332 /**
333 * Replace entity with its unicode equivalent, if it is not a valid XML
334 * entity. Otherwise strip it out. XML only allows 4 entities: &amp;,
335 * &quot;, &lt; and &gt;.
336 *
337 * @param entity
338 * the entity to be replaced
339 * @return the substitution for the entity, either itself, the unicode
340 * equivalent or an empty string.
341 */
342 private static String handleEntity(String entity) {
343 if (goodEntities.contains(entity)) {
344 return entity;
345 }
346
347 String replace = badEntities.get(entity);
348 if (replace != null) {
349 return replace;
350 }
351
352 // replace unknown entities with a space
353 return " ";
354 }
355
356 // Map entities to their unicode equivalent
357 private static Set<String> goodEntities = new HashSet<String>();
358 private static PropertyMap badEntities = new PropertyMap();
359 static {
360 // pre-defined XML entities
361 goodEntities.add("""); // quotation mark
362 goodEntities.add("&"); // ampersand
363 goodEntities.add("<"); // less-than sign
364 goodEntities.add(">"); // greater-than sign
365
366 // misc entities
367 badEntities.put("€", "\u20AC"); // euro
368 badEntities.put("‘", "\u2018"); // left single quotation mark
369 badEntities.put("’", "\u2019"); // right single quotation mark
370
371 // Latin 1 entities
372 badEntities.put(" ", "\u00A0"); // no-break space
373 badEntities.put("¡", "\u00A1"); // inverted exclamation mark
374 badEntities.put("¢", "\u00A2"); // cent sign
375 badEntities.put("£", "\u00A3"); // pound sign
376 badEntities.put("¤", "\u00A4"); // currency sign
377 badEntities.put("¥", "\u00A5"); // yen sign
378 badEntities.put("¦", "\u00A6"); // broken vertical bar
379 badEntities.put("§", "\u00A7"); // section sign
380 badEntities.put("¨", "\u00A8"); // diaeresis
381 badEntities.put("©", "\u00A9"); // copyright sign
382 badEntities.put("ª", "\u00AA"); // feminine ordinal indicator
383 badEntities.put("«", "\u00AB"); // left-pointing double angle quotation mark
384 badEntities.put("¬", "\u00AC"); // not sign
385 badEntities.put("­", "\u00AD"); // soft hyphen
386 badEntities.put("®", "\u00AE"); // registered sign
387 badEntities.put("¯", "\u00AF"); // macron
388 badEntities.put("°", "\u00B0"); // degree sign
389 badEntities.put("±", "\u00B1"); // plus-minus sign
390 badEntities.put("²", "\u00B2"); // superscript two
391 badEntities.put("³", "\u00B3"); // superscript three
392 badEntities.put("´", "\u00B4"); // acute accent
393 badEntities.put("µ", "\u00B5"); // micro sign
394 badEntities.put("¶", "\u00B6"); // pilcrow sign
395 badEntities.put("·", "\u00B7"); // middle dot
396 badEntities.put("¸", "\u00B8"); // cedilla
397 badEntities.put("¹", "\u00B9"); // superscript one
398 badEntities.put("º", "\u00BA"); // masculine ordinal indicator
399 badEntities.put("»", "\u00BB"); // right-pointing double angle quotation mark
400 badEntities.put("¼", "\u00BC"); // vulgar fraction one quarter
401 badEntities.put("½", "\u00BD"); // vulgar fraction one half
402 badEntities.put("¾", "\u00BE"); // vulgar fraction three quarters
403 badEntities.put("¿", "\u00BF"); // inverted question mark
404 badEntities.put("À", "\u00C0"); // latin capital letter A with grave
405 badEntities.put("Á", "\u00C1"); // latin capital letter A with acute
406 badEntities.put("Â", "\u00C2"); // latin capital letter A with circumflex
407 badEntities.put("Ã", "\u00C3"); // latin capital letter A with tilde
408 badEntities.put("Ä", "\u00C4"); // latin capital letter A with diaeresis
409 badEntities.put("Å", "\u00C5"); // latin capital letter A with ring above
410 badEntities.put("Æ", "\u00C6"); // latin capital letter AE
411 badEntities.put("Ç", "\u00C7"); // latin capital letter C with cedilla
412 badEntities.put("È", "\u00C8"); // latin capital letter E with grave
413 badEntities.put("É", "\u00C9"); // latin capital letter E with acute
414 badEntities.put("Ê", "\u00CA"); // latin capital letter E with circumflex
415 badEntities.put("Ë", "\u00CB"); // latin capital letter E with diaeresis
416 badEntities.put("Ì", "\u00CC"); // latin capital letter I with grave
417 badEntities.put("Í", "\u00CD"); // latin capital letter I with acute
418 badEntities.put("Î", "\u00CE"); // latin capital letter I with circumflex
419 badEntities.put("Ï", "\u00CF"); // latin capital letter I with diaeresis
420 badEntities.put("Ð", "\u00D0"); // latin capital letter ETH
421 badEntities.put("Ñ", "\u00D1"); // latin capital letter N with tilde
422 badEntities.put("Ò", "\u00D2"); // latin capital letter O with grave
423 badEntities.put("Ó", "\u00D3"); // latin capital letter O with acute
424 badEntities.put("Ô", "\u00D4"); // latin capital letter O with circumflex
425 badEntities.put("Õ", "\u00D5"); // latin capital letter O with tilde
426 badEntities.put("Ö", "\u00D6"); // latin capital letter O with diaeresis
427 badEntities.put("×", "\u00D7"); // multiplication sign
428 badEntities.put("Ø", "\u00D8"); // latin capital letter O with stroke
429 badEntities.put("Ù", "\u00D9"); // latin capital letter U with grave
430 badEntities.put("Ú", "\u00DA"); // latin capital letter U with acute
431 badEntities.put("Û", "\u00DB"); // latin capital letter U with circumflex
432 badEntities.put("Ü", "\u00DC"); // latin capital letter U with diaeresis
433 badEntities.put("Ý", "\u00DD"); // latin capital letter Y with acute
434 badEntities.put("Þ", "\u00DE"); // latin capital letter THORN
435 badEntities.put("ß", "\u00DF"); // latin small letter sharp s
436 badEntities.put("à", "\u00E0"); // latin small letter a with grave
437 badEntities.put("á", "\u00E1"); // latin small letter a with acute
438 badEntities.put("â", "\u00E2"); // latin small letter a with circumflex
439 badEntities.put("ã", "\u00E3"); // latin small letter a with tilde
440 badEntities.put("ä", "\u00E4"); // latin small letter a with diaeresis
441 badEntities.put("å", "\u00E5"); // latin small letter a with ring above
442 badEntities.put("æ", "\u00E6"); // latin small letter ae
443 badEntities.put("ç", "\u00E7"); // latin small letter c with cedilla
444 badEntities.put("è", "\u00E8"); // latin small letter e with grave
445 badEntities.put("é", "\u00E9"); // latin small letter e with acute
446 badEntities.put("ê", "\u00EA"); // latin small letter e with circumflex
447 badEntities.put("ë", "\u00EB"); // latin small letter e with diaeresis
448 badEntities.put("ì", "\u00EC"); // latin small letter i with grave
449 badEntities.put("í", "\u00ED"); // latin small letter i with acute
450 badEntities.put("î", "\u00EE"); // latin small letter i with circumflex
451 badEntities.put("ï", "\u00EF"); // latin small letter i with diaeresis
452 badEntities.put("ð", "\u00F0"); // latin small letter eth
453 badEntities.put("ñ", "\u00F1"); // latin small letter n with tilde
454 badEntities.put("ò", "\u00F2"); // latin small letter o with grave
455 badEntities.put("ó", "\u00F3"); // latin small letter o with acute
456 badEntities.put("ô", "\u00F4"); // latin small letter o with circumflex
457 badEntities.put("õ", "\u00F5"); // latin small letter o with tilde
458 badEntities.put("ö", "\u00F6"); // latin small letter o with diaeresis
459 badEntities.put("÷", "\u00F7"); // division sign
460 badEntities.put("ø", "\u00F8"); // latin small letter o with stroke
461 badEntities.put("ù", "\u00F9"); // latin small letter u with grave
462 badEntities.put("ú", "\u00FA"); // latin small letter u with acute
463 badEntities.put("û", "\u00FB"); // latin small letter u with circumflex
464 badEntities.put("ü", "\u00FC"); // latin small letter u with diaeresis
465 badEntities.put("ý", "\u00FD"); // latin small letter y with acute
466 badEntities.put("þ", "\u00FE"); // latin small letter thorn
467 badEntities.put("ÿ", "\u00FF"); // latin small letter y with diaeresis
468 }
469
470 /**
471 * Pattern for numeric entities.
472 */
473 private static Pattern validCharacterEntityPattern = Pattern.compile("^&#x?\\d{2,4};");
474
475 /**
476 * Pattern that negates the allowable XML 4 byte unicode characters. Valid
477 * are: #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] |
478 * [#x10000-#x10FFFF]
479 */
480 private static Pattern invalidCharacterPattern = Pattern.compile("[^\t\r\n\u0020-\uD7FF\uE000-\uFFFD]");
481
482 /**
483 * Pattern that matches open <br>,<hr> and <img> tags.
484 */
485 private static Pattern openHTMLTagPattern = Pattern.compile("<(img|hr|br)([^>]*)(?<!/)>");
486
487 /**
488 * The log stream
489 */
490 private static final Logger log = LoggerFactory.getLogger(XMLUtil.class);
491 }
492