| XMLUtil.java |
1 /**
2 * Distribution License:
3 * JSword is free software; you can redistribute it and/or modify it under
4 * the terms of the GNU Lesser General Public License, version 2.1 or later
5 * as published by the Free Software Foundation. This program is distributed
6 * in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even
7 * the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
8 * See the GNU Lesser General Public License for more details.
9 *
10 * The License is available on the internet at:
11 * http://www.gnu.org/copyleft/lgpl.html
12 * or by writing to:
13 * Free Software Foundation, Inc.
14 * 59 Temple Place - Suite 330
15 * Boston, MA 02111-1307, USA
16 *
17 * © CrossWire Bible Society, 2005 - 2016
18 *
19 */
20 package org.crosswire.common.xml;
21
22 import java.io.IOException;
23 import java.io.InputStream;
24 import java.util.ArrayList;
25 import java.util.Collections;
26 import java.util.HashSet;
27 import java.util.List;
28 import java.util.Set;
29 import java.util.regex.Matcher;
30 import java.util.regex.Pattern;
31
32 import org.crosswire.common.util.FileUtil;
33 import org.crosswire.common.util.PropertyMap;
34 import org.crosswire.common.util.ResourceUtil;
35 import org.jdom2.Document;
36 import org.jdom2.JDOMException;
37 import org.jdom2.input.SAXBuilder;
38 import org.jdom2.input.sax.XMLReaders;
39 import org.slf4j.Logger;
40 import org.slf4j.LoggerFactory;
41 import org.xml.sax.Attributes;
42 import org.xml.sax.ContentHandler;
43 import org.xml.sax.SAXException;
44
45 /**
46 * Utilities for working with SAX XML parsing.
47 *
48 * @see gnu.lgpl.License The GNU Lesser General Public License for details.
49 * @author Joe Walker
50 * @author DM Smith
51 */
52 public final class XMLUtil {
53 /**
54 * Prevent instantiation
55 */
56 private XMLUtil() {
57 }
58
59 /**
60 * Get and load an XML file from the classpath and a few other places into a
61 * JDOM Document object.
62 *
63 * @param subject
64 * The name of the desired resource (without any extension)
65 * @return The requested resource
66 * @throws IOException
67 * if there is a problem reading the file
68 * @throws JDOMException
69 * If the resource is not valid XML
70 */
71 public static Document getDocument(String subject) throws JDOMException, IOException {
72 String resource = subject + FileUtil.EXTENSION_XML;
73 InputStream in = ResourceUtil.getResourceAsStream(resource);
74
75 log.debug("Loading {}.xml from classpath: [OK]", subject);
76 // With JDom 1.x this passed true
77 SAXBuilder builder = new SAXBuilder(XMLReaders.DTDVALIDATING);
78 return builder.build(in);
79 }
80
81 /**
82 * Serialize a SAXEventProvider into an XML String
83 *
84 * @param provider
85 * The source of SAX events
86 * @return a serialized string
87 * @throws SAXException
88 */
89 public static String writeToString(SAXEventProvider provider) throws SAXException {
90 ContentHandler ser = new PrettySerializingContentHandler();
91 provider.provideSAXEvents(ser);
92 return ser.toString();
93 }
94
95 /**
96 * Get the full name of the attribute, including the namespace if any.
97 *
98 * @param attrs
99 * the collection of attributes
100 * @param index
101 * the index of the desired attribute
102 * @return the requested attribute
103 */
104 public static String getAttributeName(Attributes attrs, int index) {
105 String qName = attrs.getQName(index);
106 if (qName != null) {
107 return qName;
108 }
109 return attrs.getLocalName(index);
110 }
111
112 /**
113 * Show the attributes of an element as debug
114 * @param attrs
115 */
116 public static void debugSAXAttributes(Attributes attrs) {
117 for (int i = 0; i < attrs.getLength(); i++) {
118 log.debug("attr[{}]: {}={}", Integer.toString(i), attrs.getQName(i), attrs.getValue(i));
119 }
120 }
121
122 /**
123 * Normalizes the given string
124 * @param s
125 * @return the escaped string
126 */
127 public static String escape(String s) {
128 if (s == null) {
129 return s;
130 }
131 int len = s.length();
132 StringBuilder str = new StringBuilder(len);
133
134 for (int i = 0; i < len; i++) {
135 char ch = s.charAt(i);
136 switch (ch) {
137 case '<':
138 str.append("<");
139 break;
140
141 case '>':
142 str.append(">");
143 break;
144
145 case '&':
146 str.append("&");
147 break;
148
149 case '"':
150 str.append(""");
151 break;
152
153 default:
154 str.append(ch);
155 }
156 }
157
158 return str.toString();
159 }
160
161 /**
162 * For each entity in the input that is not allowed in XML, replace the
163 * entity with its unicode equivalent or remove it. For each instance of a
164 * bare &, replace it with &<br>
165 * XML only allows 4 entities: &amp;, &quot;, &lt; and &gt;.
166 *
167 * @param broken
168 * the string to handle entities
169 * @return the string with entities appropriately fixed up
170 */
171 public static String cleanAllEntities(String broken) {
172 if (broken == null) {
173 return null;
174 }
175
176 String working = broken;
177 int cleanfrom = 0;
178
179 while (true) {
180 int amp = working.indexOf('&', cleanfrom);
181
182 // If there are no more amps then we are done
183 if (amp == -1) {
184 break;
185 }
186
187 // Skip references of the kind &#ddd;
188 if (validCharacterEntityPattern.matcher(working.substring(amp)).find()) {
189 cleanfrom = working.indexOf(';', amp) + 1;
190 continue;
191 }
192
193 int i = amp + 1;
194 while (true) {
195 // if we are at the end of the string then just escape the '&';
196 if (i >= working.length()) {
197 // String entity = working.substring(amp);
198 // String replace = guessEntity(entity);
199 // DataPolice.report("replacing unterminated entity: '" +
200 // entity + "' with: '" + replace + "'");
201
202 return working.substring(0, amp) + "&" + working.substring(amp + 1);
203 }
204
205 // if we have come to a ; then we have an entity
206 // If it is something that xml can't handle then replace it.
207 char c = working.charAt(i);
208 if (c == ';') {
209 String entity = working.substring(amp, i + 1);
210 String replace = handleEntity(entity);
211 // log.warn("replacing entity: '{}' with: '{}'", entity, replace);
212
213 working = working.substring(0, amp) + replace + working.substring(i + 1);
214 break;
215 }
216
217 // Did we end an entity without finding a closing ;
218 // Then treat it as an '&' that needs to be replaced with &
219 if (!Character.isLetterOrDigit(c)) {
220 // String entity = working.substring(amp, i);
221 // String replace = "&" + working.substring(amp + 1, i);
222 // log.warn("replacing invalid entity: '{}' with: '{}': {}", entity, replace, broken);
223
224 working = working.substring(0, amp) + "&" + working.substring(amp + 1);
225 amp = i + 4; // account for the 4 extra characters
226 break;
227 }
228
229 i++;
230 }
231
232 cleanfrom = amp + 1;
233 }
234
235 return working;
236 }
237
238 /**
239 * Remove all invalid characters in the input, replacing them with a space. XML has stringent
240 * requirements as to which characters are or are not allowed. The set of
241 * allowable characters are:<br>
242 * #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]<br>
243 * Note: Java handles to
244 *
245 * @param broken
246 * the string to be cleaned
247 * @return the cleaned string
248 */
249 public static String cleanAllCharacters(String broken) {
250 return invalidCharacterPattern.matcher(broken).replaceAll(" ");
251 }
252
253 /**
254 * Strip all closing tags from the end of the XML fragment, and then
255 * re-close all tags that are open at the end of the string.
256 *
257 * @param broken
258 * the string to be cleaned.
259 * @return cleaned string, or {@code null} if the string could not be
260 * cleaned due to more broken XML
261 */
262 public static String recloseTags(String broken) {
263 String result = broken;
264 // remove closing tags from the end
265 while (result.matches(".*</[a-zA-Z]+>[ \t\r\n]*")) {
266 result = result.substring(0, result.lastIndexOf('<'));
267 }
268 // close tags again
269 List<String> openTags = new ArrayList<String>();
270 Matcher m = Pattern.compile("</?[a-zA-Z]+").matcher(result);
271 boolean lTagFound = false;
272 boolean lgTagFound = false;
273 while (m.find()) {
274 String match = m.group();
275 if (match.startsWith("</")) {
276 if (openTags.size() == 0 && "</l".equals(match) && !lTagFound) {
277 return recloseTags("<l>" + broken);
278 }
279 if (openTags.size() == 0 && "</lg".equals(match) && !lgTagFound) {
280 return recloseTags("<lg>" + broken);
281 }
282 if (openTags.size() == 0) {
283 return null;
284 }
285 String lastTag = openTags.remove(openTags.size() - 1);
286 if (!("</" + lastTag).equals(match)) {
287 return null;
288 }
289 } else {
290 int closePos = result.indexOf('>', m.end());
291 if (closePos == -1) {
292 return null;
293 }
294 while (Character.isWhitespace(result.charAt(closePos - 1))) {
295 --closePos;
296 }
297 if (result.charAt(closePos - 1) != '/') {
298 if ("<l".equals(match)) {
299 lTagFound = true;
300 }
301 if ("<lg".equals(match)) {
302 lgTagFound = true;
303 }
304 openTags.add(match.substring(1));
305 }
306 }
307 }
308 Collections.reverse(openTags);
309 for (String openTag : openTags) {
310 result += "</" + openTag + ">";
311 }
312 return result;
313 }
314
315 /**
316 * Common HTML tags such as <br>,<hr> and <img> may be
317 * left open causing XML parsing to fail. This method closes these tags.
318 *
319 * @param broken
320 * the string to be cleaned
321 * @return the cleaned string
322 */
323 public static String closeEmptyTags(String broken) {
324 if (broken == null) {
325 return null;
326 }
327
328 return openHTMLTagPattern.matcher(broken).replaceAll("<$1$2/>");
329 }
330
331 /**
332 * XML parse failed, so we can try getting rid of all the tags and having
333 * another go. We define a tag to start at a < and end at the end of the
334 * next word (where a word is what comes in between spaces) that does not
335 * contain an = sign, or at a >, whichever is earlier.
336 * @param broken
337 * @return the string without any tags
338 */
339 public static String cleanAllTags(String broken) {
340 if (broken == null) {
341 return null;
342 }
343
344 String working = broken;
345
346 allTags: while (true) {
347 int lt = working.indexOf('<');
348
349 // If there are no more amps then we are done
350 if (lt == -1) {
351 break allTags;
352 }
353
354 // loop to find the end of this tag
355 int i = lt;
356 int startattr = -1;
357
358 singletag: while (true) {
359 i++;
360
361 // the tag can't exist past the end of the string
362 if (i >= working.length()) {
363 // go back one so we can safely chop
364 i--;
365 break singletag;
366 }
367
368 char c = working.charAt(i);
369
370 // normal end of tag
371 if (c == '>') {
372 break singletag;
373 }
374
375 // we declare end-of-tag if this 'word' is not an attribute
376 if (c == ' ') {
377 if (startattr == -1) {
378 // NOTE(joe): should we skip over consecutive spaces?
379 startattr = i;
380 } else {
381 // so we've already had a space indicating start of
382 // attribute, so this must be the beginning of the next
383 // NOTE(joe): no - spaces can exist in attr values
384 String value = working.substring(startattr, i);
385 if (value.indexOf('=') == -1) {
386 // this 'attribute' does not contain an equals so
387 // we call it a word and end the parse
388 break singletag;
389 }
390 }
391 }
392 }
393
394 // So we have the end of the tag, delete it, but leave a space in it's place
395 // DataPolice.report("discarding tag: " + working.substring(lt, i + 1));
396 working = working.substring(0, lt) + " " + working.substring(i + 1);
397 }
398
399 return working;
400 }
401
402 /**
403 * Replace entity with its unicode equivalent, if it is not a valid XML
404 * entity. Otherwise strip it out. XML only allows 4 entities: &amp;,
405 * &quot;, &lt; and &gt;.
406 *
407 * @param entity
408 * the entity to be replaced
409 * @return the substitution for the entity, either itself, the unicode
410 * equivalent or an empty string.
411 */
412 private static String handleEntity(String entity) {
413 if (goodEntities.contains(entity)) {
414 return entity;
415 }
416
417 String replace = badEntities.get(entity);
418 if (replace != null) {
419 return replace;
420 }
421
422 // replace unknown entities with a space
423 return " ";
424 }
425
426 // Map entities to their unicode equivalent
427 private static Set<String> goodEntities = new HashSet<String>();
428 private static PropertyMap badEntities = new PropertyMap();
429 static {
430 // pre-defined XML entities
431 goodEntities.add("""); // quotation mark
432 goodEntities.add("&"); // ampersand
433 goodEntities.add("<"); // less-than sign
434 goodEntities.add(">"); // greater-than sign
435
436 // misc entities
437 badEntities.put("€", "\u20AC"); // euro
438 badEntities.put("‘", "\u2018"); // left single quotation mark
439 badEntities.put("’", "\u2019"); // right single quotation mark
440
441 // Latin 1 entities
442 badEntities.put(" ", "\u00A0"); // no-break space
443 badEntities.put("¡", "\u00A1"); // inverted exclamation mark
444 badEntities.put("¢", "\u00A2"); // cent sign
445 badEntities.put("£", "\u00A3"); // pound sign
446 badEntities.put("¤", "\u00A4"); // currency sign
447 badEntities.put("¥", "\u00A5"); // yen sign
448 badEntities.put("¦", "\u00A6"); // broken vertical bar
449 badEntities.put("§", "\u00A7"); // section sign
450 badEntities.put("¨", "\u00A8"); // diaeresis
451 badEntities.put("©", "\u00A9"); // copyright sign
452 badEntities.put("ª", "\u00AA"); // feminine ordinal indicator
453 badEntities.put("«", "\u00AB"); // left-pointing double angle quotation mark
454 badEntities.put("¬", "\u00AC"); // not sign
455 badEntities.put("­", "\u00AD"); // soft hyphen
456 badEntities.put("®", "\u00AE"); // registered sign
457 badEntities.put("¯", "\u00AF"); // macron
458 badEntities.put("°", "\u00B0"); // degree sign
459 badEntities.put("±", "\u00B1"); // plus-minus sign
460 badEntities.put("²", "\u00B2"); // superscript two
461 badEntities.put("³", "\u00B3"); // superscript three
462 badEntities.put("´", "\u00B4"); // acute accent
463 badEntities.put("µ", "\u00B5"); // micro sign
464 badEntities.put("¶", "\u00B6"); // pilcrow sign
465 badEntities.put("·", "\u00B7"); // middle dot
466 badEntities.put("¸", "\u00B8"); // cedilla
467 badEntities.put("¹", "\u00B9"); // superscript one
468 badEntities.put("º", "\u00BA"); // masculine ordinal indicator
469 badEntities.put("»", "\u00BB"); // right-pointing double angle quotation mark
470 badEntities.put("¼", "\u00BC"); // vulgar fraction one quarter
471 badEntities.put("½", "\u00BD"); // vulgar fraction one half
472 badEntities.put("¾", "\u00BE"); // vulgar fraction three quarters
473 badEntities.put("¿", "\u00BF"); // inverted question mark
474 badEntities.put("À", "\u00C0"); // latin capital letter A with grave
475 badEntities.put("Á", "\u00C1"); // latin capital letter A with acute
476 badEntities.put("Â", "\u00C2"); // latin capital letter A with circumflex
477 badEntities.put("Ã", "\u00C3"); // latin capital letter A with tilde
478 badEntities.put("Ä", "\u00C4"); // latin capital letter A with diaeresis
479 badEntities.put("Å", "\u00C5"); // latin capital letter A with ring above
480 badEntities.put("Æ", "\u00C6"); // latin capital letter AE
481 badEntities.put("Ç", "\u00C7"); // latin capital letter C with cedilla
482 badEntities.put("È", "\u00C8"); // latin capital letter E with grave
483 badEntities.put("É", "\u00C9"); // latin capital letter E with acute
484 badEntities.put("Ê", "\u00CA"); // latin capital letter E with circumflex
485 badEntities.put("Ë", "\u00CB"); // latin capital letter E with diaeresis
486 badEntities.put("Ì", "\u00CC"); // latin capital letter I with grave
487 badEntities.put("Í", "\u00CD"); // latin capital letter I with acute
488 badEntities.put("Î", "\u00CE"); // latin capital letter I with circumflex
489 badEntities.put("Ï", "\u00CF"); // latin capital letter I with diaeresis
490 badEntities.put("Ð", "\u00D0"); // latin capital letter ETH
491 badEntities.put("Ñ", "\u00D1"); // latin capital letter N with tilde
492 badEntities.put("Ò", "\u00D2"); // latin capital letter O with grave
493 badEntities.put("Ó", "\u00D3"); // latin capital letter O with acute
494 badEntities.put("Ô", "\u00D4"); // latin capital letter O with circumflex
495 badEntities.put("Õ", "\u00D5"); // latin capital letter O with tilde
496 badEntities.put("Ö", "\u00D6"); // latin capital letter O with diaeresis
497 badEntities.put("×", "\u00D7"); // multiplication sign
498 badEntities.put("Ø", "\u00D8"); // latin capital letter O with stroke
499 badEntities.put("Ù", "\u00D9"); // latin capital letter U with grave
500 badEntities.put("Ú", "\u00DA"); // latin capital letter U with acute
501 badEntities.put("Û", "\u00DB"); // latin capital letter U with circumflex
502 badEntities.put("Ü", "\u00DC"); // latin capital letter U with diaeresis
503 badEntities.put("Ý", "\u00DD"); // latin capital letter Y with acute
504 badEntities.put("Þ", "\u00DE"); // latin capital letter THORN
505 badEntities.put("ß", "\u00DF"); // latin small letter sharp s
506 badEntities.put("à", "\u00E0"); // latin small letter a with grave
507 badEntities.put("á", "\u00E1"); // latin small letter a with acute
508 badEntities.put("â", "\u00E2"); // latin small letter a with circumflex
509 badEntities.put("ã", "\u00E3"); // latin small letter a with tilde
510 badEntities.put("ä", "\u00E4"); // latin small letter a with diaeresis
511 badEntities.put("å", "\u00E5"); // latin small letter a with ring above
512 badEntities.put("æ", "\u00E6"); // latin small letter ae
513 badEntities.put("ç", "\u00E7"); // latin small letter c with cedilla
514 badEntities.put("è", "\u00E8"); // latin small letter e with grave
515 badEntities.put("é", "\u00E9"); // latin small letter e with acute
516 badEntities.put("ê", "\u00EA"); // latin small letter e with circumflex
517 badEntities.put("ë", "\u00EB"); // latin small letter e with diaeresis
518 badEntities.put("ì", "\u00EC"); // latin small letter i with grave
519 badEntities.put("í", "\u00ED"); // latin small letter i with acute
520 badEntities.put("î", "\u00EE"); // latin small letter i with circumflex
521 badEntities.put("ï", "\u00EF"); // latin small letter i with diaeresis
522 badEntities.put("ð", "\u00F0"); // latin small letter eth
523 badEntities.put("ñ", "\u00F1"); // latin small letter n with tilde
524 badEntities.put("ò", "\u00F2"); // latin small letter o with grave
525 badEntities.put("ó", "\u00F3"); // latin small letter o with acute
526 badEntities.put("ô", "\u00F4"); // latin small letter o with circumflex
527 badEntities.put("õ", "\u00F5"); // latin small letter o with tilde
528 badEntities.put("ö", "\u00F6"); // latin small letter o with diaeresis
529 badEntities.put("÷", "\u00F7"); // division sign
530 badEntities.put("ø", "\u00F8"); // latin small letter o with stroke
531 badEntities.put("ù", "\u00F9"); // latin small letter u with grave
532 badEntities.put("ú", "\u00FA"); // latin small letter u with acute
533 badEntities.put("û", "\u00FB"); // latin small letter u with circumflex
534 badEntities.put("ü", "\u00FC"); // latin small letter u with diaeresis
535 badEntities.put("ý", "\u00FD"); // latin small letter y with acute
536 badEntities.put("þ", "\u00FE"); // latin small letter thorn
537 badEntities.put("ÿ", "\u00FF"); // latin small letter y with diaeresis
538 }
539
540 /**
541 * Pattern for numeric entities.
542 */
543 private static Pattern validCharacterEntityPattern = Pattern.compile("^&#x?\\d{2,4};");
544
545 /**
546 * Pattern that negates the allowable XML 4 byte unicode characters. Valid
547 * are: #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] |
548 * [#x10000-#x10FFFF]
549 */
550 private static Pattern invalidCharacterPattern = Pattern.compile("[^\t\r\n\u0020-\uD7FF\uE000-\uFFFD]");
551
552 /**
553 * Pattern that matches open <br>,<hr> and <img> tags.
554 */
555 private static Pattern openHTMLTagPattern = Pattern.compile("<(img|hr|br)([^>]*)(?<!/)>");
556
557 /**
558 * The log stream
559 */
560 private static final Logger log = LoggerFactory.getLogger(XMLUtil.class);
561 }
562