1
20 package org.crosswire.common.xml;
21
22 import java.io.IOException;
23 import java.io.InputStream;
24 import java.util.ArrayList;
25 import java.util.Collections;
26 import java.util.HashSet;
27 import java.util.List;
28 import java.util.Set;
29 import java.util.regex.Matcher;
30 import java.util.regex.Pattern;
31
32 import org.crosswire.common.util.FileUtil;
33 import org.crosswire.common.util.PropertyMap;
34 import org.crosswire.common.util.ResourceUtil;
35 import org.jdom2.Document;
36 import org.jdom2.JDOMException;
37 import org.jdom2.input.SAXBuilder;
38 import org.jdom2.input.sax.XMLReaders;
39 import org.slf4j.Logger;
40 import org.slf4j.LoggerFactory;
41 import org.xml.sax.Attributes;
42 import org.xml.sax.ContentHandler;
43 import org.xml.sax.SAXException;
44
45
52 public final class XMLUtil {
53
56 private XMLUtil() {
57 }
58
59
71 public static Document getDocument(String subject) throws JDOMException, IOException {
72 String resource = subject + FileUtil.EXTENSION_XML;
73 InputStream in = ResourceUtil.getResourceAsStream(resource);
74
75 log.debug("Loading {}.xml from classpath: [OK]", subject);
76 SAXBuilder builder = new SAXBuilder(XMLReaders.DTDVALIDATING);
78 return builder.build(in);
79 }
80
81
89 public static String writeToString(SAXEventProvider provider) throws SAXException {
90 ContentHandler ser = new PrettySerializingContentHandler();
91 provider.provideSAXEvents(ser);
92 return ser.toString();
93 }
94
95
104 public static String getAttributeName(Attributes attrs, int index) {
105 String qName = attrs.getQName(index);
106 if (qName != null) {
107 return qName;
108 }
109 return attrs.getLocalName(index);
110 }
111
112
116 public static void debugSAXAttributes(Attributes attrs) {
117 for (int i = 0; i < attrs.getLength(); i++) {
118 log.debug("attr[{}]: {}={}", Integer.toString(i), attrs.getQName(i), attrs.getValue(i));
119 }
120 }
121
122
127 public static String escape(String s) {
128 if (s == null) {
129 return s;
130 }
131 int len = s.length();
132 StringBuilder str = new StringBuilder(len);
133
134 for (int i = 0; i < len; i++) {
135 char ch = s.charAt(i);
136 switch (ch) {
137 case '<':
138 str.append("<");
139 break;
140
141 case '>':
142 str.append(">");
143 break;
144
145 case '&':
146 str.append("&");
147 break;
148
149 case '"':
150 str.append(""");
151 break;
152
153 default:
154 str.append(ch);
155 }
156 }
157
158 return str.toString();
159 }
160
161
171 public static String cleanAllEntities(String broken) {
172 if (broken == null) {
173 return null;
174 }
175
176 String working = broken;
177 int cleanfrom = 0;
178
179 while (true) {
180 int amp = working.indexOf('&', cleanfrom);
181
182 if (amp == -1) {
184 break;
185 }
186
187 if (validCharacterEntityPattern.matcher(working.substring(amp)).find()) {
189 cleanfrom = working.indexOf(';', amp) + 1;
190 continue;
191 }
192
193 int i = amp + 1;
194 while (true) {
195 if (i >= working.length()) {
197
202 return working.substring(0, amp) + "&" + working.substring(amp + 1);
203 }
204
205 char c = working.charAt(i);
208 if (c == ';') {
209 String entity = working.substring(amp, i + 1);
210 String replace = handleEntity(entity);
211
213 working = working.substring(0, amp) + replace + working.substring(i + 1);
214 break;
215 }
216
217 if (!Character.isLetterOrDigit(c)) {
220
224 working = working.substring(0, amp) + "&" + working.substring(amp + 1);
225 amp = i + 4; break;
227 }
228
229 i++;
230 }
231
232 cleanfrom = amp + 1;
233 }
234
235 return working;
236 }
237
238
249 public static String cleanAllCharacters(String broken) {
250 return invalidCharacterPattern.matcher(broken).replaceAll(" ");
251 }
252
253
262 public static String recloseTags(String broken) {
263 String result = broken;
264 while (result.matches(".*</[a-zA-Z]+>[ \t\r\n]*")) {
266 result = result.substring(0, result.lastIndexOf('<'));
267 }
268 List<String> openTags = new ArrayList<String>();
270 Matcher m = Pattern.compile("</?[a-zA-Z]+").matcher(result);
271 boolean lTagFound = false;
272 boolean lgTagFound = false;
273 while (m.find()) {
274 String match = m.group();
275 if (match.startsWith("</")) {
276 if (openTags.size() == 0 && "</l".equals(match) && !lTagFound) {
277 return recloseTags("<l>" + broken);
278 }
279 if (openTags.size() == 0 && "</lg".equals(match) && !lgTagFound) {
280 return recloseTags("<lg>" + broken);
281 }
282 if (openTags.size() == 0) {
283 return null;
284 }
285 String lastTag = openTags.remove(openTags.size() - 1);
286 if (!("</" + lastTag).equals(match)) {
287 return null;
288 }
289 } else {
290 int closePos = result.indexOf('>', m.end());
291 if (closePos == -1) {
292 return null;
293 }
294 while (Character.isWhitespace(result.charAt(closePos - 1))) {
295 --closePos;
296 }
297 if (result.charAt(closePos - 1) != '/') {
298 if ("<l".equals(match)) {
299 lTagFound = true;
300 }
301 if ("<lg".equals(match)) {
302 lgTagFound = true;
303 }
304 openTags.add(match.substring(1));
305 }
306 }
307 }
308 Collections.reverse(openTags);
309 for (String openTag : openTags) {
310 result += "</" + openTag + ">";
311 }
312 return result;
313 }
314
315
323 public static String closeEmptyTags(String broken) {
324 if (broken == null) {
325 return null;
326 }
327
328 return openHTMLTagPattern.matcher(broken).replaceAll("<$1$2/>");
329 }
330
331
339 public static String cleanAllTags(String broken) {
340 if (broken == null) {
341 return null;
342 }
343
344 String working = broken;
345
346 allTags: while (true) {
347 int lt = working.indexOf('<');
348
349 if (lt == -1) {
351 break allTags;
352 }
353
354 int i = lt;
356 int startattr = -1;
357
358 singletag: while (true) {
359 i++;
360
361 if (i >= working.length()) {
363 i--;
365 break singletag;
366 }
367
368 char c = working.charAt(i);
369
370 if (c == '>') {
372 break singletag;
373 }
374
375 if (c == ' ') {
377 if (startattr == -1) {
378 startattr = i;
380 } else {
381 String value = working.substring(startattr, i);
385 if (value.indexOf('=') == -1) {
386 break singletag;
389 }
390 }
391 }
392 }
393
394 working = working.substring(0, lt) + " " + working.substring(i + 1);
397 }
398
399 return working;
400 }
401
402
412 private static String handleEntity(String entity) {
413 if (goodEntities.contains(entity)) {
414 return entity;
415 }
416
417 String replace = badEntities.get(entity);
418 if (replace != null) {
419 return replace;
420 }
421
422 return " ";
424 }
425
426 private static Set<String> goodEntities = new HashSet<String>();
428 private static PropertyMap badEntities = new PropertyMap();
429 static {
430 goodEntities.add("""); goodEntities.add("&"); goodEntities.add("<"); goodEntities.add(">");
436 badEntities.put("€", "\u20AC"); badEntities.put("‘", "\u2018"); badEntities.put("’", "\u2019");
441 badEntities.put(" ", "\u00A0"); badEntities.put("¡", "\u00A1"); badEntities.put("¢", "\u00A2"); badEntities.put("£", "\u00A3"); badEntities.put("¤", "\u00A4"); badEntities.put("¥", "\u00A5"); badEntities.put("¦", "\u00A6"); badEntities.put("§", "\u00A7"); badEntities.put("¨", "\u00A8"); badEntities.put("©", "\u00A9"); badEntities.put("ª", "\u00AA"); badEntities.put("«", "\u00AB"); badEntities.put("¬", "\u00AC"); badEntities.put("­", "\u00AD"); badEntities.put("®", "\u00AE"); badEntities.put("¯", "\u00AF"); badEntities.put("°", "\u00B0"); badEntities.put("±", "\u00B1"); badEntities.put("²", "\u00B2"); badEntities.put("³", "\u00B3"); badEntities.put("´", "\u00B4"); badEntities.put("µ", "\u00B5"); badEntities.put("¶", "\u00B6"); badEntities.put("·", "\u00B7"); badEntities.put("¸", "\u00B8"); badEntities.put("¹", "\u00B9"); badEntities.put("º", "\u00BA"); badEntities.put("»", "\u00BB"); badEntities.put("¼", "\u00BC"); badEntities.put("½", "\u00BD"); badEntities.put("¾", "\u00BE"); badEntities.put("¿", "\u00BF"); badEntities.put("À", "\u00C0"); badEntities.put("Á", "\u00C1"); badEntities.put("Â", "\u00C2"); badEntities.put("Ã", "\u00C3"); badEntities.put("Ä", "\u00C4"); badEntities.put("Å", "\u00C5"); badEntities.put("Æ", "\u00C6"); badEntities.put("Ç", "\u00C7"); badEntities.put("È", "\u00C8"); badEntities.put("É", "\u00C9"); badEntities.put("Ê", "\u00CA"); badEntities.put("Ë", "\u00CB"); badEntities.put("Ì", "\u00CC"); badEntities.put("Í", "\u00CD"); badEntities.put("Î", "\u00CE"); badEntities.put("Ï", "\u00CF"); badEntities.put("Ð", "\u00D0"); badEntities.put("Ñ", "\u00D1"); badEntities.put("Ò", "\u00D2"); badEntities.put("Ó", "\u00D3"); badEntities.put("Ô", "\u00D4"); badEntities.put("Õ", "\u00D5"); badEntities.put("Ö", "\u00D6"); badEntities.put("×", "\u00D7"); badEntities.put("Ø", "\u00D8"); badEntities.put("Ù", "\u00D9"); badEntities.put("Ú", "\u00DA"); badEntities.put("Û", "\u00DB"); badEntities.put("Ü", "\u00DC"); badEntities.put("Ý", "\u00DD"); badEntities.put("Þ", "\u00DE"); badEntities.put("ß", "\u00DF"); badEntities.put("à", "\u00E0"); badEntities.put("á", "\u00E1"); badEntities.put("â", "\u00E2"); badEntities.put("ã", "\u00E3"); badEntities.put("ä", "\u00E4"); badEntities.put("å", "\u00E5"); badEntities.put("æ", "\u00E6"); badEntities.put("ç", "\u00E7"); badEntities.put("è", "\u00E8"); badEntities.put("é", "\u00E9"); badEntities.put("ê", "\u00EA"); badEntities.put("ë", "\u00EB"); badEntities.put("ì", "\u00EC"); badEntities.put("í", "\u00ED"); badEntities.put("î", "\u00EE"); badEntities.put("ï", "\u00EF"); badEntities.put("ð", "\u00F0"); badEntities.put("ñ", "\u00F1"); badEntities.put("ò", "\u00F2"); badEntities.put("ó", "\u00F3"); badEntities.put("ô", "\u00F4"); badEntities.put("õ", "\u00F5"); badEntities.put("ö", "\u00F6"); badEntities.put("÷", "\u00F7"); badEntities.put("ø", "\u00F8"); badEntities.put("ù", "\u00F9"); badEntities.put("ú", "\u00FA"); badEntities.put("û", "\u00FB"); badEntities.put("ü", "\u00FC"); badEntities.put("ý", "\u00FD"); badEntities.put("þ", "\u00FE"); badEntities.put("ÿ", "\u00FF"); }
539
540
543 private static Pattern validCharacterEntityPattern = Pattern.compile("^&#x?\\d{2,4};");
544
545
550 private static Pattern invalidCharacterPattern = Pattern.compile("[^\t\r\n\u0020-\uD7FF\uE000-\uFFFD]");
551
552
555 private static Pattern openHTMLTagPattern = Pattern.compile("<(img|hr|br)([^>]*)(?<!/)>");
556
557
560 private static final Logger log = LoggerFactory.getLogger(XMLUtil.class);
561 }
562