1   /**
2    * Distribution License:
3    * JSword is free software; you can redistribute it and/or modify it under
4    * the terms of the GNU Lesser General Public License, version 2.1 as published by
5    * the Free Software Foundation. This program is distributed in the hope
6    * that it will be useful, but WITHOUT ANY WARRANTY; without even the
7    * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
8    * See the GNU Lesser General Public License for more details.
9    *
10   * The License is available on the internet at:
11   *       http://www.gnu.org/copyleft/lgpl.html
12   * or by writing to:
13   *      Free Software Foundation, Inc.
14   *      59 Temple Place - Suite 330
15   *      Boston, MA 02111-1307, USA
16   *
17   * Copyright: 2005
18   *     The copyright to this program is held by it's authors.
19   *
20   * ID: $Id: THMLFilter.java 2223 2012-01-26 21:28:02Z dmsmith $
21   */
22  package org.crosswire.jsword.book.filter.thml;
23  
24  import java.io.IOException;
25  import java.io.StringReader;
26  import java.util.List;
27  
28  import javax.xml.parsers.ParserConfigurationException;
29  import javax.xml.parsers.SAXParser;
30  import javax.xml.parsers.SAXParserFactory;
31  
32  import org.crosswire.common.util.Logger;
33  import org.crosswire.common.xml.XMLUtil;
34  import org.crosswire.jsword.book.Book;
35  import org.crosswire.jsword.book.OSISUtil;
36  import org.crosswire.jsword.book.filter.Filter;
37  import org.crosswire.jsword.passage.Key;
38  import org.jdom.Content;
39  import org.jdom.Element;
40  import org.xml.sax.InputSource;
41  import org.xml.sax.SAXException;
42  import org.xml.sax.SAXParseException;
43  
44  /**
45   * Filter to convert THML to OSIS format.
46   *
47   * <p>
48   * I used the THML ref page: <a
49   * href="http://www.ccel.org/ThML/ThML1.04.htm">http
50   * ://www.ccel.org/ThML/ThML1.04.htm</a> to work out what the tags meant.
51   *
52   * LATER(joe): check nesting on these THML elements
53   *
54   * @see gnu.lgpl.License for license details.<br>
55   *      The copyright to this program is held by it's authors.
56   * @author Joe Walker [joe at eireneh dot com]
57   */
58  public class THMLFilter implements Filter {
59      /* (non-Javadoc)
60       * @see org.crosswire.jsword.book.filter.Filter#toOSIS(org.crosswire.jsword.book.Book, org.crosswire.jsword.passage.Key, java.lang.String)
61       */
62      public List<Content> toOSIS(Book book, Key key, String plain) {
63          Element ele = cleanParse(book, key, plain);
64  
65          if (ele == null) {
66              if (error instanceof SAXParseException) {
67                  SAXParseException spe = (SAXParseException) error;
68                  int colNumber = spe.getColumnNumber();
69                  int start = Math.max(0, colNumber - 40);
70                  int stop = Math.min(finalInput.length(), colNumber + 40);
71                  int here = stop - start;
72                  log.warn("Could not fix " + book.getInitials() + '(' + key.getName() + ") by "
73                           + errorMessage + ": Error here(" + colNumber + ',' + finalInput.length() + ',' + here + "): " + finalInput.substring(start, stop));
74              } else {
75                  log.warn("Could not fix " + book.getInitials() + "(" + key.getName() + ") by " + errorMessage + ": " + error.getMessage());
76              }
77              ele = OSISUtil.factory().createP();
78          }
79  
80          return ele.removeContent();
81      }
82  
83      @Override
84      public THMLFilter clone() {
85          THMLFilter clone = null;
86          try {
87              clone = (THMLFilter) super.clone();
88          } catch (CloneNotSupportedException e) {
89              assert false : e;
90          }
91          return clone;
92      }
93  
94      private Element cleanParse(Book book, Key key, String plain) {
95          // So just try to strip out all XML looking things
96          String clean = XMLUtil.cleanAllEntities(plain);
97          Element ele = parse(book, key, clean, "cleaning entities");
98  
99          if (ele == null) {
100             ele = cleanText(book, key, clean);
101         }
102 
103         return ele;
104     }
105 
106     private Element cleanText(Book book, Key key, String plain) {
107         // So just try to strip out all XML looking things
108         String clean = XMLUtil.cleanAllCharacters(plain);
109         Element ele = parse(book, key, clean, "cleaning text");
110 
111         if (ele == null) {
112             ele = parse(book, key, XMLUtil.closeEmptyTags(clean), "closing empty tags");
113         }
114 
115         if (ele == null) {
116             ele = cleanTags(book, key, clean);
117         }
118 
119         return ele;
120     }
121 
122     private Element cleanTags(Book book, Key key, String plain) {
123         // So just try to strip out all XML looking things
124         String clean = XMLUtil.cleanAllTags(plain);
125         return parse(book, key, clean, "cleaning tags");
126     }
127 
128     private Element parse(Book book, Key key, String plain, String failMessage) {
129         Exception ex = null;
130         // We need to create a root element to house our document fragment
131         // 15 for the tags we add
132         StringBuilder buf = new StringBuilder(15 + plain.length());
133         buf.append('<').append(RootTag.TAG_ROOT).append('>').append(plain).append("</").append(RootTag.TAG_ROOT).append('>');
134         finalInput = buf.toString();
135         try {
136             StringReader in = new StringReader(finalInput);
137             InputSource is = new InputSource(in);
138             SAXParserFactory spf = SAXParserFactory.newInstance();
139             SAXParser parser = spf.newSAXParser();
140             CustomHandler handler = new CustomHandler(book, key);
141 
142             parser.parse(is, handler);
143             return handler.getRootElement();
144         } catch (SAXParseException e) {
145             ex = e;
146         } catch (SAXException e) {
147             ex = e;
148         } catch (IOException e) {
149             ex = e;
150         } catch (ParserConfigurationException e) {
151             ex = e;
152         } catch (IllegalArgumentException e) {
153             // JDOM has a few exceptions which are all derived from this.
154             ex = e;
155         } catch (RuntimeException e) {
156             // Catch everything else so that we handle the exception properly within a Sw*ng callback
157             ex = e;
158         }
159 
160         errorMessage = failMessage;
161         error = ex;
162         return null;
163     }
164 
165     private String errorMessage;
166     private Exception error;
167     private String finalInput;
168 
169     /**
170      * The log stream
171      */
172     private static final Logger log = Logger.getLogger(THMLFilter.class);
173 }
174