1   /**
2    * Distribution License:
3    * JSword is free software; you can redistribute it and/or modify it under
4    * the terms of the GNU Lesser General Public License, version 2.1 as published by
5    * the Free Software Foundation. This program is distributed in the hope
6    * that it will be useful, but WITHOUT ANY WARRANTY; without even the
7    * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
8    * See the GNU Lesser General Public License for more details.
9    *
10   * The License is available on the internet at:
11   *       http://www.gnu.org/copyleft/lgpl.html
12   * or by writing to:
13   *      Free Software Foundation, Inc.
14   *      59 Temple Place - Suite 330
15   *      Boston, MA 02111-1307, USA
16   *
17   * Copyright: 2005
18   *     The copyright to this program is held by it's authors.
19   *
20   * ID: $Id: OSISFilter.java 2221 2012-01-25 21:32:57Z dmsmith $
21   */
22  package org.crosswire.jsword.book.filter.osis;
23  
24  import java.io.IOException;
25  import java.io.StringReader;
26  import java.util.List;
27  
28  import org.crosswire.common.util.Logger;
29  import org.crosswire.common.xml.XMLUtil;
30  import org.crosswire.jsword.book.Book;
31  import org.crosswire.jsword.book.DataPolice;
32  import org.crosswire.jsword.book.OSISUtil;
33  import org.crosswire.jsword.book.filter.Filter;
34  import org.crosswire.jsword.passage.Key;
35  import org.jdom.Content;
36  import org.jdom.Document;
37  import org.jdom.Element;
38  import org.jdom.JDOMException;
39  import org.jdom.input.SAXBuilder;
40  import org.xml.sax.InputSource;
41  
42  /**
43   * Filter to convert an OSIS XML string to OSIS format.
44   * 
45   * @see gnu.lgpl.License for license details.<br>
46   *      The copyright to this program is held by it's authors.
47   * @author Joe Walker [joe at eireneh dot com]
48   */
49  public class OSISFilter implements Filter {
50      /**
51       * Default constructor of an OSISFilter
52       */
53      public OSISFilter() {
54          builder = new SAXBuilder();
55          builder.setFastReconfigure(true);
56      }
57  
58      /* (non-Javadoc)
59       * @see org.crosswire.jsword.book.filter.Filter#toOSIS(org.crosswire.jsword.book.Book, org.crosswire.jsword.passage.Key, java.lang.String)
60       */
61      public List<Content> toOSIS(Book book, Key key, String plain) {
62          Element ele = null;
63          Exception ex = null;
64          String clean = plain;
65  
66          // FIXME(dms): this is a major HACK handling a problem with a badly
67          // encoded module.
68          if (book.getInitials().startsWith("NET") && plain.endsWith("</div>")) {
69              clean = clean.substring(0, plain.length() - 6);
70          }
71  
72          try {
73              ele = parse(clean);
74          } catch (JDOMException e) {
75              ex = e;
76          } catch (IOException e) {
77              ex = e;
78          }
79  
80          if (ele == null) {
81              clean = XMLUtil.cleanAllEntities(clean);
82  
83              try {
84                  ele = parse(clean);
85              } catch (JDOMException e) {
86                  ex = e;
87              } catch (IOException e) {
88                  ex = e;
89              }
90          }
91  
92          if (ex != null) {
93              DataPolice.report(book, key, "Parse " + book.getInitials() + "(" + key.getName() + ") failed: " + ex.getMessage() + "\non: " + plain);
94              ele = cleanTags(book, key, clean);
95          }
96  
97          if (ele == null) {
98              ele = OSISUtil.factory().createP();
99          }
100 
101         return ele.removeContent();
102     }
103 
104     @Override
105     public OSISFilter clone() {
106         OSISFilter clone = null;
107         try {
108             clone = (OSISFilter) super.clone();
109         } catch (CloneNotSupportedException e) {
110             assert false : e;
111         }
112         return clone;
113     }
114 
115     private Element cleanTags(Book book, Key key, String plain) {
116         // So just try to strip out all XML looking things
117         String shawn = XMLUtil.cleanAllTags(plain);
118         Exception ex = null;
119         try {
120             return parse(shawn);
121         } catch (JDOMException e) {
122             ex = e;
123         } catch (IOException e) {
124             ex = e;
125         }
126 
127         log.warn("Could not fix " + book.getInitials() + "(" + key.getName() + ")  by cleaning tags: " + ex.getMessage());
128 
129         return null;
130     }
131 
132     /**
133      * If the string is invalid then we might want to have more than one crack
134      * at parsing it
135      */
136     private Element parse(String plain) throws JDOMException, IOException {
137         // create a root element to house our document fragment
138         StringReader in = new StringReader("<div>" + plain + "</div>");
139         InputSource is = new InputSource(in);
140         Document doc = builder.build(is);
141         Element div = doc.getRootElement();
142 
143         return div;
144     }
145 
146     /**
147      * The log stream
148      */
149     private static final Logger log = Logger.getLogger(OSISFilter.class);
150 
151     /**
152      * A reusable SAX Builder
153      */
154     private SAXBuilder builder;
155 }
156