1   /**
2    * Distribution License:
3    * JSword is free software; you can redistribute it and/or modify it under
4    * the terms of the GNU Lesser General Public License, version 2.1 as published by
5    * the Free Software Foundation. This program is distributed in the hope
6    * that it will be useful, but WITHOUT ANY WARRANTY; without even the
7    * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
8    * See the GNU Lesser General Public License for more details.
9    *
10   * The License is available on the internet at:
11   *       http://www.gnu.org/copyleft/lgpl.html
12   * or by writing to:
13   *      Free Software Foundation, Inc.
14   *      59 Temple Place - Suite 330
15   *      Boston, MA 02111-1307, USA
16   *
17   * Copyright: 2005 - 2012
18   *     The copyright to this program is held by it's authors.
19   *
20   * ID: $Id: CustomHandler.java 2221 2012-01-25 21:32:57Z dmsmith $
21   */
22  package org.crosswire.jsword.book.filter.thml;
23  
24  import java.util.HashMap;
25  import java.util.LinkedList;
26  import java.util.Locale;
27  import java.util.Map;
28  
29  import org.crosswire.jsword.book.Book;
30  import org.crosswire.jsword.book.DataPolice;
31  import org.crosswire.jsword.passage.Key;
32  import org.jdom.Content;
33  import org.jdom.Element;
34  import org.jdom.Text;
35  import org.xml.sax.Attributes;
36  import org.xml.sax.SAXException;
37  import org.xml.sax.helpers.DefaultHandler;
38  
39  /**
40   * To convert SAX events into OSIS events.
41   * 
42   * <p>
43   * This is based upon the THML reference page:
44   * <a href="http://www.ccel.org/ThML/ThML1.04.htm">http://www.ccel.org/ThML/ThML1.04.htm</a>
45   * to work out what the tags meant.
46   * 
47   * @see gnu.lgpl.License for license details.<br>
48   *      The copyright to this program is held by it's authors.
49   * @author Joe Walker [joe at eireneh dot com]
50   */
51  public class CustomHandler extends DefaultHandler {
52      /**
53       * Simple ctor
54       */
55      public CustomHandler(Book book, Key key) {
56          this.book = book;
57          this.key = key;
58          this.stack = new LinkedList<Content>();
59      }
60  
61      @Override
62      public void startElement(String uri, String localname, String qname, Attributes attrs) throws SAXException {
63          Element ele = null;
64  
65          // If we are looking at the root element
66          // then the stack is empty
67          if (!stack.isEmpty()) {
68              Object top = stack.getFirst();
69  
70              // If the element and its descendants are to be ignored
71              // then there is a null element on the stack
72              if (top == null) {
73                  return;
74              }
75  
76              // It might be a text element
77              if (top instanceof Element) {
78                  ele = (Element) top;
79              }
80          }
81  
82          Tag t = getTag(localname, qname);
83  
84          if (t != null) {
85              stack.addFirst(t.processTag(book, key, ele, attrs));
86          }
87      }
88  
89      @Override
90      public void characters(char[] data, int offset, int length) {
91          // what we are adding
92          String text = new String(data, offset, length);
93  
94          if (stack.isEmpty()) {
95              stack.addFirst(new Text(text));
96              return;
97          }
98  
99          // What we are adding to
100         Content top = stack.getFirst();
101 
102         // If the element and its descendants are to be ignored
103         // then there is a null element on the stack
104         if (top == null) {
105             return;
106         }
107 
108         if (top instanceof Text) {
109             ((Text) top).append(text);
110             return;
111         }
112 
113         if (top instanceof Element) {
114             Element current = (Element) top;
115 
116             int size = current.getContentSize();
117 
118             // If the last element in the list is a string then we should add
119             // this string on to the end of it rather than add a new list item
120             // because (probably as an artifact of the HTML/XSL transform we get
121             // a space inserted in the output even when 2 calls to this method
122             // split a word.
123             if (size > 0) {
124                 Content last = current.getContent(size - 1);
125                 if (last instanceof Text) {
126                     ((Text) last).append(text);
127                     return;
128                 }
129             }
130             current.addContent(new Text(text));
131         }
132     }
133 
134     @Override
135     public void endElement(String uri, String localname, String qname) {
136         if (stack.isEmpty()) {
137             return;
138         }
139         // When we are done processing an element we need to remove
140         // it from the stack so that nothing more is attached to it.
141         Content top = stack.removeFirst();
142         if (top instanceof Element) {
143             Element finished = (Element) top;
144             Tag t = getTag(localname, qname);
145 
146             if (t != null) {
147                 t.processContent(book, key, finished);
148             }
149 
150             // If it was the last element then it was the root element
151             // so save it
152             if (stack.isEmpty()) {
153                 rootElement = finished;
154             }
155         }
156     }
157 
158     public Element getRootElement() {
159         return rootElement;
160     }
161 
162     private Tag getTag(String localname, String qname) {
163         Tag t = TAG_MAP.get(qname);
164 
165         // Some of the THML books are broken in that they use uppercase
166         // element names, which the spec disallows, but we might as well
167         // look out for them
168         if (t == null) {
169             t = TAG_MAP.get(qname.toLowerCase(Locale.ENGLISH));
170 
171             if (t == null) {
172                 DataPolice.report(book, key, "Unknown thml element: " + localname + " qname=" + qname);
173 
174                 // Report on it only once and make sure the content is output.
175                 t = new AnonymousTag(qname);
176                 TAG_MAP.put(qname, t);
177                 return t;
178             }
179 
180             DataPolice.report(book, key, "Wrong case used in thml element: " + qname);
181         }
182         return t;
183     }
184 
185     /**
186      * When the document is parsed, this is the last element popped off the
187      * stack.
188      */
189     private Element rootElement;
190 
191     /**
192      * The book being parsed.
193      */
194     private Book book;
195 
196     /**
197      * The book being parsed.
198      */
199     private Key key;
200 
201     /**
202      * The stack of elements that we have created
203      */
204     private LinkedList<Content> stack;
205 
206     /**
207      * The known tag types
208      */
209     private static final Map<String, Tag> TAG_MAP = new HashMap<String, Tag>();
210 
211     static {
212         /*
213          * ThML is based upon Voyager XHTML and all Voyager elements are
214          * allowed. However not all elements make sense.
215          */
216         Tag[] tags = new Tag[] {
217                 // The following are defined in Voyager xhtml 4.0
218                 new ATag(), new AbbrTag(), new AliasTag("acronym", new AbbrTag()),
219                 new AnonymousTag("address"),
220                 new SkipTag("applet"),
221                 new SkipTag("area"),
222                 new BTag(), new SkipTag("base"),
223                 new SkipTag("basefont"),
224                 new IgnoreTag("bdo"),
225                 new BigTag(), new BlockquoteTag(), new IgnoreTag("body"),
226                 new BrTag(), new SkipTag("button"),
227                 new AnonymousTag("caption"),
228                 new CenterTag(), new AnonymousTag("cite"),
229                 new AnonymousTag("code"),
230                 new SkipTag("col"),
231                 new SkipTag("colgroup"),
232                 new AliasTag("dd", new LiTag()),
233                 new AnonymousTag("del"),
234                 new AnonymousTag("dfn"),
235                 new DivTag(), new AliasTag("dl", new UlTag()),
236                 new AliasTag("dt", new LiTag()),
237                 new AliasTag("em", new ITag()),
238                 new IgnoreTag("fieldset"),
239                 new FontTag(), new SkipTag("form"),
240                 new SkipTag("frame"),
241                 new SkipTag("frameset"),
242                 new AliasTag("h1", new HTag(1)),
243                 new AliasTag("h2", new HTag(2)),
244                 new AliasTag("h3", new HTag(3)),
245                 new AliasTag("h4", new HTag(4)),
246                 new AliasTag("h5", new HTag(5)),
247                 new AliasTag("h6", new HTag(6)),
248                 new SkipTag("head"),
249                 new HrTag(), new IgnoreTag("html"),
250                 new IgnoreTag("frameset"),
251                 new ITag(), new SkipTag("iframe"),
252                 new ImgTag(), new SkipTag("input"),
253                 new AnonymousTag("ins"),
254                 new AnonymousTag("kbd"),
255                 new AnonymousTag("label"),
256                 new AnonymousTag("legend"),
257                 new LiTag(), new SkipTag("link"),
258                 new SkipTag("map"),
259                 new SkipTag("meta"),
260                 new SkipTag("noscript"),
261                 new SkipTag("object"),
262                 new OlTag(), new SkipTag("optgroup"),
263                 new SkipTag("option"),
264                 new PTag(), new SkipTag("param"),
265                 new IgnoreTag("pre"),
266                 new QTag(), new RootTag(), new STag(), new AnonymousTag("samp"),
267                 new SkipTag("script"),
268                 new SkipTag("select"),
269                 new SmallTag(), new IgnoreTag("span"),
270                 new AliasTag("strong", new BTag()),
271                 new SkipTag("style"),
272                 new SubTag(), new SupTag(), new SyncTag(), new TableTag(), new IgnoreTag("tbody"),
273                 new TdTag(), new IgnoreTag("tfoot"),
274                 new SkipTag("textarea"),
275                 new SkipTag("title"),
276                 new IgnoreTag("thead"),
277                 new ThTag(), new TrTag(), new TtTag(), new UTag(), new UlTag(), new AnonymousTag("var"),
278 
279                 // ThML adds the following to Voyager
280                 // Note: hymn.mod is not here nor are additional head&DC
281                 // elements
282                 new AnonymousTag("added"),
283                 new AnonymousTag("attr"),
284                 new AnonymousTag("argument"),
285                 new CitationTag(), new AnonymousTag("date"),
286                 new AnonymousTag("deleted"),
287                 new AnonymousTag("def"),
288                 new AliasTag("div1", new DivTag(1)),
289                 new AliasTag("div2", new DivTag(2)),
290                 new AliasTag("div3", new DivTag(3)),
291                 new AliasTag("div4", new DivTag(4)),
292                 new AliasTag("div5", new DivTag(5)),
293                 new AliasTag("div6", new DivTag(6)),
294                 new ForeignTag(), new AnonymousTag("index"),
295                 new AnonymousTag("insertIndex"),
296                 new AnonymousTag("glossary"),
297                 new NoteTag(), new NameTag(), new PbTag(), new AnonymousTag("scripCom"),
298                 new AnonymousTag("scripContext"),
299                 new ScripRefTag(), new ScriptureTag(), new TermTag(), new AnonymousTag("unclear"),
300                 new VerseTag(),
301         };
302         for (int i = 0; i < tags.length; i++) {
303             Tag t = tags[i];
304             String tagName = t.getTagName();
305             TAG_MAP.put(tagName, t);
306         }
307     }
308 
309 }
310