1   /**
2    * Distribution License:
3    * JSword is free software; you can redistribute it and/or modify it under
4    * the terms of the GNU Lesser General Public License, version 2.1 as published by
5    * the Free Software Foundation. This program is distributed in the hope
6    * that it will be useful, but WITHOUT ANY WARRANTY; without even the
7    * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
8    * See the GNU Lesser General Public License for more details.
9    *
10   * The License is available on the internet at:
11   *       http://www.gnu.org/copyleft/lgpl.html
12   * or by writing to:
13   *      Free Software Foundation, Inc.
14   *      59 Temple Place - Suite 330
15   *      Boston, MA 02111-1307, USA
16   *
17   * Copyright: 2005
18   *     The copyright to this program is held by it's authors.
19   *
20   * ID: $Id: GBFFilter.java 2221 2012-01-25 21:32:57Z dmsmith $
21   */
22  package org.crosswire.jsword.book.filter.gbf;
23  
24  import java.util.ArrayList;
25  import java.util.LinkedList;
26  import java.util.List;
27  
28  import org.crosswire.jsword.book.Book;
29  import org.crosswire.jsword.book.DataPolice;
30  import org.crosswire.jsword.book.OSISUtil;
31  import org.crosswire.jsword.book.filter.Filter;
32  import org.crosswire.jsword.passage.Key;
33  import org.jdom.Content;
34  import org.jdom.Element;
35  
36  /**
37   * Filter to convert GBF data to OSIS format.
38   * 
39   * The best place to go for more information about the GBF spec is:
40   * <a href="http://ebible.org/bible/gbf.htm">http://ebible.org/bible/gbf.htm</a>
41   * 
42   * @see gnu.lgpl.License for license details.<br>
43   *      The copyright to this program is held by it's authors.
44   * @author Joe Walker [joe at eireneh dot com]
45   */
46  public class GBFFilter implements Filter {
47      /* (non-Javadoc)
48       * @see org.crosswire.jsword.book.filter.Filter#toOSIS(org.crosswire.jsword.book.Book, org.crosswire.jsword.passage.Key, java.lang.String)
49       */
50      public List<Content> toOSIS(Book book, Key key, String plain) {
51          Element ele = OSISUtil.factory().createDiv();
52          LinkedList<Content> stack = new LinkedList<Content>();
53          stack.addFirst(ele);
54  
55          List<Tag> taglist = parseTags(book, key, plain.trim());
56          while (true) {
57              if (taglist.isEmpty()) {
58                  break;
59              }
60  
61              Tag tag = taglist.remove(0);
62              tag.updateOsisStack(book, key, stack);
63          }
64  
65          stack.removeFirst();
66          return ele.removeContent();
67      }
68  
69      @Override
70      public GBFFilter clone() {
71          GBFFilter clone = null;
72          try {
73              clone = (GBFFilter) super.clone();
74          } catch (CloneNotSupportedException e) {
75              assert false : e;
76          }
77          return clone;
78      }
79  
80      /**
81       * Turn the string into a list of tags in the order that they appear in the
82       * original string.
83       */
84      private List<Tag> parseTags(Book book, Key key, String aRemains) {
85          String remains = aRemains;
86          List<Tag> taglist = new ArrayList<Tag>();
87  
88          // A GBF code is of the form <XY...> or <Xy...>
89          // where the first letter is always capitalized and
90          // the second letter indicates an open or close tag.
91          // Upper letters are open, lower are close.
92          // The ... is optional and represents an argument.
93          // Sometimes the argument is preceded by a space.
94          // In GBF it is legal to have < and > otherwise.
95          // In at least one module, GerLut1545, << ... >> is used for quotes.
96          while (true) {
97              int ltpos = remains.indexOf('<');
98              int gtpos = remains.indexOf('>', ltpos + 1);
99  
100             // check whether we have unmatched < and >, or no tags at all
101             // If so then we don't have a tag in the remaining.
102             if (ltpos == -1 || gtpos == -1) {
103                 // If the first letter after < is an upper case letter
104                 // then report it as a potential problem
105                 if (ltpos >= 0
106                         && ltpos < remains.length() + 1
107                         && Character.isUpperCase(remains.charAt(ltpos + 1)))
108                 {
109                     DataPolice.report(book, key, "Possible bad GBF tag" + remains);
110                 }
111                 if (gtpos != -1 && ltpos >= 0) {
112                     DataPolice.report(book, key, "Possible bad GBF tag" + remains);
113                 }
114                 int pos = Math.max(ltpos, gtpos) + 1;
115                 // If there were not any <, > or either ended the string
116                 // then we only have text.
117                 if (pos == 0 || pos == remains.length()) {
118                     taglist.add(GBFTagBuilders.getTextTag(remains));
119                     break;
120                 }
121                 taglist.add(GBFTagBuilders.getTextTag(remains.substring(0, pos)));
122                 remains = remains.substring(pos);
123                 continue;
124             }
125 
126             // If the character after the < is not an upper case letter
127             // then we don't have GBF.
128             // So, create a text tag that ends with the found >.
129             // Note that in JST, there are spurious html tags and
130             // this will treat them as valid GBF text.
131             char firstChar = remains.charAt(ltpos + 1);
132             if (!Character.isUpperCase(firstChar)) {
133                 taglist.add(GBFTagBuilders.getTextTag(remains.substring(0, gtpos + 1)));
134                 remains = remains.substring(gtpos + 1);
135                 continue;
136             }
137 
138             // generate tags
139             String start = remains.substring(0, ltpos);
140             int strLen = start.length();
141             if (strLen > 0) {
142                 int beginIndex = 0;
143                 boolean inSepStr = SEPARATORS.indexOf(start.charAt(0)) >= 0;
144                 // split words from separators...
145                 // e.g., "a b c? e g." -> "a b c", "? ", "e g."
146                 // "a b c<tag> e g." -> "a b c", tag, " ", "e g."
147                 for (int i = 1; inSepStr && i < strLen; i++) {
148                     char currentChar = start.charAt(i);
149                     if (!(SEPARATORS.indexOf(currentChar) >= 0)) {
150                         taglist.add(GBFTagBuilders.getTextTag(start.substring(beginIndex, i)));
151                         beginIndex = i;
152                         inSepStr = false;
153                     }
154                 }
155 
156                 if (beginIndex < strLen) {
157                     taglist.add(GBFTagBuilders.getTextTag(start.substring(beginIndex)));
158                 }
159             }
160 
161             String tag = remains.substring(ltpos + 1, gtpos);
162             int length = tag.length();
163             if (length > 0) {
164                 Tag reply = GBFTagBuilders.getTag(book, key, tag);
165                 if (reply != null) {
166                     taglist.add(reply);
167                 }
168             }
169 
170             remains = remains.substring(gtpos + 1);
171         }
172 
173         return taglist;
174     }
175 
176     private static final String SEPARATORS = " ,:;.?!";
177 
178 }
179