1   /**
2    * Distribution License:
3    * JSword is free software; you can redistribute it and/or modify it under
4    * the terms of the GNU Lesser General Public License, version 2.1 as published by
5    * the Free Software Foundation. This program is distributed in the hope
6    * that it will be useful, but WITHOUT ANY WARRANTY; without even the
7    * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
8    * See the GNU Lesser General Public License for more details.
9    *
10   * The License is available on the internet at:
11   *       http://www.gnu.org/copyleft/lgpl.html
12   * or by writing to:
13   *      Free Software Foundation, Inc.
14   *      59 Temple Place - Suite 330
15   *      Boston, MA 02111-1307, USA
16   *
17   * Copyright: 2005
18   *     The copyright to this program is held by it's authors.
19   *
20   * ID: $Id: SentenceUtil.java 2223 2012-01-26 21:28:02Z dmsmith $
21   */
22  package org.crosswire.jsword.book;
23  
24  import java.util.ArrayList;
25  import java.util.List;
26  import java.util.Locale;
27  
28  import org.crosswire.common.util.StringUtil;
29  
30  /**
31   * The SentenceUtil class provide utility functions for the various Books.
32   * 
33   * It is not designed to be used outside of the book package, so using it
34   * outside of these bounds is at your own risk.
35   * 
36   * @see gnu.lgpl.License for license details.<br>
37   *      The copyright to this program is held by it's authors.
38   * @author Joe Walker [joe at eireneh dot com]
39   */
40  public final class SentenceUtil {
41      /**
42       * Ensure we can not be instantiated
43       */
44      private SentenceUtil() {
45      }
46  
47      /**
48       * Take a string and tokenize it using " " and "--" as delimiters into an
49       * Array of Strings. There is a question mark over what to do with initial
50       * spaces. This algorithm discards them, I'm not sure if this is the right
51       * thing to do.
52       * 
53       * @param sentence
54       *            The string to parse.
55       * @return The string array
56       */
57      public static String[] tokenize(String sentence) {
58          List<String> tokens = new ArrayList<String>();
59  
60          int pos = 0;
61          String temp;
62          boolean alive = true;
63  
64          while (alive) {
65              // Find the next space and double dash
66              int nextSpace = sentence.indexOf(' ', pos);
67              int nextDDash = sentence.indexOf("--", pos);
68  
69              // If there is a space just after the ddash then ignore the ddash
70              if (nextSpace == nextDDash + 2) {
71                  nextDDash = -1;
72              }
73  
74              // If there is a ddash just after the space then ignore the space
75              if (nextDDash == nextSpace + 1) {
76                  nextSpace = -1;
77              }
78  
79              // if there are no more tokens then just add in what we've got.
80              if (nextSpace == -1 && nextDDash == -1) {
81                  temp = sentence.substring(pos);
82                  alive = false;
83              } else if ((nextSpace != -1 && nextSpace < nextDDash) || (nextDDash == -1)) {
84                  // Space is next if it is not -1 and it is less than ddash
85                  // The next separator is a space
86                  temp = sentence.substring(pos, nextSpace) + ' ';
87                  pos = nextSpace + 1;
88              } else {
89                  // The next separator is a ddash
90                  temp = sentence.substring(pos, nextDDash) + "--";
91                  pos = nextDDash + 2;
92              }
93  
94              if (temp != null && !"".equals(temp.trim())) {
95                  tokens.add(temp);
96              }
97          }
98  
99          // Create a String[]
100         String[] retcode = new String[tokens.size()];
101         int i = 0;
102         for (String token : tokens) {
103             retcode[i++] = token;
104         }
105 
106         return retcode;
107     }
108 
109     /**
110      * From a sentence get a list of words (in original order) without any
111      * punctuation, and all in lower case.
112      * 
113      * @param words
114      *            Words with punctuation
115      * @return Words without punctuation
116      */
117     public static String[] stripPunctuation(String... words) {
118         String[] retcode = new String[words.length];
119 
120         // Remove the punctuation from the ends of the words.
121         for (int i = 0; i < words.length; i++) {
122             retcode[i] = stripPunctuationWord(words[i]);
123         }
124 
125         return retcode;
126     }
127 
128     /**
129      * From a sentence get a list of words (in original order) without any
130      * punctuation, and all in lower case.
131      * 
132      * @param words
133      *            Words with punctuation
134      * @return Punctuation without words
135      */
136     public static String[] stripWords(String... words) {
137         if (words.length == 0) {
138             return new String[0];
139         }
140 
141         String[] retcode = new String[words.length + 1];
142 
143         // The first bit of punctuation is what comes in front of the first word
144         int first = firstLetter(words[0]);
145         if (first == 0) {
146             retcode[0] = "";
147         } else {
148             retcode[0] = words[0].substring(0, first);
149         }
150 
151         // The rest of the words
152         for (int i = 1; i < words.length; i++) {
153             retcode[i] = stripWords(words[i - 1], words[i]);
154         }
155 
156         // The last bit of punctuation is what comes at the end of the last word
157         int last = lastLetter(words[words.length - 1]);
158         if (last == words[words.length - 1].length()) {
159             retcode[words.length] = "";
160         } else {
161             retcode[words.length] = words[words.length - 1].substring(last + 1);
162         }
163 
164         return retcode;
165     }
166 
167     /**
168      * From a sentence get a list of words (in original order) without any
169      * punctuation, and all in lower case.
170      * 
171      * @param aSentence
172      *            The string to parse.
173      * @return The words split up as an array
174      */
175     public static String[] getWords(String aSentence) {
176         String sentence = aSentence;
177         // First there are some things we regard as word delimiters even if
178         // they are not near space. Note that "-" should not be in this list
179         // because words like abel-beth-maiacha contain them.
180         sentence = sentence.replaceAll("--", " ");
181         sentence = sentence.replace('.', ' ');
182         sentence = sentence.replace('!', ' ');
183         sentence = sentence.replace('?', ' ');
184         sentence = sentence.replace(':', ' ');
185         sentence = sentence.replace(';', ' ');
186         sentence = sentence.replace('"', ' ');
187         sentence = sentence.replace('\'', ' ');
188         sentence = sentence.replace('(', ' ');
189         sentence = sentence.replace(')', ' ');
190 
191         String[] words = StringUtil.split(sentence, " ");
192         String[] retcode = new String[words.length];
193 
194         // Remove the punctuation from the ends of the words.
195         for (int i = 0; i < words.length; i++) {
196             retcode[i] = stripPunctuationWord(words[i]).toLowerCase(Locale.ENGLISH);
197         }
198 
199         return retcode;
200     }
201 
202     /**
203      * Remove the punctuation from the ends of the word
204      * 
205      * @param word
206      *            Word with punctuation
207      * @return Word without punctuation
208      */
209     public static String stripPunctuationWord(String word) {
210         int first = firstLetter(word);
211         int last = lastLetter(word) + 1;
212 
213         if (first > last) {
214             return word;
215         }
216 
217         return word.substring(first, last);
218     }
219 
220     /**
221      * Remove the punctuation from the ends of the word. The special case is
222      * that if the first word ends "--" and the last word has no punctuation at
223      * the beginning, then the answer is "--" and not "-- ". We miss out the
224      * space because "--" is a special separator.
225      * 
226      * @param first
227      *            The word to grab the punctuation from the end of
228      * @param last
229      *            The word to grab the punctuation from the start of
230      * @return The end of the first, a space, and the end of the first
231      */
232     public static String stripWords(String first, String last) {
233         String init1 = first.substring(lastLetter(first) + 1);
234         String init2 = last.substring(0, firstLetter(last));
235 
236         return init1 + init2;
237     }
238 
239     /**
240      * Where is the first letter in this word
241      * 
242      * @param word
243      *            The word to search for letters
244      * @return The offset of the first letter
245      */
246     public static int firstLetter(String word) {
247         int first;
248 
249         for (first = 0; first < word.length(); first++) {
250             char c = word.charAt(first);
251             if (Character.isLetterOrDigit(c)) {
252                 break;
253             }
254         }
255 
256         return first;
257     }
258 
259     /**
260      * Where is the last letter in this word
261      * 
262      * @param word
263      *            The word to search for letters
264      * @return The offset of the last letter
265      */
266     public static int lastLetter(String word) {
267         int last;
268 
269         for (last = word.length() - 1; last >= 0; last--) {
270             char c = word.charAt(last);
271             if (Character.isLetterOrDigit(c)) {
272                 break;
273             }
274         }
275 
276         return last;
277     }
278 }
279