org.crosswire.jsword.book.SentenceUtil (Java2HTML)

1   /**
2    * Distribution License:
3    * JSword is free software; you can redistribute it and/or modify it under
4    * the terms of the GNU Lesser General Public License, version 2.1 or later
5    * as published by the Free Software Foundation. This program is distributed
6    * in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even
7    * the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
8    * See the GNU Lesser General Public License for more details.
9    *
10   * The License is available on the internet at:
11   *      http://www.gnu.org/copyleft/lgpl.html
12   * or by writing to:
13   *      Free Software Foundation, Inc.
14   *      59 Temple Place - Suite 330
15   *      Boston, MA 02111-1307, USA
16   *
17   * © CrossWire Bible Society, 2005 - 2016
18   *
19   */
20  package org.crosswire.jsword.book;
21  
22  import java.util.ArrayList;
23  import java.util.List;
24  import java.util.Locale;
25  
26  import org.crosswire.common.util.StringUtil;
27  
28  /**
29   * The SentenceUtil class provide utility functions for the various Books.
30   * 
31   * It is not designed to be used outside of the book package, so using it
32   * outside of these bounds is at your own risk.
33   * 
34   * @see gnu.lgpl.License The GNU Lesser General Public License for details.
35   * @author Joe Walker
36   */
37  public final class SentenceUtil {
38      /**
39       * Ensure we can not be instantiated
40       */
41      private SentenceUtil() {
42      }
43  
44      /**
45       * Take a string and tokenize it using " " and "--" as delimiters into an
46       * Array of Strings. There is a question mark over what to do with initial
47       * spaces. This algorithm discards them, I'm not sure if this is the right
48       * thing to do.
49       * 
50       * @param sentence
51       *            The string to parse.
52       * @return The string array
53       */
54      public static String[] tokenize(String sentence) {
55          List<String> tokens = new ArrayList<String>();
56  
57          int pos = 0;
58          String temp;
59          boolean alive = true;
60  
61          while (alive) {
62              // Find the next space and double dash
63              int nextSpace = sentence.indexOf(' ', pos);
64              int nextDDash = sentence.indexOf("--", pos);
65  
66              // If there is a space just after the ddash then ignore the ddash
67              if (nextSpace == nextDDash + 2) {
68                  nextDDash = -1;
69              }
70  
71              // If there is a ddash just after the space then ignore the space
72              if (nextDDash == nextSpace + 1) {
73                  nextSpace = -1;
74              }
75  
76              // if there are no more tokens then just add in what we've got.
77              if (nextSpace == -1 && nextDDash == -1) {
78                  temp = sentence.substring(pos);
79                  alive = false;
80              } else if ((nextSpace != -1 && nextSpace < nextDDash) || (nextDDash == -1)) {
81                  // Space is next if it is not -1 and it is less than ddash
82                  // The next separator is a space
83                  temp = sentence.substring(pos, nextSpace) + ' ';
84                  pos = nextSpace + 1;
85              } else {
86                  // The next separator is a ddash
87                  temp = sentence.substring(pos, nextDDash) + "--";
88                  pos = nextDDash + 2;
89              }
90  
91              if (temp != null && !"".equals(temp.trim())) {
92                  tokens.add(temp);
93              }
94          }
95  
96          // Create a String[]
97          String[] retcode = new String[tokens.size()];
98          int i = 0;
99          for (String token : tokens) {
100             retcode[i++] = token;
101         }
102 
103         return retcode;
104     }
105 
106     /**
107      * From a sentence get a list of words (in original order) without any
108      * punctuation, and all in lower case.
109      * 
110      * @param words
111      *            Words with punctuation
112      * @return Words without punctuation
113      */
114     public static String[] stripPunctuation(String... words) {
115         String[] retcode = new String[words.length];
116 
117         // Remove the punctuation from the ends of the words.
118         for (int i = 0; i < words.length; i++) {
119             retcode[i] = stripPunctuationWord(words[i]);
120         }
121 
122         return retcode;
123     }
124 
125     /**
126      * From a sentence get a list of words (in original order) without any
127      * punctuation, and all in lower case.
128      * 
129      * @param words
130      *            Words with punctuation
131      * @return Punctuation without words
132      */
133     public static String[] stripWords(String... words) {
134         if (words.length == 0) {
135             return new String[0];
136         }
137 
138         String[] retcode = new String[words.length + 1];
139 
140         // The first bit of punctuation is what comes in front of the first word
141         int first = firstLetter(words[0]);
142         if (first == 0) {
143             retcode[0] = "";
144         } else {
145             retcode[0] = words[0].substring(0, first);
146         }
147 
148         // The rest of the words
149         for (int i = 1; i < words.length; i++) {
150             retcode[i] = stripWords(words[i - 1], words[i]);
151         }
152 
153         // The last bit of punctuation is what comes at the end of the last word
154         int last = lastLetter(words[words.length - 1]);
155         if (last == words[words.length - 1].length()) {
156             retcode[words.length] = "";
157         } else {
158             retcode[words.length] = words[words.length - 1].substring(last + 1);
159         }
160 
161         return retcode;
162     }
163 
164     /**
165      * Remove the punctuation from the ends of the word. The special case is
166      * that if the first word ends "--" and the last word has no punctuation at
167      * the beginning, then the answer is "--" and not "-- ". We miss out the
168      * space because "--" is a special separator.
169      * 
170      * @param first
171      *            The word to grab the punctuation from the end of
172      * @param last
173      *            The word to grab the punctuation from the start of
174      * @return The end of the first, a space, and the end of the first
175      */
176     public static String stripWords(String first, String last) {
177         String init1 = first.substring(lastLetter(first) + 1);
178         String init2 = last.substring(0, firstLetter(last));
179 
180         return init1 + init2;
181     }
182 
183     /**
184      * From a sentence get a list of words (in original order) without any
185      * punctuation, and all in lower case.
186      * 
187      * @param aSentence
188      *            The string to parse.
189      * @return The words split up as an array
190      */
191     public static String[] getWords(String aSentence) {
192         String sentence = aSentence;
193         // First there are some things we regard as word delimiters even if
194         // they are not near space. Note that "-" should not be in this list
195         // because words like abel-beth-maiacha contain them.
196         sentence = sentence.replaceAll("--", " ");
197         sentence = sentence.replace('.', ' ');
198         sentence = sentence.replace('!', ' ');
199         sentence = sentence.replace('?', ' ');
200         sentence = sentence.replace(':', ' ');
201         sentence = sentence.replace(';', ' ');
202         sentence = sentence.replace('"', ' ');
203         sentence = sentence.replace('\'', ' ');
204         sentence = sentence.replace('(', ' ');
205         sentence = sentence.replace(')', ' ');
206 
207         String[] words = StringUtil.split(sentence, " ");
208         String[] retcode = new String[words.length];
209 
210         // Remove the punctuation from the ends of the words.
211         for (int i = 0; i < words.length; i++) {
212             retcode[i] = stripPunctuationWord(words[i]).toLowerCase(Locale.ENGLISH);
213         }
214 
215         return retcode;
216     }
217 
218     /**
219      * Remove the punctuation from the ends of the word
220      * 
221      * @param word
222      *            Word with punctuation
223      * @return Word without punctuation
224      */
225     public static String stripPunctuationWord(String word) {
226         int first = firstLetter(word);
227         int last = lastLetter(word) + 1;
228 
229         if (first > last) {
230             return word;
231         }
232 
233         return word.substring(first, last);
234     }
235 
236     /**
237      * Where is the first letter in this word
238      * 
239      * @param word
240      *            The word to search for letters
241      * @return The offset of the first letter
242      */
243     public static int firstLetter(String word) {
244         int first;
245 
246         for (first = 0; first < word.length(); first++) {
247             char c = word.charAt(first);
248             if (Character.isLetterOrDigit(c)) {
249                 break;
250             }
251         }
252 
253         return first;
254     }
255 
256     /**
257      * Where is the last letter in this word
258      * 
259      * @param word
260      *            The word to search for letters
261      * @return The offset of the last letter
262      */
263     public static int lastLetter(String word) {
264         int last;
265 
266         for (last = word.length() - 1; last >= 0; last--) {
267             char c = word.charAt(last);
268             if (Character.isLetterOrDigit(c)) {
269                 break;
270             }
271         }
272 
273         return last;
274     }
275 }
276