| SentenceUtil.java |
1 /**
2 * Distribution License:
3 * JSword is free software; you can redistribute it and/or modify it under
4 * the terms of the GNU Lesser General Public License, version 2.1 as published by
5 * the Free Software Foundation. This program is distributed in the hope
6 * that it will be useful, but WITHOUT ANY WARRANTY; without even the
7 * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
8 * See the GNU Lesser General Public License for more details.
9 *
10 * The License is available on the internet at:
11 * http://www.gnu.org/copyleft/lgpl.html
12 * or by writing to:
13 * Free Software Foundation, Inc.
14 * 59 Temple Place - Suite 330
15 * Boston, MA 02111-1307, USA
16 *
17 * Copyright: 2005
18 * The copyright to this program is held by it's authors.
19 *
20 * ID: $Id: SentenceUtil.java 2223 2012-01-26 21:28:02Z dmsmith $
21 */
22 package org.crosswire.jsword.book;
23
24 import java.util.ArrayList;
25 import java.util.List;
26 import java.util.Locale;
27
28 import org.crosswire.common.util.StringUtil;
29
30 /**
31 * The SentenceUtil class provide utility functions for the various Books.
32 *
33 * It is not designed to be used outside of the book package, so using it
34 * outside of these bounds is at your own risk.
35 *
36 * @see gnu.lgpl.License for license details.<br>
37 * The copyright to this program is held by it's authors.
38 * @author Joe Walker [joe at eireneh dot com]
39 */
40 public final class SentenceUtil {
41 /**
42 * Ensure we can not be instantiated
43 */
44 private SentenceUtil() {
45 }
46
47 /**
48 * Take a string and tokenize it using " " and "--" as delimiters into an
49 * Array of Strings. There is a question mark over what to do with initial
50 * spaces. This algorithm discards them, I'm not sure if this is the right
51 * thing to do.
52 *
53 * @param sentence
54 * The string to parse.
55 * @return The string array
56 */
57 public static String[] tokenize(String sentence) {
58 List<String> tokens = new ArrayList<String>();
59
60 int pos = 0;
61 String temp;
62 boolean alive = true;
63
64 while (alive) {
65 // Find the next space and double dash
66 int nextSpace = sentence.indexOf(' ', pos);
67 int nextDDash = sentence.indexOf("--", pos);
68
69 // If there is a space just after the ddash then ignore the ddash
70 if (nextSpace == nextDDash + 2) {
71 nextDDash = -1;
72 }
73
74 // If there is a ddash just after the space then ignore the space
75 if (nextDDash == nextSpace + 1) {
76 nextSpace = -1;
77 }
78
79 // if there are no more tokens then just add in what we've got.
80 if (nextSpace == -1 && nextDDash == -1) {
81 temp = sentence.substring(pos);
82 alive = false;
83 } else if ((nextSpace != -1 && nextSpace < nextDDash) || (nextDDash == -1)) {
84 // Space is next if it is not -1 and it is less than ddash
85 // The next separator is a space
86 temp = sentence.substring(pos, nextSpace) + ' ';
87 pos = nextSpace + 1;
88 } else {
89 // The next separator is a ddash
90 temp = sentence.substring(pos, nextDDash) + "--";
91 pos = nextDDash + 2;
92 }
93
94 if (temp != null && !"".equals(temp.trim())) {
95 tokens.add(temp);
96 }
97 }
98
99 // Create a String[]
100 String[] retcode = new String[tokens.size()];
101 int i = 0;
102 for (String token : tokens) {
103 retcode[i++] = token;
104 }
105
106 return retcode;
107 }
108
109 /**
110 * From a sentence get a list of words (in original order) without any
111 * punctuation, and all in lower case.
112 *
113 * @param words
114 * Words with punctuation
115 * @return Words without punctuation
116 */
117 public static String[] stripPunctuation(String... words) {
118 String[] retcode = new String[words.length];
119
120 // Remove the punctuation from the ends of the words.
121 for (int i = 0; i < words.length; i++) {
122 retcode[i] = stripPunctuationWord(words[i]);
123 }
124
125 return retcode;
126 }
127
128 /**
129 * From a sentence get a list of words (in original order) without any
130 * punctuation, and all in lower case.
131 *
132 * @param words
133 * Words with punctuation
134 * @return Punctuation without words
135 */
136 public static String[] stripWords(String... words) {
137 if (words.length == 0) {
138 return new String[0];
139 }
140
141 String[] retcode = new String[words.length + 1];
142
143 // The first bit of punctuation is what comes in front of the first word
144 int first = firstLetter(words[0]);
145 if (first == 0) {
146 retcode[0] = "";
147 } else {
148 retcode[0] = words[0].substring(0, first);
149 }
150
151 // The rest of the words
152 for (int i = 1; i < words.length; i++) {
153 retcode[i] = stripWords(words[i - 1], words[i]);
154 }
155
156 // The last bit of punctuation is what comes at the end of the last word
157 int last = lastLetter(words[words.length - 1]);
158 if (last == words[words.length - 1].length()) {
159 retcode[words.length] = "";
160 } else {
161 retcode[words.length] = words[words.length - 1].substring(last + 1);
162 }
163
164 return retcode;
165 }
166
167 /**
168 * From a sentence get a list of words (in original order) without any
169 * punctuation, and all in lower case.
170 *
171 * @param aSentence
172 * The string to parse.
173 * @return The words split up as an array
174 */
175 public static String[] getWords(String aSentence) {
176 String sentence = aSentence;
177 // First there are some things we regard as word delimiters even if
178 // they are not near space. Note that "-" should not be in this list
179 // because words like abel-beth-maiacha contain them.
180 sentence = sentence.replaceAll("--", " ");
181 sentence = sentence.replace('.', ' ');
182 sentence = sentence.replace('!', ' ');
183 sentence = sentence.replace('?', ' ');
184 sentence = sentence.replace(':', ' ');
185 sentence = sentence.replace(';', ' ');
186 sentence = sentence.replace('"', ' ');
187 sentence = sentence.replace('\'', ' ');
188 sentence = sentence.replace('(', ' ');
189 sentence = sentence.replace(')', ' ');
190
191 String[] words = StringUtil.split(sentence, " ");
192 String[] retcode = new String[words.length];
193
194 // Remove the punctuation from the ends of the words.
195 for (int i = 0; i < words.length; i++) {
196 retcode[i] = stripPunctuationWord(words[i]).toLowerCase(Locale.ENGLISH);
197 }
198
199 return retcode;
200 }
201
202 /**
203 * Remove the punctuation from the ends of the word
204 *
205 * @param word
206 * Word with punctuation
207 * @return Word without punctuation
208 */
209 public static String stripPunctuationWord(String word) {
210 int first = firstLetter(word);
211 int last = lastLetter(word) + 1;
212
213 if (first > last) {
214 return word;
215 }
216
217 return word.substring(first, last);
218 }
219
220 /**
221 * Remove the punctuation from the ends of the word. The special case is
222 * that if the first word ends "--" and the last word has no punctuation at
223 * the beginning, then the answer is "--" and not "-- ". We miss out the
224 * space because "--" is a special separator.
225 *
226 * @param first
227 * The word to grab the punctuation from the end of
228 * @param last
229 * The word to grab the punctuation from the start of
230 * @return The end of the first, a space, and the end of the first
231 */
232 public static String stripWords(String first, String last) {
233 String init1 = first.substring(lastLetter(first) + 1);
234 String init2 = last.substring(0, firstLetter(last));
235
236 return init1 + init2;
237 }
238
239 /**
240 * Where is the first letter in this word
241 *
242 * @param word
243 * The word to search for letters
244 * @return The offset of the first letter
245 */
246 public static int firstLetter(String word) {
247 int first;
248
249 for (first = 0; first < word.length(); first++) {
250 char c = word.charAt(first);
251 if (Character.isLetterOrDigit(c)) {
252 break;
253 }
254 }
255
256 return first;
257 }
258
259 /**
260 * Where is the last letter in this word
261 *
262 * @param word
263 * The word to search for letters
264 * @return The offset of the last letter
265 */
266 public static int lastLetter(String word) {
267 int last;
268
269 for (last = word.length() - 1; last >= 0; last--) {
270 char c = word.charAt(last);
271 if (Character.isLetterOrDigit(c)) {
272 break;
273 }
274 }
275
276 return last;
277 }
278 }
279