Classes in this File | Line Coverage | Branch Coverage | Complexity | ||||
SentenceUtil |
|
| 3.6666666666666665;3.667 |
1 | /** | |
2 | * Distribution License: | |
3 | * JSword is free software; you can redistribute it and/or modify it under | |
4 | * the terms of the GNU Lesser General Public License, version 2.1 or later | |
5 | * as published by the Free Software Foundation. This program is distributed | |
6 | * in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even | |
7 | * the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. | |
8 | * See the GNU Lesser General Public License for more details. | |
9 | * | |
10 | * The License is available on the internet at: | |
11 | * http://www.gnu.org/copyleft/lgpl.html | |
12 | * or by writing to: | |
13 | * Free Software Foundation, Inc. | |
14 | * 59 Temple Place - Suite 330 | |
15 | * Boston, MA 02111-1307, USA | |
16 | * | |
17 | * © CrossWire Bible Society, 2005 - 2016 | |
18 | * | |
19 | */ | |
20 | package org.crosswire.jsword.book; | |
21 | ||
22 | import java.util.ArrayList; | |
23 | import java.util.List; | |
24 | import java.util.Locale; | |
25 | ||
26 | import org.crosswire.common.util.StringUtil; | |
27 | ||
28 | /** | |
29 | * The SentenceUtil class provide utility functions for the various Books. | |
30 | * | |
31 | * It is not designed to be used outside of the book package, so using it | |
32 | * outside of these bounds is at your own risk. | |
33 | * | |
34 | * @see gnu.lgpl.License The GNU Lesser General Public License for details. | |
35 | * @author Joe Walker | |
36 | */ | |
37 | public final class SentenceUtil { | |
38 | /** | |
39 | * Ensure we can not be instantiated | |
40 | */ | |
41 | 0 | private SentenceUtil() { |
42 | 0 | } |
43 | ||
44 | /** | |
45 | * Take a string and tokenize it using " " and "--" as delimiters into an | |
46 | * Array of Strings. There is a question mark over what to do with initial | |
47 | * spaces. This algorithm discards them, I'm not sure if this is the right | |
48 | * thing to do. | |
49 | * | |
50 | * @param sentence | |
51 | * The string to parse. | |
52 | * @return The string array | |
53 | */ | |
54 | public static String[] tokenize(String sentence) { | |
55 | 0 | List<String> tokens = new ArrayList<String>(); |
56 | ||
57 | 0 | int pos = 0; |
58 | String temp; | |
59 | 0 | boolean alive = true; |
60 | ||
61 | 0 | while (alive) { |
62 | // Find the next space and double dash | |
63 | 0 | int nextSpace = sentence.indexOf(' ', pos); |
64 | 0 | int nextDDash = sentence.indexOf("--", pos); |
65 | ||
66 | // If there is a space just after the ddash then ignore the ddash | |
67 | 0 | if (nextSpace == nextDDash + 2) { |
68 | 0 | nextDDash = -1; |
69 | } | |
70 | ||
71 | // If there is a ddash just after the space then ignore the space | |
72 | 0 | if (nextDDash == nextSpace + 1) { |
73 | 0 | nextSpace = -1; |
74 | } | |
75 | ||
76 | // if there are no more tokens then just add in what we've got. | |
77 | 0 | if (nextSpace == -1 && nextDDash == -1) { |
78 | 0 | temp = sentence.substring(pos); |
79 | 0 | alive = false; |
80 | 0 | } else if ((nextSpace != -1 && nextSpace < nextDDash) || (nextDDash == -1)) { |
81 | // Space is next if it is not -1 and it is less than ddash | |
82 | // The next separator is a space | |
83 | 0 | temp = sentence.substring(pos, nextSpace) + ' '; |
84 | 0 | pos = nextSpace + 1; |
85 | } else { | |
86 | // The next separator is a ddash | |
87 | 0 | temp = sentence.substring(pos, nextDDash) + "--"; |
88 | 0 | pos = nextDDash + 2; |
89 | } | |
90 | ||
91 | 0 | if (temp != null && !"".equals(temp.trim())) { |
92 | 0 | tokens.add(temp); |
93 | } | |
94 | 0 | } |
95 | ||
96 | // Create a String[] | |
97 | 0 | String[] retcode = new String[tokens.size()]; |
98 | 0 | int i = 0; |
99 | 0 | for (String token : tokens) { |
100 | 0 | retcode[i++] = token; |
101 | } | |
102 | ||
103 | 0 | return retcode; |
104 | } | |
105 | ||
106 | /** | |
107 | * From a sentence get a list of words (in original order) without any | |
108 | * punctuation, and all in lower case. | |
109 | * | |
110 | * @param words | |
111 | * Words with punctuation | |
112 | * @return Words without punctuation | |
113 | */ | |
114 | public static String[] stripPunctuation(String... words) { | |
115 | 0 | String[] retcode = new String[words.length]; |
116 | ||
117 | // Remove the punctuation from the ends of the words. | |
118 | 0 | for (int i = 0; i < words.length; i++) { |
119 | 0 | retcode[i] = stripPunctuationWord(words[i]); |
120 | } | |
121 | ||
122 | 0 | return retcode; |
123 | } | |
124 | ||
125 | /** | |
126 | * From a sentence get a list of words (in original order) without any | |
127 | * punctuation, and all in lower case. | |
128 | * | |
129 | * @param words | |
130 | * Words with punctuation | |
131 | * @return Punctuation without words | |
132 | */ | |
133 | public static String[] stripWords(String... words) { | |
134 | 0 | if (words.length == 0) { |
135 | 0 | return new String[0]; |
136 | } | |
137 | ||
138 | 0 | String[] retcode = new String[words.length + 1]; |
139 | ||
140 | // The first bit of punctuation is what comes in front of the first word | |
141 | 0 | int first = firstLetter(words[0]); |
142 | 0 | if (first == 0) { |
143 | 0 | retcode[0] = ""; |
144 | } else { | |
145 | 0 | retcode[0] = words[0].substring(0, first); |
146 | } | |
147 | ||
148 | // The rest of the words | |
149 | 0 | for (int i = 1; i < words.length; i++) { |
150 | 0 | retcode[i] = stripWords(words[i - 1], words[i]); |
151 | } | |
152 | ||
153 | // The last bit of punctuation is what comes at the end of the last word | |
154 | 0 | int last = lastLetter(words[words.length - 1]); |
155 | 0 | if (last == words[words.length - 1].length()) { |
156 | 0 | retcode[words.length] = ""; |
157 | } else { | |
158 | 0 | retcode[words.length] = words[words.length - 1].substring(last + 1); |
159 | } | |
160 | ||
161 | 0 | return retcode; |
162 | } | |
163 | ||
164 | /** | |
165 | * Remove the punctuation from the ends of the word. The special case is | |
166 | * that if the first word ends "--" and the last word has no punctuation at | |
167 | * the beginning, then the answer is "--" and not "-- ". We miss out the | |
168 | * space because "--" is a special separator. | |
169 | * | |
170 | * @param first | |
171 | * The word to grab the punctuation from the end of | |
172 | * @param last | |
173 | * The word to grab the punctuation from the start of | |
174 | * @return The end of the first, a space, and the end of the first | |
175 | */ | |
176 | public static String stripWords(String first, String last) { | |
177 | 0 | String init1 = first.substring(lastLetter(first) + 1); |
178 | 0 | String init2 = last.substring(0, firstLetter(last)); |
179 | ||
180 | 0 | return init1 + init2; |
181 | } | |
182 | ||
183 | /** | |
184 | * From a sentence get a list of words (in original order) without any | |
185 | * punctuation, and all in lower case. | |
186 | * | |
187 | * @param aSentence | |
188 | * The string to parse. | |
189 | * @return The words split up as an array | |
190 | */ | |
191 | public static String[] getWords(String aSentence) { | |
192 | 0 | String sentence = aSentence; |
193 | // First there are some things we regard as word delimiters even if | |
194 | // they are not near space. Note that "-" should not be in this list | |
195 | // because words like abel-beth-maiacha contain them. | |
196 | 0 | sentence = sentence.replaceAll("--", " "); |
197 | 0 | sentence = sentence.replace('.', ' '); |
198 | 0 | sentence = sentence.replace('!', ' '); |
199 | 0 | sentence = sentence.replace('?', ' '); |
200 | 0 | sentence = sentence.replace(':', ' '); |
201 | 0 | sentence = sentence.replace(';', ' '); |
202 | 0 | sentence = sentence.replace('"', ' '); |
203 | 0 | sentence = sentence.replace('\'', ' '); |
204 | 0 | sentence = sentence.replace('(', ' '); |
205 | 0 | sentence = sentence.replace(')', ' '); |
206 | ||
207 | 0 | String[] words = StringUtil.split(sentence, " "); |
208 | 0 | String[] retcode = new String[words.length]; |
209 | ||
210 | // Remove the punctuation from the ends of the words. | |
211 | 0 | for (int i = 0; i < words.length; i++) { |
212 | 0 | retcode[i] = stripPunctuationWord(words[i]).toLowerCase(Locale.ENGLISH); |
213 | } | |
214 | ||
215 | 0 | return retcode; |
216 | } | |
217 | ||
218 | /** | |
219 | * Remove the punctuation from the ends of the word | |
220 | * | |
221 | * @param word | |
222 | * Word with punctuation | |
223 | * @return Word without punctuation | |
224 | */ | |
225 | public static String stripPunctuationWord(String word) { | |
226 | 0 | int first = firstLetter(word); |
227 | 0 | int last = lastLetter(word) + 1; |
228 | ||
229 | 0 | if (first > last) { |
230 | 0 | return word; |
231 | } | |
232 | ||
233 | 0 | return word.substring(first, last); |
234 | } | |
235 | ||
236 | /** | |
237 | * Where is the first letter in this word | |
238 | * | |
239 | * @param word | |
240 | * The word to search for letters | |
241 | * @return The offset of the first letter | |
242 | */ | |
243 | public static int firstLetter(String word) { | |
244 | int first; | |
245 | ||
246 | 0 | for (first = 0; first < word.length(); first++) { |
247 | 0 | char c = word.charAt(first); |
248 | 0 | if (Character.isLetterOrDigit(c)) { |
249 | 0 | break; |
250 | } | |
251 | } | |
252 | ||
253 | 0 | return first; |
254 | } | |
255 | ||
256 | /** | |
257 | * Where is the last letter in this word | |
258 | * | |
259 | * @param word | |
260 | * The word to search for letters | |
261 | * @return The offset of the last letter | |
262 | */ | |
263 | public static int lastLetter(String word) { | |
264 | int last; | |
265 | ||
266 | 0 | for (last = word.length() - 1; last >= 0; last--) { |
267 | 0 | char c = word.charAt(last); |
268 | 0 | if (Character.isLetterOrDigit(c)) { |
269 | 0 | break; |
270 | } | |
271 | } | |
272 | ||
273 | 0 | return last; |
274 | } | |
275 | } |