SentenceUtil.java |
1 /** 2 * Distribution License: 3 * JSword is free software; you can redistribute it and/or modify it under 4 * the terms of the GNU Lesser General Public License, version 2.1 or later 5 * as published by the Free Software Foundation. This program is distributed 6 * in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even 7 * the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 8 * See the GNU Lesser General Public License for more details. 9 * 10 * The License is available on the internet at: 11 * http://www.gnu.org/copyleft/lgpl.html 12 * or by writing to: 13 * Free Software Foundation, Inc. 14 * 59 Temple Place - Suite 330 15 * Boston, MA 02111-1307, USA 16 * 17 * © CrossWire Bible Society, 2005 - 2016 18 * 19 */ 20 package org.crosswire.jsword.book; 21 22 import java.util.ArrayList; 23 import java.util.List; 24 import java.util.Locale; 25 26 import org.crosswire.common.util.StringUtil; 27 28 /** 29 * The SentenceUtil class provide utility functions for the various Books. 30 * 31 * It is not designed to be used outside of the book package, so using it 32 * outside of these bounds is at your own risk. 33 * 34 * @see gnu.lgpl.License The GNU Lesser General Public License for details. 35 * @author Joe Walker 36 */ 37 public final class SentenceUtil { 38 /** 39 * Ensure we can not be instantiated 40 */ 41 private SentenceUtil() { 42 } 43 44 /** 45 * Take a string and tokenize it using " " and "--" as delimiters into an 46 * Array of Strings. There is a question mark over what to do with initial 47 * spaces. This algorithm discards them, I'm not sure if this is the right 48 * thing to do. 49 * 50 * @param sentence 51 * The string to parse. 52 * @return The string array 53 */ 54 public static String[] tokenize(String sentence) { 55 List<String> tokens = new ArrayList<String>(); 56 57 int pos = 0; 58 String temp; 59 boolean alive = true; 60 61 while (alive) { 62 // Find the next space and double dash 63 int nextSpace = sentence.indexOf(' ', pos); 64 int nextDDash = sentence.indexOf("--", pos); 65 66 // If there is a space just after the ddash then ignore the ddash 67 if (nextSpace == nextDDash + 2) { 68 nextDDash = -1; 69 } 70 71 // If there is a ddash just after the space then ignore the space 72 if (nextDDash == nextSpace + 1) { 73 nextSpace = -1; 74 } 75 76 // if there are no more tokens then just add in what we've got. 77 if (nextSpace == -1 && nextDDash == -1) { 78 temp = sentence.substring(pos); 79 alive = false; 80 } else if ((nextSpace != -1 && nextSpace < nextDDash) || (nextDDash == -1)) { 81 // Space is next if it is not -1 and it is less than ddash 82 // The next separator is a space 83 temp = sentence.substring(pos, nextSpace) + ' '; 84 pos = nextSpace + 1; 85 } else { 86 // The next separator is a ddash 87 temp = sentence.substring(pos, nextDDash) + "--"; 88 pos = nextDDash + 2; 89 } 90 91 if (temp != null && !"".equals(temp.trim())) { 92 tokens.add(temp); 93 } 94 } 95 96 // Create a String[] 97 String[] retcode = new String[tokens.size()]; 98 int i = 0; 99 for (String token : tokens) { 100 retcode[i++] = token; 101 } 102 103 return retcode; 104 } 105 106 /** 107 * From a sentence get a list of words (in original order) without any 108 * punctuation, and all in lower case. 109 * 110 * @param words 111 * Words with punctuation 112 * @return Words without punctuation 113 */ 114 public static String[] stripPunctuation(String... words) { 115 String[] retcode = new String[words.length]; 116 117 // Remove the punctuation from the ends of the words. 118 for (int i = 0; i < words.length; i++) { 119 retcode[i] = stripPunctuationWord(words[i]); 120 } 121 122 return retcode; 123 } 124 125 /** 126 * From a sentence get a list of words (in original order) without any 127 * punctuation, and all in lower case. 128 * 129 * @param words 130 * Words with punctuation 131 * @return Punctuation without words 132 */ 133 public static String[] stripWords(String... words) { 134 if (words.length == 0) { 135 return new String[0]; 136 } 137 138 String[] retcode = new String[words.length + 1]; 139 140 // The first bit of punctuation is what comes in front of the first word 141 int first = firstLetter(words[0]); 142 if (first == 0) { 143 retcode[0] = ""; 144 } else { 145 retcode[0] = words[0].substring(0, first); 146 } 147 148 // The rest of the words 149 for (int i = 1; i < words.length; i++) { 150 retcode[i] = stripWords(words[i - 1], words[i]); 151 } 152 153 // The last bit of punctuation is what comes at the end of the last word 154 int last = lastLetter(words[words.length - 1]); 155 if (last == words[words.length - 1].length()) { 156 retcode[words.length] = ""; 157 } else { 158 retcode[words.length] = words[words.length - 1].substring(last + 1); 159 } 160 161 return retcode; 162 } 163 164 /** 165 * Remove the punctuation from the ends of the word. The special case is 166 * that if the first word ends "--" and the last word has no punctuation at 167 * the beginning, then the answer is "--" and not "-- ". We miss out the 168 * space because "--" is a special separator. 169 * 170 * @param first 171 * The word to grab the punctuation from the end of 172 * @param last 173 * The word to grab the punctuation from the start of 174 * @return The end of the first, a space, and the end of the first 175 */ 176 public static String stripWords(String first, String last) { 177 String init1 = first.substring(lastLetter(first) + 1); 178 String init2 = last.substring(0, firstLetter(last)); 179 180 return init1 + init2; 181 } 182 183 /** 184 * From a sentence get a list of words (in original order) without any 185 * punctuation, and all in lower case. 186 * 187 * @param aSentence 188 * The string to parse. 189 * @return The words split up as an array 190 */ 191 public static String[] getWords(String aSentence) { 192 String sentence = aSentence; 193 // First there are some things we regard as word delimiters even if 194 // they are not near space. Note that "-" should not be in this list 195 // because words like abel-beth-maiacha contain them. 196 sentence = sentence.replaceAll("--", " "); 197 sentence = sentence.replace('.', ' '); 198 sentence = sentence.replace('!', ' '); 199 sentence = sentence.replace('?', ' '); 200 sentence = sentence.replace(':', ' '); 201 sentence = sentence.replace(';', ' '); 202 sentence = sentence.replace('"', ' '); 203 sentence = sentence.replace('\'', ' '); 204 sentence = sentence.replace('(', ' '); 205 sentence = sentence.replace(')', ' '); 206 207 String[] words = StringUtil.split(sentence, " "); 208 String[] retcode = new String[words.length]; 209 210 // Remove the punctuation from the ends of the words. 211 for (int i = 0; i < words.length; i++) { 212 retcode[i] = stripPunctuationWord(words[i]).toLowerCase(Locale.ENGLISH); 213 } 214 215 return retcode; 216 } 217 218 /** 219 * Remove the punctuation from the ends of the word 220 * 221 * @param word 222 * Word with punctuation 223 * @return Word without punctuation 224 */ 225 public static String stripPunctuationWord(String word) { 226 int first = firstLetter(word); 227 int last = lastLetter(word) + 1; 228 229 if (first > last) { 230 return word; 231 } 232 233 return word.substring(first, last); 234 } 235 236 /** 237 * Where is the first letter in this word 238 * 239 * @param word 240 * The word to search for letters 241 * @return The offset of the first letter 242 */ 243 public static int firstLetter(String word) { 244 int first; 245 246 for (first = 0; first < word.length(); first++) { 247 char c = word.charAt(first); 248 if (Character.isLetterOrDigit(c)) { 249 break; 250 } 251 } 252 253 return first; 254 } 255 256 /** 257 * Where is the last letter in this word 258 * 259 * @param word 260 * The word to search for letters 261 * @return The offset of the last letter 262 */ 263 public static int lastLetter(String word) { 264 int last; 265 266 for (last = word.length() - 1; last >= 0; last--) { 267 char c = word.charAt(last); 268 if (Character.isLetterOrDigit(c)) { 269 break; 270 } 271 } 272 273 return last; 274 } 275 } 276