Coverage Report

Coverage Report - org.crosswire.jsword.book.SentenceUtil

Classes in this File

Line Coverage

Branch Coverage

Complexity

SentenceUtil

0/80

0/44

3.667

 /**
  * Distribution License:
  * JSword is free software; you can redistribute it and/or modify it under
  * the terms of the GNU Lesser General Public License, version 2.1 or later
  * as published by the Free Software Foundation. This program is distributed
  * in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even
  * the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  * See the GNU Lesser General Public License for more details.
  *
  * The License is available on the internet at:
  *      http://www.gnu.org/copyleft/lgpl.html
  * or by writing to:
  *      Free Software Foundation, Inc.
  *      59 Temple Place - Suite 330
  *      Boston, MA 02111-1307, USA
  *
  * © CrossWire Bible Society, 2005 - 2016
  *
  */
 package org.crosswire.jsword.book;
 
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Locale;
 
 import org.crosswire.common.util.StringUtil;
 
 /**
  * The SentenceUtil class provide utility functions for the various Books.
  * 
  * It is not designed to be used outside of the book package, so using it
  * outside of these bounds is at your own risk.
  * 
  * @see gnu.lgpl.License The GNU Lesser General Public License for details.
  * @author Joe Walker
  */
 public final class SentenceUtil {
     /**
      * Ensure we can not be instantiated
      */
     private SentenceUtil() {
     }
 
     /**
      * Take a string and tokenize it using " " and "--" as delimiters into an
      * Array of Strings. There is a question mark over what to do with initial
      * spaces. This algorithm discards them, I'm not sure if this is the right
      * thing to do.
      * 
      * @param sentence
      *            The string to parse.
      * @return The string array
      */
     public static String[] tokenize(String sentence) {
         List<String> tokens = new ArrayList<String>();
 
         int pos = 0;
         String temp;
         boolean alive = true;
 
         while (alive) {
             // Find the next space and double dash
             int nextSpace = sentence.indexOf(' ', pos);
             int nextDDash = sentence.indexOf("--", pos);
 
             // If there is a space just after the ddash then ignore the ddash
             if (nextSpace == nextDDash + 2) {
                 nextDDash = -1;
             }
 
             // If there is a ddash just after the space then ignore the space
             if (nextDDash == nextSpace + 1) {
                 nextSpace = -1;
             }
 
             // if there are no more tokens then just add in what we've got.
             if (nextSpace == -1 && nextDDash == -1) {
                 temp = sentence.substring(pos);
                 alive = false;
             } else if ((nextSpace != -1 && nextSpace < nextDDash) || (nextDDash == -1)) {
                 // Space is next if it is not -1 and it is less than ddash
                 // The next separator is a space
                 temp = sentence.substring(pos, nextSpace) + ' ';
                 pos = nextSpace + 1;
             } else {
                 // The next separator is a ddash
                 temp = sentence.substring(pos, nextDDash) + "--";
                 pos = nextDDash + 2;
             }
 
             if (temp != null && !"".equals(temp.trim())) {
                 tokens.add(temp);
             }
         }
 
         // Create a String[]
         String[] retcode = new String[tokens.size()];
         int i = 0;
         for (String token : tokens) {
             retcode[i++] = token;
         }
 
         return retcode;
     }
 
     /**
      * From a sentence get a list of words (in original order) without any
      * punctuation, and all in lower case.
      * 
      * @param words
      *            Words with punctuation
      * @return Words without punctuation
      */
     public static String[] stripPunctuation(String... words) {
         String[] retcode = new String[words.length];
 
         // Remove the punctuation from the ends of the words.
         for (int i = 0; i < words.length; i++) {
             retcode[i] = stripPunctuationWord(words[i]);
         }
 
         return retcode;
     }
 
     /**
      * From a sentence get a list of words (in original order) without any
      * punctuation, and all in lower case.
      * 
      * @param words
      *            Words with punctuation
      * @return Punctuation without words
      */
     public static String[] stripWords(String... words) {
         if (words.length == 0) {
             return new String[0];
         }
 
         String[] retcode = new String[words.length + 1];
 
         // The first bit of punctuation is what comes in front of the first word
         int first = firstLetter(words[0]);
         if (first == 0) {
             retcode[0] = "";
         } else {
             retcode[0] = words[0].substring(0, first);
         }
 
         // The rest of the words
         for (int i = 1; i < words.length; i++) {
             retcode[i] = stripWords(words[i - 1], words[i]);
         }
 
         // The last bit of punctuation is what comes at the end of the last word
         int last = lastLetter(words[words.length - 1]);
         if (last == words[words.length - 1].length()) {
             retcode[words.length] = "";
         } else {
             retcode[words.length] = words[words.length - 1].substring(last + 1);
         }
 
         return retcode;
     }
 
     /**
      * Remove the punctuation from the ends of the word. The special case is
      * that if the first word ends "--" and the last word has no punctuation at
      * the beginning, then the answer is "--" and not "-- ". We miss out the
      * space because "--" is a special separator.
      * 
      * @param first
      *            The word to grab the punctuation from the end of
      * @param last
      *            The word to grab the punctuation from the start of
      * @return The end of the first, a space, and the end of the first
      */
     public static String stripWords(String first, String last) {
         String init1 = first.substring(lastLetter(first) + 1);
         String init2 = last.substring(0, firstLetter(last));
 
         return init1 + init2;
     }
 
     /**
      * From a sentence get a list of words (in original order) without any
      * punctuation, and all in lower case.
      * 
      * @param aSentence
      *            The string to parse.
      * @return The words split up as an array
      */
     public static String[] getWords(String aSentence) {
         String sentence = aSentence;
         // First there are some things we regard as word delimiters even if
         // they are not near space. Note that "-" should not be in this list
         // because words like abel-beth-maiacha contain them.
         sentence = sentence.replaceAll("--", " ");
         sentence = sentence.replace('.', ' ');
         sentence = sentence.replace('!', ' ');
         sentence = sentence.replace('?', ' ');
         sentence = sentence.replace(':', ' ');
         sentence = sentence.replace(';', ' ');
         sentence = sentence.replace('"', ' ');
         sentence = sentence.replace('\'', ' ');
         sentence = sentence.replace('(', ' ');
         sentence = sentence.replace(')', ' ');
 
         String[] words = StringUtil.split(sentence, " ");
         String[] retcode = new String[words.length];
 
         // Remove the punctuation from the ends of the words.
         for (int i = 0; i < words.length; i++) {
             retcode[i] = stripPunctuationWord(words[i]).toLowerCase(Locale.ENGLISH);
         }
 
         return retcode;
     }
 
     /**
      * Remove the punctuation from the ends of the word
      * 
      * @param word
      *            Word with punctuation
      * @return Word without punctuation
      */
     public static String stripPunctuationWord(String word) {
         int first = firstLetter(word);
         int last = lastLetter(word) + 1;
 
         if (first > last) {
             return word;
         }
 
         return word.substring(first, last);
     }
 
     /**
      * Where is the first letter in this word
      * 
      * @param word
      *            The word to search for letters
      * @return The offset of the first letter
      */
     public static int firstLetter(String word) {
         int first;
 
         for (first = 0; first < word.length(); first++) {
             char c = word.charAt(first);
             if (Character.isLetterOrDigit(c)) {
                 break;
             }
         }
 
         return first;
     }
 
     /**
      * Where is the last letter in this word
      * 
      * @param word
      *            The word to search for letters
      * @return The offset of the last letter
      */
     public static int lastLetter(String word) {
         int last;
 
         for (last = word.length() - 1; last >= 0; last--) {
             char c = word.charAt(last);
             if (Character.isLetterOrDigit(c)) {
                 break;
             }
         }
 
         return last;
     }
 }

1		/**
2		* Distribution License:
3		* JSword is free software; you can redistribute it and/or modify it under
4		* the terms of the GNU Lesser General Public License, version 2.1 or later
5		* as published by the Free Software Foundation. This program is distributed
6		* in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even
7		* the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
8		* See the GNU Lesser General Public License for more details.
9		*
10		* The License is available on the internet at:
11		* http://www.gnu.org/copyleft/lgpl.html
12		* or by writing to:
13		* Free Software Foundation, Inc.
14		* 59 Temple Place - Suite 330
15		* Boston, MA 02111-1307, USA
16		*
17		* © CrossWire Bible Society, 2005 - 2016
18		*
19		*/
20		package org.crosswire.jsword.book;
21
22		import java.util.ArrayList;
23		import java.util.List;
24		import java.util.Locale;
25
26		import org.crosswire.common.util.StringUtil;
27
28		/**
29		* The SentenceUtil class provide utility functions for the various Books.
30		*
31		* It is not designed to be used outside of the book package, so using it
32		* outside of these bounds is at your own risk.
33		*
34		* @see gnu.lgpl.License The GNU Lesser General Public License for details.
35		* @author Joe Walker
36		*/
37		public final class SentenceUtil {
38		/**
39		* Ensure we can not be instantiated
40		*/
41	0	private SentenceUtil() {
42	0	}
43
44		/**
45		* Take a string and tokenize it using " " and "--" as delimiters into an
46		* Array of Strings. There is a question mark over what to do with initial
47		* spaces. This algorithm discards them, I'm not sure if this is the right
48		* thing to do.
49		*
50		* @param sentence
51		* The string to parse.
52		* @return The string array
53		*/
54		public static String[] tokenize(String sentence) {
55	0	List<String> tokens = new ArrayList<String>();
56
57	0	int pos = 0;
58		String temp;
59	0	boolean alive = true;
60
61	0	while (alive) {
62		// Find the next space and double dash
63	0	int nextSpace = sentence.indexOf(' ', pos);
64	0	int nextDDash = sentence.indexOf("--", pos);
65
66		// If there is a space just after the ddash then ignore the ddash
67	0	if (nextSpace == nextDDash + 2) {
68	0	nextDDash = -1;
69		}
70
71		// If there is a ddash just after the space then ignore the space
72	0	if (nextDDash == nextSpace + 1) {
73	0	nextSpace = -1;
74		}
75
76		// if there are no more tokens then just add in what we've got.
77	0	if (nextSpace == -1 && nextDDash == -1) {
78	0	temp = sentence.substring(pos);
79	0	alive = false;
80	0	} else if ((nextSpace != -1 && nextSpace < nextDDash) \|\| (nextDDash == -1)) {
81		// Space is next if it is not -1 and it is less than ddash
82		// The next separator is a space
83	0	temp = sentence.substring(pos, nextSpace) + ' ';
84	0	pos = nextSpace + 1;
85		} else {
86		// The next separator is a ddash
87	0	temp = sentence.substring(pos, nextDDash) + "--";
88	0	pos = nextDDash + 2;
89		}
90
91	0	if (temp != null && !"".equals(temp.trim())) {
92	0	tokens.add(temp);
93		}
94	0	}
95
96		// Create a String[]
97	0	String[] retcode = new String[tokens.size()];
98	0	int i = 0;
99	0	for (String token : tokens) {
100	0	retcode[i++] = token;
101		}
102
103	0	return retcode;
104		}
105
106		/**
107		* From a sentence get a list of words (in original order) without any
108		* punctuation, and all in lower case.
109		*
110		* @param words
111		* Words with punctuation
112		* @return Words without punctuation
113		*/
114		public static String[] stripPunctuation(String... words) {
115	0	String[] retcode = new String[words.length];
116
117		// Remove the punctuation from the ends of the words.
118	0	for (int i = 0; i < words.length; i++) {
119	0	retcode[i] = stripPunctuationWord(words[i]);
120		}
121
122	0	return retcode;
123		}
124
125		/**
126		* From a sentence get a list of words (in original order) without any
127		* punctuation, and all in lower case.
128		*
129		* @param words
130		* Words with punctuation
131		* @return Punctuation without words
132		*/
133		public static String[] stripWords(String... words) {
134	0	if (words.length == 0) {
135	0	return new String[0];
136		}
137
138	0	String[] retcode = new String[words.length + 1];
139
140		// The first bit of punctuation is what comes in front of the first word
141	0	int first = firstLetter(words[0]);
142	0	if (first == 0) {
143	0	retcode[0] = "";
144		} else {
145	0	retcode[0] = words[0].substring(0, first);
146		}
147
148		// The rest of the words
149	0	for (int i = 1; i < words.length; i++) {
150	0	retcode[i] = stripWords(words[i - 1], words[i]);
151		}
152
153		// The last bit of punctuation is what comes at the end of the last word
154	0	int last = lastLetter(words[words.length - 1]);
155	0	if (last == words[words.length - 1].length()) {
156	0	retcode[words.length] = "";
157		} else {
158	0	retcode[words.length] = words[words.length - 1].substring(last + 1);
159		}
160
161	0	return retcode;
162		}
163
164		/**
165		* Remove the punctuation from the ends of the word. The special case is
166		* that if the first word ends "--" and the last word has no punctuation at
167		* the beginning, then the answer is "--" and not "-- ". We miss out the
168		* space because "--" is a special separator.
169		*
170		* @param first
171		* The word to grab the punctuation from the end of
172		* @param last
173		* The word to grab the punctuation from the start of
174		* @return The end of the first, a space, and the end of the first
175		*/
176		public static String stripWords(String first, String last) {
177	0	String init1 = first.substring(lastLetter(first) + 1);
178	0	String init2 = last.substring(0, firstLetter(last));
179
180	0	return init1 + init2;
181		}
182
183		/**
184		* From a sentence get a list of words (in original order) without any
185		* punctuation, and all in lower case.
186		*
187		* @param aSentence
188		* The string to parse.
189		* @return The words split up as an array
190		*/
191		public static String[] getWords(String aSentence) {
192	0	String sentence = aSentence;
193		// First there are some things we regard as word delimiters even if
194		// they are not near space. Note that "-" should not be in this list
195		// because words like abel-beth-maiacha contain them.
196	0	sentence = sentence.replaceAll("--", " ");
197	0	sentence = sentence.replace('.', ' ');
198	0	sentence = sentence.replace('!', ' ');
199	0	sentence = sentence.replace('?', ' ');
200	0	sentence = sentence.replace(':', ' ');
201	0	sentence = sentence.replace(';', ' ');
202	0	sentence = sentence.replace('"', ' ');
203	0	sentence = sentence.replace('\'', ' ');
204	0	sentence = sentence.replace('(', ' ');
205	0	sentence = sentence.replace(')', ' ');
206
207	0	String[] words = StringUtil.split(sentence, " ");
208	0	String[] retcode = new String[words.length];
209
210		// Remove the punctuation from the ends of the words.
211	0	for (int i = 0; i < words.length; i++) {
212	0	retcode[i] = stripPunctuationWord(words[i]).toLowerCase(Locale.ENGLISH);
213		}
214
215	0	return retcode;
216		}
217
218		/**
219		* Remove the punctuation from the ends of the word
220		*
221		* @param word
222		* Word with punctuation
223		* @return Word without punctuation
224		*/
225		public static String stripPunctuationWord(String word) {
226	0	int first = firstLetter(word);
227	0	int last = lastLetter(word) + 1;
228
229	0	if (first > last) {
230	0	return word;
231		}
232
233	0	return word.substring(first, last);
234		}
235
236		/**
237		* Where is the first letter in this word
238		*
239		* @param word
240		* The word to search for letters
241		* @return The offset of the first letter
242		*/
243		public static int firstLetter(String word) {
244		int first;
245
246	0	for (first = 0; first < word.length(); first++) {
247	0	char c = word.charAt(first);
248	0	if (Character.isLetterOrDigit(c)) {
249	0	break;
250		}
251		}
252
253	0	return first;
254		}
255
256		/**
257		* Where is the last letter in this word
258		*
259		* @param word
260		* The word to search for letters
261		* @return The offset of the last letter
262		*/
263		public static int lastLetter(String word) {
264		int last;
265
266	0	for (last = word.length() - 1; last >= 0; last--) {
267	0	char c = word.charAt(last);
268	0	if (Character.isLetterOrDigit(c)) {
269	0	break;
270		}
271		}
272
273	0	return last;
274		}
275		}