Coverage Report - org.crosswire.jsword.book.SentenceUtil
 
Classes in this File Line Coverage Branch Coverage Complexity
SentenceUtil
0%
0/80
0%
0/44
3.667
 
 1  
 /**
 2  
  * Distribution License:
 3  
  * JSword is free software; you can redistribute it and/or modify it under
 4  
  * the terms of the GNU Lesser General Public License, version 2.1 or later
 5  
  * as published by the Free Software Foundation. This program is distributed
 6  
  * in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even
 7  
  * the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 8  
  * See the GNU Lesser General Public License for more details.
 9  
  *
 10  
  * The License is available on the internet at:
 11  
  *      http://www.gnu.org/copyleft/lgpl.html
 12  
  * or by writing to:
 13  
  *      Free Software Foundation, Inc.
 14  
  *      59 Temple Place - Suite 330
 15  
  *      Boston, MA 02111-1307, USA
 16  
  *
 17  
  * © CrossWire Bible Society, 2005 - 2016
 18  
  *
 19  
  */
 20  
 package org.crosswire.jsword.book;
 21  
 
 22  
 import java.util.ArrayList;
 23  
 import java.util.List;
 24  
 import java.util.Locale;
 25  
 
 26  
 import org.crosswire.common.util.StringUtil;
 27  
 
 28  
 /**
 29  
  * The SentenceUtil class provide utility functions for the various Books.
 30  
  * 
 31  
  * It is not designed to be used outside of the book package, so using it
 32  
  * outside of these bounds is at your own risk.
 33  
  * 
 34  
  * @see gnu.lgpl.License The GNU Lesser General Public License for details.
 35  
  * @author Joe Walker
 36  
  */
 37  
 public final class SentenceUtil {
 38  
     /**
 39  
      * Ensure we can not be instantiated
 40  
      */
 41  0
     private SentenceUtil() {
 42  0
     }
 43  
 
 44  
     /**
 45  
      * Take a string and tokenize it using " " and "--" as delimiters into an
 46  
      * Array of Strings. There is a question mark over what to do with initial
 47  
      * spaces. This algorithm discards them, I'm not sure if this is the right
 48  
      * thing to do.
 49  
      * 
 50  
      * @param sentence
 51  
      *            The string to parse.
 52  
      * @return The string array
 53  
      */
 54  
     public static String[] tokenize(String sentence) {
 55  0
         List<String> tokens = new ArrayList<String>();
 56  
 
 57  0
         int pos = 0;
 58  
         String temp;
 59  0
         boolean alive = true;
 60  
 
 61  0
         while (alive) {
 62  
             // Find the next space and double dash
 63  0
             int nextSpace = sentence.indexOf(' ', pos);
 64  0
             int nextDDash = sentence.indexOf("--", pos);
 65  
 
 66  
             // If there is a space just after the ddash then ignore the ddash
 67  0
             if (nextSpace == nextDDash + 2) {
 68  0
                 nextDDash = -1;
 69  
             }
 70  
 
 71  
             // If there is a ddash just after the space then ignore the space
 72  0
             if (nextDDash == nextSpace + 1) {
 73  0
                 nextSpace = -1;
 74  
             }
 75  
 
 76  
             // if there are no more tokens then just add in what we've got.
 77  0
             if (nextSpace == -1 && nextDDash == -1) {
 78  0
                 temp = sentence.substring(pos);
 79  0
                 alive = false;
 80  0
             } else if ((nextSpace != -1 && nextSpace < nextDDash) || (nextDDash == -1)) {
 81  
                 // Space is next if it is not -1 and it is less than ddash
 82  
                 // The next separator is a space
 83  0
                 temp = sentence.substring(pos, nextSpace) + ' ';
 84  0
                 pos = nextSpace + 1;
 85  
             } else {
 86  
                 // The next separator is a ddash
 87  0
                 temp = sentence.substring(pos, nextDDash) + "--";
 88  0
                 pos = nextDDash + 2;
 89  
             }
 90  
 
 91  0
             if (temp != null && !"".equals(temp.trim())) {
 92  0
                 tokens.add(temp);
 93  
             }
 94  0
         }
 95  
 
 96  
         // Create a String[]
 97  0
         String[] retcode = new String[tokens.size()];
 98  0
         int i = 0;
 99  0
         for (String token : tokens) {
 100  0
             retcode[i++] = token;
 101  
         }
 102  
 
 103  0
         return retcode;
 104  
     }
 105  
 
 106  
     /**
 107  
      * From a sentence get a list of words (in original order) without any
 108  
      * punctuation, and all in lower case.
 109  
      * 
 110  
      * @param words
 111  
      *            Words with punctuation
 112  
      * @return Words without punctuation
 113  
      */
 114  
     public static String[] stripPunctuation(String... words) {
 115  0
         String[] retcode = new String[words.length];
 116  
 
 117  
         // Remove the punctuation from the ends of the words.
 118  0
         for (int i = 0; i < words.length; i++) {
 119  0
             retcode[i] = stripPunctuationWord(words[i]);
 120  
         }
 121  
 
 122  0
         return retcode;
 123  
     }
 124  
 
 125  
     /**
 126  
      * From a sentence get a list of words (in original order) without any
 127  
      * punctuation, and all in lower case.
 128  
      * 
 129  
      * @param words
 130  
      *            Words with punctuation
 131  
      * @return Punctuation without words
 132  
      */
 133  
     public static String[] stripWords(String... words) {
 134  0
         if (words.length == 0) {
 135  0
             return new String[0];
 136  
         }
 137  
 
 138  0
         String[] retcode = new String[words.length + 1];
 139  
 
 140  
         // The first bit of punctuation is what comes in front of the first word
 141  0
         int first = firstLetter(words[0]);
 142  0
         if (first == 0) {
 143  0
             retcode[0] = "";
 144  
         } else {
 145  0
             retcode[0] = words[0].substring(0, first);
 146  
         }
 147  
 
 148  
         // The rest of the words
 149  0
         for (int i = 1; i < words.length; i++) {
 150  0
             retcode[i] = stripWords(words[i - 1], words[i]);
 151  
         }
 152  
 
 153  
         // The last bit of punctuation is what comes at the end of the last word
 154  0
         int last = lastLetter(words[words.length - 1]);
 155  0
         if (last == words[words.length - 1].length()) {
 156  0
             retcode[words.length] = "";
 157  
         } else {
 158  0
             retcode[words.length] = words[words.length - 1].substring(last + 1);
 159  
         }
 160  
 
 161  0
         return retcode;
 162  
     }
 163  
 
 164  
     /**
 165  
      * Remove the punctuation from the ends of the word. The special case is
 166  
      * that if the first word ends "--" and the last word has no punctuation at
 167  
      * the beginning, then the answer is "--" and not "-- ". We miss out the
 168  
      * space because "--" is a special separator.
 169  
      * 
 170  
      * @param first
 171  
      *            The word to grab the punctuation from the end of
 172  
      * @param last
 173  
      *            The word to grab the punctuation from the start of
 174  
      * @return The end of the first, a space, and the end of the first
 175  
      */
 176  
     public static String stripWords(String first, String last) {
 177  0
         String init1 = first.substring(lastLetter(first) + 1);
 178  0
         String init2 = last.substring(0, firstLetter(last));
 179  
 
 180  0
         return init1 + init2;
 181  
     }
 182  
 
 183  
     /**
 184  
      * From a sentence get a list of words (in original order) without any
 185  
      * punctuation, and all in lower case.
 186  
      * 
 187  
      * @param aSentence
 188  
      *            The string to parse.
 189  
      * @return The words split up as an array
 190  
      */
 191  
     public static String[] getWords(String aSentence) {
 192  0
         String sentence = aSentence;
 193  
         // First there are some things we regard as word delimiters even if
 194  
         // they are not near space. Note that "-" should not be in this list
 195  
         // because words like abel-beth-maiacha contain them.
 196  0
         sentence = sentence.replaceAll("--", " ");
 197  0
         sentence = sentence.replace('.', ' ');
 198  0
         sentence = sentence.replace('!', ' ');
 199  0
         sentence = sentence.replace('?', ' ');
 200  0
         sentence = sentence.replace(':', ' ');
 201  0
         sentence = sentence.replace(';', ' ');
 202  0
         sentence = sentence.replace('"', ' ');
 203  0
         sentence = sentence.replace('\'', ' ');
 204  0
         sentence = sentence.replace('(', ' ');
 205  0
         sentence = sentence.replace(')', ' ');
 206  
 
 207  0
         String[] words = StringUtil.split(sentence, " ");
 208  0
         String[] retcode = new String[words.length];
 209  
 
 210  
         // Remove the punctuation from the ends of the words.
 211  0
         for (int i = 0; i < words.length; i++) {
 212  0
             retcode[i] = stripPunctuationWord(words[i]).toLowerCase(Locale.ENGLISH);
 213  
         }
 214  
 
 215  0
         return retcode;
 216  
     }
 217  
 
 218  
     /**
 219  
      * Remove the punctuation from the ends of the word
 220  
      * 
 221  
      * @param word
 222  
      *            Word with punctuation
 223  
      * @return Word without punctuation
 224  
      */
 225  
     public static String stripPunctuationWord(String word) {
 226  0
         int first = firstLetter(word);
 227  0
         int last = lastLetter(word) + 1;
 228  
 
 229  0
         if (first > last) {
 230  0
             return word;
 231  
         }
 232  
 
 233  0
         return word.substring(first, last);
 234  
     }
 235  
 
 236  
     /**
 237  
      * Where is the first letter in this word
 238  
      * 
 239  
      * @param word
 240  
      *            The word to search for letters
 241  
      * @return The offset of the first letter
 242  
      */
 243  
     public static int firstLetter(String word) {
 244  
         int first;
 245  
 
 246  0
         for (first = 0; first < word.length(); first++) {
 247  0
             char c = word.charAt(first);
 248  0
             if (Character.isLetterOrDigit(c)) {
 249  0
                 break;
 250  
             }
 251  
         }
 252  
 
 253  0
         return first;
 254  
     }
 255  
 
 256  
     /**
 257  
      * Where is the last letter in this word
 258  
      * 
 259  
      * @param word
 260  
      *            The word to search for letters
 261  
      * @return The offset of the last letter
 262  
      */
 263  
     public static int lastLetter(String word) {
 264  
         int last;
 265  
 
 266  0
         for (last = word.length() - 1; last >= 0; last--) {
 267  0
             char c = word.charAt(last);
 268  0
             if (Character.isLetterOrDigit(c)) {
 269  0
                 break;
 270  
             }
 271  
         }
 272  
 
 273  0
         return last;
 274  
     }
 275  
 }