SwordUtil.java |
1 /** 2 * Distribution License: 3 * JSword is free software; you can redistribute it and/or modify it under 4 * the terms of the GNU Lesser General Public License, version 2.1 or later 5 * as published by the Free Software Foundation. This program is distributed 6 * in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even 7 * the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 8 * See the GNU Lesser General Public License for more details. 9 * 10 * The License is available on the internet at: 11 * http://www.gnu.org/copyleft/lgpl.html 12 * or by writing to: 13 * Free Software Foundation, Inc. 14 * 59 Temple Place - Suite 330 15 * Boston, MA 02111-1307, USA 16 * 17 * © CrossWire Bible Society, 2005 - 2016 18 * 19 */ 20 package org.crosswire.jsword.book.sword; 21 22 import java.io.IOException; 23 import java.io.RandomAccessFile; 24 import java.io.UnsupportedEncodingException; 25 import java.net.URI; 26 27 import org.crosswire.common.util.NetUtil; 28 import org.crosswire.jsword.JSOtherMsg; 29 import org.crosswire.jsword.book.BookException; 30 import org.crosswire.jsword.book.BookMetaData; 31 import org.slf4j.Logger; 32 import org.slf4j.LoggerFactory; 33 34 /** 35 * Various utilities used by different Sword classes. 36 * 37 * @see gnu.lgpl.License The GNU Lesser General Public License for details. 38 * @author Joe Walker 39 */ 40 public final class SwordUtil { 41 /** 42 * Prevent instantiation 43 */ 44 private SwordUtil() { 45 } 46 47 /** 48 * Read a RandomAccessFile 49 * 50 * @param raf 51 * The file to read 52 * @param offset 53 * The start of the record to read 54 * @param theSize 55 * The number of bytes to read 56 * @return the read data 57 * @throws IOException 58 * on error 59 */ 60 protected static byte[] readRAF(RandomAccessFile raf, long offset, int theSize) throws IOException { 61 raf.seek(offset); 62 return readNextRAF(raf, theSize); 63 } 64 65 /** 66 * Read a RandomAccessFile from the current location in the file. 67 * 68 * @param raf 69 * The file to read 70 * @param theSize 71 * The number of bytes to read 72 * @return the read data 73 * @throws IOException 74 * on error 75 */ 76 protected static byte[] readNextRAF(RandomAccessFile raf, int theSize) throws IOException { 77 long offset = raf.getFilePointer(); 78 int size = theSize; 79 long rafSize = raf.length(); 80 81 // It is common to have an entry that points to nothing. 82 // That is the equivalent of an empty string. 83 if (size == 0) { 84 return new byte[0]; 85 } 86 87 if (size < 0) { 88 log.error("Nothing to read at offset = {} returning empty because negative size={}", Long.toString(offset), Integer.toString(size)); 89 return new byte[0]; 90 } 91 92 if (offset >= rafSize) { 93 log.error("Attempt to read beyond end. offset={} size={} but raf.length={}", Long.toString(offset), Integer.toString(size), Long.toString(rafSize)); 94 return new byte[0]; 95 } 96 97 if (offset + size > raf.length()) { 98 log.error("Need to reduce size to avoid EOFException. offset={} size={} but raf.length={}", Long.toString(offset), Integer.toString(size), Long.toString(rafSize)); 99 size = (int) (raf.length() - offset); 100 } 101 102 byte[] read = new byte[size]; 103 raf.readFully(read); 104 105 return read; 106 } 107 108 /** 109 * Writes "data" to a RandomAccessFile at the "offset" position 110 * 111 * @param raf 112 * RandomAccessFile 113 * @param offset 114 * offset to write at 115 * @param data 116 * data to write 117 * @throws IOException 118 * on error 119 */ 120 protected static void writeRAF(RandomAccessFile raf, long offset, byte[] data) throws IOException { 121 raf.seek(offset); 122 writeNextRAF(raf, data); 123 } 124 125 protected static void writeNextRAF(RandomAccessFile raf, byte[] data) throws IOException { 126 if (data == null) { 127 return; 128 } 129 raf.write(data); 130 } 131 132 /** 133 * Read a RandomAccessFile until a particular byte is seen 134 * 135 * @param raf 136 * The file to read 137 * @param offset 138 * The start of the record to read 139 * @param stopByte 140 * The point at which to stop reading 141 * @return the read data 142 * @throws IOException 143 * on error 144 */ 145 protected static byte[] readUntilRAF(RandomAccessFile raf, int offset, byte stopByte) throws IOException { 146 raf.seek(offset); 147 return readUntilRAF(raf, stopByte); 148 } 149 150 /** 151 * Read a RandomAccessFile until a particular byte is seen 152 * 153 * @param raf 154 * The file to read 155 * @param stopByte 156 * The point at which to stop reading 157 * @return the read data 158 * @throws IOException 159 * on error 160 */ 161 protected static byte[] readUntilRAF(RandomAccessFile raf, byte stopByte) throws IOException { 162 // The strategy used here is to read the file twice. 163 // Once to determine how much to read and then getting the actual data. 164 // It may be more efficient to incrementally build up a byte buffer. 165 // Note: that growing a static array by 1 byte at a time is O(n**2) 166 // This is negligible when the n is small, but prohibitive otherwise. 167 long offset = raf.getFilePointer(); 168 int size = 0; 169 170 int nextByte = -1; 171 do { 172 nextByte = raf.read(); 173 174 size++; 175 } while (nextByte != -1 && nextByte != stopByte); 176 177 // Note: we allow for nextByte == -1 to be included in size 178 // so that readRAF will report EOF errors 179 return readRAF(raf, offset, size); 180 } 181 182 /** 183 * Decode little endian data from a byte array. This assumes that the high 184 * order bit is not set as this is used solely for an offset in a file in 185 * bytes. For a practical limit, 2**31 is way bigger than any document that 186 * we can have. 187 * 188 * @param data 189 * the byte[] from which to read 4 bytes 190 * @param offset 191 * the offset into the array 192 * @return The decoded data 193 */ 194 public static int decodeLittleEndian32(byte[] data, int offset) { 195 // Convert from a byte to an int, but prevent sign extension. 196 // So -16 becomes 240 197 int byte1 = data[0 + offset] & 0xFF; 198 int byte2 = (data[1 + offset] & 0xFF) << 8; 199 int byte3 = (data[2 + offset] & 0xFF) << 16; 200 int byte4 = (data[3 + offset] & 0xFF) << 24; 201 202 return byte4 | byte3 | byte2 | byte1; 203 } 204 205 /** 206 * Encode little endian data from a byte array. This assumes that the number 207 * fits in a Java integer. That is, the range of an unsigned C integer is 208 * greater than a signed Java integer. For a practical limit, 2**31 is way 209 * bigger than any document that we can have. If this ever doesn't work, use 210 * a long for the number. 211 * 212 * @param val 213 * the number to encode into little endian 214 * @param data 215 * the byte[] from which to write 4 bytes 216 * @param offset 217 * the offset into the array 218 */ 219 protected static void encodeLittleEndian32(int val, byte[] data, int offset) { 220 data[0 + offset] = (byte) (val & 0xFF); 221 data[1 + offset] = (byte) ((val >> 8) & 0xFF); 222 data[2 + offset] = (byte) ((val >> 16) & 0xFF); 223 data[3 + offset] = (byte) ((val >> 24) & 0xFF); 224 } 225 226 /** 227 * Decode little endian data from a byte array 228 * 229 * @param data 230 * the byte[] from which to read 2 bytes 231 * @param offset 232 * the offset into the array 233 * @return The decoded data 234 */ 235 protected static int decodeLittleEndian16(byte[] data, int offset) { 236 // Convert from a byte to an int, but prevent sign extension. 237 // So -16 becomes 240 238 int byte1 = data[0 + offset] & 0xFF; 239 int byte2 = (data[1 + offset] & 0xFF) << 8; 240 241 return byte2 | byte1; 242 } 243 244 /** 245 * Encode a 16-bit little endian from an integer. It is assumed that the 246 * integer's lower 16 bits are the only that are set. 247 * 248 * @param data 249 * the byte[] from which to write 2 bytes 250 * @param offset 251 * the offset into the array 252 */ 253 protected static void encodeLittleEndian16(int val, byte[] data, int offset) { 254 data[0 + offset] = (byte) (val & 0xFF); 255 data[1 + offset] = (byte) ((val >> 8) & 0xFF); 256 } 257 258 /** 259 * Find a byte of data in an array 260 * 261 * @param data 262 * The array to search 263 * @param sought 264 * The data to search for 265 * @return The index of the found position or -1 if not found 266 */ 267 protected static int findByte(byte[] data, byte sought) { 268 return findByte(data, 0, sought); 269 } 270 271 /** 272 * Find a byte of data in an array 273 * 274 * @param data 275 * The array to search 276 * @param offset 277 * The position in the array to begin looking 278 * @param sought 279 * The data to search for 280 * @return The index of the found position or -1 if not found 281 */ 282 protected static int findByte(byte[] data, int offset, byte sought) { 283 for (int i = offset; i < data.length; i++) { 284 if (data[i] == sought) { 285 return i; 286 } 287 } 288 289 return -1; 290 } 291 292 /** 293 * Transform a byte array into a string given the encoding. If the encoding 294 * is bad then it just does it as a string. 295 * Note: this may modify data. Don't use it to examine data. 296 * 297 * @param key the key 298 * @param data 299 * The byte array to be converted 300 * @param charset 301 * The encoding of the byte array 302 * @return a string that is UTF-8 internally 303 */ 304 public static String decode(String key, byte[] data, String charset) { 305 return decode(key, data, 0, data.length, charset); 306 } 307 308 /** 309 * Transform a portion of a byte array into a string given the encoding. If 310 * the encoding is bad then it just does it as a string. 311 * Note: this may modify data. Don't use it to examine data. 312 * 313 * @param key the key 314 * @param data 315 * The byte array to be converted 316 * @param length 317 * The number of bytes to use. 318 * @param charset 319 * The encoding of the byte array 320 * @return a string that is UTF-8 internally 321 */ 322 public static String decode(String key, byte[] data, int length, String charset) { 323 return decode(key, data, 0, length, charset); 324 } 325 326 /** 327 * Transform a portion of a byte array starting at an offset into a string 328 * given the encoding. If the encoding is bad then it just does it as a 329 * string. Note: this may modify data. Don't use it to examine data. 330 * 331 * @param key the key 332 * @param data 333 * The byte array to be converted 334 * @param offset 335 * The starting position in the byte array 336 * @param length 337 * The number of bytes to use. 338 * @param charset 339 * The encoding of the byte array 340 * @return a string that is UTF-8 internally 341 */ 342 public static String decode(String key, byte[] data, int offset, int length, String charset) { 343 if ("WINDOWS-1252".equals(charset)) { 344 clean1252(key, data, offset, length); 345 } 346 String txt = ""; 347 try { 348 if (offset + length <= data.length) { 349 txt = new String(data, offset, length, charset); 350 } 351 } catch (UnsupportedEncodingException ex) { 352 // It is impossible! In case, use system default... 353 log.error("{}: Encoding {} not supported.", key, charset, ex); 354 txt = new String(data, offset, length); 355 } 356 357 return txt; 358 } 359 360 /** 361 * Remove rogue characters in the source. These are characters that are not 362 * valid in cp1252 aka WINDOWS-1252 and in UTF-8 or are non-printing control 363 * characters in the range of 0-32. 364 */ 365 private static void clean1252(String key, byte[] data, int offset, int length) { 366 int end = offset + length; 367 // make sure it doesn't go off the end 368 if (end > data.length) { 369 end = data.length; 370 } 371 for (int i = offset; i < end; i++) { 372 // between 0-32 only allow whitespace: \t, \n, \r, ' ' 373 // characters 0x81, 0x8D, 0x8F, 0x90 and 0x9D are undefined in 374 // cp1252 375 int c = data[i] & 0xFF; 376 if ((c >= 0x00 && c < 0x20 && c != 0x09 && c != 0x0A && c != 0x0D) || (c == 0x81 || c == 0x8D || c == 0x8F || c == 0x90 || c == 0x9D)) { 377 data[i] = 0x20; 378 log.error("{} has bad character 0x{} at position {} in input.", key, Integer.toString(c, 16), Integer.toString(i)); 379 } 380 } 381 } 382 383 /** 384 * Returns where the book should be located 385 * @param bookMetaData meta information about the book 386 * @return the URI locating the resource 387 * @throws BookException thrown if an issue is encountered, e.g. missing data files. 388 */ 389 public static URI getExpandedDataPath(BookMetaData bookMetaData) throws BookException { 390 URI loc = NetUtil.lengthenURI(bookMetaData.getLibrary(), bookMetaData.getProperty(SwordBookMetaData.KEY_DATA_PATH)); 391 392 if (loc == null) { 393 // FIXME(DMS): missing parameter 394 throw new BookException(JSOtherMsg.lookupText("Missing data files for old and new testaments in {0}.")); 395 } 396 397 return loc; 398 } 399 400 /** 401 * The log stream 402 */ 403 private static final Logger log = LoggerFactory.getLogger(SwordUtil.class); 404 405 } 406