| SwordUtil.java |
1 /**
2 * Distribution License:
3 * JSword is free software; you can redistribute it and/or modify it under
4 * the terms of the GNU Lesser General Public License, version 2.1 or later
5 * as published by the Free Software Foundation. This program is distributed
6 * in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even
7 * the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
8 * See the GNU Lesser General Public License for more details.
9 *
10 * The License is available on the internet at:
11 * http://www.gnu.org/copyleft/lgpl.html
12 * or by writing to:
13 * Free Software Foundation, Inc.
14 * 59 Temple Place - Suite 330
15 * Boston, MA 02111-1307, USA
16 *
17 * © CrossWire Bible Society, 2005 - 2016
18 *
19 */
20 package org.crosswire.jsword.book.sword;
21
22 import java.io.IOException;
23 import java.io.RandomAccessFile;
24 import java.io.UnsupportedEncodingException;
25 import java.net.URI;
26
27 import org.crosswire.common.util.NetUtil;
28 import org.crosswire.jsword.JSOtherMsg;
29 import org.crosswire.jsword.book.BookException;
30 import org.crosswire.jsword.book.BookMetaData;
31 import org.slf4j.Logger;
32 import org.slf4j.LoggerFactory;
33
34 /**
35 * Various utilities used by different Sword classes.
36 *
37 * @see gnu.lgpl.License The GNU Lesser General Public License for details.
38 * @author Joe Walker
39 */
40 public final class SwordUtil {
41 /**
42 * Prevent instantiation
43 */
44 private SwordUtil() {
45 }
46
47 /**
48 * Read a RandomAccessFile
49 *
50 * @param raf
51 * The file to read
52 * @param offset
53 * The start of the record to read
54 * @param theSize
55 * The number of bytes to read
56 * @return the read data
57 * @throws IOException
58 * on error
59 */
60 protected static byte[] readRAF(RandomAccessFile raf, long offset, int theSize) throws IOException {
61 raf.seek(offset);
62 return readNextRAF(raf, theSize);
63 }
64
65 /**
66 * Read a RandomAccessFile from the current location in the file.
67 *
68 * @param raf
69 * The file to read
70 * @param theSize
71 * The number of bytes to read
72 * @return the read data
73 * @throws IOException
74 * on error
75 */
76 protected static byte[] readNextRAF(RandomAccessFile raf, int theSize) throws IOException {
77 long offset = raf.getFilePointer();
78 int size = theSize;
79 long rafSize = raf.length();
80
81 // It is common to have an entry that points to nothing.
82 // That is the equivalent of an empty string.
83 if (size == 0) {
84 return new byte[0];
85 }
86
87 if (size < 0) {
88 log.error("Nothing to read at offset = {} returning empty because negative size={}", Long.toString(offset), Integer.toString(size));
89 return new byte[0];
90 }
91
92 if (offset >= rafSize) {
93 log.error("Attempt to read beyond end. offset={} size={} but raf.length={}", Long.toString(offset), Integer.toString(size), Long.toString(rafSize));
94 return new byte[0];
95 }
96
97 if (offset + size > raf.length()) {
98 log.error("Need to reduce size to avoid EOFException. offset={} size={} but raf.length={}", Long.toString(offset), Integer.toString(size), Long.toString(rafSize));
99 size = (int) (raf.length() - offset);
100 }
101
102 byte[] read = new byte[size];
103 raf.readFully(read);
104
105 return read;
106 }
107
108 /**
109 * Writes "data" to a RandomAccessFile at the "offset" position
110 *
111 * @param raf
112 * RandomAccessFile
113 * @param offset
114 * offset to write at
115 * @param data
116 * data to write
117 * @throws IOException
118 * on error
119 */
120 protected static void writeRAF(RandomAccessFile raf, long offset, byte[] data) throws IOException {
121 raf.seek(offset);
122 writeNextRAF(raf, data);
123 }
124
125 protected static void writeNextRAF(RandomAccessFile raf, byte[] data) throws IOException {
126 if (data == null) {
127 return;
128 }
129 raf.write(data);
130 }
131
132 /**
133 * Read a RandomAccessFile until a particular byte is seen
134 *
135 * @param raf
136 * The file to read
137 * @param offset
138 * The start of the record to read
139 * @param stopByte
140 * The point at which to stop reading
141 * @return the read data
142 * @throws IOException
143 * on error
144 */
145 protected static byte[] readUntilRAF(RandomAccessFile raf, int offset, byte stopByte) throws IOException {
146 raf.seek(offset);
147 return readUntilRAF(raf, stopByte);
148 }
149
150 /**
151 * Read a RandomAccessFile until a particular byte is seen
152 *
153 * @param raf
154 * The file to read
155 * @param stopByte
156 * The point at which to stop reading
157 * @return the read data
158 * @throws IOException
159 * on error
160 */
161 protected static byte[] readUntilRAF(RandomAccessFile raf, byte stopByte) throws IOException {
162 // The strategy used here is to read the file twice.
163 // Once to determine how much to read and then getting the actual data.
164 // It may be more efficient to incrementally build up a byte buffer.
165 // Note: that growing a static array by 1 byte at a time is O(n**2)
166 // This is negligible when the n is small, but prohibitive otherwise.
167 long offset = raf.getFilePointer();
168 int size = 0;
169
170 int nextByte = -1;
171 do {
172 nextByte = raf.read();
173
174 size++;
175 } while (nextByte != -1 && nextByte != stopByte);
176
177 // Note: we allow for nextByte == -1 to be included in size
178 // so that readRAF will report EOF errors
179 return readRAF(raf, offset, size);
180 }
181
182 /**
183 * Decode little endian data from a byte array. This assumes that the high
184 * order bit is not set as this is used solely for an offset in a file in
185 * bytes. For a practical limit, 2**31 is way bigger than any document that
186 * we can have.
187 *
188 * @param data
189 * the byte[] from which to read 4 bytes
190 * @param offset
191 * the offset into the array
192 * @return The decoded data
193 */
194 public static int decodeLittleEndian32(byte[] data, int offset) {
195 // Convert from a byte to an int, but prevent sign extension.
196 // So -16 becomes 240
197 int byte1 = data[0 + offset] & 0xFF;
198 int byte2 = (data[1 + offset] & 0xFF) << 8;
199 int byte3 = (data[2 + offset] & 0xFF) << 16;
200 int byte4 = (data[3 + offset] & 0xFF) << 24;
201
202 return byte4 | byte3 | byte2 | byte1;
203 }
204
205 /**
206 * Encode little endian data from a byte array. This assumes that the number
207 * fits in a Java integer. That is, the range of an unsigned C integer is
208 * greater than a signed Java integer. For a practical limit, 2**31 is way
209 * bigger than any document that we can have. If this ever doesn't work, use
210 * a long for the number.
211 *
212 * @param val
213 * the number to encode into little endian
214 * @param data
215 * the byte[] from which to write 4 bytes
216 * @param offset
217 * the offset into the array
218 */
219 protected static void encodeLittleEndian32(int val, byte[] data, int offset) {
220 data[0 + offset] = (byte) (val & 0xFF);
221 data[1 + offset] = (byte) ((val >> 8) & 0xFF);
222 data[2 + offset] = (byte) ((val >> 16) & 0xFF);
223 data[3 + offset] = (byte) ((val >> 24) & 0xFF);
224 }
225
226 /**
227 * Decode little endian data from a byte array
228 *
229 * @param data
230 * the byte[] from which to read 2 bytes
231 * @param offset
232 * the offset into the array
233 * @return The decoded data
234 */
235 protected static int decodeLittleEndian16(byte[] data, int offset) {
236 // Convert from a byte to an int, but prevent sign extension.
237 // So -16 becomes 240
238 int byte1 = data[0 + offset] & 0xFF;
239 int byte2 = (data[1 + offset] & 0xFF) << 8;
240
241 return byte2 | byte1;
242 }
243
244 /**
245 * Encode a 16-bit little endian from an integer. It is assumed that the
246 * integer's lower 16 bits are the only that are set.
247 *
248 * @param data
249 * the byte[] from which to write 2 bytes
250 * @param offset
251 * the offset into the array
252 */
253 protected static void encodeLittleEndian16(int val, byte[] data, int offset) {
254 data[0 + offset] = (byte) (val & 0xFF);
255 data[1 + offset] = (byte) ((val >> 8) & 0xFF);
256 }
257
258 /**
259 * Find a byte of data in an array
260 *
261 * @param data
262 * The array to search
263 * @param sought
264 * The data to search for
265 * @return The index of the found position or -1 if not found
266 */
267 protected static int findByte(byte[] data, byte sought) {
268 return findByte(data, 0, sought);
269 }
270
271 /**
272 * Find a byte of data in an array
273 *
274 * @param data
275 * The array to search
276 * @param offset
277 * The position in the array to begin looking
278 * @param sought
279 * The data to search for
280 * @return The index of the found position or -1 if not found
281 */
282 protected static int findByte(byte[] data, int offset, byte sought) {
283 for (int i = offset; i < data.length; i++) {
284 if (data[i] == sought) {
285 return i;
286 }
287 }
288
289 return -1;
290 }
291
292 /**
293 * Transform a byte array into a string given the encoding. If the encoding
294 * is bad then it just does it as a string.
295 * Note: this may modify data. Don't use it to examine data.
296 *
297 * @param key the key
298 * @param data
299 * The byte array to be converted
300 * @param charset
301 * The encoding of the byte array
302 * @return a string that is UTF-8 internally
303 */
304 public static String decode(String key, byte[] data, String charset) {
305 return decode(key, data, 0, data.length, charset);
306 }
307
308 /**
309 * Transform a portion of a byte array into a string given the encoding. If
310 * the encoding is bad then it just does it as a string.
311 * Note: this may modify data. Don't use it to examine data.
312 *
313 * @param key the key
314 * @param data
315 * The byte array to be converted
316 * @param length
317 * The number of bytes to use.
318 * @param charset
319 * The encoding of the byte array
320 * @return a string that is UTF-8 internally
321 */
322 public static String decode(String key, byte[] data, int length, String charset) {
323 return decode(key, data, 0, length, charset);
324 }
325
326 /**
327 * Transform a portion of a byte array starting at an offset into a string
328 * given the encoding. If the encoding is bad then it just does it as a
329 * string. Note: this may modify data. Don't use it to examine data.
330 *
331 * @param key the key
332 * @param data
333 * The byte array to be converted
334 * @param offset
335 * The starting position in the byte array
336 * @param length
337 * The number of bytes to use.
338 * @param charset
339 * The encoding of the byte array
340 * @return a string that is UTF-8 internally
341 */
342 public static String decode(String key, byte[] data, int offset, int length, String charset) {
343 if ("WINDOWS-1252".equals(charset)) {
344 clean1252(key, data, offset, length);
345 }
346 String txt = "";
347 try {
348 if (offset + length <= data.length) {
349 txt = new String(data, offset, length, charset);
350 }
351 } catch (UnsupportedEncodingException ex) {
352 // It is impossible! In case, use system default...
353 log.error("{}: Encoding {} not supported.", key, charset, ex);
354 txt = new String(data, offset, length);
355 }
356
357 return txt;
358 }
359
360 /**
361 * Remove rogue characters in the source. These are characters that are not
362 * valid in cp1252 aka WINDOWS-1252 and in UTF-8 or are non-printing control
363 * characters in the range of 0-32.
364 */
365 private static void clean1252(String key, byte[] data, int offset, int length) {
366 int end = offset + length;
367 // make sure it doesn't go off the end
368 if (end > data.length) {
369 end = data.length;
370 }
371 for (int i = offset; i < end; i++) {
372 // between 0-32 only allow whitespace: \t, \n, \r, ' '
373 // characters 0x81, 0x8D, 0x8F, 0x90 and 0x9D are undefined in
374 // cp1252
375 int c = data[i] & 0xFF;
376 if ((c >= 0x00 && c < 0x20 && c != 0x09 && c != 0x0A && c != 0x0D) || (c == 0x81 || c == 0x8D || c == 0x8F || c == 0x90 || c == 0x9D)) {
377 data[i] = 0x20;
378 log.error("{} has bad character 0x{} at position {} in input.", key, Integer.toString(c, 16), Integer.toString(i));
379 }
380 }
381 }
382
383 /**
384 * Returns where the book should be located
385 * @param bookMetaData meta information about the book
386 * @return the URI locating the resource
387 * @throws BookException thrown if an issue is encountered, e.g. missing data files.
388 */
389 public static URI getExpandedDataPath(BookMetaData bookMetaData) throws BookException {
390 URI loc = NetUtil.lengthenURI(bookMetaData.getLibrary(), bookMetaData.getProperty(SwordBookMetaData.KEY_DATA_PATH));
391
392 if (loc == null) {
393 // FIXME(DMS): missing parameter
394 throw new BookException(JSOtherMsg.lookupText("Missing data files for old and new testaments in {0}."));
395 }
396
397 return loc;
398 }
399
400 /**
401 * The log stream
402 */
403 private static final Logger log = LoggerFactory.getLogger(SwordUtil.class);
404
405 }
406