1   /**
2    * Distribution License:
3    * JSword is free software; you can redistribute it and/or modify it under
4    * the terms of the GNU Lesser General Public License, version 2.1 or later
5    * as published by the Free Software Foundation. This program is distributed
6    * in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even
7    * the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
8    * See the GNU Lesser General Public License for more details.
9    *
10   * The License is available on the internet at:
11   *      http://www.gnu.org/copyleft/lgpl.html
12   * or by writing to:
13   *      Free Software Foundation, Inc.
14   *      59 Temple Place - Suite 330
15   *      Boston, MA 02111-1307, USA
16   *
17   * © CrossWire Bible Society, 2005 - 2016
18   *
19   */
20  package org.crosswire.jsword.book.sword;
21  
22  import java.io.IOException;
23  import java.io.RandomAccessFile;
24  import java.io.UnsupportedEncodingException;
25  import java.net.URI;
26  
27  import org.crosswire.common.util.NetUtil;
28  import org.crosswire.jsword.JSOtherMsg;
29  import org.crosswire.jsword.book.BookException;
30  import org.crosswire.jsword.book.BookMetaData;
31  import org.slf4j.Logger;
32  import org.slf4j.LoggerFactory;
33  
34  /**
35   * Various utilities used by different Sword classes.
36   * 
37   * @see gnu.lgpl.License The GNU Lesser General Public License for details.
38   * @author Joe Walker
39   */
40  public final class SwordUtil {
41      /**
42       * Prevent instantiation
43       */
44      private SwordUtil() {
45      }
46  
47      /**
48       * Read a RandomAccessFile
49       * 
50       * @param raf
51       *            The file to read
52       * @param offset
53       *            The start of the record to read
54       * @param theSize
55       *            The number of bytes to read
56       * @return the read data
57       * @throws IOException
58       *             on error
59       */
60      protected static byte[] readRAF(RandomAccessFile raf, long offset, int theSize) throws IOException {
61          raf.seek(offset);
62          return readNextRAF(raf, theSize);
63      }
64  
65      /**
66       * Read a RandomAccessFile from the current location in the file.
67       * 
68       * @param raf
69       *            The file to read
70       * @param theSize
71       *            The number of bytes to read
72       * @return the read data
73       * @throws IOException
74       *             on error
75       */
76      protected static byte[] readNextRAF(RandomAccessFile raf, int theSize) throws IOException {
77          long offset = raf.getFilePointer();
78          int size = theSize;
79          long rafSize = raf.length();
80  
81          // It is common to have an entry that points to nothing.
82          // That is the equivalent of an empty string.
83          if (size == 0) {
84              return new byte[0];
85          }
86  
87          if (size < 0) {
88              log.error("Nothing to read at offset = {} returning empty because negative size={}", Long.toString(offset), Integer.toString(size));
89              return new byte[0];
90          }
91  
92          if (offset >= rafSize) {
93              log.error("Attempt to read beyond end. offset={} size={} but raf.length={}", Long.toString(offset), Integer.toString(size), Long.toString(rafSize));
94              return new byte[0];
95          }
96  
97          if (offset + size > raf.length()) {
98              log.error("Need to reduce size to avoid EOFException. offset={} size={} but raf.length={}", Long.toString(offset), Integer.toString(size), Long.toString(rafSize));
99              size = (int) (raf.length() - offset);
100         }
101 
102         byte[] read = new byte[size];
103         raf.readFully(read);
104 
105         return read;
106     }
107 
108     /**
109      * Writes "data" to a RandomAccessFile at the "offset" position
110      * 
111      * @param raf
112      *            RandomAccessFile
113      * @param offset
114      *            offset to write at
115      * @param data
116      *            data to write
117      * @throws IOException
118      *             on error
119      */
120     protected static void writeRAF(RandomAccessFile raf, long offset, byte[] data) throws IOException {
121         raf.seek(offset);
122         writeNextRAF(raf, data);
123     }
124 
125     protected static void writeNextRAF(RandomAccessFile raf, byte[] data) throws IOException {
126         if (data == null) {
127             return;
128         }
129         raf.write(data);
130     }
131 
132     /**
133      * Read a RandomAccessFile until a particular byte is seen
134      * 
135      * @param raf
136      *            The file to read
137      * @param offset
138      *            The start of the record to read
139      * @param stopByte
140      *            The point at which to stop reading
141      * @return the read data
142      * @throws IOException
143      *             on error
144      */
145     protected static byte[] readUntilRAF(RandomAccessFile raf, int offset, byte stopByte) throws IOException {
146         raf.seek(offset);
147         return readUntilRAF(raf, stopByte);
148     }
149 
150     /**
151      * Read a RandomAccessFile until a particular byte is seen
152      * 
153      * @param raf
154      *            The file to read
155      * @param stopByte
156      *            The point at which to stop reading
157      * @return the read data
158      * @throws IOException
159      *             on error
160      */
161     protected static byte[] readUntilRAF(RandomAccessFile raf, byte stopByte) throws IOException {
162         // The strategy used here is to read the file twice.
163         // Once to determine how much to read and then getting the actual data.
164         // It may be more efficient to incrementally build up a byte buffer.
165         // Note: that growing a static array by 1 byte at a time is O(n**2)
166         // This is negligible when the n is small, but prohibitive otherwise.
167         long offset = raf.getFilePointer();
168         int size = 0;
169 
170         int nextByte = -1;
171         do {
172             nextByte = raf.read();
173 
174             size++;
175         } while (nextByte != -1 && nextByte != stopByte);
176 
177         // Note: we allow for nextByte == -1 to be included in size
178         // so that readRAF will report EOF errors
179         return readRAF(raf, offset, size);
180     }
181 
182     /**
183      * Decode little endian data from a byte array. This assumes that the high
184      * order bit is not set as this is used solely for an offset in a file in
185      * bytes. For a practical limit, 2**31 is way bigger than any document that
186      * we can have.
187      * 
188      * @param data
189      *            the byte[] from which to read 4 bytes
190      * @param offset
191      *            the offset into the array
192      * @return The decoded data
193      */
194     public static int decodeLittleEndian32(byte[] data, int offset) {
195         // Convert from a byte to an int, but prevent sign extension.
196         // So -16 becomes 240
197         int byte1 = data[0 + offset] & 0xFF;
198         int byte2 = (data[1 + offset] & 0xFF) << 8;
199         int byte3 = (data[2 + offset] & 0xFF) << 16;
200         int byte4 = (data[3 + offset] & 0xFF) << 24;
201 
202         return byte4 | byte3 | byte2 | byte1;
203     }
204 
205     /**
206      * Encode little endian data from a byte array. This assumes that the number
207      * fits in a Java integer. That is, the range of an unsigned C integer is
208      * greater than a signed Java integer. For a practical limit, 2**31 is way
209      * bigger than any document that we can have. If this ever doesn't work, use
210      * a long for the number.
211      * 
212      * @param val
213      *            the number to encode into little endian
214      * @param data
215      *            the byte[] from which to write 4 bytes
216      * @param offset
217      *            the offset into the array
218      */
219     protected static void encodeLittleEndian32(int val, byte[] data, int offset) {
220         data[0 + offset] = (byte) (val & 0xFF);
221         data[1 + offset] = (byte) ((val >> 8) & 0xFF);
222         data[2 + offset] = (byte) ((val >> 16) & 0xFF);
223         data[3 + offset] = (byte) ((val >> 24) & 0xFF);
224     }
225 
226     /**
227      * Decode little endian data from a byte array
228      * 
229      * @param data
230      *            the byte[] from which to read 2 bytes
231      * @param offset
232      *            the offset into the array
233      * @return The decoded data
234      */
235     protected static int decodeLittleEndian16(byte[] data, int offset) {
236         // Convert from a byte to an int, but prevent sign extension.
237         // So -16 becomes 240
238         int byte1 = data[0 + offset] & 0xFF;
239         int byte2 = (data[1 + offset] & 0xFF) << 8;
240 
241         return byte2 | byte1;
242     }
243 
244     /**
245      * Encode a 16-bit little endian from an integer. It is assumed that the
246      * integer's lower 16 bits are the only that are set.
247      * 
248      * @param data
249      *            the byte[] from which to write 2 bytes
250      * @param offset
251      *            the offset into the array
252      */
253     protected static void encodeLittleEndian16(int val, byte[] data, int offset) {
254         data[0 + offset] = (byte) (val & 0xFF);
255         data[1 + offset] = (byte) ((val >> 8) & 0xFF);
256     }
257 
258     /**
259      * Find a byte of data in an array
260      * 
261      * @param data
262      *            The array to search
263      * @param sought
264      *            The data to search for
265      * @return The index of the found position or -1 if not found
266      */
267     protected static int findByte(byte[] data, byte sought) {
268         return findByte(data, 0, sought);
269     }
270 
271     /**
272      * Find a byte of data in an array
273      * 
274      * @param data
275      *            The array to search
276      * @param offset
277      *            The position in the array to begin looking
278      * @param sought
279      *            The data to search for
280      * @return The index of the found position or -1 if not found
281      */
282     protected static int findByte(byte[] data, int offset, byte sought) {
283         for (int i = offset; i < data.length; i++) {
284             if (data[i] == sought) {
285                 return i;
286             }
287         }
288 
289         return -1;
290     }
291 
292     /**
293      * Transform a byte array into a string given the encoding. If the encoding
294      * is bad then it just does it as a string.
295      * Note: this may modify data. Don't use it to examine data.
296      * 
297      * @param key the key
298      * @param data
299      *            The byte array to be converted
300      * @param charset
301      *            The encoding of the byte array
302      * @return a string that is UTF-8 internally
303      */
304     public static String decode(String key, byte[] data, String charset) {
305         return decode(key, data, 0, data.length, charset);
306     }
307 
308     /**
309      * Transform a portion of a byte array into a string given the encoding. If
310      * the encoding is bad then it just does it as a string.
311      * Note: this may modify data. Don't use it to examine data.
312      * 
313      * @param key the key
314      * @param data
315      *            The byte array to be converted
316      * @param length
317      *            The number of bytes to use.
318      * @param charset
319      *            The encoding of the byte array
320      * @return a string that is UTF-8 internally
321      */
322     public static String decode(String key, byte[] data, int length, String charset) {
323         return decode(key, data, 0, length, charset);
324     }
325 
326     /**
327      * Transform a portion of a byte array starting at an offset into a string
328      * given the encoding. If the encoding is bad then it just does it as a
329      * string. Note: this may modify data. Don't use it to examine data.
330      * 
331      * @param key the key
332      * @param data
333      *            The byte array to be converted
334      * @param offset
335      *            The starting position in the byte array
336      * @param length
337      *            The number of bytes to use.
338      * @param charset
339      *            The encoding of the byte array
340      * @return a string that is UTF-8 internally
341      */
342     public static String decode(String key, byte[] data, int offset, int length, String charset) {
343          if ("WINDOWS-1252".equals(charset)) {
344             clean1252(key, data, offset, length);
345          }
346         String txt = "";
347         try {
348             if (offset + length <= data.length) {
349                 txt = new String(data, offset, length, charset);
350             }
351         } catch (UnsupportedEncodingException ex) {
352             // It is impossible! In case, use system default...
353             log.error("{}: Encoding {} not supported.", key, charset, ex);
354             txt = new String(data, offset, length);
355         }
356 
357         return txt;
358     }
359 
360     /**
361      * Remove rogue characters in the source. These are characters that are not
362      * valid in cp1252 aka WINDOWS-1252 and in UTF-8 or are non-printing control
363      * characters in the range of 0-32.
364      */
365     private static void clean1252(String key, byte[] data, int offset, int length) {
366         int end = offset + length;
367         // make sure it doesn't go off the end
368         if (end > data.length) {
369             end = data.length;
370         }
371         for (int i = offset; i < end; i++) {
372             // between 0-32 only allow whitespace: \t, \n, \r, ' '
373             // characters 0x81, 0x8D, 0x8F, 0x90 and 0x9D are undefined in
374             // cp1252
375             int c = data[i] & 0xFF;
376             if ((c >= 0x00 && c < 0x20 && c != 0x09 && c != 0x0A && c != 0x0D) || (c == 0x81 || c == 0x8D || c == 0x8F || c == 0x90 || c == 0x9D)) {
377                 data[i] = 0x20;
378                 log.error("{} has bad character 0x{} at position {} in input.", key, Integer.toString(c, 16), Integer.toString(i));
379             }
380         }
381     }
382 
383     /**
384      * Returns where the book should be located
385      * @param bookMetaData meta information about the book
386      * @return the URI locating the resource
387      * @throws BookException thrown if an issue is encountered, e.g. missing data files.
388      */
389     public static URI getExpandedDataPath(BookMetaData bookMetaData) throws BookException {
390         URI loc = NetUtil.lengthenURI(bookMetaData.getLibrary(), bookMetaData.getProperty(SwordBookMetaData.KEY_DATA_PATH));
391 
392         if (loc == null) {
393             // FIXME(DMS): missing parameter
394             throw new BookException(JSOtherMsg.lookupText("Missing data files for old and new testaments in {0}."));
395         }
396 
397         return loc;
398     }
399 
400     /**
401      * The log stream
402      */
403     private static final Logger log = LoggerFactory.getLogger(SwordUtil.class);
404 
405 }
406