1   /**
2    * Distribution License:
3    * JSword is free software; you can redistribute it and/or modify it under
4    * the terms of the GNU Lesser General Public License, version 2.1 as published by
5    * the Free Software Foundation. This program is distributed in the hope
6    * that it will be useful, but WITHOUT ANY WARRANTY; without even the
7    * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
8    * See the GNU Lesser General Public License for more details.
9    *
10   * The License is available on the internet at:
11   *       http://www.gnu.org/copyleft/lgpl.html
12   * or by writing to:
13   *      Free Software Foundation, Inc.
14   *      59 Temple Place - Suite 330
15   *      Boston, MA 02111-1307, USA
16   *
17   * Copyright: 2005
18   *     The copyright to this program is held by it's authors.
19   *
20   * ID: $Id: SwordUtil.java 2221 2012-01-25 21:32:57Z dmsmith $
21   */
22  package org.crosswire.jsword.book.sword;
23  
24  import java.io.IOException;
25  import java.io.RandomAccessFile;
26  import java.io.UnsupportedEncodingException;
27  
28  import org.crosswire.common.util.Logger;
29  
30  /**
31   * Various utilities used by different Sword classes.
32   * 
33   * @see gnu.lgpl.License for license details.<br>
34   *      The copyright to this program is held by it's authors.
35   * @author Joe Walker [joe at eireneh dot com]
36   */
37  public final class SwordUtil {
38      /**
39       * Prevent instantiation
40       */
41      private SwordUtil() {
42      }
43  
44      /**
45       * Read a RandomAccessFile
46       * 
47       * @param raf
48       *            The file to read
49       * @param offset
50       *            The start of the record to read
51       * @param theSize
52       *            The number of bytes to read
53       * @return the read data
54       * @throws IOException
55       *             on error
56       */
57      protected static byte[] readRAF(RandomAccessFile raf, long offset, int theSize) throws IOException {
58          raf.seek(offset);
59          return readNextRAF(raf, theSize);
60      }
61  
62      /**
63       * Read a RandomAccessFile from the current location in the file.
64       * 
65       * @param raf
66       *            The file to read
67       * @param theSize
68       *            The number of bytes to read
69       * @return the read data
70       * @throws IOException
71       *             on error
72       */
73      protected static byte[] readNextRAF(RandomAccessFile raf, int theSize) throws IOException {
74          long offset = raf.getFilePointer();
75          int size = theSize;
76          long rafSize = raf.length();
77  
78          if (offset >= rafSize) {
79              log.error("Attempt to read beyond end. offset=" + offset + " size=" + size + " but raf.length=" + rafSize);
80              return new byte[0];
81          }
82  
83          if (offset + size > raf.length()) {
84              log.error("Need to reduce size to avoid EOFException. offset=" + offset + " size=" + size + " but raf.length=" + rafSize);
85              size = (int) (raf.length() - offset);
86          }
87  
88          if (size < 1) {
89              log.error("Nothing to read at offset = " + offset + " returning empty because size=" + size);
90              return new byte[0];
91          }
92  
93          byte[] read = new byte[size];
94          raf.readFully(read);
95  
96          return read;
97      }
98  
99      /**
100      * Writes "data" to a RandomAccessFile at the "offset" position
101      * 
102      * @param raf
103      *            RandomAccessFile
104      * @param offset
105      *            offset to write at
106      * @param data
107      *            data to write
108      * @throws IOException
109      *             on error
110      */
111     protected static void writeRAF(RandomAccessFile raf, long offset, byte[] data) throws IOException {
112         raf.seek(offset);
113         writeNextRAF(raf, data);
114     }
115 
116     protected static void writeNextRAF(RandomAccessFile raf, byte[] data) throws IOException {
117         if (data == null) {
118             return;
119         }
120         raf.write(data);
121     }
122 
123     /**
124      * Read a RandomAccessFile until a particular byte is seen
125      * 
126      * @param raf
127      *            The file to read
128      * @param offset
129      *            The start of the record to read
130      * @param stopByte
131      *            The point at which to stop reading
132      * @return the read data
133      * @throws IOException
134      *             on error
135      */
136     protected static byte[] readUntilRAF(RandomAccessFile raf, int offset, byte stopByte) throws IOException {
137         raf.seek(offset);
138         return readUntilRAF(raf, stopByte);
139     }
140 
141     /**
142      * Read a RandomAccessFile until a particular byte is seen
143      * 
144      * @param raf
145      *            The file to read
146      * @param stopByte
147      *            The point at which to stop reading
148      * @return the read data
149      * @throws IOException
150      *             on error
151      */
152     protected static byte[] readUntilRAF(RandomAccessFile raf, byte stopByte) throws IOException {
153         // The strategy used here is to read the file twice.
154         // Once to determine how much to read and then getting the actual data.
155         // It may be more efficient to incrementally build up a byte buffer.
156         // Note: that growing a static array by 1 byte at a time is O(n**2)
157         // This is negligible when the n is small, but prohibitive otherwise.
158         long offset = raf.getFilePointer();
159         int size = 0;
160 
161         int nextByte = -1;
162         do {
163             nextByte = raf.read();
164 
165             size++;
166         } while (nextByte != -1 && nextByte != stopByte);
167 
168         // Note: we allow for nextByte == -1 to be included in size
169         // so that readRAF will report EOF errors
170         return readRAF(raf, offset, size);
171     }
172 
173     /**
174      * Decode little endian data from a byte array. This assumes that the high
175      * order bit is not set as this is used solely for an offset in a file in
176      * bytes. For a practical limit, 2**31 is way bigger than any document that
177      * we can have.
178      * 
179      * @param data
180      *            the byte[] from which to read 4 bytes
181      * @param offset
182      *            the offset into the array
183      * @return The decoded data
184      */
185     protected static int decodeLittleEndian32(byte[] data, int offset) {
186         // Convert from a byte to an int, but prevent sign extension.
187         // So -16 becomes 240
188         int byte1 = data[0 + offset] & 0xFF;
189         int byte2 = (data[1 + offset] & 0xFF) << 8;
190         int byte3 = (data[2 + offset] & 0xFF) << 16;
191         int byte4 = (data[3 + offset] & 0xFF) << 24;
192 
193         return byte4 | byte3 | byte2 | byte1;
194     }
195 
196     /**
197      * Encode little endian data from a byte array. This assumes that the number
198      * fits in a Java integer. That is, the range of an unsigned C integer is
199      * greater than a signed Java integer. For a practical limit, 2**31 is way
200      * bigger than any document that we can have. If this ever doesn't work, use
201      * a long for the number.
202      * 
203      * @param val
204      *            the number to encode into little endian
205      * @param data
206      *            the byte[] from which to write 4 bytes
207      * @param offset
208      *            the offset into the array
209      */
210     protected static void encodeLittleEndian32(int val, byte[] data, int offset) {
211         data[0 + offset] = (byte) (val & 0xFF);
212         data[1 + offset] = (byte) ((val >> 8) & 0xFF);
213         data[2 + offset] = (byte) ((val >> 16) & 0xFF);
214         data[3 + offset] = (byte) ((val >> 24) & 0xFF);
215     }
216 
217     /**
218      * Decode little endian data from a byte array
219      * 
220      * @param data
221      *            the byte[] from which to read 2 bytes
222      * @param offset
223      *            the offset into the array
224      * @return The decoded data
225      */
226     protected static int decodeLittleEndian16(byte[] data, int offset) {
227         // Convert from a byte to an int, but prevent sign extension.
228         // So -16 becomes 240
229         int byte1 = data[0 + offset] & 0xFF;
230         int byte2 = (data[1 + offset] & 0xFF) << 8;
231 
232         return byte2 | byte1;
233     }
234 
235     /**
236      * Encode a 16-bit little endian from an integer. It is assumed that the
237      * integer's lower 16 bits are the only that are set.
238      * 
239      * @param data
240      *            the byte[] from which to write 2 bytes
241      * @param offset
242      *            the offset into the array
243      */
244     protected static void encodeLittleEndian16(int val, byte[] data, int offset) {
245         data[0 + offset] = (byte) (val & 0xFF);
246         data[1 + offset] = (byte) ((val >> 8) & 0xFF);
247     }
248 
249     /**
250      * Find a byte of data in an array
251      * 
252      * @param data
253      *            The array to search
254      * @param sought
255      *            The data to search for
256      * @return The index of the found position or -1 if not found
257      */
258     protected static int findByte(byte[] data, byte sought) {
259         return findByte(data, 0, sought);
260     }
261 
262     /**
263      * Find a byte of data in an array
264      * 
265      * @param data
266      *            The array to search
267      * @param offset
268      *            The position in the array to begin looking
269      * @param sought
270      *            The data to search for
271      * @return The index of the found position or -1 if not found
272      */
273     protected static int findByte(byte[] data, int offset, byte sought) {
274         for (int i = offset; i < data.length; i++) {
275             if (data[i] == sought) {
276                 return i;
277             }
278         }
279 
280         return -1;
281     }
282 
283     /**
284      * Transform a byte array into a string given the encoding. If the encoding
285      * is bad then it just does it as a string.
286      * 
287      * @param data
288      *            The byte array to be converted
289      * @param charset
290      *            The encoding of the byte array
291      * @return a string that is UTF-8 internally
292      */
293     public static String decode(String key, byte[] data, String charset) {
294         return decode(key, data, 0, data.length, charset);
295     }
296 
297     /**
298      * Transform a portion of a byte array into a string given the encoding. If
299      * the encoding is bad then it just does it as a string.
300      * 
301      * @param data
302      *            The byte array to be converted
303      * @param length
304      *            The number of bytes to use.
305      * @param charset
306      *            The encoding of the byte array
307      * @return a string that is UTF-8 internally
308      */
309     public static String decode(String key, byte[] data, int length, String charset) {
310         return decode(key, data, 0, length, charset);
311     }
312 
313     /**
314      * Transform a portion of a byte array starting at an offset into a string
315      * given the encoding. If the encoding is bad then it just does it as a
316      * string.
317      * 
318      * @param data
319      *            The byte array to be converted
320      * @param offset
321      *            The starting position in the byte array
322      * @param length
323      *            The number of bytes to use.
324      * @param charset
325      *            The encoding of the byte array
326      * @return a string that is UTF-8 internally
327      */
328     public static String decode(String key, byte[] data, int offset, int length, String charset) {
329         if ("WINDOWS-1252".equals(charset)) {
330             clean1252(key, data, length);
331         }
332         String txt = "";
333         try {
334             txt = new String(data, offset, length, charset);
335         } catch (UnsupportedEncodingException ex) {
336             // It is impossible! In case, use system default...
337             log.error(key + ": Encoding: " + charset + " not supported", ex);
338             txt = new String(data, offset, length);
339         }
340 
341         return txt;
342     }
343 
344     /**
345      * Remove rogue characters in the source. These are characters that are not
346      * valid in cp1252 aka WINDOWS-1252 and in UTF-8 or are non-printing control
347      * characters in the range of 0-32.
348      */
349     public static void clean1252(String key, byte[] data) {
350         clean1252(key, data, data.length);
351     }
352 
353     /**
354      * Remove rogue characters in the source. These are characters that are not
355      * valid in cp1252 aka WINDOWS-1252 and in UTF-8 or are non-printing control
356      * characters in the range of 0-32.
357      */
358     public static void clean1252(String key, byte[] data, int length) {
359         for (int i = 0; i < length; i++) {
360             // between 0-32 only allow whitespace
361             // characters 0x81, 0x8D, 0x8F, 0x90 and 0x9D are undefined in
362             // cp1252
363             int c = data[i] & 0xFF;
364             if ((c >= 0x00 && c < 0x20 && c != 0x09 && c != 0x0A && c != 0x0D) || (c == 0x81 || c == 0x8D || c == 0x8F || c == 0x90 || c == 0x9D)) {
365                 data[i] = 0x20;
366                 log.error(key + " has bad character 0x" + Integer.toString(c, 16) + " at position " + i + " in input.");
367             }
368         }
369     }
370 
371     /**
372      * The log stream
373      */
374     private static final Logger log = Logger.getLogger(SwordUtil.class);
375 
376 }
377