1   /**
2    * Distribution License:
3    * JSword is free software; you can redistribute it and/or modify it under
4    * the terms of the GNU Lesser General Public License, version 2.1 or later
5    * as published by the Free Software Foundation. This program is distributed
6    * in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even
7    * the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
8    * See the GNU Lesser General Public License for more details.
9    *
10   * The License is available on the internet at:
11   *      http://www.gnu.org/copyleft/lgpl.html
12   * or by writing to:
13   *      Free Software Foundation, Inc.
14   *      59 Temple Place - Suite 330
15   *      Boston, MA 02111-1307, USA
16   *
17   * © CrossWire Bible Society, 2005 - 2016
18   *
19   */
20  package org.crosswire.jsword.book.sword;
21  
22  import java.io.IOException;
23  import java.io.RandomAccessFile;
24  
25  import org.crosswire.common.compress.CompressorType;
26  import org.crosswire.jsword.JSMsg;
27  import org.crosswire.jsword.book.BookException;
28  import org.crosswire.jsword.book.BookMetaData;
29  import org.crosswire.jsword.book.sword.state.OpenFileStateManager;
30  import org.crosswire.jsword.book.sword.state.ZVerseBackendState;
31  import org.crosswire.jsword.passage.BitwisePassage;
32  import org.crosswire.jsword.passage.Key;
33  import org.crosswire.jsword.passage.KeyUtil;
34  import org.crosswire.jsword.passage.RocketPassage;
35  import org.crosswire.jsword.passage.Verse;
36  import org.crosswire.jsword.versification.Testament;
37  import org.crosswire.jsword.versification.Versification;
38  import org.crosswire.jsword.versification.system.Versifications;
39  import org.slf4j.Logger;
40  import org.slf4j.LoggerFactory;
41  
42  /**
43   * A backend to read compressed data verse based files. While the text file
44   * contains data compressed with ZIP or LZSS, it cannot be uncompressed using a
45   * stand alone zip utility, such as WinZip or gzip. The reason for this is that
46   * the data file is a concatenation of blocks of compressed data.
47   * 
48   * <p>
49   * The blocks can either be "b", book (aka testament); "c", chapter or "v",
50   * verse. The choice is a matter of trade offs. The program needs to uncompress
51   * a block into memory. Having it at the book level is very memory expensive.
52   * Having it at the verse level is very disk expensive, but takes the least
53   * amount of memory. The most common is chapter.
54   * </p>
55   * 
56   * <p>
57   * In order to find the data in the text file, we need to find the block. The
58   * first index (idx) is used for this. Each verse is indexed to a tuple (block
59   * number, verse start, verse size). This data allows us to find the correct
60   * block, and to extract the verse from the uncompressed block, but it does not
61   * help us uncompress the block.
62   * </p>
63   * 
64   * <p>
65   * Once the block is known, then the next index (comp) gives the location of the
66   * compressed block, its compressed size and its uncompressed size.
67   * </p>
68   * 
69   * <p>
70   * There are 3 files for each testament, 2 (idx and comp) are indexes into the
71   * third (text) which contains the data. The key into each index is the verse
72   * index within that testament, which is determined by book, chapter and verse
73   * of that key.
74   * </p>
75   * 
76   * <p>
77   * All unsigned numbers are stored 2-complement, little endian.
78   * </p>
79   * <p>
80   * Then proceed as follows, at all times working on the set of files for the
81   * testament in question:
82   * </p>
83   * 
84   * The three files are laid out in the following fashion:
85   * <ul>
86   * <li>The idx file has one entry per verse in the versification. The number
87   * of verses varies by versification and testament. Each entry describes the
88   * compressed block in which it is found, the start of the verse in the
89   * uncompressed block and the length of the verse.
90   * <ul>
91   * <li>Block number - 32-bit/4-bytes - the number of the entry in the comp file.</li>
92   * <li>Verse start - 32 bit/4-bytes - the start of the verse in the uncompressed block in the dat file.</li>
93   * <li>Verse length - 16 bit/4-bytes - the length of the verse in the uncompressed block from the dat file.</li>
94   * </ul>
95   * Algorithm:
96   * <ul>
97   * <li>Given the ordinal value of the verse, seek to the ordinal * 10 and read 10 bytes.
98   * <li>Decode the 10 bytes as Block Number, Verse start and length</li>
99   * </ul>
100  * </li>
101  * <li>The comp file has one entry per block.
102  * Each entry describes the location of a compressed block,
103  * giving its start and size in the next file.
104  * <ul>
105  * <li>Block Start - 32-bit/4-byte - the start of the block in the dat file</li>
106  * <li>Compressed Block Size - 32-bit/4-byte - the size of the compressed block in the dat file</li>
107  * <li>Uncompressed Block Size - 32-bit/4-byte - the size of the block after uncompressing</li>
108  * </ul>
109  * Algorithm:
110  * <ul>
111  * <li>Given a block number, seek to block-index * 12 and read 12 bytes</li>
112  * <li>Decode the 12 bytes as Block Start, Compressed Block Size and Uncompressed Block Size</li>
113  * </ul>
114  * </li>
115  * <li> The dat file is compressed blocks of verses.
116  * <br>
117  * Algorithm:
118  * <ul>
119  * <li>Given the entry from the comp file, seek to the start and read the indicated compressed block size</li>
120  * <li>If the book is enciphered it, decipher it.</li>
121  * <li>Uncompress the block, using the uncompressed size as an optimization.</li>
122  * <li>Using the verse start, seek to that location in the uncompressed block and read the indicated verse size.</li>
123  * <li>Convert the bytes to a String using the books indicated charset.</li>
124  * </ul>
125  * </li>
126  * </ul>
127  * 
128  * @see gnu.lgpl.License The GNU Lesser General Public License for details.
129  * @author Joe Walker
130  * @author DM Smith
131  */
132 public class ZVerseBackend extends AbstractBackend<ZVerseBackendState> {
133     /**
134      * Simple ctor
135      * @param sbmd 
136      * @param blockType 
137      */
138     public ZVerseBackend(SwordBookMetaData sbmd, BlockType blockType) {
139         super(sbmd);
140         this.blockType = blockType;
141     }
142 
143     /* This method assumes single keys. It is the responsibility of the caller to provide the iteration. 
144      * 
145      * FIXME: this could be refactored to push the iterations down, but no performance benefit would be gained since we have a manager that keeps the file accesses open
146      * (non-Javadoc)
147      * @see org.crosswire.jsword.book.sword.AbstractBackend#contains(org.crosswire.jsword.passage.Key)
148      */
149     @Override
150     public boolean contains(Key key) {
151         return getRawTextLength(key) > 0;
152     }
153 
154     /* (non-Javadoc)
155      * @see org.crosswire.jsword.book.sword.AbstractBackend#size(org.crosswire.jsword.passage.Key)
156      */
157     @Override
158     public int getRawTextLength(Key key) {
159         ZVerseBackendState rafBook = null;
160         try {
161             rafBook = initState();
162 
163             String v11nName = getBookMetaData().getProperty(BookMetaData.KEY_VERSIFICATION);
164             Versification v11n = Versifications.instance().getVersification(v11nName);
165             Verse verse = KeyUtil.getVerse(key);
166 
167             int index = verse.getOrdinal();
168             Testament testament = v11n.getTestament(index);
169             index = v11n.getTestamentOrdinal(index);
170 
171             RandomAccessFile idxRaf = rafBook.getIdxRaf(testament);
172 
173             // If Bible does not contain the desired testament, then false
174             if (idxRaf == null) {
175                 return 0;
176             }
177 
178             // 10 because the index is 10 bytes long for each verse
179             byte[] temp = SwordUtil.readRAF(idxRaf, 1L * index * IDX_ENTRY_SIZE, IDX_ENTRY_SIZE);
180 
181             // If the Bible does not contain the desired verse, return nothing.
182             // Some Bibles have different versification, so the requested verse
183             // may not exist.
184             if (temp == null || temp.length == 0) {
185                 return 0;
186             }
187 
188             // The data is little endian - extract the verseSize
189             return SwordUtil.decodeLittleEndian16(temp, 8);
190 
191         } catch (IOException e) {
192             return 0;
193         } catch (BookException e) {
194             // FIXME(CJB): fail silently as before, but i don't think this is
195             // correct behaviour - would cause API changes
196             log.error("Unable to ascertain key validity", e);
197             return 0;
198         } finally {
199             OpenFileStateManager.instance().release(rafBook);
200         }
201     }
202 
203     /* (non-Javadoc)
204      * @see org.crosswire.jsword.book.sword.AbstractBackend#getGlobalKeyList()
205      */
206     @Override
207     public Key getGlobalKeyList() throws BookException {
208         ZVerseBackendState rafBook = null;
209         try {
210             rafBook = initState();
211 
212             String v11nName = getBookMetaData().getProperty(BookMetaData.KEY_VERSIFICATION);
213             Versification v11n = Versifications.instance().getVersification(v11nName);
214 
215             Testament[] testaments = new Testament[] {
216                     Testament.OLD, Testament.NEW
217             };
218 
219             BitwisePassage passage = new RocketPassage(v11n);
220             passage.raiseEventSuppresion();
221             passage.raiseNormalizeProtection();
222 
223             for (Testament currentTestament : testaments) {
224                 RandomAccessFile idxRaf = rafBook.getIdxRaf(currentTestament);
225 
226                 // If Bible does not contain the desired testament, then false
227                 if (idxRaf == null) {
228                     // no keys in this testament
229                     continue;
230                 }
231 
232                 int maxIndex = v11n.getCount(currentTestament) - 1;
233 
234                 // Read in the whole index, a few hundred Kb at most.
235                 byte[] temp = SwordUtil.readRAF(idxRaf, 0, IDX_ENTRY_SIZE * maxIndex);
236 
237                 // For each entry of 10 bytes, the length of the verse in bytes
238                 // is in the last 2 bytes. If both bytes are 0, then there is no content.
239                 for (int ii = 0; ii < temp.length; ii += IDX_ENTRY_SIZE) {
240                     // This can be simplified to temp[ii + 8] == 0 && temp[ii + 9] == 0.
241                     // int verseSize = SwordUtil.decodeLittleEndian16(temp, ii + 8);
242                     // if (verseSize > 0) {
243                     if (temp[ii + 8] != 0 || temp[ii + 9] != 0) {
244                         int ordinal = ii / IDX_ENTRY_SIZE;
245                         passage.addVersifiedOrdinal(v11n.getOrdinal(currentTestament, ordinal));
246                     }
247                 }
248             }
249 
250             passage.lowerNormalizeProtection();
251             passage.lowerEventSuppressionAndTest();
252 
253             return passage;
254         } catch (IOException e) {
255             throw new BookException(JSMsg.gettext("Unable to read key list from book."));
256         } finally {
257             OpenFileStateManager.instance().release(rafBook);
258         }
259     }
260 
261     /* (non-Javadoc)
262      * @see org.crosswire.jsword.book.sword.StatefulFileBackedBackend#initState()
263      */
264     public ZVerseBackendState initState() throws BookException {
265         return OpenFileStateManager.instance().getZVerseBackendState(getBookMetaData(), blockType);
266     }
267 
268     /* (non-Javadoc)
269      * @see org.crosswire.jsword.book.sword.StatefulFileBackedBackend#readRawContent(org.crosswire.jsword.book.sword.state.OpenFileState, org.crosswire.jsword.passage.Key)
270      */
271     public String readRawContent(ZVerseBackendState rafBook, Key key) throws IOException {
272 
273         BookMetaData bookMetaData = getBookMetaData();
274         final String charset = bookMetaData.getBookCharset();
275         final String compressType = bookMetaData.getProperty(SwordBookMetaData.KEY_COMPRESS_TYPE);
276 
277         final String v11nName = getBookMetaData().getProperty(BookMetaData.KEY_VERSIFICATION);
278         final Versification v11n = Versifications.instance().getVersification(v11nName);
279         Verse verse = KeyUtil.getVerse(key);
280 
281         int index = verse.getOrdinal();
282         final Testament testament = v11n.getTestament(index);
283         index = v11n.getTestamentOrdinal(index);
284         final RandomAccessFile idxRaf;
285         final RandomAccessFile compRaf;
286         final RandomAccessFile textRaf;
287 
288         idxRaf = rafBook.getIdxRaf(testament);
289         compRaf = rafBook.getCompRaf(testament);
290         textRaf = rafBook.getTextRaf(testament);
291 
292         // If Bible does not contain the desired testament, return nothing.
293         if (idxRaf == null) {
294             return "";
295         }
296 
297         //dumpIdxRaf(v11n, 0, compRaf);
298         //dumpCompRaf(idxRaf);
299         // 10 because the index is 10 bytes long for each verse
300         byte[] temp = SwordUtil.readRAF(idxRaf, 1L * index * IDX_ENTRY_SIZE, IDX_ENTRY_SIZE);
301 
302         // If the Bible does not contain the desired verse, return nothing.
303         // Some Bibles have different versification, so the requested verse
304         // may not exist.
305         if (temp == null || temp.length == 0) {
306             return "";
307         }
308 
309         // The data is little endian - extract the blockNum, verseStart
310         // and
311         // verseSize
312         final long blockNum = SwordUtil.decodeLittleEndian32(temp, 0);
313         final int verseStart = SwordUtil.decodeLittleEndian32(temp, 4);
314         final int verseSize = SwordUtil.decodeLittleEndian16(temp, 8);
315 
316         // Can we get the data from the cache
317         byte[] uncompressed = null;
318         if (blockNum == rafBook.getLastBlockNum() && testament == rafBook.getLastTestament()) {
319             uncompressed = rafBook.getLastUncompressed();
320         } else {
321             // Then seek using this index into the idx file
322             temp = SwordUtil.readRAF(compRaf, blockNum * COMP_ENTRY_SIZE, COMP_ENTRY_SIZE);
323             if (temp == null || temp.length == 0) {
324                 return "";
325             }
326 
327             final int blockStart = SwordUtil.decodeLittleEndian32(temp, 0);
328             final int blockSize = SwordUtil.decodeLittleEndian32(temp, 4);
329             final int uncompressedSize = SwordUtil.decodeLittleEndian32(temp, 8);
330 
331             // Read from the data file.
332             final byte[] data = SwordUtil.readRAF(textRaf, blockStart, blockSize);
333 
334             decipher(data);
335 
336             uncompressed = CompressorType.fromString(compressType).getCompressor(data).uncompress(uncompressedSize).toByteArray();
337 
338             // cache the uncompressed data for next time
339             rafBook.setLastBlockNum(blockNum);
340             rafBook.setLastTestament(testament);
341             rafBook.setLastUncompressed(uncompressed);
342         }
343 
344         // and cut out the required section.
345         final byte[] chopped = new byte[verseSize];
346         System.arraycopy(uncompressed, verseStart, chopped, 0, verseSize);
347 
348         return SwordUtil.decode(key.getName(), chopped, charset);
349 
350     }
351 
352     /* (non-Javadoc)
353      * @see org.crosswire.jsword.book.sword.AbstractBackend#setAliasKey(org.crosswire.jsword.passage.Key, org.crosswire.jsword.passage.Key)
354      */
355     public void setAliasKey(ZVerseBackendState rafBook, Key alias, Key source) throws IOException {
356         throw new UnsupportedOperationException();
357     }
358 
359     /* (non-Javadoc)
360      * @see org.crosswire.jsword.book.sword.AbstractBackend#setRawText(org.crosswire.jsword.passage.Key, java.lang.String)
361      */
362     public void setRawText(ZVerseBackendState rafBook, Key key, String text) throws BookException, IOException {
363         throw new UnsupportedOperationException();
364     }
365 
366     /** 
367      * Experimental code.
368      * 
369      * @param v11n
370      * @param ordinalStart
371      * @param raf
372      */
373     public void dumpIdxRaf(Versification v11n, int ordinalStart, RandomAccessFile raf) {
374         long end = -1;
375         try {
376             end = raf.length();
377         } catch (IOException e) {
378             // TODO Auto-generated catch block
379             e.printStackTrace();
380         }
381 
382         int i = ordinalStart;
383         StringBuilder buf = new StringBuilder();
384         System.out.println("osisID\tblock\tstart\tsize");
385         for (long offset = 0; offset < end; offset += IDX_ENTRY_SIZE) {
386             // 10 because the index is 10 bytes long for each verse
387             byte[] temp = null;
388             try {
389                 temp = SwordUtil.readRAF(raf, offset, IDX_ENTRY_SIZE);
390             } catch (IOException e) {
391                 e.printStackTrace();
392             }
393 
394             // If the Bible does not contain the desired verse, return nothing.
395             // Some Bibles have different versification, so the requested verse
396             // may not exist.
397             long blockNum = -1;
398             int verseStart = -1;
399             int verseSize = -1;
400             if (temp != null && temp.length > 0) {
401                 // The data is little endian - extract the blockNum, verseStart and verseSize
402                 blockNum = SwordUtil.decodeLittleEndian32(temp, 0);
403                 verseStart = SwordUtil.decodeLittleEndian32(temp, 4);
404                 verseSize = SwordUtil.decodeLittleEndian16(temp, 8);
405             }
406             buf.setLength(0);
407             buf.append(v11n.decodeOrdinal(i++).getOsisID());
408             buf.append('\t');
409             buf.append(blockNum);
410             buf.append('\t');
411             buf.append(verseStart);
412             buf.append('\t');
413             buf.append(verseSize);
414             System.out.println(buf.toString());
415         }
416     }
417 
418     /**
419      * Experimental code.
420      * 
421      * @param raf
422      */
423     public void dumpCompRaf(RandomAccessFile raf) {
424         long end = -1;
425         try {
426             end = raf.length();
427         } catch (IOException e) {
428             // TODO Auto-generated catch block
429             e.printStackTrace();
430         }
431 
432         int blockNum = 0;
433         StringBuilder buf = new StringBuilder();
434         System.out.println("block\tstart\tsize\tuncompressed");
435         for (long offset = 0; offset < end; offset += COMP_ENTRY_SIZE) {
436             // 12 because the index is 12 bytes long for each verse
437             byte[] temp = null;
438             try {
439                 temp = SwordUtil.readRAF(raf, offset, COMP_ENTRY_SIZE);
440             } catch (IOException e) {
441                 e.printStackTrace();
442             }
443 
444             // If the Bible does not contain the desired verse, return nothing.
445             // Some Bibles have different versification, so the requested verse
446             // may not exist.
447             int blockStart = -1;
448             int blockSize = -1;
449             int uncompressedSize = -1;
450             if (temp != null && temp.length > 0) {
451                 // The data is little endian - extract the blockNum, verseStar and verseSize
452                  blockStart = SwordUtil.decodeLittleEndian32(temp, 0);
453                  blockSize = SwordUtil.decodeLittleEndian32(temp, 4);
454                  uncompressedSize = SwordUtil.decodeLittleEndian32(temp, 8);
455             }
456             buf.setLength(0);
457             buf.append(blockNum);
458             buf.append('\t');
459             buf.append(blockStart);
460             buf.append('\t');
461             buf.append(blockSize);
462             buf.append('\t');
463             buf.append(uncompressedSize);
464             System.out.println(buf.toString());
465         }
466     }
467 
468     /**
469      * Whether the book is blocked by Book, Chapter or Verse.
470      */
471     private final BlockType blockType;
472 
473     /**
474      * How many bytes in the idx index?
475      */
476     private static final int IDX_ENTRY_SIZE = 10;
477 
478     /**
479      * How many bytes in the comp index?
480      */
481     private static final int COMP_ENTRY_SIZE = 12;
482 
483     /**
484      * The log stream
485      */
486     private static final Logger log = LoggerFactory.getLogger(ZVerseBackend.class);
487 }
488