Classes in this File | Line Coverage | Branch Coverage | Complexity | ||||
ZVerseBackend |
|
| 5.0;5 |
1 | /** | |
2 | * Distribution License: | |
3 | * JSword is free software; you can redistribute it and/or modify it under | |
4 | * the terms of the GNU Lesser General Public License, version 2.1 or later | |
5 | * as published by the Free Software Foundation. This program is distributed | |
6 | * in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even | |
7 | * the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. | |
8 | * See the GNU Lesser General Public License for more details. | |
9 | * | |
10 | * The License is available on the internet at: | |
11 | * http://www.gnu.org/copyleft/lgpl.html | |
12 | * or by writing to: | |
13 | * Free Software Foundation, Inc. | |
14 | * 59 Temple Place - Suite 330 | |
15 | * Boston, MA 02111-1307, USA | |
16 | * | |
17 | * © CrossWire Bible Society, 2005 - 2016 | |
18 | * | |
19 | */ | |
20 | package org.crosswire.jsword.book.sword; | |
21 | ||
22 | import java.io.IOException; | |
23 | import java.io.RandomAccessFile; | |
24 | ||
25 | import org.crosswire.common.compress.CompressorType; | |
26 | import org.crosswire.jsword.JSMsg; | |
27 | import org.crosswire.jsword.book.BookException; | |
28 | import org.crosswire.jsword.book.BookMetaData; | |
29 | import org.crosswire.jsword.book.sword.state.OpenFileStateManager; | |
30 | import org.crosswire.jsword.book.sword.state.ZVerseBackendState; | |
31 | import org.crosswire.jsword.passage.BitwisePassage; | |
32 | import org.crosswire.jsword.passage.Key; | |
33 | import org.crosswire.jsword.passage.KeyUtil; | |
34 | import org.crosswire.jsword.passage.RocketPassage; | |
35 | import org.crosswire.jsword.passage.Verse; | |
36 | import org.crosswire.jsword.versification.Testament; | |
37 | import org.crosswire.jsword.versification.Versification; | |
38 | import org.crosswire.jsword.versification.system.Versifications; | |
39 | import org.slf4j.Logger; | |
40 | import org.slf4j.LoggerFactory; | |
41 | ||
42 | /** | |
43 | * A backend to read compressed data verse based files. While the text file | |
44 | * contains data compressed with ZIP or LZSS, it cannot be uncompressed using a | |
45 | * stand alone zip utility, such as WinZip or gzip. The reason for this is that | |
46 | * the data file is a concatenation of blocks of compressed data. | |
47 | * | |
48 | * <p> | |
49 | * The blocks can either be "b", book (aka testament); "c", chapter or "v", | |
50 | * verse. The choice is a matter of trade offs. The program needs to uncompress | |
51 | * a block into memory. Having it at the book level is very memory expensive. | |
52 | * Having it at the verse level is very disk expensive, but takes the least | |
53 | * amount of memory. The most common is chapter. | |
54 | * </p> | |
55 | * | |
56 | * <p> | |
57 | * In order to find the data in the text file, we need to find the block. The | |
58 | * first index (idx) is used for this. Each verse is indexed to a tuple (block | |
59 | * number, verse start, verse size). This data allows us to find the correct | |
60 | * block, and to extract the verse from the uncompressed block, but it does not | |
61 | * help us uncompress the block. | |
62 | * </p> | |
63 | * | |
64 | * <p> | |
65 | * Once the block is known, then the next index (comp) gives the location of the | |
66 | * compressed block, its compressed size and its uncompressed size. | |
67 | * </p> | |
68 | * | |
69 | * <p> | |
70 | * There are 3 files for each testament, 2 (idx and comp) are indexes into the | |
71 | * third (text) which contains the data. The key into each index is the verse | |
72 | * index within that testament, which is determined by book, chapter and verse | |
73 | * of that key. | |
74 | * </p> | |
75 | * | |
76 | * <p> | |
77 | * All unsigned numbers are stored 2-complement, little endian. | |
78 | * </p> | |
79 | * <p> | |
80 | * Then proceed as follows, at all times working on the set of files for the | |
81 | * testament in question: | |
82 | * </p> | |
83 | * | |
84 | * The three files are laid out in the following fashion: | |
85 | * <ul> | |
86 | * <li>The idx file has one entry per verse in the versification. The number | |
87 | * of verses varies by versification and testament. Each entry describes the | |
88 | * compressed block in which it is found, the start of the verse in the | |
89 | * uncompressed block and the length of the verse. | |
90 | * <ul> | |
91 | * <li>Block number - 32-bit/4-bytes - the number of the entry in the comp file.</li> | |
92 | * <li>Verse start - 32 bit/4-bytes - the start of the verse in the uncompressed block in the dat file.</li> | |
93 | * <li>Verse length - 16 bit/4-bytes - the length of the verse in the uncompressed block from the dat file.</li> | |
94 | * </ul> | |
95 | * Algorithm: | |
96 | * <ul> | |
97 | * <li>Given the ordinal value of the verse, seek to the ordinal * 10 and read 10 bytes. | |
98 | * <li>Decode the 10 bytes as Block Number, Verse start and length</li> | |
99 | * </ul> | |
100 | * </li> | |
101 | * <li>The comp file has one entry per block. | |
102 | * Each entry describes the location of a compressed block, | |
103 | * giving its start and size in the next file. | |
104 | * <ul> | |
105 | * <li>Block Start - 32-bit/4-byte - the start of the block in the dat file</li> | |
106 | * <li>Compressed Block Size - 32-bit/4-byte - the size of the compressed block in the dat file</li> | |
107 | * <li>Uncompressed Block Size - 32-bit/4-byte - the size of the block after uncompressing</li> | |
108 | * </ul> | |
109 | * Algorithm: | |
110 | * <ul> | |
111 | * <li>Given a block number, seek to block-index * 12 and read 12 bytes</li> | |
112 | * <li>Decode the 12 bytes as Block Start, Compressed Block Size and Uncompressed Block Size</li> | |
113 | * </ul> | |
114 | * </li> | |
115 | * <li> The dat file is compressed blocks of verses. | |
116 | * <br> | |
117 | * Algorithm: | |
118 | * <ul> | |
119 | * <li>Given the entry from the comp file, seek to the start and read the indicated compressed block size</li> | |
120 | * <li>If the book is enciphered it, decipher it.</li> | |
121 | * <li>Uncompress the block, using the uncompressed size as an optimization.</li> | |
122 | * <li>Using the verse start, seek to that location in the uncompressed block and read the indicated verse size.</li> | |
123 | * <li>Convert the bytes to a String using the books indicated charset.</li> | |
124 | * </ul> | |
125 | * </li> | |
126 | * </ul> | |
127 | * | |
128 | * @see gnu.lgpl.License The GNU Lesser General Public License for details. | |
129 | * @author Joe Walker | |
130 | * @author DM Smith | |
131 | */ | |
132 | 0 | public class ZVerseBackend extends AbstractBackend<ZVerseBackendState> { |
133 | /** | |
134 | * Simple ctor | |
135 | * @param sbmd | |
136 | * @param blockType | |
137 | */ | |
138 | public ZVerseBackend(SwordBookMetaData sbmd, BlockType blockType) { | |
139 | 0 | super(sbmd); |
140 | 0 | this.blockType = blockType; |
141 | 0 | } |
142 | ||
143 | /* This method assumes single keys. It is the responsibility of the caller to provide the iteration. | |
144 | * | |
145 | * FIXME: this could be refactored to push the iterations down, but no performance benefit would be gained since we have a manager that keeps the file accesses open | |
146 | * (non-Javadoc) | |
147 | * @see org.crosswire.jsword.book.sword.AbstractBackend#contains(org.crosswire.jsword.passage.Key) | |
148 | */ | |
149 | @Override | |
150 | public boolean contains(Key key) { | |
151 | 0 | return getRawTextLength(key) > 0; |
152 | } | |
153 | ||
154 | /* (non-Javadoc) | |
155 | * @see org.crosswire.jsword.book.sword.AbstractBackend#size(org.crosswire.jsword.passage.Key) | |
156 | */ | |
157 | @Override | |
158 | public int getRawTextLength(Key key) { | |
159 | 0 | ZVerseBackendState rafBook = null; |
160 | try { | |
161 | 0 | rafBook = initState(); |
162 | ||
163 | 0 | String v11nName = getBookMetaData().getProperty(BookMetaData.KEY_VERSIFICATION); |
164 | 0 | Versification v11n = Versifications.instance().getVersification(v11nName); |
165 | 0 | Verse verse = KeyUtil.getVerse(key); |
166 | ||
167 | 0 | int index = verse.getOrdinal(); |
168 | 0 | Testament testament = v11n.getTestament(index); |
169 | 0 | index = v11n.getTestamentOrdinal(index); |
170 | ||
171 | 0 | RandomAccessFile idxRaf = rafBook.getIdxRaf(testament); |
172 | ||
173 | // If Bible does not contain the desired testament, then false | |
174 | 0 | if (idxRaf == null) { |
175 | 0 | return 0; |
176 | } | |
177 | ||
178 | // 10 because the index is 10 bytes long for each verse | |
179 | 0 | byte[] temp = SwordUtil.readRAF(idxRaf, 1L * index * IDX_ENTRY_SIZE, IDX_ENTRY_SIZE); |
180 | ||
181 | // If the Bible does not contain the desired verse, return nothing. | |
182 | // Some Bibles have different versification, so the requested verse | |
183 | // may not exist. | |
184 | 0 | if (temp == null || temp.length == 0) { |
185 | 0 | return 0; |
186 | } | |
187 | ||
188 | // The data is little endian - extract the verseSize | |
189 | 0 | return SwordUtil.decodeLittleEndian16(temp, 8); |
190 | ||
191 | 0 | } catch (IOException e) { |
192 | 0 | return 0; |
193 | 0 | } catch (BookException e) { |
194 | // FIXME(CJB): fail silently as before, but i don't think this is | |
195 | // correct behaviour - would cause API changes | |
196 | 0 | log.error("Unable to ascertain key validity", e); |
197 | 0 | return 0; |
198 | } finally { | |
199 | 0 | OpenFileStateManager.instance().release(rafBook); |
200 | } | |
201 | } | |
202 | ||
203 | /* (non-Javadoc) | |
204 | * @see org.crosswire.jsword.book.sword.AbstractBackend#getGlobalKeyList() | |
205 | */ | |
206 | @Override | |
207 | public Key getGlobalKeyList() throws BookException { | |
208 | 0 | ZVerseBackendState rafBook = null; |
209 | try { | |
210 | 0 | rafBook = initState(); |
211 | ||
212 | 0 | String v11nName = getBookMetaData().getProperty(BookMetaData.KEY_VERSIFICATION); |
213 | 0 | Versification v11n = Versifications.instance().getVersification(v11nName); |
214 | ||
215 | 0 | Testament[] testaments = new Testament[] { |
216 | Testament.OLD, Testament.NEW | |
217 | }; | |
218 | ||
219 | 0 | BitwisePassage passage = new RocketPassage(v11n); |
220 | 0 | passage.raiseEventSuppresion(); |
221 | 0 | passage.raiseNormalizeProtection(); |
222 | ||
223 | 0 | for (Testament currentTestament : testaments) { |
224 | 0 | RandomAccessFile idxRaf = rafBook.getIdxRaf(currentTestament); |
225 | ||
226 | // If Bible does not contain the desired testament, then false | |
227 | 0 | if (idxRaf == null) { |
228 | // no keys in this testament | |
229 | 0 | continue; |
230 | } | |
231 | ||
232 | 0 | int maxIndex = v11n.getCount(currentTestament) - 1; |
233 | ||
234 | // Read in the whole index, a few hundred Kb at most. | |
235 | 0 | byte[] temp = SwordUtil.readRAF(idxRaf, 0, IDX_ENTRY_SIZE * maxIndex); |
236 | ||
237 | // For each entry of 10 bytes, the length of the verse in bytes | |
238 | // is in the last 2 bytes. If both bytes are 0, then there is no content. | |
239 | 0 | for (int ii = 0; ii < temp.length; ii += IDX_ENTRY_SIZE) { |
240 | // This can be simplified to temp[ii + 8] == 0 && temp[ii + 9] == 0. | |
241 | // int verseSize = SwordUtil.decodeLittleEndian16(temp, ii + 8); | |
242 | // if (verseSize > 0) { | |
243 | 0 | if (temp[ii + 8] != 0 || temp[ii + 9] != 0) { |
244 | 0 | int ordinal = ii / IDX_ENTRY_SIZE; |
245 | 0 | passage.addVersifiedOrdinal(v11n.getOrdinal(currentTestament, ordinal)); |
246 | } | |
247 | } | |
248 | } | |
249 | ||
250 | 0 | passage.lowerNormalizeProtection(); |
251 | 0 | passage.lowerEventSuppressionAndTest(); |
252 | ||
253 | 0 | return passage; |
254 | 0 | } catch (IOException e) { |
255 | 0 | throw new BookException(JSMsg.gettext("Unable to read key list from book.")); |
256 | } finally { | |
257 | 0 | OpenFileStateManager.instance().release(rafBook); |
258 | } | |
259 | } | |
260 | ||
261 | /* (non-Javadoc) | |
262 | * @see org.crosswire.jsword.book.sword.StatefulFileBackedBackend#initState() | |
263 | */ | |
264 | public ZVerseBackendState initState() throws BookException { | |
265 | 0 | return OpenFileStateManager.instance().getZVerseBackendState(getBookMetaData(), blockType); |
266 | } | |
267 | ||
268 | /* (non-Javadoc) | |
269 | * @see org.crosswire.jsword.book.sword.StatefulFileBackedBackend#readRawContent(org.crosswire.jsword.book.sword.state.OpenFileState, org.crosswire.jsword.passage.Key) | |
270 | */ | |
271 | public String readRawContent(ZVerseBackendState rafBook, Key key) throws IOException { | |
272 | ||
273 | 0 | BookMetaData bookMetaData = getBookMetaData(); |
274 | 0 | final String charset = bookMetaData.getBookCharset(); |
275 | 0 | final String compressType = bookMetaData.getProperty(SwordBookMetaData.KEY_COMPRESS_TYPE); |
276 | ||
277 | 0 | final String v11nName = getBookMetaData().getProperty(BookMetaData.KEY_VERSIFICATION); |
278 | 0 | final Versification v11n = Versifications.instance().getVersification(v11nName); |
279 | 0 | Verse verse = KeyUtil.getVerse(key); |
280 | ||
281 | 0 | int index = verse.getOrdinal(); |
282 | 0 | final Testament testament = v11n.getTestament(index); |
283 | 0 | index = v11n.getTestamentOrdinal(index); |
284 | final RandomAccessFile idxRaf; | |
285 | final RandomAccessFile compRaf; | |
286 | final RandomAccessFile textRaf; | |
287 | ||
288 | 0 | idxRaf = rafBook.getIdxRaf(testament); |
289 | 0 | compRaf = rafBook.getCompRaf(testament); |
290 | 0 | textRaf = rafBook.getTextRaf(testament); |
291 | ||
292 | // If Bible does not contain the desired testament, return nothing. | |
293 | 0 | if (idxRaf == null) { |
294 | 0 | return ""; |
295 | } | |
296 | ||
297 | //dumpIdxRaf(v11n, 0, compRaf); | |
298 | //dumpCompRaf(idxRaf); | |
299 | // 10 because the index is 10 bytes long for each verse | |
300 | 0 | byte[] temp = SwordUtil.readRAF(idxRaf, 1L * index * IDX_ENTRY_SIZE, IDX_ENTRY_SIZE); |
301 | ||
302 | // If the Bible does not contain the desired verse, return nothing. | |
303 | // Some Bibles have different versification, so the requested verse | |
304 | // may not exist. | |
305 | 0 | if (temp == null || temp.length == 0) { |
306 | 0 | return ""; |
307 | } | |
308 | ||
309 | // The data is little endian - extract the blockNum, verseStart | |
310 | // and | |
311 | // verseSize | |
312 | 0 | final long blockNum = SwordUtil.decodeLittleEndian32(temp, 0); |
313 | 0 | final int verseStart = SwordUtil.decodeLittleEndian32(temp, 4); |
314 | 0 | final int verseSize = SwordUtil.decodeLittleEndian16(temp, 8); |
315 | ||
316 | // Can we get the data from the cache | |
317 | 0 | byte[] uncompressed = null; |
318 | 0 | if (blockNum == rafBook.getLastBlockNum() && testament == rafBook.getLastTestament()) { |
319 | 0 | uncompressed = rafBook.getLastUncompressed(); |
320 | } else { | |
321 | // Then seek using this index into the idx file | |
322 | 0 | temp = SwordUtil.readRAF(compRaf, blockNum * COMP_ENTRY_SIZE, COMP_ENTRY_SIZE); |
323 | 0 | if (temp == null || temp.length == 0) { |
324 | 0 | return ""; |
325 | } | |
326 | ||
327 | 0 | final int blockStart = SwordUtil.decodeLittleEndian32(temp, 0); |
328 | 0 | final int blockSize = SwordUtil.decodeLittleEndian32(temp, 4); |
329 | 0 | final int uncompressedSize = SwordUtil.decodeLittleEndian32(temp, 8); |
330 | ||
331 | // Read from the data file. | |
332 | 0 | final byte[] data = SwordUtil.readRAF(textRaf, blockStart, blockSize); |
333 | ||
334 | 0 | decipher(data); |
335 | ||
336 | 0 | uncompressed = CompressorType.fromString(compressType).getCompressor(data).uncompress(uncompressedSize).toByteArray(); |
337 | ||
338 | // cache the uncompressed data for next time | |
339 | 0 | rafBook.setLastBlockNum(blockNum); |
340 | 0 | rafBook.setLastTestament(testament); |
341 | 0 | rafBook.setLastUncompressed(uncompressed); |
342 | } | |
343 | ||
344 | // and cut out the required section. | |
345 | 0 | final byte[] chopped = new byte[verseSize]; |
346 | 0 | System.arraycopy(uncompressed, verseStart, chopped, 0, verseSize); |
347 | ||
348 | 0 | return SwordUtil.decode(key.getName(), chopped, charset); |
349 | ||
350 | } | |
351 | ||
352 | /* (non-Javadoc) | |
353 | * @see org.crosswire.jsword.book.sword.AbstractBackend#setAliasKey(org.crosswire.jsword.passage.Key, org.crosswire.jsword.passage.Key) | |
354 | */ | |
355 | public void setAliasKey(ZVerseBackendState rafBook, Key alias, Key source) throws IOException { | |
356 | 0 | throw new UnsupportedOperationException(); |
357 | } | |
358 | ||
359 | /* (non-Javadoc) | |
360 | * @see org.crosswire.jsword.book.sword.AbstractBackend#setRawText(org.crosswire.jsword.passage.Key, java.lang.String) | |
361 | */ | |
362 | public void setRawText(ZVerseBackendState rafBook, Key key, String text) throws BookException, IOException { | |
363 | 0 | throw new UnsupportedOperationException(); |
364 | } | |
365 | ||
366 | /** | |
367 | * Experimental code. | |
368 | * | |
369 | * @param v11n | |
370 | * @param ordinalStart | |
371 | * @param raf | |
372 | */ | |
373 | public void dumpIdxRaf(Versification v11n, int ordinalStart, RandomAccessFile raf) { | |
374 | 0 | long end = -1; |
375 | try { | |
376 | 0 | end = raf.length(); |
377 | 0 | } catch (IOException e) { |
378 | // TODO Auto-generated catch block | |
379 | 0 | e.printStackTrace(); |
380 | 0 | } |
381 | ||
382 | 0 | int i = ordinalStart; |
383 | 0 | StringBuilder buf = new StringBuilder(); |
384 | 0 | System.out.println("osisID\tblock\tstart\tsize"); |
385 | 0 | for (long offset = 0; offset < end; offset += IDX_ENTRY_SIZE) { |
386 | // 10 because the index is 10 bytes long for each verse | |
387 | 0 | byte[] temp = null; |
388 | try { | |
389 | 0 | temp = SwordUtil.readRAF(raf, offset, IDX_ENTRY_SIZE); |
390 | 0 | } catch (IOException e) { |
391 | 0 | e.printStackTrace(); |
392 | 0 | } |
393 | ||
394 | // If the Bible does not contain the desired verse, return nothing. | |
395 | // Some Bibles have different versification, so the requested verse | |
396 | // may not exist. | |
397 | 0 | long blockNum = -1; |
398 | 0 | int verseStart = -1; |
399 | 0 | int verseSize = -1; |
400 | 0 | if (temp != null && temp.length > 0) { |
401 | // The data is little endian - extract the blockNum, verseStart and verseSize | |
402 | 0 | blockNum = SwordUtil.decodeLittleEndian32(temp, 0); |
403 | 0 | verseStart = SwordUtil.decodeLittleEndian32(temp, 4); |
404 | 0 | verseSize = SwordUtil.decodeLittleEndian16(temp, 8); |
405 | } | |
406 | 0 | buf.setLength(0); |
407 | 0 | buf.append(v11n.decodeOrdinal(i++).getOsisID()); |
408 | 0 | buf.append('\t'); |
409 | 0 | buf.append(blockNum); |
410 | 0 | buf.append('\t'); |
411 | 0 | buf.append(verseStart); |
412 | 0 | buf.append('\t'); |
413 | 0 | buf.append(verseSize); |
414 | 0 | System.out.println(buf.toString()); |
415 | } | |
416 | 0 | } |
417 | ||
418 | /** | |
419 | * Experimental code. | |
420 | * | |
421 | * @param raf | |
422 | */ | |
423 | public void dumpCompRaf(RandomAccessFile raf) { | |
424 | 0 | long end = -1; |
425 | try { | |
426 | 0 | end = raf.length(); |
427 | 0 | } catch (IOException e) { |
428 | // TODO Auto-generated catch block | |
429 | 0 | e.printStackTrace(); |
430 | 0 | } |
431 | ||
432 | 0 | int blockNum = 0; |
433 | 0 | StringBuilder buf = new StringBuilder(); |
434 | 0 | System.out.println("block\tstart\tsize\tuncompressed"); |
435 | 0 | for (long offset = 0; offset < end; offset += COMP_ENTRY_SIZE) { |
436 | // 12 because the index is 12 bytes long for each verse | |
437 | 0 | byte[] temp = null; |
438 | try { | |
439 | 0 | temp = SwordUtil.readRAF(raf, offset, COMP_ENTRY_SIZE); |
440 | 0 | } catch (IOException e) { |
441 | 0 | e.printStackTrace(); |
442 | 0 | } |
443 | ||
444 | // If the Bible does not contain the desired verse, return nothing. | |
445 | // Some Bibles have different versification, so the requested verse | |
446 | // may not exist. | |
447 | 0 | int blockStart = -1; |
448 | 0 | int blockSize = -1; |
449 | 0 | int uncompressedSize = -1; |
450 | 0 | if (temp != null && temp.length > 0) { |
451 | // The data is little endian - extract the blockNum, verseStar and verseSize | |
452 | 0 | blockStart = SwordUtil.decodeLittleEndian32(temp, 0); |
453 | 0 | blockSize = SwordUtil.decodeLittleEndian32(temp, 4); |
454 | 0 | uncompressedSize = SwordUtil.decodeLittleEndian32(temp, 8); |
455 | } | |
456 | 0 | buf.setLength(0); |
457 | 0 | buf.append(blockNum); |
458 | 0 | buf.append('\t'); |
459 | 0 | buf.append(blockStart); |
460 | 0 | buf.append('\t'); |
461 | 0 | buf.append(blockSize); |
462 | 0 | buf.append('\t'); |
463 | 0 | buf.append(uncompressedSize); |
464 | 0 | System.out.println(buf.toString()); |
465 | } | |
466 | 0 | } |
467 | ||
468 | /** | |
469 | * Whether the book is blocked by Book, Chapter or Verse. | |
470 | */ | |
471 | private final BlockType blockType; | |
472 | ||
473 | /** | |
474 | * How many bytes in the idx index? | |
475 | */ | |
476 | private static final int IDX_ENTRY_SIZE = 10; | |
477 | ||
478 | /** | |
479 | * How many bytes in the comp index? | |
480 | */ | |
481 | private static final int COMP_ENTRY_SIZE = 12; | |
482 | ||
483 | /** | |
484 | * The log stream | |
485 | */ | |
486 | 0 | private static final Logger log = LoggerFactory.getLogger(ZVerseBackend.class); |
487 | } |