1   /**
2    * Distribution License:
3    * JSword is free software; you can redistribute it and/or modify it under
4    * the terms of the GNU Lesser General Public License, version 2.1 or later
5    * as published by the Free Software Foundation. This program is distributed
6    * in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even
7    * the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
8    * See the GNU Lesser General Public License for more details.
9    *
10   * The License is available on the internet at:
11   *      http://www.gnu.org/copyleft/lgpl.html
12   * or by writing to:
13   *      Free Software Foundation, Inc.
14   *      59 Temple Place - Suite 330
15   *      Boston, MA 02111-1307, USA
16   *
17   * © CrossWire Bible Society, 2005 - 2016
18   *
19   */
20  package org.crosswire.jsword.book.sword;
21  
22  import java.io.IOException;
23  import java.io.ObjectInputStream;
24  import java.io.UnsupportedEncodingException;
25  import java.text.DecimalFormat;
26  import java.text.MessageFormat;
27  import java.util.Calendar;
28  import java.util.Date;
29  import java.util.GregorianCalendar;
30  import java.util.Locale;
31  import java.util.regex.Matcher;
32  import java.util.regex.Pattern;
33  
34  import org.crosswire.common.icu.DateFormatter;
35  import org.crosswire.common.util.StringUtil;
36  import org.crosswire.jsword.JSMsg;
37  import org.crosswire.jsword.book.BookCategory;
38  import org.crosswire.jsword.book.BookException;
39  import org.crosswire.jsword.book.BookMetaData;
40  import org.crosswire.jsword.book.FeatureType;
41  import org.crosswire.jsword.book.sword.state.OpenFileStateManager;
42  import org.crosswire.jsword.book.sword.state.RawLDBackendState;
43  import org.crosswire.jsword.passage.DefaultLeafKeyList;
44  import org.crosswire.jsword.passage.Key;
45  
46  /**
47   * An implementation AbstractKeyBackend to read RAW format files.
48   * 
49   * @param <T> The type of the RawLDBackendState that this class extends.
50   * @see gnu.lgpl.License The GNU Lesser General Public License for details.
51   * @author Joe Walker
52   * @author DM Smith
53   */
54  public class RawLDBackend<T extends RawLDBackendState> extends AbstractKeyBackend<RawLDBackendState> {
55      /**
56       * Simple ctor
57       * 
58       * @param sbmd 
59       * @param datasize
60       *            We need to know how many bytes in the size portion of the
61       *            index
62       */
63      public RawLDBackend(SwordBookMetaData sbmd, int datasize) {
64          super(sbmd);
65          this.datasize = datasize;
66          this.entrysize = OFFSETSIZE + datasize;
67      }
68  
69      public String readRawContent(RawLDBackendState state, Key key) throws IOException {
70          return doReadRawContent(state, key.getName());
71      }
72  
73      public RawLDBackendState initState() throws BookException {
74          return OpenFileStateManager.instance().getRawLDBackendState(getBookMetaData());
75      }
76  
77      private String doReadRawContent(RawLDBackendState state, String key) throws IOException {
78          if (key == null || key.length() == 0) {
79              return "";
80          }
81          int pos = search(state, key);
82          if (pos >= 0) {
83              DataIndex index = getIndex(state, pos);
84              DataEntry entry = getEntry(state, key, index);
85              entry = getEntry(state, entry);
86              if (entry.isLinkEntry()) {
87                  return doReadRawContent(state, entry.getLinkTarget());
88              }
89              return getRawText(entry);
90          }
91  
92          // TRANSLATOR: Error condition: Indicates that something could not
93          // be found in the book. {0} is a placeholder for the unknown key.
94          throw new IOException(JSMsg.gettext("Key not found {0}", key));
95      }
96  
97      protected String getRawText(DataEntry entry) {
98          String cipherKeyString = getBookMetaData().getProperty(SwordBookMetaData.KEY_CIPHER_KEY);
99          byte[] cipherKeyBytes = null;
100         if (cipherKeyString != null) {
101             try {
102                 cipherKeyBytes = cipherKeyString.getBytes(getBookMetaData().getBookCharset());
103             } catch (UnsupportedEncodingException e) {
104                 cipherKeyBytes = cipherKeyString.getBytes();
105             }
106         }
107         return entry.getRawText(cipherKeyBytes);
108     }
109 
110     /* (non-Javadoc)
111      * @see org.crosswire.jsword.passage.Key#getCardinality()
112      */
113     public int getCardinality() {
114         RawLDBackendState state = null;
115         try {
116             state = initState();
117 
118             if (state.getSize() == -1) {
119                 state.setSize((int) (state.getIdxRaf().length() / entrysize));
120             }
121             return state.getSize();
122         } catch (BookException e) {
123             return 0;
124         } catch (IOException e) {
125             return 0;
126         } finally {
127             OpenFileStateManager.instance().release(state);
128         }
129     }
130 
131     /* (non-Javadoc)
132      * @see org.crosswire.jsword.passage.Key#get(int)
133      */
134     public Key get(int index) {
135         RawLDBackendState state = null;
136         try {
137             state = initState();
138 
139             if (index < getCardinality()) {
140                 DataIndex dataIndex = getIndex(state, index);
141                 DataEntry entry = getEntry(state, getBookMetaData().getInitials(), dataIndex);
142                 String keytitle = internal2external(entry.getKey());
143                 return new DefaultLeafKeyList(keytitle);
144             }
145         } catch (BookException e) {
146             // This is allowed
147             // Fall through to throw an AIOOBE.
148         } catch (IOException e) {
149             // This is allowed
150             // Fall through to throw an AIOOBE.
151         } finally {
152             OpenFileStateManager.instance().release(state);
153         }
154         throw new ArrayIndexOutOfBoundsException(index);
155     }
156 
157     /* (non-Javadoc)
158      * @see org.crosswire.jsword.passage.Key#indexOf(org.crosswire.jsword.passage.Key)
159      */
160     public int indexOf(Key that) {
161         RawLDBackendState state = null;
162         try {
163             state = initState();
164             return search(state, that.getName());
165         } catch (IOException e) {
166             return -getCardinality() - 1;
167         } catch (BookException e) {
168             return -getCardinality() - 1;
169         } finally {
170             OpenFileStateManager.instance().release(state);
171         }
172     }
173 
174     /* (non-Javadoc)
175      * @see org.crosswire.jsword.book.sword.AbstractBackend#size(org.crosswire.jsword.passage.Key)
176      */
177     @Override
178     public int getRawTextLength(Key key) {
179         RawLDBackendState state = null;
180         try {
181             state = initState();
182             int entry = search(state, key.getName());
183             // Read the offset and size for this key from the index
184             byte[] buffer = SwordUtil.readRAF(state.getIdxRaf(), entry * entrysize, entrysize);
185             int entrySize = 0;
186             switch (datasize) {
187             case 2:
188                 entrySize = SwordUtil.decodeLittleEndian16(buffer, 4);
189                 break;
190             case 4:
191                 entrySize = SwordUtil.decodeLittleEndian32(buffer, 4);
192                 break;
193             default:
194                 assert false : datasize;
195             }
196 
197             return entrySize;
198         } catch (IOException e) {
199             return 0;
200         } catch (BookException e) {
201             return 0;
202         } finally {
203             OpenFileStateManager.instance().release(state);
204         }
205     }
206 
207     /**
208      * Get the Index (that is offset and size) for an entry.
209      * 
210      * @param entry
211      * @return the index of the entry
212      * @throws IOException
213      */
214     protected DataIndex getIndex(RawLDBackendState state, long entry) throws IOException {
215         // Read the offset and size for this key from the index
216         byte[] buffer = SwordUtil.readRAF(state.getIdxRaf(), entry * entrysize, entrysize);
217         int entryOffset = SwordUtil.decodeLittleEndian32(buffer, 0);
218         int entrySize = -1;
219         switch (datasize) {
220         case 2:
221             entrySize = SwordUtil.decodeLittleEndian16(buffer, 4);
222             break;
223         case 4:
224             entrySize = SwordUtil.decodeLittleEndian32(buffer, 4);
225             break;
226         default:
227             assert false : datasize;
228         }
229         return new DataIndex(entryOffset, entrySize);
230     }
231 
232     /**
233      * Get the text for an indexed entry in the book.
234      * 
235      * @param state
236      *            the state object for the storage
237      * @param reply
238      *            the context for this dataIndex, used for debugging
239      * @param dataIndex
240      *            the entry to get
241      * @return the text for the entry.
242      * @throws IOException
243      */
244     private DataEntry getEntry(RawLDBackendState state, String reply, DataIndex dataIndex) throws IOException {
245 //        DataIndex dataIndex = getIndex(state, index);
246         // Now read the data file for this key using the offset and size
247         byte[] data = SwordUtil.readRAF(state.getDatRaf(), dataIndex.getOffset(), dataIndex.getSize());
248         return new DataEntry(reply, data, getBookMetaData().getBookCharset());
249     }
250 
251     /**
252      * Get the entry indicated by this entry. If this entry doesn't indicate any other entry
253      * then it returns the entry. Note, this is used by compressed dictionaries to get the deeper stuff.
254      * 
255      * @param state the state where the entry can be found
256      * @param entry the entry that might indicate a deeper entry
257      * @return the final entry
258      */
259     protected DataEntry getEntry(RawLDBackendState state, DataEntry entry) {
260         return entry;
261     }
262 
263     /**
264      * Find a matching entry, returning it's index. Otherwise return &lt; 0, such
265      * that (-pos - 1) gives the insertion index.
266      * 
267      * @param key
268      * @return the match
269      * @throws IOException
270      */
271     private int search(RawLDBackendState state, String key) throws IOException {
272         // Note: In some dictionaries, the first element is out of order and
273         // represents the title of the work.
274         // So, do the bin search from 1 to end and if not found, check the
275         // first element as a special case.
276         // If that does not match return the position found otherwise.
277 
278         // Initialize to one beyond both ends.
279         int total = getCardinality();
280         int low = 0;
281         int high = total;
282         int match = -1;
283         DataIndex dataIndex = null;
284 
285         String suppliedKey = null;
286         while (high - low > 1) {
287             // use >>> to keep mid always in range
288             int mid = (low + high) >>> 1;
289 
290             // Get the key for the item at "mid"
291             dataIndex = getIndex(state, mid);
292             // Occasionally there's a bogus index entry (size == 0)
293             // in the middle of the index. It needs to be skipped.
294             while (dataIndex.getSize() == 0) {
295                 // reset mid toward the longer end
296                 mid += high - mid > mid - low ? 1 : -1;
297                 // ensure that we are in bounds.
298                 if (mid < low || mid > high) {
299                     break;
300                 }
301                 dataIndex = getIndex(state, mid);
302             }
303             String entryKey = normalizeForSearch(getEntry(state, key, dataIndex).getKey());
304             // Normalize the key based upon the first entry looked at.
305             if (suppliedKey == null) {
306                 suppliedKey = normalizeForSearch(external2internal(key, entryKey));
307             }
308             int cmp = entryKey.compareTo(suppliedKey);
309             if (cmp < 0) {
310                 low = mid;
311             } else if (cmp > 0) {
312                 high = mid;
313             } else {
314                 match = mid;
315                 break;
316             }
317         }
318 
319         // Do we have an exact match?
320         if (match >= 0) {
321             return match;
322         }
323 
324         // Many dictionaries have an introductory entry, so check it for a match.
325         dataIndex = getIndex(state, 0);
326         String entryKey = normalizeForSearch(getEntry(state, key, dataIndex).getKey());
327         if (suppliedKey == null) {
328             suppliedKey = normalizeForSearch(external2internal(key, entryKey));
329         }
330         if (entryKey.compareTo(suppliedKey) == 0) {
331             return 0;
332         }
333 
334         // It wasn't found so see if it is present in a linear search if case sensitive keys are used.
335         if ("true".equalsIgnoreCase(getBookMetaData().getProperty(SwordBookMetaData.KEY_CASE_SENSITIVE_KEYS))) {
336            for (int i = 0; i < total; i++) {
337                dataIndex = getIndex(state, i);
338                if (getEntry(state, key, dataIndex).getKey().compareTo(key) == 0) {
339                    return i;
340                }
341            }
342         }
343 
344         return -(high + 1);
345     }
346 
347     /**
348      * Convert the supplied key to something that can be understood by the module.
349      * Use firstKey to determine the pattern for Strong's numbers.
350      * 
351      * @param externalKey The external key to normalize
352      * @param pattern The first non-introduction key in the module.
353      * @return the internal representation of the key.
354      */
355     private String external2internal(String externalKey, String pattern) {
356         if (externalKey.length() == 0) {
357             return externalKey;
358         }
359         BookMetaData bmd = getBookMetaData();
360         String keytitle = externalKey;
361         if (BookCategory.DAILY_DEVOTIONS.equals(bmd.getBookCategory())) {
362             // Is it already in internal format? If so, just return it.
363             Matcher m = DEVOTION_PATTERN.matcher(keytitle);
364             if (m.matches()) {
365                 return keytitle;
366             }
367             Calendar greg = new GregorianCalendar();
368             DateFormatter nameDF = DateFormatter.getDateInstance();
369             nameDF.setLenient(true);
370             Date date = nameDF.parse(keytitle);
371             greg.setTime(date);
372             Object[] objs = {
373                     Integer.valueOf(1 + greg.get(Calendar.MONTH)), Integer.valueOf(greg.get(Calendar.DATE))
374             };
375             return DATE_KEY_FORMAT.format(objs);
376         }
377 
378         if (bmd.hasFeature(FeatureType.GREEK_DEFINITIONS) || bmd.hasFeature(FeatureType.HEBREW_DEFINITIONS)) {
379             // Is the string valid?
380             Matcher m = STRONGS_PATTERN.matcher(keytitle);
381             if (!m.matches()) {
382                 return keytitle;
383             }
384             if ("true".equalsIgnoreCase(bmd.getProperty(SwordBookMetaData.KEY_STRONGS_PADDING))) {
385                 // pad to 4 digits
386                 // NASB has trailing letters!
387                 int pos = keytitle.length() - 1;
388                 char lastLetter = keytitle.charAt(pos);
389                 boolean hasTrailingLetter = Character.isLetter(lastLetter);
390                 if (hasTrailingLetter) {
391                     keytitle = keytitle.substring(0, pos);
392                     // And it might be preceded by a !
393                     pos--;
394                     if (pos > 0 && keytitle.charAt(pos) == '!') {
395                         keytitle = keytitle.substring(0, pos);
396                     }
397                 }
398 
399                 // Get the G or the H.
400                 char type = keytitle.charAt(0);
401 
402                 // Get the number after the G or H
403                 int strongsNumber = Integer.parseInt(keytitle.substring(1));
404                 // If it has both Greek and Hebrew, then the G and H are needed.
405                 StringBuilder buf = new StringBuilder();
406                 if (bmd.hasFeature(FeatureType.GREEK_DEFINITIONS) && bmd.hasFeature(FeatureType.HEBREW_DEFINITIONS)) {
407                     // The convention is that a Strong's dictionary with both Greek
408                     // and Hebrew have G or H prefix
409                     buf.append(type);
410                     buf.append(getZero4Pad().format(strongsNumber));
411 
412                     // The NAS lexicon has some entries that end in A-Z, but it is
413                     // not preceded by a !
414                     if (hasTrailingLetter && "naslex".equalsIgnoreCase(bmd.getInitials())) {
415                         buf.append(lastLetter);
416                     }
417                     return buf.toString();
418                 }
419 
420                 m = STRONGS_PATTERN.matcher(pattern);
421                 if (m.matches()) {
422                     buf.append(type);
423                     int numLength = m.group(2).length();
424                     if (numLength == 4) {
425                         buf.append(getZero4Pad().format(strongsNumber));
426                     } else {
427                         buf.append(getZero5Pad().format(strongsNumber));
428                     }
429                     // The NAS lexicon has some entries that end in A-Z, but it is
430                     // not preceded by a !
431                     if (hasTrailingLetter && "naslex".equalsIgnoreCase(bmd.getInitials())) {
432                         buf.append(lastLetter);
433                     }
434                     return buf.toString();
435                 }
436 
437                 // It is just the number
438                 return getZero5Pad().format(strongsNumber);
439             }
440             // else unpad, E.g. G0001 to G1
441             // This test is merely an optimization to prevent unnecessary work.
442             if (keytitle.charAt(1) == '0') {
443                 char type = keytitle.charAt(0);
444                 // NASB has trailing letters!
445                 int pos = keytitle.length() - 1;
446                 char lastLetter = keytitle.charAt(pos);
447                 boolean hasTrailingLetter = Character.isLetter(lastLetter);
448                 if (hasTrailingLetter) {
449                     keytitle = keytitle.substring(0, pos);
450                     // And it might be preceded by a !
451                     pos--;
452                     if (pos > 0 && keytitle.charAt(pos) == '!') {
453                         keytitle = keytitle.substring(0, pos);
454                     }
455                 }
456                 // Get the number after the G or H
457                 int strongsNumber = Integer.parseInt(keytitle.substring(1));
458                 // The convention is that a Strong's dictionary with both Greek
459                 // and Hebrew have G or H prefix
460                 StringBuilder buf = new StringBuilder();
461                 buf.append(type);
462                 buf.append(strongsNumber);
463 
464                 // The NAS lexicon has some entries that end in A-Z, but it is
465                 // not preceded by a !
466                 if (hasTrailingLetter && "naslex".equalsIgnoreCase(bmd.getInitials())) {
467                     buf.append(lastLetter);
468                 }
469             }
470         }
471         return keytitle;
472     }
473 
474     private String internal2external(String internalKey) {
475         BookMetaData bmd = getBookMetaData();
476         String keytitle = internalKey;
477         if (BookCategory.DAILY_DEVOTIONS.equals(bmd.getBookCategory()) && keytitle.length() >= 3) {
478             Calendar greg = new GregorianCalendar();
479             DateFormatter nameDF = DateFormatter.getDateInstance();
480             String[] spec = StringUtil.splitAll(keytitle, '.');
481             greg.set(Calendar.MONTH, Integer.parseInt(spec[0]) - 1);
482             greg.set(Calendar.DATE, Integer.parseInt(spec[1]));
483             keytitle = nameDF.format(greg.getTime());
484         }
485         return keytitle;
486     }
487 
488     private String normalizeForSearch(String internalKey) {
489         BookMetaData bmd = getBookMetaData();
490         String keytitle = internalKey;
491         String caseSensitive = bmd.getProperty(SwordBookMetaData.KEY_CASE_SENSITIVE_KEYS);
492         if (!"true".equalsIgnoreCase(caseSensitive) && !BookCategory.DAILY_DEVOTIONS.equals(bmd.getBookCategory())) {
493             return keytitle.toUpperCase(Locale.US);
494         }
495 
496         return keytitle;
497     }
498 
499     /**
500      * A means to normalize Strong's Numbers.
501      */
502     private DecimalFormat getZero5Pad() {
503         return new DecimalFormat("00000");
504     }
505 
506     /**
507      * A means to normalize Strong's Numbers.
508      */
509     private DecimalFormat getZero4Pad() {
510         return new DecimalFormat("0000");
511     }
512 
513     /**
514      * Serialization support.
515      * 
516      * @param is
517      * @throws IOException
518      * @throws ClassNotFoundException
519      */
520     private void readObject(ObjectInputStream is) throws IOException, ClassNotFoundException {
521         is.defaultReadObject();
522     }
523 
524     /** 
525      * Experimental code.
526      */
527     public void dumpIdxRaf() {
528         RawLDBackendState state = null;
529         long end = -1;
530         try {
531             state = initState();
532             end = getCardinality();
533             StringBuilder buf = new StringBuilder();
534             System.out.println("index\toffset\tsize\tkey\tvalue");
535             for (long i = 0; i < end; ++i) {
536                 DataIndex index = getIndex(state, i);
537                 int offset = index.getOffset();
538                 int size   = index.getSize();
539                 buf.setLength(0);
540                 buf.append(i);
541                 buf.append('\t');
542                 buf.append(offset);
543                 buf.append('\t');
544                 buf.append(size);
545                 if (size > 0) {
546                     // Now read the data file for this key using the offset and size
547                     byte[] data = SwordUtil.readRAF(state.getDatRaf(), offset, size);
548                     DataEntry entry = new DataEntry(Long.toString(i), data, getBookMetaData().getBookCharset());
549                     String key = entry.getKey();
550                     String raw = getRawText(entry);
551                     buf.append('\t');
552                     buf.append(key);
553                     buf.append('\t');
554                     if (raw.length() > 43) {
555                         buf.append(raw.substring(0, 40).replace('\n', ' '));
556                         buf.append("...");
557                     } else {
558                         buf.append(raw.replace('\n', ' '));
559                     }
560                 } else {
561                     buf.append("\t\t");
562                 }
563                 System.out.println(buf.toString());
564             }
565         } catch (IOException e) {
566             // TODO Auto-generated catch block
567             e.printStackTrace();
568         } catch (BookException e) {
569             // TODO Auto-generated catch block
570             e.printStackTrace();
571         } finally {
572             OpenFileStateManager.instance().release(state);
573         }
574     }
575 
576     /** 
577      * Experimental code.
578      */
579     public void toIMP() {
580         RawLDBackendState state = null;
581         long end = -1;
582         try {
583             state = initState();
584             end = getCardinality();
585             StringBuilder buf = new StringBuilder();
586             for (long i = 0; i < end; ++i) {
587                 DataIndex index = getIndex(state, i);
588                 int offset = index.getOffset();
589                 int size   = index.getSize();
590                 buf.setLength(0);
591                 buf.append("$$$");
592                 if (size > 0) {
593                     // Now read the data file for this key using the offset and size
594                     byte[] data = SwordUtil.readRAF(state.getDatRaf(), offset, size);
595                     DataEntry entry = new DataEntry(Long.toString(i), data, getBookMetaData().getBookCharset());
596                     String key = entry.getKey();
597                     String raw = getRawText(entry);
598                     buf.append(key);
599                     buf.append("\n");
600                     buf.append(raw);
601                 }
602                 System.out.println(buf.toString());
603             }
604         } catch (IOException e) {
605             // TODO Auto-generated catch block
606             e.printStackTrace();
607         } catch (BookException e) {
608             // TODO Auto-generated catch block
609             e.printStackTrace();
610         } finally {
611             OpenFileStateManager.instance().release(state);
612         }
613     }
614     /**
615      * Date formatter
616      */
617     private static final MessageFormat DATE_KEY_FORMAT = new MessageFormat("{0,number,00}.{1,number,00}");
618 
619     /**
620      * This is the pattern of a Strong's Number. It begins with a G or H. Is
621      * followed by a number. It can be followed by a ! and a letter or just a
622      * letter.
623      */
624     private static final Pattern STRONGS_PATTERN = Pattern.compile("^([GH])(\\d+)((!)?([a-z])?)$");
625     /**
626      * This is the pattern of a date for a DailyDevotion, DD.MM
627      */
628     private static final Pattern DEVOTION_PATTERN = Pattern.compile("^\\d\\d\\.\\d\\d$");
629 
630     /**
631      * The number of bytes in the size count in the index
632      */
633     private final int datasize;
634 
635     /**
636      * The number of bytes for each entry in the index: either 6 or 8
637      */
638     private final int entrysize;
639 
640     /**
641      * How many bytes in the offset pointers in the index
642      */
643     private static final int OFFSETSIZE = 4;
644 
645     /**
646      * Serialization ID
647      */
648     private static final long serialVersionUID = 818089833394450383L;
649 }
650