1   /**
2    * Distribution License:
3    * JSword is free software; you can redistribute it and/or modify it under
4    * the terms of the GNU Lesser General Public License, version 2.1 as published by
5    * the Free Software Foundation. This program is distributed in the hope
6    * that it will be useful, but WITHOUT ANY WARRANTY; without even the
7    * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
8    * See the GNU Lesser General Public License for more details.
9    *
10   * The License is available on the internet at:
11   *       http://www.gnu.org/copyleft/lgpl.html
12   * or by writing to:
13   *      Free Software Foundation, Inc.
14   *      59 Temple Place - Suite 330
15   *      Boston, MA 02111-1307, USA
16   *
17   * Copyright: 2005
18   *     The copyright to this program is held by it's authors.
19   *
20   * ID: $Id:LuceneIndex.java 984 2006-01-23 14:18:33 -0500 (Mon, 23 Jan 2006) dmsmith $
21   */
22  package org.crosswire.jsword.index.lucene;
23  
24  import java.io.File;
25  import java.io.IOException;
26  import java.net.URI;
27  import java.util.ArrayList;
28  import java.util.List;
29  
30  import org.apache.lucene.analysis.Analyzer;
31  import org.apache.lucene.document.Document;
32  import org.apache.lucene.document.Field;
33  import org.apache.lucene.index.IndexWriter;
34  import org.apache.lucene.queryParser.ParseException;
35  import org.apache.lucene.queryParser.QueryParser;
36  import org.apache.lucene.search.IndexSearcher;
37  import org.apache.lucene.search.Query;
38  import org.apache.lucene.search.ScoreDoc;
39  import org.apache.lucene.search.Searcher;
40  import org.apache.lucene.search.TopScoreDocCollector;
41  import org.apache.lucene.store.Directory;
42  import org.apache.lucene.store.FSDirectory;
43  import org.apache.lucene.store.RAMDirectory;
44  import org.apache.lucene.util.Version;
45  import org.crosswire.common.activate.Activatable;
46  import org.crosswire.common.activate.Activator;
47  import org.crosswire.common.activate.Lock;
48  import org.crosswire.common.progress.JobManager;
49  import org.crosswire.common.progress.Progress;
50  import org.crosswire.common.util.Logger;
51  import org.crosswire.common.util.NetUtil;
52  import org.crosswire.common.util.Reporter;
53  import org.crosswire.jsword.JSMsg;
54  import org.crosswire.jsword.book.Book;
55  import org.crosswire.jsword.book.BookData;
56  import org.crosswire.jsword.book.BookException;
57  import org.crosswire.jsword.book.FeatureType;
58  import org.crosswire.jsword.book.OSISUtil;
59  import org.crosswire.jsword.index.AbstractIndex;
60  import org.crosswire.jsword.index.IndexStatus;
61  import org.crosswire.jsword.index.lucene.analysis.LuceneAnalyzer;
62  import org.crosswire.jsword.index.search.SearchModifier;
63  import org.crosswire.jsword.passage.AbstractPassage;
64  import org.crosswire.jsword.passage.Key;
65  import org.crosswire.jsword.passage.NoSuchKeyException;
66  import org.crosswire.jsword.passage.NoSuchVerseException;
67  import org.crosswire.jsword.passage.PassageTally;
68  import org.crosswire.jsword.passage.VerseFactory;
69  import org.crosswire.jsword.versification.Versification;
70  import org.crosswire.jsword.versification.system.Versifications;
71  import org.jdom.Element;
72  
73  /**
74   * Implement the SearchEngine using Lucene as the search engine.
75   * 
76   * @see gnu.lgpl.License for license details.<br>
77   *      The copyright to this program is held by it's authors.
78   * @author Joe Walker [joe at eireneh dot com]
79   */
80  public class LuceneIndex extends AbstractIndex implements Activatable {
81      /*
82       * The following fields are named the same as Sword in the hopes of sharing
83       * indexes.
84       */
85      /**
86       * The Lucene field for the osisID
87       */
88      public static final String FIELD_KEY = "key";
89  
90      /**
91       * The Lucene field for the text contents
92       */
93      public static final String FIELD_BODY = "content";
94  
95      /**
96       * The Lucene field for the strong numbers
97       */
98      public static final String FIELD_STRONG = "strong";
99  
100     /**
101      * The Lucene field for headings
102      */
103     public static final String FIELD_HEADING = "heading";
104 
105     /**
106      * The Lucene field for cross references
107      */
108     public static final String FIELD_XREF = "xref";
109 
110     /**
111      * The Lucene field for the notes
112      */
113     public static final String FIELD_NOTE = "note";
114 
115     /**
116      * Read an existing index and use it.
117      * 
118      * @throws BookException
119      *             If we fail to read the index files
120      */
121     public LuceneIndex(Book book, URI storage) throws BookException {
122         this.book = book;
123 
124         try {
125             this.path = NetUtil.getAsFile(storage).getCanonicalPath();
126         } catch (IOException ex) {
127             // TRANSLATOR: Error condition: Could not initialize a search index.
128             throw new BookException(JSMsg.gettext("Failed to initialize Lucene search engine."), ex);
129         }
130     }
131 
132     /**
133      * Generate an index to use, telling the job about progress as you go.
134      * 
135      * @throws BookException
136      *             If we fail to read the index files
137      */
138     public LuceneIndex(Book book, URI storage, boolean create) throws BookException {
139         assert create;
140 
141         this.book = book;
142         File finalPath = null;
143         try {
144             finalPath = NetUtil.getAsFile(storage);
145             this.path = finalPath.getCanonicalPath();
146         } catch (IOException ex) {
147             // TRANSLATOR: Error condition: Could not initialize a search index. Lucene is the name of the search technology being used.
148             throw new BookException(JSMsg.gettext("Failed to initialize Lucene search engine."), ex);
149         }
150 
151         // TRANSLATOR: Progress label indicating the start of indexing. {0} is a placeholder for the book's short name.
152         String jobName = JSMsg.gettext("Creating index. Processing {0}", book.getInitials());
153         Progress job = JobManager.createJob(jobName, Thread.currentThread());
154         job.beginJob(jobName);
155 
156         IndexStatus finalStatus = IndexStatus.UNDONE;
157 
158         Analyzer analyzer = new LuceneAnalyzer(book);
159 
160         List<Key> errors = new ArrayList<Key>();
161         File tempPath = new File(path + '.' + IndexStatus.CREATING.toString());
162 
163         try {
164             synchronized (CREATING) {
165 
166                 book.setIndexStatus(IndexStatus.CREATING);
167 
168                 // An index is created by opening an IndexWriter with the create
169                 // argument set to true.
170                 // IndexWriter writer = new
171                 // IndexWriter(tempPath.getCanonicalPath(), analyzer, true);
172 
173                 // Create the index in core.
174                 final RAMDirectory ramDir = new RAMDirectory();
175                 IndexWriter writer = new IndexWriter(ramDir, analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED);
176 
177                 generateSearchIndexImpl(job, errors, writer, book.getGlobalKeyList(), 0);
178 
179                 // TRANSLATOR: Progress label for optimizing a search index. This may take a bit of time, so we have a label for it.
180                 job.setSectionName(JSMsg.gettext("Optimizing"));
181                 job.setWork(95);
182 
183                 // Consolidate the index into the minimum number of files.
184                 // writer.optimize(); /* Optimize is done by addIndexes */
185                 writer.close();
186 
187                 // Write the core index to disk.
188                 final Directory destination = FSDirectory.open(new File(tempPath.getCanonicalPath()));
189                 IndexWriter fsWriter = new IndexWriter(destination, analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED);
190                 fsWriter.addIndexesNoOptimize(new Directory[] {
191                     ramDir
192                 });
193                 fsWriter.optimize();
194                 fsWriter.close();
195 
196                 // Free up the space used by the ram directory
197                 ramDir.close();
198 
199                 job.setCancelable(false);
200                 if (!job.isFinished()) {
201                     if (!tempPath.renameTo(finalPath)) {
202                         // TRANSLATOR: The search index could not be moved to it's final location.
203                         throw new BookException(JSMsg.gettext("Installation failed."));
204                     }
205                 }
206 
207                 if (finalPath.exists()) {
208                     finalStatus = IndexStatus.DONE;
209                 }
210 
211                 if (!errors.isEmpty()) {
212                     StringBuilder buf = new StringBuilder();
213                     for (Key error : errors) {
214                         buf.append(error);
215                         buf.append('\n');
216                     }
217                     // TRANSLATOR: It is likely that one or more verses could not be indexed due to errors in those verses.
218                     // This message gives a listing of them to the user.
219                     Reporter.informUser(this, JSMsg.gettext("The following verses have errors and could not be indexed\n{0}", buf));
220                 }
221 
222             }
223         } catch (IOException ex) {
224             job.cancel();
225             // TRANSLATOR: Common error condition: Some error happened while creating a search index.
226             throw new BookException(JSMsg.gettext("Failed to initialize Lucene search engine."), ex);
227         } finally {
228             book.setIndexStatus(finalStatus);
229             job.done();
230         }
231     }
232 
233     /*
234      * (non-Javadoc)
235      * 
236      * @see org.crosswire.jsword.index.search.Index#findWord(java.lang.String)
237      */
238     public Key find(String search) throws BookException {
239         checkActive();
240         String v11nName = book.getBookMetaData().getProperty("Versification").toString();
241         Versification v11n = Versifications.instance().getVersification(v11nName);
242 
243         SearchModifier modifier = getSearchModifier();
244         Key results = null;
245 
246         if (search != null) {
247             try {
248                 Analyzer analyzer = new LuceneAnalyzer(book);
249 
250                 QueryParser parser = new QueryParser(Version.LUCENE_29, LuceneIndex.FIELD_BODY, analyzer);
251                 parser.setAllowLeadingWildcard(true);
252                 Query query = parser.parse(search);
253                 log.info("ParsedQuery-" + query.toString());
254 
255                 // For ranking we use a PassageTally
256                 if (modifier != null && modifier.isRanked()) {
257                     PassageTally tally = new PassageTally(v11n);
258                     tally.raiseEventSuppresion();
259                     tally.raiseNormalizeProtection();
260                     results = tally;
261 
262                     TopScoreDocCollector collector = TopScoreDocCollector.create(modifier.getMaxResults(), false);
263                     searcher.search(query, collector);
264                     tally.setTotal(collector.getTotalHits());
265                     ScoreDoc[] hits = collector.topDocs().scoreDocs;
266                     for (int i = 0; i < hits.length; i++) {
267                         int docId = hits[i].doc;
268                         Document doc = searcher.doc(docId);
269                         Key key = VerseFactory.fromString(v11n, doc.get(LuceneIndex.FIELD_KEY));
270                         // PassageTally understands a score of 0 as the verse
271                         // not participating
272                         int score = (int) (hits[i].score * 100 + 1);
273                         tally.add(key, score);
274                     }
275                     tally.lowerNormalizeProtection();
276                     tally.lowerEventSuppresionAndTest();
277                 } else {
278                     results = book.createEmptyKeyList();
279                     // If we have an abstract passage,
280                     // make sure it does not try to fire change events.
281                     AbstractPassage passage = null;
282                     if (results instanceof AbstractPassage) {
283                         passage = (AbstractPassage) results;
284                         passage.raiseEventSuppresion();
285                         passage.raiseNormalizeProtection();
286                     }
287                     searcher.search(query, new VerseCollector(v11n, searcher, results));
288                     if (passage != null) {
289                         passage.lowerNormalizeProtection();
290                         passage.lowerEventSuppresionAndTest();
291                     }
292                 }
293             } catch (IOException e) {
294                 // The VerseCollector may throw IOExceptions that merely wrap a
295                 // NoSuchVerseException
296                 Throwable cause = e.getCause();
297                 if (cause instanceof NoSuchVerseException) {
298                     // TRANSLATOR: Error condition: An unexpected error happened that caused search to fail.
299                     throw new BookException(JSMsg.gettext("Search failed."), cause);
300                 }
301 
302                 // TRANSLATOR: Error condition: An unexpected error happened that caused search to fail.
303                 throw new BookException(JSMsg.gettext("Search failed."), e);
304             } catch (NoSuchVerseException e) {
305                 // TRANSLATOR: Error condition: An unexpected error happened that caused search to fail.
306                 throw new BookException(JSMsg.gettext("Search failed."), e);
307             } catch (ParseException e) {
308                 // TRANSLATOR: Error condition: An unexpected error happened that caused search to fail.
309                 throw new BookException(JSMsg.gettext("Search failed."), e);
310             } finally {
311                 Activator.deactivate(this);
312             }
313         }
314 
315         if (results == null) {
316             if (modifier != null && modifier.isRanked()) {
317                 results = new PassageTally(v11n);
318             } else {
319                 results = book.createEmptyKeyList();
320             }
321         }
322         return results;
323     }
324 
325     /*
326      * (non-Javadoc)
327      * 
328      * @see org.crosswire.jsword.index.search.Index#getKey(java.lang.String)
329      */
330     public Key getKey(String name) throws NoSuchKeyException {
331         return book.getKey(name);
332     }
333 
334     /*
335      * (non-Javadoc)
336      * 
337      * @see
338      * org.crosswire.common.activate.Activatable#activate(org.crosswire.common
339      * .activate.Lock)
340      */
341     public final void activate(Lock lock) {
342         try {
343             directory = FSDirectory.open(new File(path));
344             searcher = new IndexSearcher(directory, true);
345         } catch (IOException ex) {
346             log.warn("second load failure", ex);
347         }
348 
349         active = true;
350     }
351 
352     /*
353      * (non-Javadoc)
354      * 
355      * @see
356      * org.crosswire.common.activate.Activatable#deactivate(org.crosswire.common
357      * .activate.Lock)
358      */
359     public final void deactivate(Lock lock) {
360         try {
361             searcher.close();
362             directory.close();
363         } catch (IOException ex) {
364             Reporter.informUser(this, ex);
365         } finally {
366             searcher = null;
367             directory = null;
368         }
369 
370         active = false;
371     }
372 
373     /**
374      * Helper method so we can quickly activate ourselves on access
375      */
376     protected final void checkActive() {
377         if (!active) {
378             Activator.activate(this);
379         }
380     }
381 
382     /**
383      * Dig down into a Key indexing as we go.
384      */
385     private void generateSearchIndexImpl(Progress job, List<Key> errors, IndexWriter writer, Key key, int count) throws BookException, IOException {
386         String v11nName = book.getBookMetaData().getProperty("Versification").toString();
387         Versification v11n = Versifications.instance().getVersification(v11nName);
388         boolean hasStrongs = book.getBookMetaData().hasFeature(FeatureType.STRONGS_NUMBERS);
389         boolean hasXRefs = book.getBookMetaData().hasFeature(FeatureType.SCRIPTURE_REFERENCES);
390         boolean hasNotes = book.getBookMetaData().hasFeature(FeatureType.FOOTNOTES);
391         boolean hasHeadings = book.getBookMetaData().hasFeature(FeatureType.HEADINGS);
392 
393         String oldRootName = "";
394         int percent = 0;
395         String rootName = "";
396         BookData data = null;
397         Element osis = null;
398 
399         // Set up for reuse.
400         Document doc = new Document();
401         Field keyField = new Field(FIELD_KEY, "", Field.Store.YES, Field.Index.NOT_ANALYZED, Field.TermVector.NO);
402         Field bodyField = new Field(FIELD_BODY, "", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.NO);
403         Field strongField = new Field(FIELD_STRONG, "", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.NO);
404         Field xrefField = new Field(FIELD_XREF, "", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.NO);
405         Field noteField = new Field(FIELD_NOTE, "", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.NO);
406         Field headingField = new Field(FIELD_HEADING, "", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.NO);
407 
408         int size = key.getCardinality();
409         int subCount = count;
410         for (Key subkey : key) {
411             if (subkey.canHaveChildren()) {
412                 generateSearchIndexImpl(job, errors, writer, subkey, subCount);
413             } else {
414                 data = new BookData(book, subkey);
415                 osis = null;
416 
417                 try {
418                     osis = data.getOsisFragment();
419                 } catch (BookException e) {
420                     errors.add(subkey);
421                     continue;
422                 }
423 
424                 // Remove all fields from the document
425                 doc.getFields().clear();
426 
427                 // Do the actual indexing
428                 // Always add the key
429                 keyField.setValue(subkey.getOsisRef());
430                 doc.add(keyField);
431 
432                 addField(doc, bodyField, OSISUtil.getCanonicalText(osis));
433 
434                 if (hasStrongs) {
435                     addField(doc, strongField, OSISUtil.getStrongsNumbers(osis));
436                 }
437 
438                 if (hasXRefs) {
439                     addField(doc, xrefField, OSISUtil.getReferences(v11n, osis));
440                 }
441 
442                 if (hasNotes) {
443                     addField(doc, noteField, OSISUtil.getNotes(osis));
444                 }
445 
446                 if (hasHeadings) {
447                     addField(doc, headingField, OSISUtil.getHeadings(osis));
448                 }
449 
450                 // Add the document if we added more than just the key.
451                 if (doc.getFields().size() > 1) {
452                     writer.addDocument(doc);
453                 }
454 
455                 // report progress
456                 rootName = subkey.getRootName();
457                 if (!rootName.equals(oldRootName)) {
458                     oldRootName = rootName;
459                     job.setSectionName(rootName);
460                 }
461 
462                 subCount++;
463                 int oldPercent = percent;
464                 percent = 95 * subCount / size;
465 
466                 if (oldPercent != percent) {
467                     job.setWork(percent);
468                 }
469 
470                 // This could take a long time ...
471                 Thread.yield();
472                 if (Thread.currentThread().isInterrupted()) {
473                     break;
474                 }
475             }
476         }
477     }
478 
479     private void addField(Document doc, Field field, String text) {
480         if (text != null && text.length() > 0) {
481             field.setValue(text);
482             doc.add(field);
483         }
484     }
485 
486     /**
487      * A synchronization lock point to prevent us from doing 2 index runs at a
488      * time.
489      */
490     private static final Object CREATING = new Object();
491 
492     /**
493      * Are we active
494      */
495     private boolean active;
496 
497     /**
498      * The log stream
499      */
500     private static final Logger log = Logger.getLogger(LuceneIndex.class);
501 
502     /**
503      * The Book that we are indexing
504      */
505     protected Book book;
506 
507     /**
508      * The location of this index
509      */
510     private String path;
511 
512     /**
513      * The Lucene directory for the path.
514      */
515     protected Directory directory;
516 
517     /**
518      * The Lucene search engine
519      */
520     protected Searcher searcher;
521 }
522