[jsword-svn] r1318 - in trunk/jsword/src/main/java/org/crosswire/jsword: book book/sword index/lucene

dmsmith at www.crosswire.org dmsmith at www.crosswire.org
Sun May 6 08:36:36 MST 2007


Author: dmsmith
Date: 2007-05-06 08:36:35 -0700 (Sun, 06 May 2007)
New Revision: 1318

Added:
   trunk/jsword/src/main/java/org/crosswire/jsword/index/lucene/LuceneAnalyzer.java
Modified:
   trunk/jsword/src/main/java/org/crosswire/jsword/book/BookData.java
   trunk/jsword/src/main/java/org/crosswire/jsword/book/FeatureType.java
   trunk/jsword/src/main/java/org/crosswire/jsword/book/OSISUtil.java
   trunk/jsword/src/main/java/org/crosswire/jsword/book/sword/SwordBookMetaData.java
   trunk/jsword/src/main/java/org/crosswire/jsword/index/lucene/LuceneIndex.java
Log:
Added the ability to index Strong's Numbers and cross references.

Modified: trunk/jsword/src/main/java/org/crosswire/jsword/book/BookData.java
===================================================================
--- trunk/jsword/src/main/java/org/crosswire/jsword/book/BookData.java	2007-05-06 15:24:35 UTC (rev 1317)
+++ trunk/jsword/src/main/java/org/crosswire/jsword/book/BookData.java	2007-05-06 15:36:35 UTC (rev 1318)
@@ -88,6 +88,24 @@
     }
 
     /**
+     * Return just the Strong's numbers.
+     * @return The Book's Strong's numbers as a space separated string.
+     */
+    public String getStrongsNumbers()
+    {
+        return OSISUtil.getStrongsNumbers(getOsis());
+    }
+
+    /**
+     * Return just the scripture references in the book.
+     * @return The Book's scripture references
+     */
+    public String getReferences()
+    {
+        return OSISUtil.getReferences(getOsis());
+    }
+
+    /**
      * Check that a BibleData is valid.
      * Currently, this does nothing, and isn't used. it was broken when we used
      * JAXB, however it wasn't much use then becuase JAXB did a lot to keep the

Modified: trunk/jsword/src/main/java/org/crosswire/jsword/book/FeatureType.java
===================================================================
--- trunk/jsword/src/main/java/org/crosswire/jsword/book/FeatureType.java	2007-05-06 15:24:35 UTC (rev 1317)
+++ trunk/jsword/src/main/java/org/crosswire/jsword/book/FeatureType.java	2007-05-06 15:36:35 UTC (rev 1318)
@@ -52,7 +52,6 @@
      */
     public static final FeatureType HEBREW_PARSE = new FeatureType("HebrewParse"); //$NON-NLS-1$
 
-
     /**
      * The book is one of Daily Devotions. 
      */
@@ -64,8 +63,38 @@
     public static final FeatureType GLOSSARY = new FeatureType("Glossary"); //$NON-NLS-1$
 
     /**
-     * @param name The name of the BookCategory
+     * The book contains Strong's Numbers
      */
+    public static final FeatureType STRONGS_NUMBERS = new FeatureType("StrongsNumbers"); //$NON-NLS-1$
+
+    /**
+     * The book contains footnotes
+     */
+    public static final FeatureType FOOTNOTES = new FeatureType("Footnotes"); //$NON-NLS-1$
+
+    /**
+     * The book contains Scripture cross references
+     */
+    public static final FeatureType SCRIPTURE_REFERENCES = new FeatureType("Scripref"); //$NON-NLS-1$
+
+    /**
+     * The book marks the Word's of Christ
+     */
+    public static final FeatureType WORDS_OF_CHRIST = new FeatureType("RedLetterText"); //$NON-NLS-1$
+
+    /**
+     * The book contains Morphology info
+     */
+    public static final FeatureType MORPHOLOGY = new FeatureType("Morph"); //$NON-NLS-1$
+
+    /**
+     * The book contains Headings
+     */
+    public static final FeatureType HEADINGS = new FeatureType("Headings"); //$NON-NLS-1$
+
+    /**
+     * @param name The name of the FeatureType
+     */
     private FeatureType(String name)
     {
         this.name = name;
@@ -106,7 +135,7 @@
     }
 
     /**
-     * The name of the BookCategory
+     * The name of the FeatureType
      */
     private String name;
 
@@ -127,6 +156,12 @@
         HEBREW_PARSE,
         DAILY_DEVOTIONS,
         GLOSSARY,
+        STRONGS_NUMBERS,
+        FOOTNOTES,
+        SCRIPTURE_REFERENCES,
+        WORDS_OF_CHRIST,
+        MORPHOLOGY,
+        HEADINGS,
     };
 
     /**

Modified: trunk/jsword/src/main/java/org/crosswire/jsword/book/OSISUtil.java
===================================================================
--- trunk/jsword/src/main/java/org/crosswire/jsword/book/OSISUtil.java	2007-05-06 15:24:35 UTC (rev 1317)
+++ trunk/jsword/src/main/java/org/crosswire/jsword/book/OSISUtil.java	2007-05-06 15:36:35 UTC (rev 1318)
@@ -28,9 +28,15 @@
 import java.util.Iterator;
 import java.util.List;
 import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
 import org.crosswire.common.util.Logger;
+import org.crosswire.jsword.passage.Key;
+import org.crosswire.jsword.passage.KeyFactory;
+import org.crosswire.jsword.passage.NoSuchKeyException;
 import org.crosswire.jsword.passage.NoSuchVerseException;
+import org.crosswire.jsword.passage.PassageKeyFactory;
 import org.crosswire.jsword.passage.Verse;
 import org.crosswire.jsword.passage.VerseFactory;
 import org.jdom.Content;
@@ -617,6 +623,86 @@
         return buffer.toString().trim();
     }
 
+    /**
+     * A space separate string containing Strong's numbers.
+     * @return The Strong's numbers in the text
+     */
+    public static String getStrongsNumbers(Element root)
+    {
+        StringBuffer buffer = new StringBuffer();
+
+        Iterator contentIter = getDeepContent(root, OSISUtil.OSIS_ELEMENT_W).iterator();
+        while (contentIter.hasNext())
+        {
+            Element ele = (Element) contentIter.next();
+            String attr = ele.getAttributeValue(OSISUtil.ATTRIBUTE_W_LEMMA);
+            if (attr != null)
+            {
+                if (buffer.length() > 0)
+                {
+                    buffer.append(' ');
+                }
+
+                buffer.append(attr);
+            }
+        }
+
+        String lemmas = buffer.toString();
+
+        // Clear out the buffer for re-use
+        int len = buffer.length();
+        if (len > 0)
+        {
+            buffer.delete(0, len);
+        }
+
+        Matcher matcher = strongsNumberPattern.matcher(lemmas);
+        while (matcher.find())
+        {
+            String strongType = matcher.group(1);
+            String strongsNum = matcher.group(2);
+            if (buffer.length() > 0)
+            {
+                buffer.append(' ');
+            }
+            buffer.append(strongType);
+            buffer.append(strongsNum);
+        }
+         
+        return buffer.toString().trim();
+    }
+
+    /**
+     * A space separate string containing osisID from the reference element.
+     * @return The references in the text
+     */
+    public static String getReferences(Element root)
+    {
+        KeyFactory keyf = PassageKeyFactory.instance();
+        Key collector = keyf.createEmptyKeyList();
+
+        Iterator contentIter = getDeepContent(root, OSISUtil.OSIS_ELEMENT_REFERENCE).iterator();
+        while (contentIter.hasNext())
+        {
+            Element ele = (Element) contentIter.next();
+            String attr = ele.getAttributeValue(OSISUtil.OSIS_ATTR_REF);
+            if (attr != null)
+            {
+                try
+                {
+                    Key key = keyf.getKey(attr);
+                    collector.addAll(key);
+                }
+                catch (NoSuchKeyException e)
+                {
+                    log.warn("Unable to parse: " + attr, e); //$NON-NLS-1$
+                }
+            }
+        }
+
+        return collector.getOsisID();
+    }
+
     private static void getCanonicalContent(String sName, String sID, Iterator iter, StringBuffer buffer)
     {
         Object data = null;
@@ -697,6 +783,7 @@
                 throw new BookException(Msg.OSIS_BADID, ex, new Object[] { osisid });
             }
         }
+
         // So we just walk up the tree trying to find a verse
         Parent parent = ele.getParent();
         if (parent instanceof Element)
@@ -742,11 +829,17 @@
             reply.add(start);
         }
 
+        Object data = null;
+        Element ele = null;
         Iterator contentIter = start.getContent().iterator();
         while (contentIter.hasNext())
         {
-            Element ele = (Element) contentIter.next();
-            recurseDeepContent(ele, name, reply);
+            data = contentIter.next();
+            if (data instanceof Element)
+            {
+                ele = (Element) data;
+                recurseDeepContent(ele, name, reply);
+            }
         }
     }
 
@@ -785,4 +878,7 @@
             recurseElement(sub, buffer);
         }
     }
+
+    private static String strongsNumber = "strong:([GH])0*([0-9]+)"; //$NON-NLS-1$
+    private static Pattern strongsNumberPattern = Pattern.compile(strongsNumber);
 }

Modified: trunk/jsword/src/main/java/org/crosswire/jsword/book/sword/SwordBookMetaData.java
===================================================================
--- trunk/jsword/src/main/java/org/crosswire/jsword/book/sword/SwordBookMetaData.java	2007-05-06 15:24:35 UTC (rev 1317)
+++ trunk/jsword/src/main/java/org/crosswire/jsword/book/sword/SwordBookMetaData.java	2007-05-06 15:36:35 UTC (rev 1318)
@@ -263,8 +263,22 @@
      */
     /* @Override */
     public boolean hasFeature(FeatureType feature)
-    {
-        return cet.match(ConfigEntryType.FEATURE, feature.toString());
+    {        
+        if (cet.match(ConfigEntryType.FEATURE, feature.toString()))
+        {
+            return true;
+        }
+        // Many "features" are GlobalOptionFilters, which in the Sword C++ API
+        // indicate a class to use for filtering.
+        // These mostly have the source type prepended to the feature
+        StringBuffer buffer = new StringBuffer(getProperty(ConfigEntryType.SOURCE_TYPE));
+        buffer.append(feature);
+        if (cet.match(ConfigEntryType.GLOBAL_OPTION_FILTER, buffer.toString()))
+        {
+            return true;
+        }
+        // But some do not
+        return cet.match(ConfigEntryType.GLOBAL_OPTION_FILTER, feature.toString());
     }
 
     private void buildProperties()

Added: trunk/jsword/src/main/java/org/crosswire/jsword/index/lucene/LuceneAnalyzer.java
===================================================================
--- trunk/jsword/src/main/java/org/crosswire/jsword/index/lucene/LuceneAnalyzer.java	                        (rev 0)
+++ trunk/jsword/src/main/java/org/crosswire/jsword/index/lucene/LuceneAnalyzer.java	2007-05-06 15:36:35 UTC (rev 1318)
@@ -0,0 +1,45 @@
+package org.crosswire.jsword.index.lucene;
+
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.KeywordAnalyzer;
+import org.apache.lucene.analysis.SimpleAnalyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.WhitespaceAnalyzer;
+
+public class LuceneAnalyzer extends Analyzer
+{
+
+    public LuceneAnalyzer()
+    {
+    }
+
+    public TokenStream tokenStream(String fieldName, Reader reader)
+    {
+        // do not tokenize keys
+        if (LuceneIndex.FIELD_KEY.equals(fieldName))
+        {
+            return KEYWORD.tokenStream(fieldName, reader);
+        }
+        // Split Strong's Numbers on whitespace
+        else if (LuceneIndex.FIELD_STRONG.equals(fieldName))
+        {
+            return WHITESPACE.tokenStream(fieldName, reader);
+        }
+        // Split xrefs's on whitespace
+        else if (LuceneIndex.FIELD_XREF.equals(fieldName))
+        {
+            return WHITESPACE.tokenStream(fieldName, reader);
+        }
+        // just use the standard tokenizer
+        else
+        {
+            return SIMPLE.tokenStream(fieldName, reader);
+        }
+    }
+
+    private static final Analyzer KEYWORD = new KeywordAnalyzer();
+    private static final Analyzer WHITESPACE = new WhitespaceAnalyzer();
+    private static final Analyzer SIMPLE = new SimpleAnalyzer();
+}

Modified: trunk/jsword/src/main/java/org/crosswire/jsword/index/lucene/LuceneIndex.java
===================================================================
--- trunk/jsword/src/main/java/org/crosswire/jsword/index/lucene/LuceneIndex.java	2007-05-06 15:24:35 UTC (rev 1317)
+++ trunk/jsword/src/main/java/org/crosswire/jsword/index/lucene/LuceneIndex.java	2007-05-06 15:36:35 UTC (rev 1318)
@@ -30,7 +30,6 @@
 import java.util.List;
 
 import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.SimpleAnalyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.index.IndexWriter;
@@ -53,6 +52,7 @@
 import org.crosswire.jsword.book.Book;
 import org.crosswire.jsword.book.BookData;
 import org.crosswire.jsword.book.BookException;
+import org.crosswire.jsword.book.FeatureType;
 import org.crosswire.jsword.index.AbstractIndex;
 import org.crosswire.jsword.index.IndexStatus;
 import org.crosswire.jsword.index.search.SearchModifier;
@@ -114,7 +114,7 @@
         Progress job = JobManager.createJob(Msg.INDEX_START.toString(), Thread.currentThread(), false);
 
         IndexStatus finalStatus = IndexStatus.UNDONE;
-        Analyzer analyzer = new SimpleAnalyzer();
+        Analyzer analyzer = new LuceneAnalyzer();
         List errors = new ArrayList();
         File tempPath = new File(path + '.' + IndexStatus.CREATING.toString());
 
@@ -202,7 +202,7 @@
             try
             {
 
-                Analyzer analyzer = new SimpleAnalyzer();
+                Analyzer analyzer = new LuceneAnalyzer();
                 QueryParser parser = new QueryParser(LuceneIndex.FIELD_BODY, analyzer);
                 Query query = parser.parse(search);
                 Hits hits = searcher.search(query);
@@ -216,7 +216,7 @@
                     results = tally;
                     for (int i = 0; i < hits.length(); i++)
                     {
-                        Key key = VerseFactory.fromString(hits.doc(i).get(LuceneIndex.FIELD_NAME));
+                        Key key = VerseFactory.fromString(hits.doc(i).get(LuceneIndex.FIELD_KEY));
                         // PassageTally understands a score of 0 as the verse not participating
                         int score = (int) (hits.score(i) * 100 + 1);
                         tally.add(key, score);
@@ -238,7 +238,7 @@
                     }
                     for (int i = 0; i < hits.length(); i++)
                     {
-                        Key key = VerseFactory.fromString(hits.doc(i).get(LuceneIndex.FIELD_NAME));
+                        Key key = VerseFactory.fromString(hits.doc(i).get(LuceneIndex.FIELD_KEY));
                         results.addAll(key);
                     }
                     if (passage != null)
@@ -339,10 +339,15 @@
      */
     private void generateSearchIndexImpl(Progress job, List errors, IndexWriter writer, Key key, int count) throws BookException, IOException
     {
+        boolean hasStrongs = book.getBookMetaData().hasFeature(FeatureType.STRONGS_NUMBERS);
+        boolean hasXRefs = book.getBookMetaData().hasFeature(FeatureType.SCRIPTURE_REFERENCES);
+
         String oldRootName = ""; //$NON-NLS-1$
         int percent = 0;
         String rootName = ""; //$NON-NLS-1$
         String text = ""; //$NON-NLS-1$
+        String strongs = ""; //$NON-NLS-1$
+        String xrefs = ""; //$NON-NLS-1$
         BookData data = null;
         Key subkey = null;
         Document doc = null;
@@ -369,13 +374,36 @@
                 }
 
                 text = data.getCanonicalText();
+                if (hasStrongs)
+                {
+                    strongs = data.getStrongsNumbers();
+                }
 
                 // Do the actual indexing
                 if (text != null && text.length() > 0)
                 {
                     doc = new Document();
-                    doc.add(new Field(FIELD_NAME, subkey.getOsisRef(), Field.Store.YES, Field.Index.NO));
+                    doc.add(new Field(FIELD_KEY, subkey.getOsisRef(), Field.Store.YES, Field.Index.UN_TOKENIZED));
                     doc.add(new Field(FIELD_BODY, new StringReader(text)));
+
+                    if (hasStrongs)
+                    {
+                        strongs = data.getStrongsNumbers();
+                        if (strongs != null && strongs.length() > 0)
+                        {
+                            doc.add(new Field(FIELD_STRONG, strongs, Field.Store.NO, Field.Index.TOKENIZED));
+                        }
+                    }
+
+                    if (hasXRefs)
+                    {
+                        xrefs = data.getReferences();
+                        if (xrefs != null && xrefs.length() > 0)
+                        {
+                            doc.add(new Field(FIELD_XREF, xrefs, Field.Store.NO, Field.Index.TOKENIZED));
+                        }
+                    }
+
                     writer.addDocument(doc);
                 }
 
@@ -423,7 +451,7 @@
     /**
      * The Lucene field for the osisID
      */
-    protected static final String FIELD_NAME = "key"; //$NON-NLS-1$
+    protected static final String FIELD_KEY = "key"; //$NON-NLS-1$
 
     /**
      * The Lucene field for the text contents
@@ -436,6 +464,16 @@
     protected static final String FIELD_STRONG = "strong"; //$NON-NLS-1$
 
     /**
+     * The Lucene field for cross references
+     */
+    protected static final String FIELD_XREF = "xref"; //$NON-NLS-1$
+
+    /**
+     * The Lucene field for notes
+     */
+    protected static final String FIELD_NOTES = "note"; //$NON-NLS-1$
+
+    /**
      * The Book that we are indexing
      */
     protected Book book;




More information about the jsword-svn mailing list