[jsword-svn] jsword/java/limbo/org/crosswire/jsword/book/search/ser s

jswordcvs at crosswire.org jswordcvs at crosswire.org
Sat Oct 9 14:45:06 MST 2004


Update of /cvs/jsword/jsword/java/limbo/org/crosswire/jsword/book/search/ser
In directory www.crosswire.org:/tmp/cvs-serv30045/java/limbo/org/crosswire/jsword/book/search/ser

Added Files:
	Msg.properties Msg.java package.html SerIndexManager.java 
	SerIndex.java 
Log Message:
indexing updates

--- NEW FILE: package.html ---
<html>
<body>

<p>
  An implementation of SearchEngine that uses simple serialized files.
</p>

</body>
</html>

--- NEW FILE: Msg.properties ---
# The naming convention for the keys in the file is ClassName.MessageName
# Where ClassName is the name of the class using the property.
# When the resource is used by more than one class it should be the one
# that the resource is most closely associated.
# The MessageName should be mixed case, with a leading capital.
# It should have no spaces or other punctuation (e.g. _, -, ', ...)

SerIndex.Indexing=Creating index. Processing {0}
SerIndex.Init=Failed to initialise Lucene search engine.
SerIndex.DeleteFailed=Failed to delete search index
SerIndex.WriteError=Write Error.
SerIndex.FindingWords=Finding Words ({0})
SerIndex.WritingWords=Writing Words ({0})
SerIndex.Saving=Saving Index

--- NEW FILE: Msg.java ---
package org.crosswire.jsword.book.search.ser;

import org.crosswire.common.util.MsgBase;

/**
 * Compile safe Msg resource settings.
 * 
 * <p><table border='1' cellPadding='3' cellSpacing='0'>
 * <tr><td bgColor='white' class='TableRowColor'><font size='-7'>
 *
 * Distribution Licence:<br />
 * JSword is free software; you can redistribute it
 * and/or modify it under the terms of the GNU General Public License,
 * version 2 as published by the Free Software Foundation.<br />
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * General Public License for more details.<br />
 * The License is available on the internet
 * <a href='http://www.gnu.org/copyleft/gpl.html'>here</a>, or by writing to:
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
 * MA 02111-1307, USA<br />
 * The copyright to this program is held by it's authors.
 * </font></td></tr></table>
 * @see gnu.gpl.Licence
 * @author Joe Walker [joe at eireneh dot com]
 * @version $Id: Msg.java,v 1.1 2004/10/09 21:45:04 joe Exp $
 */
class Msg extends MsgBase
{
    static final Msg INDEX_START = new Msg("SerIndex.Indexing"); //$NON-NLS-1$
    static final Msg SER_INIT = new Msg("SerIndex.Init"); //$NON-NLS-1$
    static final Msg DELETE_FAILED = new Msg("SerIndex.DeleteFailed"); //$NON-NLS-1$
    static final Msg WRITE_ERROR = new Msg("SerIndex.WriteError"); //$NON-NLS-1$
    static final Msg FINDING_WORDS = new Msg("SerIndex.FindingWords"); //$NON-NLS-1$
    static final Msg WRITING_WORDS = new Msg("SerIndex.WritingWords"); //$NON-NLS-1$
    static final Msg SAVING = new Msg("SerIndex.Saving"); //$NON-NLS-1$

    /**
     * Passthrough ctor 
     */
    private Msg(String name)
    {
        super(name);
    }
}

--- NEW FILE: SerIndex.java ---
package org.crosswire.jsword.book.search.ser;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.io.RandomAccessFile;
import java.net.URL;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.SortedMap;
import java.util.TreeMap;

import org.crosswire.common.activate.Activatable;
import org.crosswire.common.activate.Activator;
import org.crosswire.common.activate.Lock;
import org.crosswire.common.progress.Job;
import org.crosswire.common.progress.JobManager;
import org.crosswire.common.util.FileUtil;
import org.crosswire.common.util.Logger;
import org.crosswire.common.util.NetUtil;
import org.crosswire.common.util.Reporter;
import org.crosswire.jsword.book.Book;
import org.crosswire.jsword.book.BookData;
import org.crosswire.jsword.book.BookException;
import org.crosswire.jsword.book.SentanceUtil;
import org.crosswire.jsword.book.search.Grammar;
import org.crosswire.jsword.book.search.Index;
import org.crosswire.jsword.book.search.Thesaurus;
import org.crosswire.jsword.passage.BibleInfo;
import org.crosswire.jsword.passage.Key;
import org.crosswire.jsword.passage.KeyUtil;
import org.crosswire.jsword.passage.NoSuchKeyException;
import org.crosswire.jsword.passage.Passage;
import org.crosswire.jsword.passage.PassageKeyFactory;
import org.crosswire.jsword.passage.Verse;

/**
 * A search engine - This is a stepping stone on the way to allowing use of
 * Lucene in place of our search engine.
 * 
 * <p><table border='1' cellPadding='3' cellSpacing='0'>
 * <tr><td bgColor='white' class='TableRowColor'><font size='-7'>
 *
 * Distribution Licence:<br />
 * JSword is free software; you can redistribute it
 * and/or modify it under the terms of the GNU General Public License,
 * version 2 as published by the Free Software Foundation.<br />
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * General Public License for more details.<br />
 * The License is available on the internet
 * <a href='http://www.gnu.org/copyleft/gpl.html'>here</a>, or by writing to:
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
 * MA 02111-1307, USA<br />
 * The copyright to this program is held by it's authors.
 * </font></td></tr></table>
 * @see gnu.gpl.Licence
 * @author Joe Walker [joe at eireneh dot com]
 * @version $Id: SerIndex.java,v 1.1 2004/10/09 21:45:04 joe Exp $
 */
public class SerIndex implements Index, Activatable, Thesaurus
{
    /**
     * Default ctor
     */
    public SerIndex(Book newbook, URL storage)
    {
        this.book = newbook;
        this.url = storage;
    }

    /**
     * Generate an index to use, telling the job about progress as you go.
     * @throws BookException If we fail to read the index files
     */
    public SerIndex(Book book, URL storage, boolean create) throws BookException
    {
        assert create;

        this.book = book;
        this.url = storage;

        Job job = JobManager.createJob(Msg.INDEX_START.toString(), Thread.currentThread(), false);

        try
        {
            synchronized (creating)
            {
                generateSearchIndex(job);
            }
        }
        catch (Exception ex)
        {
            job.ignoreTimings();
            throw new BookException(Msg.SER_INIT, ex);
        }
        finally
        {
            job.done();
        }                
    }

    /* (non-Javadoc)
     * @see org.crosswire.jsword.book.search.Index#getKey(java.lang.String)
     */
    public Key getKey(String name) throws NoSuchKeyException
    {
        return book.getKey(name);
    }

    /* (non-Javadoc)
     * @see org.crosswire.jsword.book.search.parse.Index#getStartsWith(java.lang.String)
     */
    public Collection getSynonyms(String word)
    {
        checkActive();

        // log.fine("considering="+words[i]);
        String root = Grammar.getRoot(word);

        // Check that the root is still a word. If not then we
        // use the full version. This catches misses like se is
        // the root of seed, and matches sea and so on ...
        Key ref = findWord(root);
        if (ref.isEmpty())
        {
            root = word;
        }

        word = word.toLowerCase();
        SortedMap submap = datamap.subMap(word, word + "\u9999"); //$NON-NLS-1$
        return submap.keySet();
    }

    /* (non-Javadoc)
     * @see org.crosswire.jsword.book.search.parse.Index#findWord(java.lang.String)
     */
    public Key findWord(String word)
    {
        checkActive();

        if (word == null)
        {
            return book.createEmptyKeyList();
        }

        Section section = (Section) datamap.get(word.toLowerCase());
        if (section == null)
        {
            return book.createEmptyKeyList();
        }

        try
        {
            // Read blob
            byte[] blob = new byte[section.length];
            dataRaf.seek(section.offset);
            int read = dataRaf.read(blob);

            // Probably a bit harsh, but it would be wrong to just drop it.
            if (read != blob.length)
            {
                throw new IOException();
            }

            // De-serialize
            return PassageKeyFactory.fromBinaryRepresentation(blob);
        }
        catch (Exception ex)
        {
            log.warn("Search failed on:"); //$NON-NLS-1$
            log.warn("  word=" + word); //$NON-NLS-1$
            log.warn("  offset=" + section.offset); //$NON-NLS-1$
            log.warn("  length=" + section.length); //$NON-NLS-1$
            Reporter.informUser(this, ex);

            return book.createEmptyKeyList();
        }
    }

    /* (non-Javadoc)
     * @see org.crosswire.jsword.book.search.AbstractIndex#generateSearchIndex(org.crosswire.common.progress.Job)
     */
    public void generateSearchIndex(Job job) throws BookException
    {
        // create a word/passage hashmap
        Map matchmap = new HashMap();
        generateSearchIndexImpl(job, book.getGlobalKeyList(), matchmap);

        // For the progress listener
        int count = 0;
        int words = matchmap.size();

        // Now we need to write the words into our index
        try
        {
            NetUtil.makeDirectory(url);
            URL dataUrl = NetUtil.lengthenURL(url, FILE_DATA);
            dataRaf = new RandomAccessFile(NetUtil.getAsFile(dataUrl), FileUtil.MODE_WRITE);
        }
        catch (IOException ex)
        {
            throw new BookException(Msg.WRITE_ERROR, ex);
        }

        for (Iterator it = matchmap.keySet().iterator(); it.hasNext(); )
        {
            String word = (String) it.next();
            Key match = (Key) matchmap.get(word);
            recordFoundPassage(word, match);

            // Fire a progress event?
            int percent = PERCENT_READ + (PERCENT_WRITE * count++ / words) / BibleInfo.versesInBible();
            job.setProgress(percent, Msg.WRITING_WORDS.toString(word));

            // This could take a long time ...
            Thread.yield();
            if (Thread.currentThread().isInterrupted())
            {
                break;
            }
        }

        // Store the indexes on disk
        try
        {
            job.setProgress(PERCENT_READ + PERCENT_WRITE, Msg.SAVING.toString());

            // Save the ascii Passage index
            URL indexurl = NetUtil.lengthenURL(url, FILE_INDEX);
            PrintWriter indexout = new PrintWriter(NetUtil.getOutputStream(indexurl));
            Iterator it = datamap.keySet().iterator();
            while (it.hasNext())
            {
                String word = (String) it.next();
                Section section = (Section) datamap.get(word);
                indexout.println(word + ":" + section.offset + ":" + section.length); //$NON-NLS-1$ //$NON-NLS-2$
            }
            indexout.close();
        }
        catch (IOException ex)
        {
            throw new BookException(Msg.WRITE_ERROR, ex);
        }
    }

    /**
     * Dig down into a Key indexing as we go.
     */
    private void generateSearchIndexImpl(Job job, Key key, Map matchmap) throws BookException
    {
        // loop through all the verses

        int percent = 0;
        for (Iterator it = key.iterator(); it.hasNext(); )
        {
            Key sublist = (Key) it.next();
            if (sublist.canHaveChildren())
            {
                generateSearchIndexImpl(job, sublist, matchmap);
            }
            else
            {
                BookData data = book.getData(sublist);
                String text = data.getPlainText();

                String[] words = SentanceUtil.getWords(text);
                for (int i = 0; i < words.length; i++)
                {
                    // ensure there is a Passage for this word in the word/passage hashmap
                    Key matches = (Key) matchmap.get(words[i]);
                    if (matches == null)
                    {
                        matches = book.createEmptyKeyList();
                        matchmap.put(words[i], matches);
                    }

                    // add this verse to this words passage
                    matches.addAll(sublist);
                }

                // report progress
                if (sublist instanceof Passage)
                {
                    Verse verse = KeyUtil.getVerse(sublist);
                    percent = PERCENT_READ * verse.getOrdinal() / BibleInfo.versesInBible();
                }

                job.setProgress(percent, Msg.FINDING_WORDS.toString(sublist.getName()));

                // This could take a long time ...
                Thread.yield();
                if (Thread.currentThread().isInterrupted())
                {
                    break;
                }
            }
        }
    }

    /**
     * Add to the main index data the references against this word
     * @param word The word to write
     * @param key The references to the word
     */
    private void recordFoundPassage(String word, Key key) throws BookException
    {
        if (word == null)
        {
            return;
        }

        try
        {
            Passage ref = KeyUtil.getPassage(key);
            byte[] buffer = PassageKeyFactory.toBinaryRepresentation(ref);

            Section section = new Section(dataRaf.getFilePointer(), buffer.length);

            dataRaf.write(buffer);
            datamap.put(word.toLowerCase(), section);
        }
        catch (Exception ex)
        {
            throw new BookException(Msg.WRITE_ERROR, ex);
        }
    }

    /* (non-Javadoc)
     * @see org.crosswire.jsword.book.search.SearchEngine#activate()
     */
    public final void activate(Lock lock)
    {
        try
        {
            URL dataUrl = NetUtil.lengthenURL(url, FILE_DATA);
            dataRaf = new RandomAccessFile(NetUtil.getAsFile(dataUrl), FileUtil.MODE_READ);
        
            URL indexUrl = NetUtil.lengthenURL(url, FILE_INDEX);
            BufferedReader indexIn = new BufferedReader(new InputStreamReader(indexUrl.openStream()));
        
            while (true)
            {
                String line = indexIn.readLine();
                if (line == null)
                {
                    break;
                }
        
                try
                {
                    int colon1 = line.indexOf(":"); //$NON-NLS-1$
                    int colon2 = line.lastIndexOf(":"); //$NON-NLS-1$
                    String word = line.substring(0, colon1);
        
                    long offset = Long.parseLong(line.substring(colon1 + 1, colon2));
                    int length = Integer.parseInt(line.substring(colon2 + 1));
        
                    Section section = new Section(offset, length);
                    datamap.put(word, section);
                }
                catch (NumberFormatException ex)
                {
                    log.error("NumberFormatException reading line: " + line, ex); //$NON-NLS-1$
                }
            }
        }
        catch (IOException ex)
        {
            log.error("Read failed on indexin", ex); //$NON-NLS-1$
        }

        active = true;
    }

    /* (non-Javadoc)
     * @see org.crosswire.jsword.book.search.SearchEngine#deactivate()
     */
    public final void deactivate(Lock lock)
    {
        datamap.clear();
        dataRaf = null;

        active = false;
    }

    /**
     * Helper method so we can quickly activate ourselves on access
     */
    private final void checkActive()
    {
        if (!active)
        {
            Activator.activate(this);
        }
    }

    /**
     * A synchronization lock point to prevent us from doing 2 index runs at a time.
     */
    private static final Object creating = new Object();

    /**
     * Are we active
     */
    private boolean active = false;

    /**
     * The name of the data file
     */
    private static final String FILE_DATA = "ref.data"; //$NON-NLS-1$

    /**
     * The name of the index file
     */
    protected static final String FILE_INDEX = "ref.index"; //$NON-NLS-1$

    /**
     * The Bible we are indexing
     */
    protected Book book;

    /**
     * The directory to which to write the index
     */
    private URL url;

    /**
     * The passages random access file
     */
    private RandomAccessFile dataRaf;

    /**
     * The hash of indexes into the passages file
     */
    private SortedMap datamap = new TreeMap();

    /**
     * The log stream
     */
    private static final Logger log = Logger.getLogger(SerIndex.class);

    /**
     * The percentages taken but by different parts
     */
    private static final int PERCENT_READ = 60;
    private static final int PERCENT_WRITE = 39;
    // private static final int PERCENT_INDEX = 1;

    /**
     * A simple class to hold an offset and length into the passages random
     * access file
     */
    public static class Section
    {
        protected Section(long offset, int length)
        {
            this.offset = offset;
            this.length = length;
        }

        protected long offset;
        protected int length;
    }
}

--- NEW FILE: SerIndexManager.java ---
package org.crosswire.jsword.book.search.ser;

import java.io.IOException;
import java.net.URL;
import java.util.HashMap;
import java.util.Map;

import org.crosswire.common.util.Logger;
import org.crosswire.common.util.NetUtil;
import org.crosswire.common.util.Reporter;
import org.crosswire.jsword.book.Book;
import org.crosswire.jsword.book.BookException;
import org.crosswire.jsword.book.search.Index;
import org.crosswire.jsword.book.search.IndexManager;
import org.crosswire.jsword.util.Project;

/**
 * .
 * 
 * <p><table border='1' cellPadding='3' cellSpacing='0'>
 * <tr><td bgColor='white' class='TableRowColor'><font size='-7'>
 *
 * Distribution Licence:<br />
 * JSword is free software; you can redistribute it
 * and/or modify it under the terms of the GNU General Public License,
 * version 2 as published by the Free Software Foundation.<br />
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * General Public License for more details.<br />
 * The License is available on the internet
 * <a href='http://www.gnu.org/copyleft/gpl.html'>here</a>, or by writing to:
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
 * MA 02111-1307, USA<br />
 * The copyright to this program is held by it's authors.
 * </font></td></tr></table>
 * @see gnu.gpl.Licence
 * @author Joe Walker [joe at eireneh dot com]
 * @version $Id: SerIndexManager.java,v 1.1 2004/10/09 21:45:04 joe Exp $
 */
public class SerIndexManager implements IndexManager
{
    /* (non-Javadoc)
     * @see org.crosswire.jsword.book.search.AbstractIndex#isIndexed()
     */
    public boolean isIndexed(Book book)
    {
        try
        {
            URL storage = getStorageArea(book);
            URL longer = NetUtil.lengthenURL(storage, SerIndex.FILE_INDEX);
            return NetUtil.isFile(longer);
        }
        catch (IOException ex)
        {
            log.error("Failed to find lucene index storage area.", ex); //$NON-NLS-1$
            return false;
        }
    }

    /* (non-Javadoc)
     * @see org.crosswire.jsword.book.search.IndexManager#getIndex(org.crosswire.jsword.book.Book)
     */
    public Index getIndex(Book book) throws BookException
    {
        try
        {
            Index reply = (Index) indexes.get(book);
            if (reply == null)
            {
                URL storage = getStorageArea(book);
                reply = new SerIndex(book, storage);
                indexes.put(book, reply);
            }

            return reply;
        }
        catch (IOException ex)
        {
            throw new BookException(Msg.SER_INIT, ex);
        }
    }

    /* (non-Javadoc)
     * @see org.crosswire.jsword.book.search.AbstractIndex#generateSearchIndex(org.crosswire.common.progress.Job)
     */
    public void scheduleIndexCreation(final Book book)
    {
        Thread work = new Thread(new Runnable()
        {
            public void run()
            {
                try
                {
                    URL storage = getStorageArea(book);
                    Index index = new SerIndex(book, storage, true);
                    indexes.put(book, index);
                }
                catch (Exception ex)
                {
                    Reporter.informUser(SerIndexManager.this, ex);
                }
            }
        });
        work.start();
    }

    /* (non-Javadoc)
     * @see org.crosswire.jsword.book.search.IndexManager#deleteIndex(org.crosswire.jsword.book.Book)
     */
    public void deleteIndex(Book book) throws BookException
    {
        try
        {
            // TODO(joe): This needs some checks that it isn't being used
            URL storage = getStorageArea(book);
            NetUtil.delete(storage);
        }
        catch (IOException ex)
        {
            throw new BookException(Msg.DELETE_FAILED, ex);
        }
    }

    /**
     * Determine where an index should be stored
     * @param book The book to be indexed
     * @return A URL to store stuff in
     * @throws IOException If there is a problem in finding where to store stuff
     */
    protected URL getStorageArea(Book book) throws IOException
    {
        String driverName = book.getBookMetaData().getDriverName();
        String bookName = book.getBookMetaData().getInitials();

        assert driverName != null;
        assert bookName != null;

        URL base = Project.instance().getTempScratchSpace(DIR_SER, false);
        URL driver = NetUtil.lengthenURL(base, driverName);

        return NetUtil.lengthenURL(driver, bookName);
    }

    /**
     * The created indexes
     */
    protected static final Map indexes = new HashMap();

    /**
     * The ser search index directory
     */
    private static final String DIR_SER = "ser"; //$NON-NLS-1$

    /**
     * The log stream
     */
    private static final Logger log = Logger.getLogger(SerIndexManager.class);
}



More information about the jsword-svn mailing list