1   /**
2    * Distribution License:
3    * JSword is free software; you can redistribute it and/or modify it under
4    * the terms of the GNU Lesser General Public License, version 2.1 as published by
5    * the Free Software Foundation. This program is distributed in the hope
6    * that it will be useful, but WITHOUT ANY WARRANTY; without even the
7    * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
8    * See the GNU Lesser General Public License for more details.
9    *
10   * The License is available on the internet at:
11   *       http://www.gnu.org/copyleft/lgpl.html
12   * or by writing to:
13   *      Free Software Foundation, Inc.
14   *      59 Temple Place - Suite 330
15   *      Boston, MA 02111-1307, USA
16   *
17   * Copyright: 2007
18   *     The copyright to this program is held by it's authors.
19   *
20   * ID: $Id:  $
21   */
22  package org.crosswire.jsword.index.lucene.analysis;
23  
24  import java.io.IOException;
25  import java.io.Reader;
26  import java.util.HashMap;
27  import java.util.Set;
28  import java.util.regex.Pattern;
29  
30  import org.apache.lucene.analysis.LowerCaseTokenizer;
31  import org.apache.lucene.analysis.PorterStemFilter;
32  import org.apache.lucene.analysis.StopAnalyzer;
33  import org.apache.lucene.analysis.StopFilter;
34  import org.apache.lucene.analysis.TokenStream;
35  import org.apache.lucene.analysis.de.GermanAnalyzer;
36  import org.apache.lucene.analysis.fr.FrenchAnalyzer;
37  import org.apache.lucene.analysis.nl.DutchAnalyzer;
38  import org.apache.lucene.analysis.snowball.SnowballFilter;
39  import org.apache.lucene.util.Version;
40  import org.crosswire.jsword.book.Book;
41  
42  /**
43   * An Analyzer whose {@link TokenStream} is built from a
44   * {@link LowerCaseTokenizer} filtered with {@link SnowballFilter} (optional)
45   * and {@link StopFilter} (optional) Default behavior: Stemming is done, Stop
46   * words not removed A snowball stemmer is configured according to the language
47   * of the Book. Currently it takes following stemmer names (available stemmers
48   * in lucene snowball package net.sf.snowball.ext)
49   * 
50   * <pre>
51   *     Danish
52   *     Dutch
53   *     English
54   *     Finnish
55   *     French
56   *     German2
57   *     German
58   *     Italian
59   *     Kp
60   *     Lovins
61   *     Norwegian
62   *     Porter
63   *     Portuguese
64   *     Russian
65   *     Spanish
66   *     Swedish
67   * </pre>
68   * 
69   * This list is expected to expand, as and when Snowball project support more
70   * languages
71   * 
72   * @see gnu.lgpl.License for license details.<br>
73   *      The copyright to this program is held by it's authors.
74   * @author sijo cherian [sijocherian at yahoo dot com]
75   */
76  public class ConfigurableSnowballAnalyzer extends AbstractBookAnalyzer {
77      public ConfigurableSnowballAnalyzer() {
78      }
79  
80      /**
81       * Filters {@link LowerCaseTokenizer} with {@link StopFilter} if enabled and
82       * {@link SnowballFilter}.
83       */
84      @Override
85      public final TokenStream tokenStream(String fieldName, Reader reader) {
86          TokenStream result = new LowerCaseTokenizer(reader);
87          if (doStopWords && stopSet != null) {
88              result = new StopFilter(false, result, stopSet);
89          }
90  
91          // Configure Snowball filter based on language/stemmerName
92          if (doStemming) {
93              result = new SnowballFilter(result, stemmerName);
94          }
95  
96          return result;
97      }
98  
99      /* (non-Javadoc)
100      * @see org.apache.lucene.analysis.Analyzer#reusableTokenStream(java.lang.String, java.io.Reader)
101      */
102     @Override
103     public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
104         SavedStreams streams = (SavedStreams) getPreviousTokenStream();
105         if (streams == null) {
106             streams = new SavedStreams(new LowerCaseTokenizer(reader));
107             if (doStopWords && stopSet != null) {
108                 streams.setResult(new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion), streams.getResult(), stopSet));
109             }
110 
111             if (doStemming) {
112                 streams.setResult(new PorterStemFilter(streams.getResult()));
113             }
114 
115             setPreviousTokenStream(streams);
116         } else {
117             streams.getSource().reset(reader);
118         }
119         return streams.getResult();
120     }
121 
122     @Override
123     public void setBook(Book newBook) {
124         book = newBook;
125         stemmerName = null;
126         if (book != null) {
127             // stemmer name are same as language name, in most cases
128             pickStemmer(book.getLanguage().getName());
129         }
130     }
131 
132     /**
133      * Given the name of a stemmer, use that one.
134      * 
135      * @param language
136      */
137     public void pickStemmer(String language) {
138         stemmerName = language;
139         if (stemmerName != null) {
140             // Check for allowed stemmers
141             if (!allowedStemmers.matcher(stemmerName).matches()) {
142                 throw new IllegalArgumentException("SnowballAnalyzer configured for unavailable stemmer " + stemmerName);
143             }
144 
145             // Initialize the default stop words
146             if (defaultStopWordMap.containsKey(stemmerName)) {
147                 stopSet = defaultStopWordMap.get(stemmerName);
148             }
149         }
150     }
151 
152     /**
153      * The name of the stemmer to use.
154      */
155     private String stemmerName;
156 
157     private static Pattern allowedStemmers = Pattern
158             .compile("(Danish|Dutch|English|Finnish|French|German2|German|Italian|Kp|Lovins|Norwegian|Porter|Portuguese|Russian|Spanish|Swedish)");
159 
160     // Maps StemmerName > String array of standard stop words
161     private static HashMap<String, Set<?>> defaultStopWordMap = new HashMap<String, Set<?>>();
162     static {
163         defaultStopWordMap.put("French", FrenchAnalyzer.getDefaultStopSet());
164         defaultStopWordMap.put("German", GermanAnalyzer.getDefaultStopSet());
165         defaultStopWordMap.put("German2", GermanAnalyzer.getDefaultStopSet());
166         defaultStopWordMap.put("Dutch", DutchAnalyzer.getDefaultStopSet());
167         defaultStopWordMap.put("English", StopAnalyzer.ENGLISH_STOP_WORDS_SET);
168         defaultStopWordMap.put("Porter", StopAnalyzer.ENGLISH_STOP_WORDS_SET);
169     }
170 
171     private final Version matchVersion = Version.LUCENE_29;
172 }
173