1 | |
|
2 | |
|
3 | |
|
4 | |
|
5 | |
|
6 | |
|
7 | |
|
8 | |
|
9 | |
|
10 | |
|
11 | |
|
12 | |
|
13 | |
|
14 | |
|
15 | |
|
16 | |
|
17 | |
|
18 | |
|
19 | |
|
20 | |
package org.crosswire.jsword.index.lucene.analysis; |
21 | |
|
22 | |
import java.io.IOException; |
23 | |
import java.io.Reader; |
24 | |
import java.util.HashMap; |
25 | |
import java.util.Map; |
26 | |
import java.util.Set; |
27 | |
|
28 | |
import org.apache.lucene.analysis.LowerCaseTokenizer; |
29 | |
import org.apache.lucene.analysis.StopAnalyzer; |
30 | |
import org.apache.lucene.analysis.StopFilter; |
31 | |
import org.apache.lucene.analysis.TokenStream; |
32 | |
import org.apache.lucene.analysis.de.GermanAnalyzer; |
33 | |
import org.apache.lucene.analysis.fr.FrenchAnalyzer; |
34 | |
import org.apache.lucene.analysis.nl.DutchAnalyzer; |
35 | |
import org.apache.lucene.analysis.snowball.SnowballFilter; |
36 | |
import org.apache.lucene.util.Version; |
37 | |
import org.crosswire.jsword.book.Book; |
38 | |
|
39 | |
|
40 | |
|
41 | |
|
42 | |
|
43 | |
|
44 | |
|
45 | |
|
46 | |
|
47 | |
|
48 | |
|
49 | |
|
50 | |
|
51 | |
|
52 | |
|
53 | |
|
54 | |
|
55 | |
|
56 | |
|
57 | |
|
58 | |
|
59 | |
|
60 | |
|
61 | |
|
62 | |
|
63 | |
|
64 | |
|
65 | |
|
66 | |
|
67 | |
|
68 | |
|
69 | |
|
70 | |
|
71 | |
|
72 | |
public class ConfigurableSnowballAnalyzer extends AbstractBookAnalyzer { |
73 | 0 | public ConfigurableSnowballAnalyzer() { |
74 | 0 | } |
75 | |
|
76 | |
|
77 | |
|
78 | |
|
79 | |
|
80 | |
@Override |
81 | |
public final TokenStream tokenStream(String fieldName, Reader reader) { |
82 | 0 | TokenStream result = new LowerCaseTokenizer(reader); |
83 | 0 | if (doStopWords && stopSet != null) { |
84 | 0 | result = new StopFilter(false, result, stopSet); |
85 | |
} |
86 | |
|
87 | |
|
88 | 0 | if (doStemming) { |
89 | 0 | result = new SnowballFilter(result, stemmerName); |
90 | |
} |
91 | |
|
92 | 0 | return result; |
93 | |
} |
94 | |
|
95 | |
|
96 | |
|
97 | |
|
98 | |
@Override |
99 | |
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { |
100 | 0 | SavedStreams streams = (SavedStreams) getPreviousTokenStream(); |
101 | 0 | if (streams == null) { |
102 | 0 | streams = new SavedStreams(new LowerCaseTokenizer(reader)); |
103 | 0 | if (doStopWords && stopSet != null) { |
104 | 0 | streams.setResult(new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion), streams.getResult(), stopSet)); |
105 | |
} |
106 | |
|
107 | 0 | if (doStemming) { |
108 | 0 | streams.setResult(new SnowballFilter(streams.getResult(), stemmerName)); |
109 | |
} |
110 | |
|
111 | 0 | setPreviousTokenStream(streams); |
112 | |
} else { |
113 | 0 | streams.getSource().reset(reader); |
114 | |
} |
115 | 0 | return streams.getResult(); |
116 | |
} |
117 | |
|
118 | |
@Override |
119 | |
public void setBook(Book newBook) { |
120 | 0 | book = newBook; |
121 | 0 | stemmerName = null; |
122 | 0 | if (book != null) { |
123 | |
|
124 | 0 | pickStemmer(book.getLanguage().getCode()); |
125 | |
} |
126 | 0 | } |
127 | |
|
128 | |
|
129 | |
|
130 | |
|
131 | |
|
132 | |
|
133 | |
public void pickStemmer(String languageCode) { |
134 | 0 | if (languageCode != null) { |
135 | |
|
136 | 0 | if (languageCodeToStemmerLanguageNameMap.containsKey(languageCode)) { |
137 | 0 | stemmerName = languageCodeToStemmerLanguageNameMap.get(languageCode); |
138 | |
} else { |
139 | 0 | throw new IllegalArgumentException("SnowballAnalyzer configured for unavailable stemmer " + stemmerName); |
140 | |
} |
141 | |
|
142 | |
|
143 | 0 | if (defaultStopWordMap.containsKey(languageCode)) { |
144 | 0 | stopSet = defaultStopWordMap.get(languageCode); |
145 | |
} |
146 | |
} |
147 | 0 | } |
148 | |
|
149 | |
|
150 | |
|
151 | |
|
152 | |
private String stemmerName; |
153 | |
|
154 | 0 | private static Map<String, String> languageCodeToStemmerLanguageNameMap = new HashMap<String, String>(); |
155 | |
static { |
156 | 0 | languageCodeToStemmerLanguageNameMap.put("da", "Danish"); |
157 | 0 | languageCodeToStemmerLanguageNameMap.put("nl", "Dutch"); |
158 | 0 | languageCodeToStemmerLanguageNameMap.put("en", "English"); |
159 | 0 | languageCodeToStemmerLanguageNameMap.put("fi", "Finnish"); |
160 | 0 | languageCodeToStemmerLanguageNameMap.put("fr", "French"); |
161 | 0 | languageCodeToStemmerLanguageNameMap.put("de", "German"); |
162 | 0 | languageCodeToStemmerLanguageNameMap.put("it", "Italian"); |
163 | 0 | languageCodeToStemmerLanguageNameMap.put("no", "Norwegian"); |
164 | 0 | languageCodeToStemmerLanguageNameMap.put("pt", "Portuguese"); |
165 | 0 | languageCodeToStemmerLanguageNameMap.put("ru", "Russian"); |
166 | 0 | languageCodeToStemmerLanguageNameMap.put("es", "Spanish"); |
167 | 0 | languageCodeToStemmerLanguageNameMap.put("sv", "Swedish"); |
168 | |
} |
169 | |
|
170 | |
|
171 | 0 | private static HashMap<String, Set<?>> defaultStopWordMap = new HashMap<String, Set<?>>(); |
172 | |
static { |
173 | 0 | defaultStopWordMap.put("fr", FrenchAnalyzer.getDefaultStopSet()); |
174 | 0 | defaultStopWordMap.put("de", GermanAnalyzer.getDefaultStopSet()); |
175 | 0 | defaultStopWordMap.put("nl", DutchAnalyzer.getDefaultStopSet()); |
176 | 0 | defaultStopWordMap.put("en", StopAnalyzer.ENGLISH_STOP_WORDS_SET); |
177 | 0 | } |
178 | |
|
179 | 0 | private final Version matchVersion = Version.LUCENE_29; |
180 | |
} |