| PersianLuceneAnalyzer.java |
1 /**
2 * Distribution License:
3 * JSword is free software; you can redistribute it and/or modify it under
4 * the terms of the GNU Lesser General Public License, version 2.1 as published by
5 * the Free Software Foundation. This program is distributed in the hope
6 * that it will be useful, but WITHOUT ANY WARRANTY; without even the
7 * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
8 * See the GNU Lesser General Public License for more details.
9 *
10 * The License is available on the internet at:
11 * http://www.gnu.org/copyleft/lgpl.html
12 * or by writing to:
13 * Free Software Foundation, Inc.
14 * 59 Temple Place - Suite 330
15 * Boston, MA 02111-1307, USA
16 *
17 * Copyright: 2009
18 * The copyright to this program is held by it's authors.
19 *
20 * ID: $Id: $
21 */
22 package org.crosswire.jsword.index.lucene.analysis;
23
24 import java.io.IOException;
25 import java.io.Reader;
26
27 import org.apache.lucene.analysis.LowerCaseFilter;
28 import org.apache.lucene.analysis.StopFilter;
29 import org.apache.lucene.analysis.TokenStream;
30 import org.apache.lucene.analysis.ar.ArabicLetterTokenizer;
31 import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
32 import org.apache.lucene.analysis.fa.PersianAnalyzer;
33 import org.apache.lucene.analysis.fa.PersianNormalizationFilter;
34 import org.apache.lucene.util.Version;
35
36 /**
37 * An Analyzer whose {@link TokenStream} is built from a
38 * {@link ArabicLetterTokenizer} filtered with {@link LowerCaseFilter},
39 * {@link ArabicNormalizationFilter}, {@link PersianNormalizationFilter} and
40 * Persian {@link StopFilter} (optional)
41 *
42 * @see gnu.lgpl.License for license details.<br>
43 * The copyright to this program is held by it's authors.
44 * @author DM Smith [dmsmith555 at yahoo dot com]
45 */
46 public class PersianLuceneAnalyzer extends AbstractBookAnalyzer {
47 public PersianLuceneAnalyzer() throws IOException {
48 stopSet = PersianAnalyzer.getDefaultStopSet();
49 }
50
51 /*
52 * (non-Javadoc)
53 *
54 * @see org.apache.lucene.analysis.Analyzer#tokenStream(java.lang.String,
55 * java.io.Reader)
56 */
57 @Override
58 public final TokenStream tokenStream(String fieldName, Reader reader) {
59 TokenStream result = new ArabicLetterTokenizer(reader);
60 result = new LowerCaseFilter(result);
61 result = new ArabicNormalizationFilter(result);
62 /* additional persian-specific normalization */
63 result = new PersianNormalizationFilter(result);
64 /*
65 * the order here is important: the stop set is normalized with the
66 * above!
67 */
68 if (doStopWords && stopSet != null) {
69 result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion), result, stopSet);
70 }
71
72 return result;
73 }
74
75 /**
76 * Returns a (possibly reused) {@link TokenStream} which tokenizes all the
77 * text in the provided {@link Reader}.
78 *
79 * @return A {@link TokenStream} built from a {@link ArabicLetterTokenizer}
80 * filtered with {@link LowerCaseFilter},
81 * {@link ArabicNormalizationFilter},
82 * {@link PersianNormalizationFilter} and Persian Stop words
83 */
84 @Override
85 public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
86 SavedStreams streams = (SavedStreams) getPreviousTokenStream();
87 if (streams == null) {
88 streams = new SavedStreams(new ArabicLetterTokenizer(reader));
89 streams.setResult(new LowerCaseFilter(streams.getResult()));
90 streams.setResult(new ArabicNormalizationFilter(streams.getResult()));
91 /* additional persian-specific normalization */
92 streams.setResult(new PersianNormalizationFilter(streams.getResult()));
93 /*
94 * the order here is important: the stop set is normalized with the
95 * above!
96 */
97 if (doStopWords && stopSet != null) {
98 streams.setResult(new StopFilter(false, streams.getResult(), stopSet));
99 }
100 setPreviousTokenStream(streams);
101 } else {
102 streams.getSource().reset(reader);
103 }
104 return streams.getResult();
105 }
106 private final Version matchVersion = Version.LUCENE_29;
107 }
108