1   /**
2    * Distribution License:
3    * JSword is free software; you can redistribute it and/or modify it under
4    * the terms of the GNU Lesser General Public License, version 2.1 as published by
5    * the Free Software Foundation. This program is distributed in the hope
6    * that it will be useful, but WITHOUT ANY WARRANTY; without even the
7    * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
8    * See the GNU Lesser General Public License for more details.
9    *
10   * The License is available on the internet at:
11   *       http://www.gnu.org/copyleft/lgpl.html
12   * or by writing to:
13   *      Free Software Foundation, Inc.
14   *      59 Temple Place - Suite 330
15   *      Boston, MA 02111-1307, USA
16   *
17   * Copyright: 2009
18   *     The copyright to this program is held by it's authors.
19   *
20   * ID: $Id:  $
21   */
22  package org.crosswire.jsword.index.lucene.analysis;
23  
24  import java.io.IOException;
25  import java.io.Reader;
26  
27  import org.apache.lucene.analysis.LowerCaseFilter;
28  import org.apache.lucene.analysis.StopFilter;
29  import org.apache.lucene.analysis.TokenStream;
30  import org.apache.lucene.analysis.ar.ArabicLetterTokenizer;
31  import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
32  import org.apache.lucene.analysis.fa.PersianAnalyzer;
33  import org.apache.lucene.analysis.fa.PersianNormalizationFilter;
34  import org.apache.lucene.util.Version;
35  
36  /**
37   * An Analyzer whose {@link TokenStream} is built from a
38   * {@link ArabicLetterTokenizer} filtered with {@link LowerCaseFilter},
39   * {@link ArabicNormalizationFilter}, {@link PersianNormalizationFilter} and
40   * Persian {@link StopFilter} (optional)
41   * 
42   * @see gnu.lgpl.License for license details.<br>
43   *      The copyright to this program is held by it's authors.
44   * @author DM Smith [dmsmith555 at yahoo dot com]
45   */
46  public class PersianLuceneAnalyzer extends AbstractBookAnalyzer {
47      public PersianLuceneAnalyzer() throws IOException {
48          stopSet = PersianAnalyzer.getDefaultStopSet();
49      }
50  
51      /*
52       * (non-Javadoc)
53       * 
54       * @see org.apache.lucene.analysis.Analyzer#tokenStream(java.lang.String,
55       * java.io.Reader)
56       */
57      @Override
58      public final TokenStream tokenStream(String fieldName, Reader reader) {
59          TokenStream result = new ArabicLetterTokenizer(reader);
60          result = new LowerCaseFilter(result);
61          result = new ArabicNormalizationFilter(result);
62          /* additional persian-specific normalization */
63          result = new PersianNormalizationFilter(result);
64          /*
65           * the order here is important: the stop set is normalized with the
66           * above!
67           */
68          if (doStopWords && stopSet != null) {
69              result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion), result, stopSet);
70          }
71  
72          return result;
73      }
74  
75      /**
76       * Returns a (possibly reused) {@link TokenStream} which tokenizes all the
77       * text in the provided {@link Reader}.
78       * 
79       * @return A {@link TokenStream} built from a {@link ArabicLetterTokenizer}
80       *         filtered with {@link LowerCaseFilter},
81       *         {@link ArabicNormalizationFilter},
82       *         {@link PersianNormalizationFilter} and Persian Stop words
83       */
84      @Override
85      public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
86          SavedStreams streams = (SavedStreams) getPreviousTokenStream();
87          if (streams == null) {
88              streams = new SavedStreams(new ArabicLetterTokenizer(reader));
89              streams.setResult(new LowerCaseFilter(streams.getResult()));
90              streams.setResult(new ArabicNormalizationFilter(streams.getResult()));
91              /* additional persian-specific normalization */
92              streams.setResult(new PersianNormalizationFilter(streams.getResult()));
93              /*
94               * the order here is important: the stop set is normalized with the
95               * above!
96               */
97              if (doStopWords && stopSet != null) {
98                  streams.setResult(new StopFilter(false, streams.getResult(), stopSet));
99              }
100             setPreviousTokenStream(streams);
101         } else {
102             streams.getSource().reset(reader);
103         }
104         return streams.getResult();
105     }
106     private final Version matchVersion = Version.LUCENE_29;
107 }
108