1   /**
2    * Distribution License:
3    * JSword is free software; you can redistribute it and/or modify it under
4    * the terms of the GNU Lesser General Public License, version 2.1 as published by
5    * the Free Software Foundation. This program is distributed in the hope
6    * that it will be useful, but WITHOUT ANY WARRANTY; without even the
7    * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
8    * See the GNU Lesser General Public License for more details.
9    *
10   * The License is available on the internet at:
11   *       http://www.gnu.org/copyleft/lgpl.html
12   * or by writing to:
13   *      Free Software Foundation, Inc.
14   *      59 Temple Place - Suite 330
15   *      Boston, MA 02111-1307, USA
16   *
17   * Copyright: 2007
18   *     The copyright to this program is held by it's authors.
19   *
20   * ID: $Id: Bitap.java 2149 2011-04-09 15:51:45Z dmsmith $
21   */
22  package org.crosswire.common.diff;
23  
24  import java.util.HashMap;
25  import java.util.Map;
26  
27  /**
28   * An implementation of the Bitap algorithm for finding a "fuzzy" location of a
29   * match.
30   * 
31   * Based on the LGPL Diff_Match_Patch v1.5 javascript of Neil Fraser, Copyright (C) 2006<br>
32   * <a href="http://neil.fraser.name/software/diff_match_patch/">http://neil.fraser.name/software/diff_match_patch/</a>
33   * 
34   * @see gnu.lgpl.License for license details.<br>
35   *      The copyright to this program is held by it's authors.
36   * @author DM Smith [dmsmith555 at yahoo dot com]
37   */
38  public class Bitap implements Locator {
39      /**
40       * Locate the best instance of 'pattern' in 'text' near 'loc'.
41       * 
42       * @param text
43       *            The text to search
44       * @param pattern
45       *            The pattern to search for
46       * @param loc
47       *            The location to search around
48       */
49      public Bitap(String text, String pattern, int loc) {
50          this.text = text;
51          this.pattern = pattern;
52          this.loc = loc;
53          alphabet = new HashMap<Character, Integer>();
54      }
55  
56      /*
57       * (non-Javadoc)
58       * 
59       * @see org.crosswire.common.diff.Locator#maxPatternLength()
60       */
61      public int maxPatternLength() {
62          return MAXBITS;
63      }
64  
65      /*
66       * (non-Javadoc)
67       * 
68       * @see org.crosswire.common.diff.Locator#locate()
69       */
70      public int locate() {
71          // Initialize the alphabet.
72          alphabet();
73  
74          // Coerce the text length between reasonable maximums and minimums.
75          scoreTextLength = Math.max(text.length(), Bitap.minLength);
76          scoreTextLength = Math.min(scoreTextLength, Bitap.maxLength);
77  
78          // Highest score beyond which we give up.
79          double scoreThreshold = Bitap.threshold;
80  
81          // Is there a nearby exact match? (speedup)
82          int bestLoc = text.indexOf(pattern, loc);
83          if (bestLoc != -1) {
84              scoreThreshold = Math.min(bitapScore(0, bestLoc), scoreThreshold);
85          }
86  
87          // What about in the other direction? (speedup)
88          bestLoc = text.lastIndexOf(pattern, loc + pattern.length());
89          if (bestLoc != -1) {
90              scoreThreshold = Math.min(bitapScore(0, bestLoc), scoreThreshold);
91          }
92  
93          // Initialize the bit arrays.
94          int matchmask = (int) Math.pow(2, pattern.length() - 1);
95  
96          bestLoc = -1;
97  
98          int binMin;
99          int binMid;
100         int binMax = Math.max(loc + loc, text.length());
101         int[] lastrd = new int[0];
102         for (int d = 0; d < pattern.length(); d++) {
103             // Scan for the best match; each iteration allows for one more
104             // error.
105             int[] rd = new int[text.length()];
106 
107             // Run a binary search to determine how far from 'loc' we can stray
108             // at this error level.
109             binMin = loc;
110             binMid = binMax;
111             while (binMin < binMid) {
112                 if (bitapScore(d, binMid) < scoreThreshold) {
113                     binMin = binMid;
114                 } else {
115                     binMax = binMid;
116                 }
117                 binMid = (binMax - binMin) / 2 + binMin;
118             }
119 
120             binMax = binMid; // Use the result from this iteration as the
121             // maximum for the next.
122             int start = Math.max(0, loc - (binMid - loc) - 1);
123             int finish = Math.min(text.length() - 1, pattern.length() + binMid);
124 
125             if (text.charAt(finish) == pattern.charAt(pattern.length() - 1)) {
126                 rd[finish] = (int) Math.pow(2, d + 1) - 1;
127             } else {
128                 rd[finish] = (int) Math.pow(2, d) - 1;
129             }
130 
131             for (int j = finish - 1; j >= start; j--) {
132                 Character curChar = Character.valueOf(text.charAt(j));
133                 int mask = alphabet.containsKey(curChar) ? alphabet.get(curChar).intValue() : 0;
134                 if (d == 0) { // First pass: exact match.
135                     rd[j] = ((rd[j + 1] << 1) | 1) & mask;
136                 } else { // Subsequent passes: fuzzy match.
137                     rd[j] = ((rd[j + 1] << 1) | 1) & mask | ((lastrd[j + 1] << 1) | 1) | ((lastrd[j] << 1) | 1) | lastrd[j + 1];
138                 }
139 
140                 if ((rd[j] & matchmask) != 0) {
141                     double score = bitapScore(d, j);
142                     // This match will almost certainly be better than any
143                     // existing match. But check anyway.
144                     if (score <= scoreThreshold) {
145                         // Told you so.
146                         scoreThreshold = score;
147                         bestLoc = j;
148                         if (j > loc) {
149                             // When passing loc, don't exceed our current
150                             // distance from loc.
151                             start = Math.max(0, loc - (j - loc));
152                         } else {
153                             // Already passed loc, downhill from here on in.
154                             break;
155                         }
156                     }
157                 }
158             }
159 
160             if (bitapScore(d + 1, loc) > scoreThreshold) // No hope for a
161             // (better) match at
162             // greater error
163             // levels.
164             {
165                 // No hope for a (better) match at greater error levels.
166                 break;
167             }
168 
169             lastrd = rd;
170         }
171 
172         return bestLoc;
173     }
174 
175     protected Map<Character, Integer> getAlphabet() {
176         return alphabet;
177     }
178 
179     /**
180      * Compute and return the score for a match with e errors and x location.
181      * 
182      * @param e
183      *            Number of errors in match
184      * @param x
185      *            Location of match
186      * @return Overall score for match
187      */
188     private double bitapScore(int e, int x) {
189         // Compute and return the score for a match with e errors and x
190         // location.
191         int d = Math.abs(loc - x);
192         return (e / (float) pattern.length() / Bitap.balance) + (d / (float) scoreTextLength / (1.0 - Bitap.balance));
193     }
194 
195     // Initialize the alphabet for the Bitap algorithm.
196     protected void alphabet() {
197         int len = pattern.length();
198 
199         assert len <= Bitap.MAXBITS : "Pattern too long for this application.";
200 
201         for (int i = 0; i < len; i++) {
202             Character c = Character.valueOf(pattern.charAt(i));
203             Integer value = alphabet.get(c);
204             int mask = value == null ? 0 : value.intValue();
205             mask |= (int) Math.pow(2, len - i - 1);
206             alphabet.put(c, Integer.valueOf(mask));
207         }
208     }
209 
210     public static void setBalance(float newBalance) {
211         balance = newBalance;
212     }
213 
214     public static void setThreshold(float newThreshold) {
215         threshold = newThreshold;
216     }
217 
218     public static void setMinLength(int newMinLength) {
219         minLength = newMinLength;
220     }
221 
222     public static void setMaxLength(int newMaxLength) {
223         maxLength = newMaxLength;
224     }
225 
226     /**
227      * The maximum number of bits in an int. Change this to 64 if long is used
228      * by alphabet.
229      */
230     private static final int MAXBITS = 32;
231 
232     /**
233      * Tweak the relative importance (0.0 = accuracy, 1.0 = proximity)
234      */
235     private static final float BALANCE = 0.5f;
236     private static float balance = BALANCE;
237 
238     /**
239      * At what point is no match declared (0.0 = perfection, 1.0 = very loose)
240      */
241     private static final float THRESHOLD = 0.5f;
242     private static float threshold = THRESHOLD;
243 
244     /**
245      * The min and max cutoffs used when computing text lengths.
246      */
247     private static final int MINLENGTH = 100;
248     private static int minLength = MINLENGTH;
249     private static final int MAXLENGTH = 1000;
250     private static int maxLength = MAXLENGTH;
251 
252     /**
253      * The text to search.
254      */
255     private String text;
256 
257     /**
258      * The pattern to find in the text.
259      */
260     private String pattern;
261 
262     /**
263      * The location in text to focus the search.
264      */
265     private int loc;
266 
267     /**
268      * The length of the string constrained between MINLENGHT and MAXLENGTH
269      */
270     private int scoreTextLength;
271 
272     /**
273      * Alphabet is the compiled representation of the pattern.
274      */
275     private Map<Character, Integer> alphabet;
276 }
277