1   /**
2    * Distribution License:
3    * JSword is free software; you can redistribute it and/or modify it under
4    * the terms of the GNU Lesser General Public License, version 2.1 or later
5    * as published by the Free Software Foundation. This program is distributed
6    * in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even
7    * the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
8    * See the GNU Lesser General Public License for more details.
9    *
10   * The License is available on the internet at:
11   *      http://www.gnu.org/copyleft/lgpl.html
12   * or by writing to:
13   *      Free Software Foundation, Inc.
14   *      59 Temple Place - Suite 330
15   *      Boston, MA 02111-1307, USA
16   *
17   * © CrossWire Bible Society, 2007 - 2016
18   *
19   */
20  package org.crosswire.common.diff;
21  
22  import java.util.HashMap;
23  import java.util.Map;
24  
25  /**
26   * An implementation of the Bitap algorithm for finding a "fuzzy" location of a
27   * match.
28   * 
29   * Based on the LGPL Diff_Match_Patch v1.5 javascript of Neil Fraser, Copyright (C) 2006<br>
30   * <a href="http://neil.fraser.name/software/diff_match_patch/">http://neil.fraser.name/software/diff_match_patch/</a>
31   * 
32   * @see gnu.lgpl.License The GNU Lesser General Public License for details.
33   * @author DM Smith
34   */
35  public class Bitap implements Locator {
36      /**
37       * Locate the best instance of 'pattern' in 'text' near 'loc'.
38       * 
39       * @param text
40       *            The text to search
41       * @param pattern
42       *            The pattern to search for
43       * @param loc
44       *            The location to search around
45       */
46      public Bitap(String text, String pattern, int loc) {
47          this.text = text;
48          this.pattern = pattern;
49          this.loc = loc;
50          alphabet = new HashMap<Character, Integer>();
51      }
52  
53      /*
54       * (non-Javadoc)
55       * 
56       * @see org.crosswire.common.diff.Locator#maxPatternLength()
57       */
58      public int maxPatternLength() {
59          return MAXBITS;
60      }
61  
62      /*
63       * (non-Javadoc)
64       * 
65       * @see org.crosswire.common.diff.Locator#locate()
66       */
67      public int locate() {
68          // Initialize the alphabet.
69          alphabet();
70  
71          // Coerce the text length between reasonable maximums and minimums.
72          scoreTextLength = Math.max(text.length(), Bitap.minLength);
73          scoreTextLength = Math.min(scoreTextLength, Bitap.maxLength);
74  
75          // Highest score beyond which we give up.
76          double scoreThreshold = Bitap.threshold;
77  
78          // Is there a nearby exact match? (speedup)
79          int bestLoc = text.indexOf(pattern, loc);
80          if (bestLoc != -1) {
81              scoreThreshold = Math.min(bitapScore(0, bestLoc), scoreThreshold);
82          }
83  
84          // What about in the other direction? (speedup)
85          bestLoc = text.lastIndexOf(pattern, loc + pattern.length());
86          if (bestLoc != -1) {
87              scoreThreshold = Math.min(bitapScore(0, bestLoc), scoreThreshold);
88          }
89  
90          // Initialize the bit arrays.
91          int matchmask = (int) Math.pow(2, pattern.length() - 1);
92  
93          bestLoc = -1;
94  
95          int binMin;
96          int binMid;
97          int binMax = Math.max(loc + loc, text.length());
98          int[] lastrd = new int[0];
99          for (int d = 0; d < pattern.length(); d++) {
100             // Scan for the best match; each iteration allows for one more
101             // error.
102             int[] rd = new int[text.length()];
103 
104             // Run a binary search to determine how far from 'loc' we can stray
105             // at this error level.
106             binMin = loc;
107             binMid = binMax;
108             while (binMin < binMid) {
109                 if (bitapScore(d, binMid) < scoreThreshold) {
110                     binMin = binMid;
111                 } else {
112                     binMax = binMid;
113                 }
114                 binMid = (binMax - binMin) / 2 + binMin;
115             }
116 
117             binMax = binMid; // Use the result from this iteration as the
118             // maximum for the next.
119             int start = Math.max(0, loc - (binMid - loc) - 1);
120             int finish = Math.min(text.length() - 1, pattern.length() + binMid);
121 
122             if (text.charAt(finish) == pattern.charAt(pattern.length() - 1)) {
123                 rd[finish] = (int) Math.pow(2, d + 1) - 1;
124             } else {
125                 rd[finish] = (int) Math.pow(2, d) - 1;
126             }
127 
128             for (int j = finish - 1; j >= start; j--) {
129                 Character curChar = Character.valueOf(text.charAt(j));
130                 int mask = alphabet.containsKey(curChar) ? alphabet.get(curChar).intValue() : 0;
131                 if (d == 0) { // First pass: exact match.
132                     rd[j] = ((rd[j + 1] << 1) | 1) & mask;
133                 } else { // Subsequent passes: fuzzy match.
134                     rd[j] = ((rd[j + 1] << 1) | 1) & mask | ((lastrd[j + 1] << 1) | 1) | ((lastrd[j] << 1) | 1) | lastrd[j + 1];
135                 }
136 
137                 if ((rd[j] & matchmask) != 0) {
138                     double score = bitapScore(d, j);
139                     // This match will almost certainly be better than any
140                     // existing match. But check anyway.
141                     if (score <= scoreThreshold) {
142                         // Told you so.
143                         scoreThreshold = score;
144                         bestLoc = j;
145                         if (j > loc) {
146                             // When passing loc, don't exceed our current
147                             // distance from loc.
148                             start = Math.max(0, loc - (j - loc));
149                         } else {
150                             // Already passed loc, downhill from here on in.
151                             break;
152                         }
153                     }
154                 }
155             }
156 
157             if (bitapScore(d + 1, loc) > scoreThreshold) // No hope for a
158             // (better) match at
159             // greater error
160             // levels.
161             {
162                 // No hope for a (better) match at greater error levels.
163                 break;
164             }
165 
166             lastrd = rd;
167         }
168 
169         return bestLoc;
170     }
171 
172     protected Map<Character, Integer> getAlphabet() {
173         return alphabet;
174     }
175 
176     /**
177      * Compute and return the score for a match with e errors and x location.
178      * 
179      * @param e
180      *            Number of errors in match
181      * @param x
182      *            Location of match
183      * @return Overall score for match
184      */
185     private double bitapScore(int e, int x) {
186         // Compute and return the score for a match with e errors and x
187         // location.
188         int d = Math.abs(loc - x);
189         return (e / (float) pattern.length() / Bitap.balance) + (d / (float) scoreTextLength / (1.0 - Bitap.balance));
190     }
191 
192     // Initialize the alphabet for the Bitap algorithm.
193     protected void alphabet() {
194         int len = pattern.length();
195 
196         assert len <= Bitap.MAXBITS : "Pattern too long for this application.";
197 
198         for (int i = 0; i < len; i++) {
199             Character c = Character.valueOf(pattern.charAt(i));
200             Integer value = alphabet.get(c);
201             int mask = value == null ? 0 : value.intValue();
202             mask |= (int) Math.pow(2, len - i - 1);
203             alphabet.put(c, Integer.valueOf(mask));
204         }
205     }
206 
207     public static void setBalance(float newBalance) {
208         balance = newBalance;
209     }
210 
211     public static void setThreshold(float newThreshold) {
212         threshold = newThreshold;
213     }
214 
215     public static void setMinLength(int newMinLength) {
216         minLength = newMinLength;
217     }
218 
219     public static void setMaxLength(int newMaxLength) {
220         maxLength = newMaxLength;
221     }
222 
223     /**
224      * The maximum number of bits in an int. Change this to 64 if long is used
225      * by alphabet.
226      */
227     private static final int MAXBITS = 32;
228 
229     /**
230      * Tweak the relative importance (0.0 = accuracy, 1.0 = proximity)
231      */
232     private static final float BALANCE = 0.5f;
233     private static float balance = BALANCE;
234 
235     /**
236      * At what point is no match declared (0.0 = perfection, 1.0 = very loose)
237      */
238     private static final float THRESHOLD = 0.5f;
239     private static float threshold = THRESHOLD;
240 
241     /**
242      * The min and max cutoffs used when computing text lengths.
243      */
244     private static final int MINLENGTH = 100;
245     private static int minLength = MINLENGTH;
246     private static final int MAXLENGTH = 1000;
247     private static int maxLength = MAXLENGTH;
248 
249     /**
250      * The text to search.
251      */
252     private String text;
253 
254     /**
255      * The pattern to find in the text.
256      */
257     private String pattern;
258 
259     /**
260      * The location in text to focus the search.
261      */
262     private int loc;
263 
264     /**
265      * The length of the string constrained between MINLENGHT and MAXLENGTH
266      */
267     private int scoreTextLength;
268 
269     /**
270      * Alphabet is the compiled representation of the pattern.
271      */
272     private Map<Character, Integer> alphabet;
273 }
274