| Bitap.java |
1 /**
2 * Distribution License:
3 * JSword is free software; you can redistribute it and/or modify it under
4 * the terms of the GNU Lesser General Public License, version 2.1 or later
5 * as published by the Free Software Foundation. This program is distributed
6 * in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even
7 * the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
8 * See the GNU Lesser General Public License for more details.
9 *
10 * The License is available on the internet at:
11 * http://www.gnu.org/copyleft/lgpl.html
12 * or by writing to:
13 * Free Software Foundation, Inc.
14 * 59 Temple Place - Suite 330
15 * Boston, MA 02111-1307, USA
16 *
17 * © CrossWire Bible Society, 2007 - 2016
18 *
19 */
20 package org.crosswire.common.diff;
21
22 import java.util.HashMap;
23 import java.util.Map;
24
25 /**
26 * An implementation of the Bitap algorithm for finding a "fuzzy" location of a
27 * match.
28 *
29 * Based on the LGPL Diff_Match_Patch v1.5 javascript of Neil Fraser, Copyright (C) 2006<br>
30 * <a href="http://neil.fraser.name/software/diff_match_patch/">http://neil.fraser.name/software/diff_match_patch/</a>
31 *
32 * @see gnu.lgpl.License The GNU Lesser General Public License for details.
33 * @author DM Smith
34 */
35 public class Bitap implements Locator {
36 /**
37 * Locate the best instance of 'pattern' in 'text' near 'loc'.
38 *
39 * @param text
40 * The text to search
41 * @param pattern
42 * The pattern to search for
43 * @param loc
44 * The location to search around
45 */
46 public Bitap(String text, String pattern, int loc) {
47 this.text = text;
48 this.pattern = pattern;
49 this.loc = loc;
50 alphabet = new HashMap<Character, Integer>();
51 }
52
53 /*
54 * (non-Javadoc)
55 *
56 * @see org.crosswire.common.diff.Locator#maxPatternLength()
57 */
58 public int maxPatternLength() {
59 return MAXBITS;
60 }
61
62 /*
63 * (non-Javadoc)
64 *
65 * @see org.crosswire.common.diff.Locator#locate()
66 */
67 public int locate() {
68 // Initialize the alphabet.
69 alphabet();
70
71 // Coerce the text length between reasonable maximums and minimums.
72 scoreTextLength = Math.max(text.length(), Bitap.minLength);
73 scoreTextLength = Math.min(scoreTextLength, Bitap.maxLength);
74
75 // Highest score beyond which we give up.
76 double scoreThreshold = Bitap.threshold;
77
78 // Is there a nearby exact match? (speedup)
79 int bestLoc = text.indexOf(pattern, loc);
80 if (bestLoc != -1) {
81 scoreThreshold = Math.min(bitapScore(0, bestLoc), scoreThreshold);
82 }
83
84 // What about in the other direction? (speedup)
85 bestLoc = text.lastIndexOf(pattern, loc + pattern.length());
86 if (bestLoc != -1) {
87 scoreThreshold = Math.min(bitapScore(0, bestLoc), scoreThreshold);
88 }
89
90 // Initialize the bit arrays.
91 int matchmask = (int) Math.pow(2, pattern.length() - 1);
92
93 bestLoc = -1;
94
95 int binMin;
96 int binMid;
97 int binMax = Math.max(loc + loc, text.length());
98 int[] lastrd = new int[0];
99 for (int d = 0; d < pattern.length(); d++) {
100 // Scan for the best match; each iteration allows for one more
101 // error.
102 int[] rd = new int[text.length()];
103
104 // Run a binary search to determine how far from 'loc' we can stray
105 // at this error level.
106 binMin = loc;
107 binMid = binMax;
108 while (binMin < binMid) {
109 if (bitapScore(d, binMid) < scoreThreshold) {
110 binMin = binMid;
111 } else {
112 binMax = binMid;
113 }
114 binMid = (binMax - binMin) / 2 + binMin;
115 }
116
117 binMax = binMid; // Use the result from this iteration as the
118 // maximum for the next.
119 int start = Math.max(0, loc - (binMid - loc) - 1);
120 int finish = Math.min(text.length() - 1, pattern.length() + binMid);
121
122 if (text.charAt(finish) == pattern.charAt(pattern.length() - 1)) {
123 rd[finish] = (int) Math.pow(2, d + 1) - 1;
124 } else {
125 rd[finish] = (int) Math.pow(2, d) - 1;
126 }
127
128 for (int j = finish - 1; j >= start; j--) {
129 Character curChar = Character.valueOf(text.charAt(j));
130 int mask = alphabet.containsKey(curChar) ? alphabet.get(curChar).intValue() : 0;
131 if (d == 0) { // First pass: exact match.
132 rd[j] = ((rd[j + 1] << 1) | 1) & mask;
133 } else { // Subsequent passes: fuzzy match.
134 rd[j] = ((rd[j + 1] << 1) | 1) & mask | ((lastrd[j + 1] << 1) | 1) | ((lastrd[j] << 1) | 1) | lastrd[j + 1];
135 }
136
137 if ((rd[j] & matchmask) != 0) {
138 double score = bitapScore(d, j);
139 // This match will almost certainly be better than any
140 // existing match. But check anyway.
141 if (score <= scoreThreshold) {
142 // Told you so.
143 scoreThreshold = score;
144 bestLoc = j;
145 if (j > loc) {
146 // When passing loc, don't exceed our current
147 // distance from loc.
148 start = Math.max(0, loc - (j - loc));
149 } else {
150 // Already passed loc, downhill from here on in.
151 break;
152 }
153 }
154 }
155 }
156
157 if (bitapScore(d + 1, loc) > scoreThreshold) // No hope for a
158 // (better) match at
159 // greater error
160 // levels.
161 {
162 // No hope for a (better) match at greater error levels.
163 break;
164 }
165
166 lastrd = rd;
167 }
168
169 return bestLoc;
170 }
171
172 protected Map<Character, Integer> getAlphabet() {
173 return alphabet;
174 }
175
176 /**
177 * Compute and return the score for a match with e errors and x location.
178 *
179 * @param e
180 * Number of errors in match
181 * @param x
182 * Location of match
183 * @return Overall score for match
184 */
185 private double bitapScore(int e, int x) {
186 // Compute and return the score for a match with e errors and x
187 // location.
188 int d = Math.abs(loc - x);
189 return (e / (float) pattern.length() / Bitap.balance) + (d / (float) scoreTextLength / (1.0 - Bitap.balance));
190 }
191
192 // Initialize the alphabet for the Bitap algorithm.
193 protected void alphabet() {
194 int len = pattern.length();
195
196 assert len <= Bitap.MAXBITS : "Pattern too long for this application.";
197
198 for (int i = 0; i < len; i++) {
199 Character c = Character.valueOf(pattern.charAt(i));
200 Integer value = alphabet.get(c);
201 int mask = value == null ? 0 : value.intValue();
202 mask |= (int) Math.pow(2, len - i - 1);
203 alphabet.put(c, Integer.valueOf(mask));
204 }
205 }
206
207 public static void setBalance(float newBalance) {
208 balance = newBalance;
209 }
210
211 public static void setThreshold(float newThreshold) {
212 threshold = newThreshold;
213 }
214
215 public static void setMinLength(int newMinLength) {
216 minLength = newMinLength;
217 }
218
219 public static void setMaxLength(int newMaxLength) {
220 maxLength = newMaxLength;
221 }
222
223 /**
224 * The maximum number of bits in an int. Change this to 64 if long is used
225 * by alphabet.
226 */
227 private static final int MAXBITS = 32;
228
229 /**
230 * Tweak the relative importance (0.0 = accuracy, 1.0 = proximity)
231 */
232 private static final float BALANCE = 0.5f;
233 private static float balance = BALANCE;
234
235 /**
236 * At what point is no match declared (0.0 = perfection, 1.0 = very loose)
237 */
238 private static final float THRESHOLD = 0.5f;
239 private static float threshold = THRESHOLD;
240
241 /**
242 * The min and max cutoffs used when computing text lengths.
243 */
244 private static final int MINLENGTH = 100;
245 private static int minLength = MINLENGTH;
246 private static final int MAXLENGTH = 1000;
247 private static int maxLength = MAXLENGTH;
248
249 /**
250 * The text to search.
251 */
252 private String text;
253
254 /**
255 * The pattern to find in the text.
256 */
257 private String pattern;
258
259 /**
260 * The location in text to focus the search.
261 */
262 private int loc;
263
264 /**
265 * The length of the string constrained between MINLENGHT and MAXLENGTH
266 */
267 private int scoreTextLength;
268
269 /**
270 * Alphabet is the compiled representation of the pattern.
271 */
272 private Map<Character, Integer> alphabet;
273 }
274