| Bitap.java |
1 /**
2 * Distribution License:
3 * JSword is free software; you can redistribute it and/or modify it under
4 * the terms of the GNU Lesser General Public License, version 2.1 as published by
5 * the Free Software Foundation. This program is distributed in the hope
6 * that it will be useful, but WITHOUT ANY WARRANTY; without even the
7 * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
8 * See the GNU Lesser General Public License for more details.
9 *
10 * The License is available on the internet at:
11 * http://www.gnu.org/copyleft/lgpl.html
12 * or by writing to:
13 * Free Software Foundation, Inc.
14 * 59 Temple Place - Suite 330
15 * Boston, MA 02111-1307, USA
16 *
17 * Copyright: 2007
18 * The copyright to this program is held by it's authors.
19 *
20 * ID: $Id: Bitap.java 2149 2011-04-09 15:51:45Z dmsmith $
21 */
22 package org.crosswire.common.diff;
23
24 import java.util.HashMap;
25 import java.util.Map;
26
27 /**
28 * An implementation of the Bitap algorithm for finding a "fuzzy" location of a
29 * match.
30 *
31 * Based on the LGPL Diff_Match_Patch v1.5 javascript of Neil Fraser, Copyright (C) 2006<br>
32 * <a href="http://neil.fraser.name/software/diff_match_patch/">http://neil.fraser.name/software/diff_match_patch/</a>
33 *
34 * @see gnu.lgpl.License for license details.<br>
35 * The copyright to this program is held by it's authors.
36 * @author DM Smith [dmsmith555 at yahoo dot com]
37 */
38 public class Bitap implements Locator {
39 /**
40 * Locate the best instance of 'pattern' in 'text' near 'loc'.
41 *
42 * @param text
43 * The text to search
44 * @param pattern
45 * The pattern to search for
46 * @param loc
47 * The location to search around
48 */
49 public Bitap(String text, String pattern, int loc) {
50 this.text = text;
51 this.pattern = pattern;
52 this.loc = loc;
53 alphabet = new HashMap<Character, Integer>();
54 }
55
56 /*
57 * (non-Javadoc)
58 *
59 * @see org.crosswire.common.diff.Locator#maxPatternLength()
60 */
61 public int maxPatternLength() {
62 return MAXBITS;
63 }
64
65 /*
66 * (non-Javadoc)
67 *
68 * @see org.crosswire.common.diff.Locator#locate()
69 */
70 public int locate() {
71 // Initialize the alphabet.
72 alphabet();
73
74 // Coerce the text length between reasonable maximums and minimums.
75 scoreTextLength = Math.max(text.length(), Bitap.minLength);
76 scoreTextLength = Math.min(scoreTextLength, Bitap.maxLength);
77
78 // Highest score beyond which we give up.
79 double scoreThreshold = Bitap.threshold;
80
81 // Is there a nearby exact match? (speedup)
82 int bestLoc = text.indexOf(pattern, loc);
83 if (bestLoc != -1) {
84 scoreThreshold = Math.min(bitapScore(0, bestLoc), scoreThreshold);
85 }
86
87 // What about in the other direction? (speedup)
88 bestLoc = text.lastIndexOf(pattern, loc + pattern.length());
89 if (bestLoc != -1) {
90 scoreThreshold = Math.min(bitapScore(0, bestLoc), scoreThreshold);
91 }
92
93 // Initialize the bit arrays.
94 int matchmask = (int) Math.pow(2, pattern.length() - 1);
95
96 bestLoc = -1;
97
98 int binMin;
99 int binMid;
100 int binMax = Math.max(loc + loc, text.length());
101 int[] lastrd = new int[0];
102 for (int d = 0; d < pattern.length(); d++) {
103 // Scan for the best match; each iteration allows for one more
104 // error.
105 int[] rd = new int[text.length()];
106
107 // Run a binary search to determine how far from 'loc' we can stray
108 // at this error level.
109 binMin = loc;
110 binMid = binMax;
111 while (binMin < binMid) {
112 if (bitapScore(d, binMid) < scoreThreshold) {
113 binMin = binMid;
114 } else {
115 binMax = binMid;
116 }
117 binMid = (binMax - binMin) / 2 + binMin;
118 }
119
120 binMax = binMid; // Use the result from this iteration as the
121 // maximum for the next.
122 int start = Math.max(0, loc - (binMid - loc) - 1);
123 int finish = Math.min(text.length() - 1, pattern.length() + binMid);
124
125 if (text.charAt(finish) == pattern.charAt(pattern.length() - 1)) {
126 rd[finish] = (int) Math.pow(2, d + 1) - 1;
127 } else {
128 rd[finish] = (int) Math.pow(2, d) - 1;
129 }
130
131 for (int j = finish - 1; j >= start; j--) {
132 Character curChar = Character.valueOf(text.charAt(j));
133 int mask = alphabet.containsKey(curChar) ? alphabet.get(curChar).intValue() : 0;
134 if (d == 0) { // First pass: exact match.
135 rd[j] = ((rd[j + 1] << 1) | 1) & mask;
136 } else { // Subsequent passes: fuzzy match.
137 rd[j] = ((rd[j + 1] << 1) | 1) & mask | ((lastrd[j + 1] << 1) | 1) | ((lastrd[j] << 1) | 1) | lastrd[j + 1];
138 }
139
140 if ((rd[j] & matchmask) != 0) {
141 double score = bitapScore(d, j);
142 // This match will almost certainly be better than any
143 // existing match. But check anyway.
144 if (score <= scoreThreshold) {
145 // Told you so.
146 scoreThreshold = score;
147 bestLoc = j;
148 if (j > loc) {
149 // When passing loc, don't exceed our current
150 // distance from loc.
151 start = Math.max(0, loc - (j - loc));
152 } else {
153 // Already passed loc, downhill from here on in.
154 break;
155 }
156 }
157 }
158 }
159
160 if (bitapScore(d + 1, loc) > scoreThreshold) // No hope for a
161 // (better) match at
162 // greater error
163 // levels.
164 {
165 // No hope for a (better) match at greater error levels.
166 break;
167 }
168
169 lastrd = rd;
170 }
171
172 return bestLoc;
173 }
174
175 protected Map<Character, Integer> getAlphabet() {
176 return alphabet;
177 }
178
179 /**
180 * Compute and return the score for a match with e errors and x location.
181 *
182 * @param e
183 * Number of errors in match
184 * @param x
185 * Location of match
186 * @return Overall score for match
187 */
188 private double bitapScore(int e, int x) {
189 // Compute and return the score for a match with e errors and x
190 // location.
191 int d = Math.abs(loc - x);
192 return (e / (float) pattern.length() / Bitap.balance) + (d / (float) scoreTextLength / (1.0 - Bitap.balance));
193 }
194
195 // Initialize the alphabet for the Bitap algorithm.
196 protected void alphabet() {
197 int len = pattern.length();
198
199 assert len <= Bitap.MAXBITS : "Pattern too long for this application.";
200
201 for (int i = 0; i < len; i++) {
202 Character c = Character.valueOf(pattern.charAt(i));
203 Integer value = alphabet.get(c);
204 int mask = value == null ? 0 : value.intValue();
205 mask |= (int) Math.pow(2, len - i - 1);
206 alphabet.put(c, Integer.valueOf(mask));
207 }
208 }
209
210 public static void setBalance(float newBalance) {
211 balance = newBalance;
212 }
213
214 public static void setThreshold(float newThreshold) {
215 threshold = newThreshold;
216 }
217
218 public static void setMinLength(int newMinLength) {
219 minLength = newMinLength;
220 }
221
222 public static void setMaxLength(int newMaxLength) {
223 maxLength = newMaxLength;
224 }
225
226 /**
227 * The maximum number of bits in an int. Change this to 64 if long is used
228 * by alphabet.
229 */
230 private static final int MAXBITS = 32;
231
232 /**
233 * Tweak the relative importance (0.0 = accuracy, 1.0 = proximity)
234 */
235 private static final float BALANCE = 0.5f;
236 private static float balance = BALANCE;
237
238 /**
239 * At what point is no match declared (0.0 = perfection, 1.0 = very loose)
240 */
241 private static final float THRESHOLD = 0.5f;
242 private static float threshold = THRESHOLD;
243
244 /**
245 * The min and max cutoffs used when computing text lengths.
246 */
247 private static final int MINLENGTH = 100;
248 private static int minLength = MINLENGTH;
249 private static final int MAXLENGTH = 1000;
250 private static int maxLength = MAXLENGTH;
251
252 /**
253 * The text to search.
254 */
255 private String text;
256
257 /**
258 * The pattern to find in the text.
259 */
260 private String pattern;
261
262 /**
263 * The location in text to focus the search.
264 */
265 private int loc;
266
267 /**
268 * The length of the string constrained between MINLENGHT and MAXLENGTH
269 */
270 private int scoreTextLength;
271
272 /**
273 * Alphabet is the compiled representation of the pattern.
274 */
275 private Map<Character, Integer> alphabet;
276 }
277