| Distance.java |
1 /**
2 * Distribution License:
3 * JSword is free software; you can redistribute it and/or modify it under
4 * the terms of the GNU Lesser General Public License, version 2.1 or later
5 * as published by the Free Software Foundation. This program is distributed
6 * in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even
7 * the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
8 * See the GNU Lesser General Public License for more details.
9 *
10 * The License is available on the internet at:
11 * http://www.gnu.org/copyleft/lgpl.html
12 * or by writing to:
13 * Free Software Foundation, Inc.
14 * 59 Temple Place - Suite 330
15 * Boston, MA 02111-1307, USA
16 *
17 * © CrossWire Bible Society, 2007 - 2016
18 *
19 */
20 package org.crosswire.common.diff;
21
22 /**
23 * Compute the distance between 2 strings. The larger the number the greater the
24 * distance.
25 *
26 * @see gnu.lgpl.License The GNU Lesser General Public License for details.
27 * @author DM Smith
28 */
29 public final class Distance {
30 /**
31 * Prevent instantiation.
32 */
33 private Distance() {
34 }
35
36 /**
37 * Compute the LevenshteinDistance between two strings. See <a
38 * href="http://www.merriampark.com/ldjava.htm"
39 * >www.merriampark.com/ldjava.htm</a> for original implementation.
40 *
41 * @param source
42 * the baseline text
43 * @param target
44 * the changed text
45 * @return the distance
46 */
47 public static int getLevenshteinDistance(String source, String target) {
48 if (source == null || target == null) {
49 throw new IllegalArgumentException("Strings must not be null");
50 }
51
52 /*
53 * The difference between this impl. and the previous is that, rather
54 * than creating and retaining a matrix of size s.length()+1 by
55 * t.length()+1, we maintain two single-dimensional arrays of length
56 * s.length()+1. The first, d, is the 'current working' distance array
57 * that maintains the newest distance cost counts as we iterate through
58 * the characters of String s. Each time we increment the index of
59 * String t we are comparing, d is copied to p, the second int[]. Doing
60 * so allows us to retain the previous cost counts as required by the
61 * algorithm (taking the minimum of the cost count to the left, up one,
62 * and diagonally up and to the left of the current cost count being
63 * calculated). (Note that the arrays aren't really copied anymore, just
64 * switched...this is clearly much better than cloning an array or doing
65 * a System.arraycopy() each time through the outer loop.)
66 *
67 * Effectively, the difference between the two implementations is this
68 * one does not cause an out of memory condition when calculating the LD
69 * over two very large strings.
70 */
71
72 int sourceLength = source.length(); // length of source
73 int targetLength = target.length(); // length of target
74
75 if (sourceLength == 0) {
76 return targetLength;
77 } else if (targetLength == 0) {
78 return sourceLength;
79 }
80
81 int[] prevDist = new int[sourceLength + 1]; // 'previous' cost array,
82 // horizontally
83 int[] dist = new int[sourceLength + 1]; // cost array, horizontally
84 int[] swap; // placeholder to assist in swapping prevDist and dist
85
86 // indexes into strings source and target
87 int i; // iterates through source
88 int j; // iterates through target
89
90 char targetJ; // jth character of t
91
92 int cost;
93
94 for (i = 0; i <= sourceLength; i++) {
95 prevDist[i] = i;
96 }
97
98 for (j = 1; j <= targetLength; j++) {
99 targetJ = target.charAt(j - 1);
100 dist[0] = j;
101
102 for (i = 1; i <= sourceLength; i++) {
103 cost = source.charAt(i - 1) == targetJ ? 0 : 1;
104 // minimum of cell to the left + 1, to the top + 1, diagonally
105 // left and up +cost
106 dist[i] = Math.min(Math.min(dist[i - 1] + 1, prevDist[i] + 1), prevDist[i - 1] + cost);
107 }
108
109 // copy current distance counts to 'previous row' distance counts
110 swap = prevDist;
111 prevDist = dist;
112 dist = swap;
113 }
114
115 // our last action in the above loop was to switch d and p, so p now
116 // actually has the most recent cost counts
117 return prevDist[sourceLength];
118 }
119 }
120