Classes in this File | Line Coverage | Branch Coverage | Complexity | ||||
Distance |
|
| 6.5;6.5 |
1 | /** | |
2 | * Distribution License: | |
3 | * JSword is free software; you can redistribute it and/or modify it under | |
4 | * the terms of the GNU Lesser General Public License, version 2.1 or later | |
5 | * as published by the Free Software Foundation. This program is distributed | |
6 | * in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even | |
7 | * the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. | |
8 | * See the GNU Lesser General Public License for more details. | |
9 | * | |
10 | * The License is available on the internet at: | |
11 | * http://www.gnu.org/copyleft/lgpl.html | |
12 | * or by writing to: | |
13 | * Free Software Foundation, Inc. | |
14 | * 59 Temple Place - Suite 330 | |
15 | * Boston, MA 02111-1307, USA | |
16 | * | |
17 | * © CrossWire Bible Society, 2007 - 2016 | |
18 | * | |
19 | */ | |
20 | package org.crosswire.common.diff; | |
21 | ||
22 | /** | |
23 | * Compute the distance between 2 strings. The larger the number the greater the | |
24 | * distance. | |
25 | * | |
26 | * @see gnu.lgpl.License The GNU Lesser General Public License for details. | |
27 | * @author DM Smith | |
28 | */ | |
29 | public final class Distance { | |
30 | /** | |
31 | * Prevent instantiation. | |
32 | */ | |
33 | 0 | private Distance() { |
34 | 0 | } |
35 | ||
36 | /** | |
37 | * Compute the LevenshteinDistance between two strings. See <a | |
38 | * href="http://www.merriampark.com/ldjava.htm" | |
39 | * >www.merriampark.com/ldjava.htm</a> for original implementation. | |
40 | * | |
41 | * @param source | |
42 | * the baseline text | |
43 | * @param target | |
44 | * the changed text | |
45 | * @return the distance | |
46 | */ | |
47 | public static int getLevenshteinDistance(String source, String target) { | |
48 | 0 | if (source == null || target == null) { |
49 | 0 | throw new IllegalArgumentException("Strings must not be null"); |
50 | } | |
51 | ||
52 | /* | |
53 | * The difference between this impl. and the previous is that, rather | |
54 | * than creating and retaining a matrix of size s.length()+1 by | |
55 | * t.length()+1, we maintain two single-dimensional arrays of length | |
56 | * s.length()+1. The first, d, is the 'current working' distance array | |
57 | * that maintains the newest distance cost counts as we iterate through | |
58 | * the characters of String s. Each time we increment the index of | |
59 | * String t we are comparing, d is copied to p, the second int[]. Doing | |
60 | * so allows us to retain the previous cost counts as required by the | |
61 | * algorithm (taking the minimum of the cost count to the left, up one, | |
62 | * and diagonally up and to the left of the current cost count being | |
63 | * calculated). (Note that the arrays aren't really copied anymore, just | |
64 | * switched...this is clearly much better than cloning an array or doing | |
65 | * a System.arraycopy() each time through the outer loop.) | |
66 | * | |
67 | * Effectively, the difference between the two implementations is this | |
68 | * one does not cause an out of memory condition when calculating the LD | |
69 | * over two very large strings. | |
70 | */ | |
71 | ||
72 | 0 | int sourceLength = source.length(); // length of source |
73 | 0 | int targetLength = target.length(); // length of target |
74 | ||
75 | 0 | if (sourceLength == 0) { |
76 | 0 | return targetLength; |
77 | 0 | } else if (targetLength == 0) { |
78 | 0 | return sourceLength; |
79 | } | |
80 | ||
81 | 0 | int[] prevDist = new int[sourceLength + 1]; // 'previous' cost array, |
82 | // horizontally | |
83 | 0 | int[] dist = new int[sourceLength + 1]; // cost array, horizontally |
84 | int[] swap; // placeholder to assist in swapping prevDist and dist | |
85 | ||
86 | // indexes into strings source and target | |
87 | int i; // iterates through source | |
88 | int j; // iterates through target | |
89 | ||
90 | char targetJ; // jth character of t | |
91 | ||
92 | int cost; | |
93 | ||
94 | 0 | for (i = 0; i <= sourceLength; i++) { |
95 | 0 | prevDist[i] = i; |
96 | } | |
97 | ||
98 | 0 | for (j = 1; j <= targetLength; j++) { |
99 | 0 | targetJ = target.charAt(j - 1); |
100 | 0 | dist[0] = j; |
101 | ||
102 | 0 | for (i = 1; i <= sourceLength; i++) { |
103 | 0 | cost = source.charAt(i - 1) == targetJ ? 0 : 1; |
104 | // minimum of cell to the left + 1, to the top + 1, diagonally | |
105 | // left and up +cost | |
106 | 0 | dist[i] = Math.min(Math.min(dist[i - 1] + 1, prevDist[i] + 1), prevDist[i - 1] + cost); |
107 | } | |
108 | ||
109 | // copy current distance counts to 'previous row' distance counts | |
110 | 0 | swap = prevDist; |
111 | 0 | prevDist = dist; |
112 | 0 | dist = swap; |
113 | } | |
114 | ||
115 | // our last action in the above loop was to switch d and p, so p now | |
116 | // actually has the most recent cost counts | |
117 | 0 | return prevDist[sourceLength]; |
118 | } | |
119 | } |