Distance.java |
1 /** 2 * Distribution License: 3 * JSword is free software; you can redistribute it and/or modify it under 4 * the terms of the GNU Lesser General Public License, version 2.1 or later 5 * as published by the Free Software Foundation. This program is distributed 6 * in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even 7 * the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 8 * See the GNU Lesser General Public License for more details. 9 * 10 * The License is available on the internet at: 11 * http://www.gnu.org/copyleft/lgpl.html 12 * or by writing to: 13 * Free Software Foundation, Inc. 14 * 59 Temple Place - Suite 330 15 * Boston, MA 02111-1307, USA 16 * 17 * © CrossWire Bible Society, 2007 - 2016 18 * 19 */ 20 package org.crosswire.common.diff; 21 22 /** 23 * Compute the distance between 2 strings. The larger the number the greater the 24 * distance. 25 * 26 * @see gnu.lgpl.License The GNU Lesser General Public License for details. 27 * @author DM Smith 28 */ 29 public final class Distance { 30 /** 31 * Prevent instantiation. 32 */ 33 private Distance() { 34 } 35 36 /** 37 * Compute the LevenshteinDistance between two strings. See <a 38 * href="http://www.merriampark.com/ldjava.htm" 39 * >www.merriampark.com/ldjava.htm</a> for original implementation. 40 * 41 * @param source 42 * the baseline text 43 * @param target 44 * the changed text 45 * @return the distance 46 */ 47 public static int getLevenshteinDistance(String source, String target) { 48 if (source == null || target == null) { 49 throw new IllegalArgumentException("Strings must not be null"); 50 } 51 52 /* 53 * The difference between this impl. and the previous is that, rather 54 * than creating and retaining a matrix of size s.length()+1 by 55 * t.length()+1, we maintain two single-dimensional arrays of length 56 * s.length()+1. The first, d, is the 'current working' distance array 57 * that maintains the newest distance cost counts as we iterate through 58 * the characters of String s. Each time we increment the index of 59 * String t we are comparing, d is copied to p, the second int[]. Doing 60 * so allows us to retain the previous cost counts as required by the 61 * algorithm (taking the minimum of the cost count to the left, up one, 62 * and diagonally up and to the left of the current cost count being 63 * calculated). (Note that the arrays aren't really copied anymore, just 64 * switched...this is clearly much better than cloning an array or doing 65 * a System.arraycopy() each time through the outer loop.) 66 * 67 * Effectively, the difference between the two implementations is this 68 * one does not cause an out of memory condition when calculating the LD 69 * over two very large strings. 70 */ 71 72 int sourceLength = source.length(); // length of source 73 int targetLength = target.length(); // length of target 74 75 if (sourceLength == 0) { 76 return targetLength; 77 } else if (targetLength == 0) { 78 return sourceLength; 79 } 80 81 int[] prevDist = new int[sourceLength + 1]; // 'previous' cost array, 82 // horizontally 83 int[] dist = new int[sourceLength + 1]; // cost array, horizontally 84 int[] swap; // placeholder to assist in swapping prevDist and dist 85 86 // indexes into strings source and target 87 int i; // iterates through source 88 int j; // iterates through target 89 90 char targetJ; // jth character of t 91 92 int cost; 93 94 for (i = 0; i <= sourceLength; i++) { 95 prevDist[i] = i; 96 } 97 98 for (j = 1; j <= targetLength; j++) { 99 targetJ = target.charAt(j - 1); 100 dist[0] = j; 101 102 for (i = 1; i <= sourceLength; i++) { 103 cost = source.charAt(i - 1) == targetJ ? 0 : 1; 104 // minimum of cell to the left + 1, to the top + 1, diagonally 105 // left and up +cost 106 dist[i] = Math.min(Math.min(dist[i - 1] + 1, prevDist[i] + 1), prevDist[i - 1] + cost); 107 } 108 109 // copy current distance counts to 'previous row' distance counts 110 swap = prevDist; 111 prevDist = dist; 112 dist = swap; 113 } 114 115 // our last action in the above loop was to switch d and p, so p now 116 // actually has the most recent cost counts 117 return prevDist[sourceLength]; 118 } 119 } 120