1   /**
2    * Distribution License:
3    * JSword is free software; you can redistribute it and/or modify it under
4    * the terms of the GNU Lesser General Public License, version 2.1 or later
5    * as published by the Free Software Foundation. This program is distributed
6    * in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even
7    * the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
8    * See the GNU Lesser General Public License for more details.
9    *
10   * The License is available on the internet at:
11   *      http://www.gnu.org/copyleft/lgpl.html
12   * or by writing to:
13   *      Free Software Foundation, Inc.
14   *      59 Temple Place - Suite 330
15   *      Boston, MA 02111-1307, USA
16   *
17   * © CrossWire Bible Society, 2007 - 2016
18   *
19   */
20  package org.crosswire.common.diff;
21  
22  /**
23   * Compute the distance between 2 strings. The larger the number the greater the
24   * distance.
25   * 
26   * @see gnu.lgpl.License The GNU Lesser General Public License for details.
27   * @author DM Smith
28   */
29  public final class Distance {
30      /**
31       * Prevent instantiation.
32       */
33      private Distance() {
34      }
35  
36      /**
37       * Compute the LevenshteinDistance between two strings. See <a
38       * href="http://www.merriampark.com/ldjava.htm"
39       * >www.merriampark.com/ldjava.htm</a> for original implementation.
40       * 
41       * @param source
42       *            the baseline text
43       * @param target
44       *            the changed text
45       * @return the distance
46       */
47      public static int getLevenshteinDistance(String source, String target) {
48          if (source == null || target == null) {
49              throw new IllegalArgumentException("Strings must not be null");
50          }
51  
52          /*
53           * The difference between this impl. and the previous is that, rather
54           * than creating and retaining a matrix of size s.length()+1 by
55           * t.length()+1, we maintain two single-dimensional arrays of length
56           * s.length()+1. The first, d, is the 'current working' distance array
57           * that maintains the newest distance cost counts as we iterate through
58           * the characters of String s. Each time we increment the index of
59           * String t we are comparing, d is copied to p, the second int[]. Doing
60           * so allows us to retain the previous cost counts as required by the
61           * algorithm (taking the minimum of the cost count to the left, up one,
62           * and diagonally up and to the left of the current cost count being
63           * calculated). (Note that the arrays aren't really copied anymore, just
64           * switched...this is clearly much better than cloning an array or doing
65           * a System.arraycopy() each time through the outer loop.)
66           * 
67           * Effectively, the difference between the two implementations is this
68           * one does not cause an out of memory condition when calculating the LD
69           * over two very large strings.
70           */
71  
72          int sourceLength = source.length(); // length of source
73          int targetLength = target.length(); // length of target
74  
75          if (sourceLength == 0) {
76              return targetLength;
77          } else if (targetLength == 0) {
78              return sourceLength;
79          }
80  
81          int[] prevDist = new int[sourceLength + 1]; // 'previous' cost array,
82          // horizontally
83          int[] dist = new int[sourceLength + 1]; // cost array, horizontally
84          int[] swap; // placeholder to assist in swapping prevDist and dist
85  
86          // indexes into strings source and target
87          int i; // iterates through source
88          int j; // iterates through target
89  
90          char targetJ; // jth character of t
91  
92          int cost;
93  
94          for (i = 0; i <= sourceLength; i++) {
95              prevDist[i] = i;
96          }
97  
98          for (j = 1; j <= targetLength; j++) {
99              targetJ = target.charAt(j - 1);
100             dist[0] = j;
101 
102             for (i = 1; i <= sourceLength; i++) {
103                 cost = source.charAt(i - 1) == targetJ ? 0 : 1;
104                 // minimum of cell to the left + 1, to the top + 1, diagonally
105                 // left and up +cost
106                 dist[i] = Math.min(Math.min(dist[i - 1] + 1, prevDist[i] + 1), prevDist[i - 1] + cost);
107             }
108 
109             // copy current distance counts to 'previous row' distance counts
110             swap = prevDist;
111             prevDist = dist;
112             dist = swap;
113         }
114 
115         // our last action in the above loop was to switch d and p, so p now
116         // actually has the most recent cost counts
117         return prevDist[sourceLength];
118     }
119 }
120