/* You may freely copy, distribute, modify and use this class as long as the original author attribution remains intact. See message below. Copyright (C) 2003 Christian Pesch. All Rights Reserved. */ package slash.metamusic.distance; /** * Taken from: http://www.merriampark.com/ld.htm * <p/> * Levenshtein distance (distance) is a measure of the similarity between * two strings, which we will refer to as the source string (s) and * the target string (t). The distance is the number of deletions, * insertions, or substitutions required to transform s into t. * <p/> * For example, * If s is "test" and t is "test", then distance(s,t) = 0, because no * transformations are needed. The strings are already identical. * If s is "test" and t is "tent", then distance(s,t) = 1, because one * substitution (change "s" to "n") is sufficient to transform s * into t. * <p/> * The greater the Levenshtein distance, the more different the * strings are. * <p/> * Levenshtein distance is named after the Russian scientist * Vladimir Levenshtein, who devised the algorithm in 1965. If * you can't spell or pronounce Levenshtein, the metric is also * sometimes called edit distance. * <p/> * The algorithm: * <p/> * Step Description * 1 Set n to be the length of s. * Set m to be the length of t. * If n = 0, return m and exit. * If m = 0, return n and exit. * Construct a matrix containing 0..m rows and 0..n columns. * 2 Initialize the first row to 0..n. * Initialize the first column to 0..m. * 3 Examine each character of s (i from 1 to n). * 4 Examine each character of t (j from 1 to m). * 5 If s[i] equals t[j], the cost is 0. * If s[i] doesn't equals t[j], the cost is 1. * 6 Set cell d[i,j] of the matrix equals to the minimum of: * a. The cell immediately above plus 1: d[i-1,j] + 1. * b. The cell immediately to the left plus 1: d[i,j-1] + 1. * c. The cell diagonally above and to the left plus the cost: d[i-1,j-1] + cost. * 7 After the iteration steps (3, 4, 5, 6) are complete, the distance is found in cell d[n,m]. * <p/> * Example * <p/> * This section shows how the Levenshtein distance is computed when the * source string is "GUMBO" and the target string is "GAMBOL". * <p/> * Steps 1 and 2 * G U M B O * 0 1 2 3 4 5 * G 1 * A 2 * M 3 * B 4 * O 5 * L 6 * <p/> * Steps 3 to 6 When i = 1 * G U M B O * 0 1 2 3 4 5 * G 1 0 * A 2 1 * M 3 2 * B 4 3 * O 5 4 * L 6 5 * <p/> * Steps 3 to 6 When i = 2 * G U M B O * 0 1 2 3 4 5 * G 1 0 1 * A 2 1 1 * M 3 2 2 * B 4 3 3 * O 5 4 4 * L 6 5 5 * <p/> * Steps 3 to 6 When i = 3 * G U M B O * 0 1 2 3 4 5 * G 1 0 1 2 * A 2 1 1 2 * M 3 2 2 1 * B 4 3 3 2 * O 5 4 4 3 * L 6 5 5 4 * <p/> * Steps 3 to 6 When i = 4 * G U M B O * 0 1 2 3 4 5 * G 1 0 1 2 3 * A 2 1 1 2 3 * M 3 2 2 1 2 * B 4 3 3 2 1 * O 5 4 4 3 2 * L 6 5 5 4 3 * <p/> * Steps 3 to 6 When i = 5 * G U M B O * 0 1 2 3 4 5 * G 1 0 1 2 3 4 * A 2 1 1 2 3 4 * M 3 2 2 1 2 3 * B 4 3 3 2 1 2 * O 5 4 4 3 2 1 * L 6 5 5 4 3 2 * <p/> * Step 7 * The distance is in the lower right hand corner of the matrix, * i.e. 2. This corresponds to our intuitive realization that * "GUMBO" can be transformed into "GAMBOL" by substituting "A" * for "U" and adding "L" (one substitution and 1 insertion = * 2 changes). * * @author Christian Pesch * @version $Id: Levenshtein.java 911 2006-12-23 17:25:04Z cpesch $ */ public class Levenshtein { /** * Calculate the minimum of three values */ private static int minimum(int a, int b, int c) { int mi; mi = a; if (b < mi) { mi = b; } if (c < mi) { mi = c; } return mi; } /** * Compute the Levenshtein distance between s and t. */ public static int distance(String s, String t) { // Step 1 int n = s.length(); int m = t.length(); if (n == 0) { return m; } if (m == 0) { return n; } // Step 2 int[][] matrix = new int[n + 1][m + 1]; for (int i = 0; i <= n; i++) { matrix[i][0] = i; } for (int j = 0; j <= m; j++) { matrix[0][j] = j; } // Step 3 int cost; for (int i = 1; i <= n; i++) { char s_i = s.charAt(i - 1); // Step 4 for (int j = 1; j <= m; j++) { char t_j = t.charAt(j - 1); // Step 5 if (s_i == t_j) { cost = 0; } else { cost = 1; } // Step 6 matrix[i][j] = minimum(matrix[i - 1][j] + 1, matrix[i][j - 1] + 1, matrix[i - 1][j - 1] + cost); } } // Step 7 return matrix[n][m]; } public static void main(String[] args) { if (args.length != 2) { System.out.println(Levenshtein.class + " <string1> <string2>"); System.exit(5); } System.out.println(Levenshtein.class + "#distance: " + distance(args[0], args[1])); System.exit(0); } }