/************************************************************************** OmegaT - Computer Assisted Translation (CAT) tool with fuzzy matching, translation memory, keyword search, glossaries, and translation leveraging into updated projects. Copyright (C) 2000-2006 Keith Godfrey and Maxym Mykhalchuk 2008 Alex Buloichik Home page: http://www.omegat.org/ Support center: http://groups.yahoo.com/group/OmegaT/ This file is part of OmegaT. OmegaT is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. OmegaT is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. **************************************************************************/ package org.omegat.core.matching; import org.omegat.util.OStrings; import org.omegat.util.Token; /** * Class to compute Levenshtein Distance. * * <p> * Levenshtein distance (LD) is a measure of the similarity between two strings, * which we will refer to as the source string (s) and the target string (t). * The distance is the number of deletions, insertions, or substitutions * required to transform s into t. * * <p> * For example, * <ul> * <li>If s is "test" and t is "test", then LD(s,t) = 0, because no * transformations are needed. The strings are already identical. * <li>If s is "test" and t is "tent", then LD(s,t) = 1, because one * substitution (change "s" to "n") is sufficient to transform s into t. * </ul> * * <p> * The greater the Levenshtein distance, the more different the strings are. * <p> * Levenshtein distance is named after the Russian scientist Vladimir * Levenshtein, who devised the algorithm in 1965. If you can't spell or * pronounce Levenshtein, the metric is also sometimes called edit distance. * * alex73's comment: We can't make 'compute' mathod static, because in this case * LevenshteinDistance will not be thread-safe(see 'd' and 'p' arrays). We can't * create these arrays inside 'compute' method, because it's enough slow * operation. We have to create LevenshteinDistance instance one for each thread * where we will call it. It's best way for best performance. * * @see <a href="http://people.cs.pitt.edu/~kirk/cs1501/Pruhs/Fall2006/Assignments/editdistance/Levenshtein%20Distance.htm">Levenshtein Distance, in Three Flavors</a> * * @author Vladimir Levenshtein * @author Michael Gilleland, Merriam Park Software * @author Chas Emerick, Apache Software Foundation * @author Maxym Mykhalchuk * @author Alex Buloichik (alex73mail@gmail.com) */ public class LevenshteinDistance implements ISimilarityCalculator { /** * Get minimum of three values */ private static short minimum(int a, int b, int c) { return (short) Math.min(a, Math.min(b, c)); } /** Maximal number of items compared. */ private static final int MAX_N = 1000; /** * Cost array, horizontally. Here to avoid excessive allocation and garbage * collection. */ private short[] d = new short[MAX_N + 1]; /** * "Previous" cost array, horizontally. Here to avoid excessive allocation * and garbage collection. */ private short[] p = new short[MAX_N + 1]; /* * Compute Levenshtein distance between two lists. * * <p> The difference between this impl. and the canonical one is that, * rather than creating and retaining a matrix of size s.length()+1 by * t.length()+1, we maintain two single-dimensional arrays of length * s.length()+1. * * <p> The first, d, is the 'current working' distance array that maintains * the newest distance cost counts as we iterate through the characters of * String s. Each time we increment the index of String t we are comparing, * d is copied to p, the second int[]. Doing so allows us to retain the * previous cost counts as required by the algorithm (taking the minimum of * the cost count to the left, up one, and diagonally up and to the left of * the current cost count being calculated). <p> (Note that the arrays * aren't really copied anymore, just switched... this is clearly much * better than cloning an array or doing a System.arraycopy() each time * through the outer loop.) * * <p> Effectively, the difference between the two implementations is this * one does not cause an out of memory condition when calculating the LD * over two very large strings. * * <p> For perfomance reasons the maximal number of compared items is {@link * #MAX_N}. */ public int compute(Token[] s, Token[] t) { if (s == null || t == null) throw new IllegalArgumentException(OStrings.getString("LD_NULL_ARRAYS_ERROR")); int n = s.length; // length of s int m = t.length; // length of t if (n == 0) return m; else if (m == 0) return n; if (n > MAX_N) n = MAX_N; if (m > MAX_N) m = MAX_N; short[] swap; // placeholder to assist in swapping p and d // indexes into strings s and t short i; // iterates through s short j; // iterates through t Token t_j = null; // jth object of t short cost; // cost for (i = 0; i <= n; i++) p[i] = i; for (j = 1; j <= m; j++) { t_j = t[j - 1]; d[0] = j; Token s_i = null; // ith object of s for (i = 1; i <= n; i++) { s_i = s[i - 1]; cost = s_i.equals(t_j) ? (short) 0 : (short) 1; // minimum of cell to the left+1, to the top+1, diagonally left // and up +cost d[i] = minimum(d[i - 1] + 1, p[i] + 1, p[i - 1] + cost); } // copy current distance counts to 'previous row' distance counts swap = p; p = d; d = swap; } // our last action in the above loop was to switch d and p, so p now // actually has the most recent cost counts return p[n]; } }