/**************************************************************************
OmegaT - Computer Assisted Translation (CAT) tool
with fuzzy matching, translation memory, keyword search,
glossaries, and translation leveraging into updated projects.
Copyright (C) 2000-2006 Keith Godfrey and Maxym Mykhalchuk
2008 Alex Buloichik
Home page: http://www.omegat.org/
Support center: http://groups.yahoo.com/group/OmegaT/
This file is part of OmegaT.
OmegaT is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OmegaT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
**************************************************************************/
package org.omegat.core.matching;
import org.omegat.util.OStrings;
import org.omegat.util.Token;
/**
* Class to compute Levenshtein Distance.
*
* <p>
* Levenshtein distance (LD) is a measure of the similarity between two strings,
* which we will refer to as the source string (s) and the target string (t).
* The distance is the number of deletions, insertions, or substitutions
* required to transform s into t.
*
* <p>
* For example,
* <ul>
* <li>If s is "test" and t is "test", then LD(s,t) = 0, because no
* transformations are needed. The strings are already identical.
* <li>If s is "test" and t is "tent", then LD(s,t) = 1, because one
* substitution (change "s" to "n") is sufficient to transform s into t.
* </ul>
*
* <p>
* The greater the Levenshtein distance, the more different the strings are.
* <p>
* Levenshtein distance is named after the Russian scientist Vladimir
* Levenshtein, who devised the algorithm in 1965. If you can't spell or
* pronounce Levenshtein, the metric is also sometimes called edit distance.
*
* alex73's comment: We can't make 'compute' mathod static, because in this case
* LevenshteinDistance will not be thread-safe(see 'd' and 'p' arrays). We can't
* create these arrays inside 'compute' method, because it's enough slow
* operation. We have to create LevenshteinDistance instance one for each thread
* where we will call it. It's best way for best performance.
*
* @see <a href="http://people.cs.pitt.edu/~kirk/cs1501/Pruhs/Fall2006/Assignments/editdistance/Levenshtein%20Distance.htm">Levenshtein Distance, in Three Flavors</a>
*
* @author Vladimir Levenshtein
* @author Michael Gilleland, Merriam Park Software
* @author Chas Emerick, Apache Software Foundation
* @author Maxym Mykhalchuk
* @author Alex Buloichik (alex73mail@gmail.com)
*/
public class LevenshteinDistance implements ISimilarityCalculator {
/**
* Get minimum of three values
*/
private static short minimum(int a, int b, int c) {
return (short) Math.min(a, Math.min(b, c));
}
/** Maximal number of items compared. */
private static final int MAX_N = 1000;
/**
* Cost array, horizontally. Here to avoid excessive allocation and garbage
* collection.
*/
private short[] d = new short[MAX_N + 1];
/**
* "Previous" cost array, horizontally. Here to avoid excessive allocation
* and garbage collection.
*/
private short[] p = new short[MAX_N + 1];
/*
* Compute Levenshtein distance between two lists.
*
* <p> The difference between this impl. and the canonical one is that,
* rather than creating and retaining a matrix of size s.length()+1 by
* t.length()+1, we maintain two single-dimensional arrays of length
* s.length()+1.
*
* <p> The first, d, is the 'current working' distance array that maintains
* the newest distance cost counts as we iterate through the characters of
* String s. Each time we increment the index of String t we are comparing,
* d is copied to p, the second int[]. Doing so allows us to retain the
* previous cost counts as required by the algorithm (taking the minimum of
* the cost count to the left, up one, and diagonally up and to the left of
* the current cost count being calculated). <p> (Note that the arrays
* aren't really copied anymore, just switched... this is clearly much
* better than cloning an array or doing a System.arraycopy() each time
* through the outer loop.)
*
* <p> Effectively, the difference between the two implementations is this
* one does not cause an out of memory condition when calculating the LD
* over two very large strings.
*
* <p> For perfomance reasons the maximal number of compared items is {@link
* #MAX_N}.
*/
public int compute(Token[] s, Token[] t) {
if (s == null || t == null)
throw new IllegalArgumentException(OStrings.getString("LD_NULL_ARRAYS_ERROR"));
int n = s.length; // length of s
int m = t.length; // length of t
if (n == 0)
return m;
else if (m == 0)
return n;
if (n > MAX_N)
n = MAX_N;
if (m > MAX_N)
m = MAX_N;
short[] swap; // placeholder to assist in swapping p and d
// indexes into strings s and t
short i; // iterates through s
short j; // iterates through t
Token t_j = null; // jth object of t
short cost; // cost
for (i = 0; i <= n; i++)
p[i] = i;
for (j = 1; j <= m; j++) {
t_j = t[j - 1];
d[0] = j;
Token s_i = null; // ith object of s
for (i = 1; i <= n; i++) {
s_i = s[i - 1];
cost = s_i.equals(t_j) ? (short) 0 : (short) 1;
// minimum of cell to the left+1, to the top+1, diagonally left
// and up +cost
d[i] = minimum(d[i - 1] + 1, p[i] + 1, p[i - 1] + cost);
}
// copy current distance counts to 'previous row' distance counts
swap = p;
p = d;
d = swap;
}
// our last action in the above loop was to switch d and p, so p now
// actually has the most recent cost counts
return p[n];
}
}