package edu.stanford.nlp.util; import edu.stanford.nlp.util.logging.Redwood; import java.util.Arrays; /** Find the (Levenshtein) edit distance between two Strings or Character * arrays. * By default it allows transposition. * <br> * This is an object so that you can save on the cost of allocating / * deallocating the large array when possible * @author Dan Klein * @author John Bauer - rewrote using DP instead of memorization */ public class EditDistance { /** A logger for this class */ private static Redwood.RedwoodChannels log = Redwood.channels(EditDistance.class); final boolean allowTranspose; protected double[][] score = null; public EditDistance() { allowTranspose = true; } public EditDistance(boolean allowTranspose) { this.allowTranspose = allowTranspose; } protected void clear(int sourceLength, int targetLength) { if (score == null || score.length < sourceLength + 1 || score[0].length < targetLength + 1) { score = new double[sourceLength + 1][targetLength + 1]; } for (double[] aScore : score) { Arrays.fill(aScore, worst()); } } // CONSTRAINT SEMIRING START protected double best() { return 0.0; } protected double worst() { return Double.POSITIVE_INFINITY; } protected double unit() { return 1.0; } protected double better(double x, double y) { if (x < y) { return x; } return y; } protected double combine(double x, double y) { return x + y; } // CONSTRAINT SEMIRING END // COST FUNCTION BEGIN protected double insertCost(Object o) { return unit(); } protected double deleteCost(Object o) { return unit(); } protected double substituteCost(Object source, Object target) { if (source.equals(target)) { return best(); } return unit(); } double transposeCost(Object s1, Object s2, Object t1, Object t2) { if (s1.equals(t2) && s2.equals(t1)) { if (allowTranspose) { return unit(); } else { return 2*unit(); } } return worst(); } // COST FUNCTION END double score(Object[] source, int sPos, Object[] target, int tPos) { for (int i = 0; i <= sPos; ++i) { for (int j = 0; j <= tPos; ++j) { double bscore = score[i][j]; if (bscore != worst()) continue; if (i == 0 && j == 0) { bscore = best(); } else { if (i > 0) { bscore = better(bscore, (combine(score[i - 1][j], deleteCost(source[i - 1])))); } if (j > 0) { bscore = better(bscore, (combine(score[i][j - 1], insertCost(target[j - 1])))); } if (i > 0 && j > 0) { bscore = better(bscore, (combine(score[i - 1][j - 1], substituteCost(source[i - 1], target[j - 1])))); } if (i > 1 && j > 1) { bscore = better(bscore, (combine(score[i - 2][j - 2], transposeCost(source[i - 2], source[i - 1], target[j - 2], target[j - 1])))); } } score[i][j] = bscore; } } return score[sPos][tPos]; } public double score(Object[] source, Object[] target) { clear(source.length, target.length); return score(source, source.length, target, target.length); } public double score(String sourceStr, String targetStr) { if(sourceStr.equals(targetStr)) return 0; Object[] source = Characters.asCharacterArray(sourceStr); Object[] target = Characters.asCharacterArray(targetStr); clear(source.length, target.length); return score(source, source.length, target, target.length); } public static void main(String[] args) { if (args.length >= 2) { EditDistance d = new EditDistance(); System.out.println(d.score(args[0], args[1])); } else { log.info("usage: java EditDistance str1 str2"); } } }