package edu.stanford.nlp.patterns; import static java.lang.Math.abs; import static java.lang.Math.max; import java.util.Arrays; /** * COPIED FROM https://gist.github.com/steveash (public domain license) * Implementation of the OSA (optimal string alignment) which is similar * to the Damerau-Levenshtein in that it allows for transpositions to * count as a single edit distance, but is not a true metric and can * over-estimate the cost because it disallows substrings to edited more than * once. See wikipedia for more discussion on OSA vs DL * <p/> * See Algorithms on Strings, Trees and Sequences by Dan Gusfield for more * information. * <p/> * This also has a set of local buffer implementations to avoid allocating new * buffers each time, which might be a premature optimization * <p/> * * @author Steve Ash, copied by Sonal Gupta (changed to remove dependence on Google code) */ public class EditDistanceDamerauLevenshteinLike { private static final int threadLocalBufferSize = 64; private static final ThreadLocal<short[]> costLocal = new ThreadLocal<short[]>() { @Override protected short[] initialValue() { return new short[threadLocalBufferSize]; } }; private static final ThreadLocal<short[]> back1Local = new ThreadLocal<short[]>() { @Override protected short[] initialValue() { return new short[threadLocalBufferSize]; } }; private static final ThreadLocal<short[]> back2Local = new ThreadLocal<short[]>() { @Override protected short[] initialValue() { return new short[threadLocalBufferSize]; } }; //return -1 if the edit distance is more than the threshold public static int editDistance(CharSequence s, CharSequence t, int threshold) { assert(s!=null); assert(t!=null); assert(threshold >= 0); //"Cannot take edit distance of strings longer than 32k chars" assert(s.length() < Short.MAX_VALUE); assert(t.length() < Short.MAX_VALUE ); if (s.length() + 1 > threadLocalBufferSize || t.length() + 1 > threadLocalBufferSize) return editDistanceWithNewBuffers(s, t, (short)threshold); short[] cost = costLocal.get(); short[] back1 = back1Local.get(); short[] back2 = back2Local.get(); return editDistanceWithBuffers(s, t, (short)threshold, back2, back1, cost); } static int editDistanceWithNewBuffers(CharSequence s, CharSequence t, short threshold) { int slen = s.length(); short[] back1 = new short[slen + 1]; // "up 1" row in table short[] back2 = new short[slen + 1]; // "up 2" row in table short[] cost = new short[slen + 1]; // "current cost" return editDistanceWithBuffers(s, t, threshold, back2, back1, cost); } private static int editDistanceWithBuffers(CharSequence s, CharSequence t, short threshold, short[] back2, short[] back1, short[] cost) { short slen = (short) s.length(); short tlen = (short) t.length(); // if one string is empty, the edit distance is necessarily the length of // the other if (slen == 0) { return tlen <= threshold ? tlen : -1; } else if (tlen == 0) { return slen <= threshold ? slen : -1; } // if lengths are different > k, then can't be within edit distance if (abs(slen - tlen) > threshold) return -1; if (slen > tlen) { // swap the two strings to consume less memory CharSequence tmp = s; s = t; t = tmp; slen = tlen; tlen = (short) t.length(); } initMemoiseTables(threshold, back2, back1, cost, slen); for (short j = 1; j <= tlen; j++) { cost[0] = j; // j is the cost of inserting this many characters // stripe bounds int min = max(1, j - threshold); int max = min(slen, (short) (j + threshold)); // at this iteration the left most entry is "too much" so reset it if (min > 1) { cost[min - 1] = Short.MAX_VALUE; } iterateOverStripe(s, t, j, cost, back1, back2, min, max); // swap our cost arrays to move on to the next "row" short[] tempCost = back2; back2 = back1; back1 = cost; cost = tempCost; } // after exit, the current cost is in back1 // if back1[slen] > k then we exceeded, so return -1 if (back1[slen] > threshold) { return -1; } return back1[slen]; } private static void iterateOverStripe(CharSequence s, CharSequence t, short j, short[] cost, short[] back1, short[] back2, int min, int max) { // iterates over the stripe for (int i = min; i <= max; i++) { if (s.charAt(i - 1) == t.charAt(j - 1)) { cost[i] = back1[i - 1]; } else { cost[i] = (short) (1 + min(cost[i - 1], back1[i], back1[i - 1])); } if (i >= 2 && j >= 2) { // possible transposition to check for if ((s.charAt(i - 2) == t.charAt(j - 1)) && s.charAt(i - 1) == t.charAt(j - 2)) { cost[i] = min(cost[i], (short) (back2[i - 2] + 1)); } } } } private static void initMemoiseTables(short threshold, short[] back2, short[] back1, short[] cost, short slen) { // initial "starting" values for inserting all the letters short boundary = (short) (min(slen, threshold) + 1); for (short i = 0; i < boundary; i++) { back1[i] = i; back2[i] = i; } // need to make sure that we don't read a default value when looking "up" Arrays.fill(back1, boundary, slen + 1, Short.MAX_VALUE); Arrays.fill(back2, boundary, slen + 1, Short.MAX_VALUE); Arrays.fill(cost, 0, slen + 1, Short.MAX_VALUE); } private static short min(short a, short b) { return (a <= b ? a : b); } private static short min(short a, short b, short c) { return min(a, min(b, c)); } }