package org.basex.util; import static org.basex.util.Token.*; /** * Damerau-Levenshtein implementation. Based on the publications from * Levenshtein (1965): Binary codes capable of correcting spurious insertions * and deletions of ones, and Damerau (1964): A technique for computer * detection and correction of spelling errors. * * @author BaseX Team 2005-12, BSD License * @author Christian Gruen */ public final class Levenshtein { /** Maximum token size. */ private static final int MAX = 50; /** Static matrix for Levenshtein distance. */ private final int[][] m = new int[MAX + 2][MAX + 2]; /** * Constructor. */ public Levenshtein() { for(int i = 0; i < m.length; ++i) { m[0][i] = i; m[i][0] = i; } } /** * Compares two character arrays for similarity. * @param token token to be compared * @param sub second token to be compared * @param err number of allowed errors; dynamic calculation if value is 0 * @return true if the arrays are similar */ public boolean similar(final byte[] token, final byte[] sub, final int err) { int sl = 0, tl = 0; for(int s = 0; s < sub.length; s += cl(sub, s)) ++sl; for(int t = 0; t < token.length; t += cl(token, t)) ++tl; if(tl == 0) return false; // use exact search for too short and too long values if(sl < 4 || tl > MAX || sl > MAX) return sl == tl && same(token, sub); // skip different tokens with too different lengths final int k = err == 0 ? Math.max(1, sl >> 2) : err; return Math.abs(sl - tl) <= k && ls(token, tl, sub, sl, k); } /** * Calculates a Levenshtein distance. * @param tk token to be compared * @param tl token length * @param sb sub token to be compared * @param sl string length * @param k maximum number of accepted errors * @return true if the arrays are similar */ private boolean ls(final byte[] tk, final int tl, final byte[] sb, final int sl, final int k) { int e2 = -1, f2 = -1; for(int t = 0; t < tl; t += cl(tk, t)) { final int e = norm(lc(cp(tk, t))); int d = Integer.MAX_VALUE; for(int s = 0; s < sl; s += cl(sb, s)) { final int f = norm(lc(cp(sb, s))); int c = m(m[t][s + 1] + 1, m[t + 1][s] + 1, m[t][s] + (e == f ? 0 : 1)); if(e == f2 && f == e2) c = m[t][s]; m[t + 1][s + 1] = c; d = Math.min(d, c); f2 = f; } if(d > k) return false; e2 = e; } return m[tl][sl] <= k; } /** * Gets the minimum of three values. * @param a 1st value * @param b 2nd value * @param c 3rd value * @return minimum */ private static int m(final int a, final int b, final int c) { final int d = a < b ? a : b; return d < c ? d : c; } /** * Compares two character arrays for equality. * @param tk token to be compared * @param sb second token to be compared * @return true if the arrays are equal */ private static boolean same(final byte[] tk, final byte[] sb) { int t = 0, s = 0; for(; t < tk.length && s < sb.length; t += cl(tk, t), s += cl(sb, s)) { if(lc(norm(cp(tk, t))) != lc(norm(cp(sb, t)))) return false; } return true; } }