/*
* eXist Open Source Native XML Database
* Copyright (C) 2001-09 Wolfgang M. Meier
* wolfgang@exist-db.org
* http://exist.sourceforge.net
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*
* $Id$
*/
package org.exist.xquery.functions.text;
import org.exist.storage.TermMatcher;
/**
* A fuzzy implementation of {@link org.exist.storage.TermMatcher}. It calculates
* the Levenshtein distance between the index and the search term.
*
* @author Wolfgang Meier (wolfgang@exist-db.org)
*/
public class FuzzyMatcher implements TermMatcher {
private final String searchTerm;
private final int termLength;
private final double threshold;
public FuzzyMatcher(String searchTerm, double threshold) {
this.searchTerm = searchTerm;
this.termLength = searchTerm.length();
this.threshold = threshold;
}
/* (non-Javadoc)
* @see org.exist.storage.TermMatcher#matches(java.lang.String)
*/
public boolean matches(CharSequence text) {
if(searchTerm.equals(text))
return true;
int textlen = text.length();
int dist = editDistance(text, searchTerm, textlen, termLength);
double distance = 1 - ((double)dist / (double)Math.min(textlen, termLength));
return distance > threshold;
}
/**
Finds and returns the smallest of three integers
*/
private static final int min(int a, int b, int c) {
int t = (a < b) ? a : b;
return (t < c) ? t : c;
}
/**
* This static array saves us from the time required to create a new array
* everytime editDistance is called.
*/
private int e[][] = new int[1][1];
/**
Levenshtein distance also known as edit distance is a measure of similiarity
between two strings where the distance is measured as the number of character
deletions, insertions or substitutions required to transform one string to
the other string.
<p>This method takes in four parameters; two strings and their respective
lengths to compute the Levenshtein distance between the two strings.
The result is returned as an integer.
*/
private final int editDistance(CharSequence s, String t, int n, int m) {
if (e.length <= n || e[0].length <= m) {
e = new int[Math.max(e.length, n+1)][Math.max(e[0].length, m+1)];
}
int d[][] = e; // matrix
int i; // iterates through s
int j; // iterates through t
char s_i; // ith character of s
if (n == 0) return m;
if (m == 0) return n;
// init matrix d
for (i = 0; i <= n; i++) d[i][0] = i;
for (j = 0; j <= m; j++) d[0][j] = j;
// start computing edit distance
for (i = 1; i <= n; i++) {
s_i = s.charAt(i - 1);
for (j = 1; j <= m; j++) {
if (s_i != t.charAt(j-1))
d[i][j] = min(d[i-1][j], d[i][j-1], d[i-1][j-1])+1;
else d[i][j] = min(d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1]);
}
}
// we got the result!
return d[n][m];
}
}