package org.owasp.webscarab.util;
/*
* Calculates the Levenshtein distance between two byte arrays
* This is great for showing which responses are similar or different
* to others. However, it is VERY slow, O(n*m), which bogs down really
* quickly if we start looking at sequences of a few thousand bytes :-(
*
* We optimize by tokenising the input into words, and comparing those
*
* An alternative might be the XDelta algorithm, see e.g.
* http://sourceforge.net/projects/javaxdelta/&e=10313
*
* Also see a paper "A Linear Time, Constant Space Differencing Algorithm" by Burns and Long
*/
import java.util.List;
import java.util.Iterator;
public class LevenshteinDistance<T> {
private List<T> _baseline;
private int[] _current, _previous;
public LevenshteinDistance(List<T> baseline) {
_baseline = baseline;
_current = new int[_baseline.size()+1];
_previous = new int[_baseline.size()+1];
}
public synchronized int getDistance(List<T> target) {
if (_baseline.size() == 0)
return target.size();
if (target.size() == 0)
return _baseline.size();
for (int i = 0; i < _current.length; i++) {
_current[i] = i;
}
Iterator<T> targIt = target.iterator();
int j=0;
while(targIt.hasNext()) {
T targObj = targIt.next();
j++;
int[] t = _previous;
_previous = _current;
_current = t;
_current[0] = _previous[0]+1;
Iterator<T> baseIt = _baseline.iterator();
int i=0;
while(baseIt.hasNext()) {
T baseObj = baseIt.next();
i++;
int cost;
if (baseObj.equals(targObj)) {
cost = 0;
} else {
cost = 1;
}
_current[i] = Math.min(Math.min(_previous[i]+1, _current[i-1]+1), _previous[i-1] + cost);
}
}
return _current[_baseline.size()];
}
public static void main(String[] args) {
List<Character> baseline = new java.util.ArrayList<Character>();
baseline.add(new Character('l'));
baseline.add(new Character('e'));
baseline.add(new Character('v'));
baseline.add(new Character('e'));
baseline.add(new Character('n'));
baseline.add(new Character('s'));
baseline.add(new Character('h'));
baseline.add(new Character('t'));
baseline.add(new Character('e'));
LevenshteinDistance<Character> ld = new LevenshteinDistance<Character>(baseline);
List<Character> target = new java.util.ArrayList<Character>();
target.add(new Character('m'));
target.add(new Character('e'));
target.add(new Character('i'));
target.add(new Character('l'));
target.add(new Character('e'));
target.add(new Character('n'));
target.add(new Character('s'));
target.add(new Character('t'));
target.add(new Character('e'));
int distance = ld.getDistance(target);
System.out.println("Distance between \"meilenstein\" and \"levenshtein\": " + distance);
}
}