package vroom.common.utilities;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import vroom.common.utilities.Utilities.Math;
/**
* The class <code>LevenshteinDistance</code> contains the lazy evaluated matrix used to calculate the Levenshtein
* distance with a time/space complexity of {@code O(n.d)} where {@code d} is the distance between the two seqs, and
* {@code n} is the length of the longest seq.
* <p>
* Creation date: Nov 29, 2011 - 2:52:33 PM
*
* @author Victor Pillac, <a href="http://uniandes.edu.co">Universidad de Los Andes</a>-<a
* href="http://copa.uniandes.edu.co">Copa</a> <a href="http://www.emn.fr">Ecole des Mines de Nantes</a>-<a
* href="http://www.irccyn.ec-nantes.fr/irccyn/d/en/equipes/Slp">SLP</a>
* @version 1.0
*/
public class LevenshteinDistance<T> {
private final Object[][] mMatrix;
private final ArrayList<T> mRef;
private final ArrayList<T> mSeq;
/**
* Creates a new <code>LevenshteinDistance</code>
*
* @param ref
* @param seq
*/
private LevenshteinDistance(List<T> ref, List<T> seq) {
mRef = new ArrayList<T>(ref.size() + 1);
mRef.add(null);
for (T n : ref)
mRef.add(n);
mSeq = new ArrayList<T>(seq.size() + 1);
mSeq.add(null);
for (T n : seq)
mSeq.add(n);
mMatrix = new Object[mRef.size()][mSeq.size()];
}
/**
* Returns the distance between {@code ref} and {@code seq}
*
* @param ref
* @param seq
* @return the distance between {@code ref} and {@code seq}
*/
public static <T> int getDistance(List<T> ref, List<T> seq) {
LevenshteinDistance<T> dist = new LevenshteinDistance<T>(ref, seq);
return dist.getDistance();
}
/**
* Returns a minimum length edit sequence to transform {@code ref} into {@code seq}, using lazy evaluation.
* <p>
* Time complexity of {@code O(n(1+d))} (where {@code d} is the distance) and space complexity of {@code O(n)}
* </p>
*
* @param ref
* @param seq
* @return a minimum length edit sequence to transform {@code ref} into {@code seq}
*/
public static <T> List<Edit<T>> getEditSequence(List<T> ref, List<T> seq) {
LevenshteinDistance<T> dist = new LevenshteinDistance<T>(ref, seq);
return dist.getEditSequence();
}
/**
* Returns the <a href="http://en.wikipedia.org/wiki/Levenshtein_distance">Levenshtein distance</a> between two
* sequences of objects, using the classic version of the dynamic programming algorithm.
* <p>
* Time complexity of {@code O(n²)} and space complexity of {@code O(n)}
* </p>
*
* @param s
* the first sequence
* @param t
* the second sequence
* @param directed
* {@code true} if {@code s} is the reference and {@code t} the evaluated sequence, changes the outputs
* of the number of deletions, insertions and substitutions. Set to {@code false} if only the distance is
* required
* @return an array containing: [distance, deletions, insertions, substitutions]
*/
public static <T> int[] getDistanceClassic(List<T> s, List<T> t, boolean directed) {
if (!directed && s.size() < t.size()) {
List<T> o = t;
t = s;
s = o;
}
// int[][] d = new int[sSize + 1][tSize + 1];
// distance, deletions, insertions, substitutions
int[][][] d = new int[2][t.size() + 1][4];
for (int i = 0; i < d.length; i++)
d[i][0][0] = i; // Distance between any string and an empty string
for (int j = 0; j < d[0].length; j++)
d[0][j][0] = j; // Distance between an empty string and any string
int iRow = 1, predIRow = 0;
Iterator<T> sIt = s.iterator();
int i = 1;
while (sIt.hasNext()) {
Object sNext = sIt.next();
d[iRow][0][0] = i;
int j = 1;
Iterator<T> tIt = t.iterator();
while (tIt.hasNext()) {
Object tNext = tIt.next();
if (Utilities.equal(tNext, sNext)) {
d[iRow][j][0] = d[predIRow][j - 1][0];
d[iRow][j][1] = d[predIRow][j - 1][1];
d[iRow][j][2] = d[predIRow][j - 1][2];
d[iRow][j][3] = d[predIRow][j - 1][3];
} else {
int[][] pred = new int[][] { //
d[predIRow][j], // a deletion
d[iRow][j - 1], // an insertion
d[predIRow][j - 1] // a substitution
};
int[] argMin = Math.argMin(//
new int[] { pred[0][0], pred[1][0], pred[2][0] });
// Update the distance
d[iRow][j][0] = argMin[1] + 1;
// Update the count
// Copy count from selected pred
d[iRow][j][1] = pred[argMin[0]][1];
d[iRow][j][2] = pred[argMin[0]][2];
d[iRow][j][3] = pred[argMin[0]][3];
// Increment count for detected move
d[iRow][j][argMin[0] + 1]++;
}
j++;
}
predIRow = iRow;
iRow = 1 - iRow;
i++;
}
return d[predIRow][d[predIRow].length - 1];
}
/**
* Returns the distance between the reference and the seq
*
* @return the distance between the reference and the seq
*/
private int getDistance() {
return getSoutEastCell().value();
}
/**
* Return a minimum length edit sequence to go from the the reference seq to the evaluated seq
*
* @return a minimum length edit sequence to go from the the reference seq to the evaluated seq
*/
private List<Edit<T>> getEditSequence() {
ArrayList<Edit<T>> edits = new ArrayList<Edit<T>>(getDistance());
Cell cell = getSoutEastCell();
while (cell.getPred() != null) {
if (cell.getEdit().getType() != LevenshteinDistance.EditType.NONE)
edits.add(cell.getEdit());
cell = cell.getPred();
}
return edits;
}
/**
* Returns the cell at position {@code [i,j]}
*
* @param i
* @param j
* @return the cell at position {@code [i,j]}
*/
@SuppressWarnings("unchecked")
private Cell getCell(int i, int j) {
if (mMatrix[i][j] == null)
mMatrix[i][j] = new Cell(i, j);
return (Cell) mMatrix[i][j];
}
/**
* Returns the south east cell
*
* @return the south east cell
*/
private Cell getSoutEastCell() {
return getCell(mRef.size() - 1, mSeq.size() - 1);
}
/**
* <code>Cell</code> represents a cell of the {@link LevenshteinDistance}, it contains a lazy evaluated value, and a
* reference to its predecessor and the move to go from the predecessor to this cell
*/
protected class Cell {
final int i, j;
int mEval;
Cell mPred;
LevenshteinDistance.EditType mType;
LevenshteinDistance.Edit<T> mEdit;
/**
* Creates a new <code>Cell</code>
*
* @param i
* the index in the reference seq
* @param j
* the index in the evaluated seq
*/
public Cell(int i, int j) {
this.i = i;
this.j = j;
mEval = -1;
mPred = null;
if (i == 0 && j == 0) {
mEval = 0;
mType = LevenshteinDistance.EditType.NONE;
} else if (i == 0) {
mEval = j;
mPred = getCell(i, j - 1);
mType = LevenshteinDistance.EditType.INS;
} else if (j == 0) {
mEval = i;
mPred = getCell(i - 1, j);
mType = LevenshteinDistance.EditType.DEL;
}
}
/**
* Returns this cell predecessor
*
* @return this cell predecessor
*/
private Cell getPred() {
return mPred;
}
/**
* Lazy evaluation of this cell
*
* @return the value of this cell
*/
private int value() {
if (mEval == -1) {
evaluate();
}
return mEval;
}
/**
* Evaluate this cell
*/
private void evaluate() {
// NorthWest cell
Cell nw = getCell(i - 1, j - 1);
if (Utilities.equal(mRef.get(i), mSeq.get(j))) {
mPred = nw;
mType = LevenshteinDistance.EditType.NONE;
} else {
// West cell (insertion)
Cell w = getCell(i, j - 1);
if (w.value() < nw.value()) {
// Optimization: we now that w<=n in this case (see Lloyd page)
mPred = w;
mType = LevenshteinDistance.EditType.INS;
} else {
// North cell (deletion)
Cell n = getCell(i - 1, j);
if (nw.value() <= n.value()) {
mPred = nw;
mType = LevenshteinDistance.EditType.SUB;
} else {
mPred = n;
mType = LevenshteinDistance.EditType.DEL;
}
}
}
// Update the evaluation
mEval = mPred.value() + mType.getCost();
}
/**
* Return the edit required to move from this cell predecessor to this cell
*
* @return the edit required to move from this cell predecessor to this cell
*/
private LevenshteinDistance.Edit<T> getEdit() {
if (mEdit == null) {
if (mType == LevenshteinDistance.EditType.SUB
|| mType == LevenshteinDistance.EditType.INS)
mEdit = new LevenshteinDistance.Edit<T>(mType, i, mRef.get(i), mSeq.get(j));
else
mEdit = new LevenshteinDistance.Edit<T>(mType, i, mRef.get(i), mRef.get(i));
}
return mEdit;
}
}
public static class Edit<T> {
private final LevenshteinDistance.EditType mType;
private final int mEditIndex;
private final T mEditedElement;
private final T mNewElement;
/**
* Getter for <code>type</code>
*
* @return the type
*/
public LevenshteinDistance.EditType getType() {
return mType;
}
/**
* Getter for the index at which the edit occurred in the reference sequence: the deleted element for
* {@link EditType#DEL}, the element that was replaced for {@link EditType#SUB}, and the element after which an
* element was inserted for {@link EditType#INS}.
*
* @return the index at which the edit occurred in the reference sequence
*/
public int getEditIndex() {
return mEditIndex;
}
/**
* Getter for the edited element in the reference sequence: the deleted element for {@link EditType#DEL}, the
* element that was replaced for {@link EditType#SUB}, and the element after which an element was inserted for
* {@link EditType#INS}.
*
* @return the edited element in the reference sequence
*/
public T getEditedElement() {
return mEditedElement;
}
/**
* Getter for the new element: the deleted element for {@link EditType#DEL}, the new element for
* {@link EditType#SUB}, and the inserted element for {@link EditType#INS}.
*
* @return the new element
*/
public T getNewElement() {
return mNewElement;
}
/**
* Creates a new <code>Edit</code>
*
* @param type
* the type of edit
* @param editIndex
* the index at which the edit was made
* @param editedElement
* the element that was edited
* @param newElement
* the that was removed, inserted, or replaced a previous element
*/
public Edit(LevenshteinDistance.EditType type, int editIndex, T editedElement, T newElement) {
mType = type;
mEditIndex = editIndex;
mEditedElement = editedElement;
mNewElement = newElement;
}
@Override
public String toString() {
if (mNewElement == null)
return String.format("%s[%s@%s]", mType, mEditedElement, mEditIndex);
else
return String
.format("%s[%s@%s:%s]", mType, mEditedElement, mEditIndex, mNewElement);
}
}
/**
* The enum <code>EditType</code> represent the different edits measured by the Levenshtein distance.
* <p>
* Creation date: Nov 29, 2011 - 2:54:32 PM
*
* @author Victor Pillac, <a href="http://uniandes.edu.co">Universidad de Los Andes</a>-<a
* href="http://copa.uniandes.edu.co">Copa</a> <a href="http://www.emn.fr">Ecole des Mines de Nantes</a>-<a
* href="http://www.irccyn.ec-nantes.fr/irccyn/d/en/equipes/Slp">SLP</a>
* @version 1.0
*/
public static enum EditType {
NONE(0), DEL(1), SUB(1), INS(1);
final int mCost;
/**
* Returns the cost of this edit
*
* @return the cost of this edit
*/
public int getCost() {
return mCost;
}
private EditType(int cost) {
mCost = cost;
}
}
}