/*
* Freeplane - mind map editor
* Copyright (C) 2012 Dimitry Polivaev
*
* This file's author is Felix Natter
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package org.freeplane.features.filter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Stack;
/**
* Pseudo-Damerau-Levenshtein (aka "Optimal String Distance")
* implementation which allows some non-adjacent transpositions(?)
* Computes the edit distance with insertions/deletions/substitutions/transpositions.
*
* Optionally the edit distance of a semi-global alignment is computed which
* allows the search term to be shifted free-of-cost (i.e. dist("file", "a file is")==0).
*
* Some properties are explained in the unit test, {@link org.freeplane.features.filter.EditDistanceStringMatchingStrategiesTest}.
*
* TODO: use unicode code points instead of chars !!
* (but neither simplyhtml nor freeplane are currently codepoint-safe...)
*
* @author Felix Natter <fnatter@gmx.net>
*
*/
public class PseudoDamerauLevenshtein implements EditDistanceStringMatchingStrategy {
private int[][] matrix;
private String searchTerm;
private String searchText;
private final int costIndel = 1;
private final int costMismatch = 1;
private final int costTranspos = 1;
private Type type;
private Stack<Alignment> alignmentsInProgress;
private ArrayList<Alignment> alignmentsDone;
public class Alignment implements Comparable<Alignment>
{
private final String searchTermString;
private final String searchTextString;
private final double prob;
private final int matchStart;
private final int matchEnd;
private final int r, c;
public int getMatchStart()
{
return matchStart;
}
public int getMatchEnd()
{
return matchEnd;
}
public boolean overlapsWith(final Alignment other)
{
return (matchStart <= other.matchStart && other.matchStart <= matchEnd-1) || // endpoint of this lies in other
(other.matchStart <= matchStart && matchStart <= other.matchEnd-1); // endpoint of other lies in this
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + getOuterType().hashCode();
result = prime * result + c;
result = prime * result + matchEnd;
result = prime * result + matchStart;
long temp;
temp = Double.doubleToLongBits(prob);
result = prime * result + (int) (temp ^ (temp >>> 32));
result = prime * result + r;
result = prime
* result
+ ((searchTermString == null) ? 0 : searchTermString
.hashCode());
result = prime
* result
+ ((searchTextString == null) ? 0 : searchTextString
.hashCode());
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj) {
return true;
}
if (obj == null) {
return false;
}
if (getClass() != obj.getClass()) {
return false;
}
Alignment other = (Alignment) obj;
if (!getOuterType().equals(other.getOuterType())) {
return false;
}
if (c != other.c) {
return false;
}
if (matchEnd != other.matchEnd) {
return false;
}
if (matchStart != other.matchStart) {
return false;
}
if (Double.doubleToLongBits(prob) != Double
.doubleToLongBits(other.prob)) {
return false;
}
if (r != other.r) {
return false;
}
if (searchTermString == null) {
if (other.searchTermString != null) {
return false;
}
} else if (!searchTermString.equals(other.searchTermString)) {
return false;
}
if (searchTextString == null) {
if (other.searchTextString != null) {
return false;
}
} else if (!searchTextString.equals(other.searchTextString)) {
return false;
}
return true;
}
public String getMatch()
{
return searchText.substring(matchStart, matchEnd);
}
public int compareTo(final Alignment other)
{
if (prob == other.prob)
{
return new Integer(getMatch().length()).compareTo(new Integer(other.getMatch().length()));
}
else
{
return new Double(prob).compareTo(new Double(other.prob));
}
}
public void print()
{
System.out.format("Alignment@%x[%.2f]:\n%s\n%s\n=> matches '%s' [%d,%d]\n",
hashCode(), prob, searchTermString, searchTextString, getMatch(),
matchStart,matchEnd);
}
@Override
public String toString()
{
return String.format("Ali@%x[%s,%.2f,%d,%d]", hashCode(), getMatch(), prob, matchStart, matchEnd);
}
public Alignment(final String searchTermString, final String searchTextString, final double prob,
final int matchStart, final int matchEnd, final int r, final int c)
{
this.searchTermString = searchTermString;
this.searchTextString = searchTextString;
this.prob = prob;
this.matchStart = matchStart;
this.matchEnd = matchEnd;
this.r = r;
this.c = c;
}
private PseudoDamerauLevenshtein getOuterType() {
return PseudoDamerauLevenshtein.this;
}
}
private boolean isMatch(int i, int j)
{
char col = searchTerm.charAt(i-1);
char row = searchText.charAt(j-1);
if (col == row || row == '-')
return true;
else
return false;
}
public int distance() {
matrix = new int[searchTerm.length()+1][searchText.length()+1]; // [row][col]
// first column: start-gap penalties for searchTerm
for (int i = 0; i <= (int)searchTerm.length(); i++)
matrix[i][0] = i*costIndel;
// first row: start-gap penalties for searchText
if (type == Type.Global)
{
for (int j = 1; j <= (int)searchText.length(); j++)
matrix[0][j] = j*costIndel;
}
else if (type == Type.SemiGlobal)
{
Arrays.fill(matrix[0], 0);
}
// compute the rest of the matrix
for (int i = 1; i <= searchTerm.length(); i++)
{
for (int j = 1; j <= searchText.length(); j++)
{
int cost_try_match = matrix[i-1][j-1] + (isMatch(i,j) ? 0 : costMismatch);
int cost_ins = matrix[i-1][j] + costIndel;
int cost_del = matrix[i][j-1] + costIndel;
matrix[i][j] = Math.min(cost_try_match, Math.min(cost_ins, cost_del));
if (i >= 2 && j >= 2 &&
searchTerm.charAt(i-2) == searchText.charAt(j-1) &&
searchTerm.charAt(i-1) == searchText.charAt(j-2))
{
matrix[i][j] = Math.min(matrix[i][j], matrix[i-2][j-2] + costTranspos);
}
}
}
//writeMatrix(matrix);
if (type == Type.Global)
{
return matrix[searchTerm.length()][searchText.length()];
}
else
{
int min = Integer.MAX_VALUE;
for (int j = 1; j <= searchText.length()+1; j++)
{
min = Math.min(min, matrix[searchTerm.length()][j-1]);
}
return min;
}
}
private void writeMatrix(int[][] H)
{
for (int i = 0; i < H.length; i++)
{
for (int j = 0; j < H[0].length; j++)
{
System.out.format(" %3d", H[i][j]);
}
System.out.println();
}
}
public List<Alignment> computeAlignments(final double minProb)
{
alignmentsInProgress = new Stack<Alignment>();
alignmentsDone = new ArrayList<Alignment>();
int dist = distance(); // this computes the Dynamic Programming matrix according to Levenshtein
if (type == Type.Global && getMatchProb(dist) > minProb)
{
alignmentsInProgress.push(new Alignment("", "", getMatchProb(dist), 0, searchText.length(),
searchTerm.length(), searchText.length()));
}
else
{
// semi-global "substring" alignment
StringBuilder searchTermSuffix = new StringBuilder();
StringBuilder searchTextSuffix = new StringBuilder();
for (int c = searchText.length() + 1; c >= 1; c--)
{
if (c <= searchText.length())
{
searchTermSuffix.append('-');
searchTextSuffix.insert(0, searchText.charAt(c-1));
}
double prob = getMatchProb(matrix[searchTerm.length()][c-1]);
if (prob > minProb)
{
alignmentsInProgress.push(new Alignment(searchTermSuffix.toString(), searchTextSuffix.toString(),
prob, 0, searchText.length() - searchTextSuffix.length(), searchTerm.length(), c - 1));
}
}
}
while (!alignmentsInProgress.isEmpty())
{
developAlignment(alignmentsInProgress.pop());
}
// filter (overlapping) alignments
alignmentsDone = filterAlignments(alignmentsDone);
sortAlignments(alignmentsDone);
/*
System.out.format("--NON-OVERLAPPPING ALIGNMENTS-------------------\n");
for (Alignment ali: alignmentsDone)
{
ali.print();
}
*/
matrix = null;
//return alignmentsDone.toArray(new Alignment[alignmentsDone.size()]);
return alignmentsDone;
}
/**
* Keep only non-overlapping matches (alignments) while preferring alignments with high score (prob)
* TODO: this is a heuristic, is the problem NP complete?
* => probably, see "maximum set packing"
*
* @param alignments alignments list to filter
* @return filtered alignment list
*/
static ArrayList<Alignment> filterAlignments(final ArrayList<Alignment> alignments)
{
if (alignments.isEmpty())
return new ArrayList<Alignment>();
// sort by score and match length (see Alignment.compareTo())
Collections.sort(alignments, Collections.reverseOrder());
ArrayList<Alignment> clusters = new ArrayList<Alignment>(alignments.size());
// start with a single cluster
clusters.add(alignments.get(0));
alignments.remove(0);
// assign alignments to clusters
for (Alignment ali: alignments)
{
boolean found_cluster = false;
for (int j = 0; j < clusters.size(); j++)
{
if (ali.overlapsWith(clusters.get(j)))
{
found_cluster = true;
// keep either current cluster center or set to 'ali'
if (ali.compareTo(clusters.get(j)) > 0)
{
clusters.set(j, ali);
}
}
}
if (!found_cluster)
{
clusters.add(ali);
}
}
return clusters;
}
/**
* Sort alignments (matches) by start positions
* @param alignments list of alignments to sort
*/
static void sortAlignments(final ArrayList<Alignment> alignments)
{
Collections.sort(alignments, new Comparator<Alignment>()
{
public int compare(Alignment o1, Alignment o2) {
return new Integer(o1.matchStart).compareTo(o2.matchStart);
}
});
}
// private void printAlignmentsFrom(final String searchTermSuffix, final String searchTextSuffix, final int r, final int c,
// double prob, int matchStart, int matchEnd)
private void developAlignment(final Alignment ali)
{
System.out.format("developAlignment(term=%s, text=%s, r=%d, c=%d)",
ali.searchTermString, ali.searchTextString, ali.r, ali.c);
if (ali.r == 0 && ali.c == 0)
{
alignmentsDone.add(ali);
System.out.println();
ali.print();
}
else
{
// TODO: comments!!
// match/mismatch
if (ali.r >= 1 && ali.c >= 1 && matrix[ali.r][ali.c] == matrix[ali.r-1][ali.c-1] + (isMatch(ali.r,ali.c) ? 0 : costMismatch))
{
System.out.format("=> match/mismatch\n");
alignmentsInProgress.push(new Alignment(
searchTerm.charAt(ali.r-1) + ali.searchTermString,
searchText.charAt(ali.c-1) + ali.searchTextString,
ali.prob, ali.matchStart, ali.matchEnd, ali.r - 1, ali.c - 1)
);
}
/*
// free insertions at the beginning of the searchTerm
if (ali.c >= 1 && type == Type.SemiGlobal && ali.r == 0 && matrix[ali.r][ali.c-1] == 0)
{
System.out.format("=> insertion at beginning\n");
alignmentsInProgress.push(new Alignment(
"-" + ali.searchTermString,
searchText.charAt(ali.c-1) + ali.searchTextString,
ali.prob, ali.matchStart + 1, ali.matchEnd, ali.r, ali.c - 1)
);
}
*/
if (type == Type.SemiGlobal && ali.r == 0)
{
System.out.format("=> insertions at beginning\n");
int c = ali.c, matchStart = ali.matchStart;
StringBuilder searchTermPrefix = new StringBuilder();
StringBuilder searchTextPrefix = new StringBuilder();
while (c > 0)
{
searchTermPrefix.append('-');
searchTextPrefix.insert(0, searchText.charAt(c-1));
matchStart += 1;
c--;
}
alignmentsInProgress.push(new Alignment(
searchTermPrefix.toString() + ali.searchTermString,
searchTextPrefix.toString() + ali.searchTextString,
ali.prob, matchStart, ali.matchEnd, 0, 0)
);
}
// insertion
if (ali.c >= 1 && matrix[ali.r][ali.c] == matrix[ali.r][ali.c-1] + costIndel)
{
System.out.format("=> insertion\n");
alignmentsInProgress.push(new Alignment(
"-" + ali.searchTermString,
searchText.charAt(ali.c-1) + ali.searchTextString,
ali.prob, ali.matchStart, ali.matchEnd, ali.r, ali.c - 1)
);
}
// deletion
if (ali.r >= 1 && matrix[ali.r][ali.c] == matrix[ali.r-1][ali.c] + costIndel)
{
System.out.format("=> deletion\n");
alignmentsInProgress.push(new Alignment(
searchTerm.charAt(ali.r-1) + ali.searchTermString,
"-" + ali.searchTextString,
ali.prob, ali.matchStart, ali.matchEnd, ali.r - 1, ali.c)
);
}
// Damerau-Extension (transpositions)
if (ali.r >= 2 && ali.c >= 2 && matrix[ali.r][ali.c] == matrix[ali.r-2][ali.c-2] + costTranspos &&
searchTerm.charAt(ali.r-2) == searchText.charAt(ali.c-1) &&
searchTerm.charAt(ali.r-1) == searchText.charAt(ali.c-2))
{
System.out.format("=> transposition\n");
alignmentsInProgress.push(new Alignment(
searchTerm.substring(ali.r - 2, ali.r) + ali.searchTermString,
searchText.substring(ali.c - 2, ali.c) + ali.searchTextString,
ali.prob, ali.matchStart, ali.matchEnd, ali.r - 2, ali.c - 2)
);
}
}
}
private float getMatchProb(final int distance)
{
if (type == Type.SemiGlobal)
{
return 1.0F - ((float)distance / searchTerm.length());
}
else
{
return 1.0F - ((float)distance / Math.min(searchTerm.length(), searchText.length()));
}
}
public float matchProb()
{
//LogUtils.severe("minMatchProb=" +StringMatchingStrategy.APPROXIMATE_MATCHING_MINPROB);
int dist = distance();
matrix = null;
//LogUtils.severe(String.format("DLevDist(%s,%s) = %d\n", searchTerm, searchText, dist));
return getMatchProb(dist);
}
public PseudoDamerauLevenshtein() {
//LogUtils.severe("minMatchProb=" +StringMatchingStrategy.APPROXIMATE_MATCHING_MINPROB);
}
public void init(String searchTerm, String searchText,
boolean subStringMatch, boolean caseSensitive)
{
if (searchTerm == null || searchText == null)
{
throw new IllegalArgumentException("Null searchText/searchTerm!");
}
if (caseSensitive)
{
this.searchTerm = searchTerm;
this.searchText = searchText;
}
else
{
this.searchTerm = searchTerm.toLowerCase();
this.searchText= searchText.toLowerCase();
}
this.type = subStringMatch ? Type.SemiGlobal : Type.Global;
}
public boolean matches(String searchTerm, String searchText,
boolean subStringMatch, boolean caseSensitive)
{
init(searchTerm, searchText, subStringMatch, caseSensitive);
return matchProb() > StringMatchingStrategy.APPROXIMATE_MATCHING_MINPROB;
}
/*
public Match[] getMatches(String searchTerm,
String searchText, boolean subStringMatch, boolean caseSensitive, double minProb) {
init(searchTerm, searchText, subStringMatch, caseSensitive);
List<Match> matches = new ArrayList<Match>();
for (Alignment ali: computeAlignments(minProb))
{
matches.add(new Match(ali.matchStart, ali.matchEnd));
}
return matches.toArray(new Match[matches.size()]);
}
*/
}