package com.bericotech.clavin.util; import java.util.Arrays; import java.util.SortedMap; import java.util.TreeMap; /*##################################################################### * * CLAVIN (Cartographic Location And Vicinity INdexer) * --------------------------------------------------- * * Copyright (C) 2012-2013 Berico Technologies * http://clavin.bericotechnologies.com * * ==================================================================== * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. * * ==================================================================== * * DamerauLevenshtein.java * *###################################################################*/ /** * Utility functions for calculating edit distance between strings, * consistent with the Damerau-Levenshtein metric. * * Transposition, substitution, insertion, and deletion operations are * all considered to be one edit each. Multiple substring edits (e.g., * adjacent transpositions) are supported, unlike in "optimal string * alignment distance" where no substring may be edited more than once. */ public class DamerauLevenshtein { // sentinel value for the end of contents in an "infinite" array final static Null endMarker = new Null(); /** * Computes the true Damerau–Levenshtein edit distance * (with adjacent transpositions) between two given strings.<br><br> * * Based on <a href="http://en.wikipedia.org/wiki/Damerau–Levenshtein_distance">C# code from Wikipedia</a>. * * @param str1 First string being compared * @param str2 Second string being compared * @return Edit distance between strings */ public static int damerauLevenshteinDistance(String str1, String str2) { // return fast if one or both strings is empty or null if ((str1 == null) || str1.isEmpty()) { if ((str2 == null) || str2.isEmpty()) { return 0; } else { return str2.length(); } } else if ((str2 == null) || str2.isEmpty()) { return str1.length(); } // split strings into string arrays String[] stringArray1 = str1.split(""); String[] stringArray2 = str2.split(""); // initialize matrix values int[][] matrix = new int[stringArray1.length + 2][stringArray2.length + 2]; int bound = stringArray1.length + stringArray2.length; matrix[0][0] = bound; for (int i = 0; i <= stringArray1.length; i++) { matrix[i + 1][1] = i; matrix[i + 1][0] = bound; } for (int j = 0; j <= stringArray2.length; j++) { matrix[1][j + 1] = j; matrix[0][j + 1] = bound; } // initialize dictionary SortedMap<String, Integer> dictionary = new TreeMap<String, Integer>(); for (String letter : (str1 + str2).split("")) { if (!dictionary.containsKey(letter)) { dictionary.put(letter, 0); } } // compute edit distance between strings for (int i = 1; i <= stringArray1.length; i++) { int index = 0; for (int j = 1; j <= stringArray2.length; j++) { int i1 = dictionary.get(stringArray2[j - 1]); int j1 = index; if (stringArray1[i - 1].equals(stringArray2[j - 1])) { matrix[i + 1][j + 1] = matrix[i][j]; index = j; } else { matrix[i + 1][j + 1] = Math.min(matrix[i][j], Math.min(matrix[i + 1][j], matrix[i][j + 1])) + 1; } matrix[i + 1][j + 1] = Math.min(matrix[i + 1][j + 1], matrix[i1][j1] + (i - i1 - 1) + 1 + (j - j1 - 1)); } dictionary.put(stringArray1[i - 1], i); } return matrix[stringArray1.length + 1][stringArray2.length + 1]; } /** * Convenience method for calling {@link #damerauLevenshteinDistance(String str1, String str2)} * when you don't care about case sensitivity. * * @param str1 First string being compared * @param str2 Second string being compared * @return Case-insensitive edit distance between strings */ public static int damerauLevenshteinDistanceCaseInsensitive(String str1, String str2) { return damerauLevenshteinDistance(str1.toLowerCase(), str2.toLowerCase()); } /** * Fast method for determining whether the Damerau-Levenshtein edit * distance between two strings is less than 2. * * Returns as quick as possibly by stopping once multiple edits are * found. Significantly faster than {@link #damerauLevenshteinDistance(String str1, String str2)} * which explores every path between every string to get the exact * edit distance. Despite the speed boost, we maintain consistency * with {@link #damerauLevenshteinDistance(String str1, String str2)}. * * @param str1 First string being compared * @param str2 Second string being compared * @return True if DL edit distance < 2, false otherwise */ public static boolean isEditDistance1(String str1, String str2) { // one or both strings is empty or null if ((str1 == null) || str1.isEmpty()) { if ((str2 == null) || str2.isEmpty()) { return true; } else { return (str2.length() <= 1); } } else if ((str2 == null) || str2.isEmpty()) { return (str1.length() <= 1); } // difference between string lengths ensures edit distance > bound if (Math.abs(str1.length() - str2.length()) > 1) return false; // initialize counters int offset1 = 0; int offset2 = 0; int i = 0; InfiniteCharArray chars1 = new InfiniteCharArray(str1.toCharArray()); InfiniteCharArray chars2 = new InfiniteCharArray(str2.toCharArray()); while (!chars1.get(i + offset1).equals(endMarker) || !chars2.get(i + offset2).equals(endMarker)) { if (!chars1.get(i + offset1).equals(chars2.get(i + offset2))) { // character mismatch if ((chars1.get(i + offset1).equals(chars2.get(i + offset2 + 1))) && (chars1.get(i + offset1 + 1).equals(chars2.get(i + offset2))) && (chars1.remainder(i + offset1 + 2).equals(chars2.remainder(i + offset2 + 2)))) { // transposition i = i + 2; // move past the transposition } else if (chars1.remainder(i + offset1).equals(chars2.remainder(i + offset2 + 1))) { // insertion offset2++; // realign } else if (chars1.remainder(i + offset1 + 1).equals(chars2.remainder(i + offset2))) { // deletion offset1++; // realign } else if (chars1.remainder(i + offset1 + 1).equals(chars2.remainder(i + offset2 + 1))) { // substitution i++; // } else return false; // multiple edits } i++; } return true; } } /** * Convenience class allowing us to avoid running into * ArrayIndexOutOfBoundsExceptions (thanks, Java!) * */ class InfiniteCharArray { // the array being encapsulated private char[] array; /** * Sole constructor. * * @param array the array to be encapsulated */ protected InfiniteCharArray(char[] array) { this.array = array; } /** * If we try to retrieve what lies beyond the end of values, return * a {@link Null} object instead of throwing an * ArrayIndexOutOfBoundsException. Otherwise, return the value at * at the given index. * * @param index the position in the array for which we seek a value * @return the value at that index or a Null object if we're "out of bounds" */ protected Object get(int index) { if (index < this.array.length) { return this.array[index]; } else { return new Null(); } } /** * Get the contents of the char array to the right of the given * index, and return it as a String. * * @param index left bound of the string we're pulling from the char array * @return a string representing everything to the right of the index */ protected String remainder(int index) { if (index > this.array.length) return ""; else return new String(Arrays.copyOfRange(this.array, index, this.array.length)); } } /** * This is what gets returned instead of an ArrayIndexOutOfBoundsException * when we use an {@link InfiniteCharArray}. * */ class Null { /** * Call the "super" constructor. */ protected Null() { super(); } /** * For pretty printing. */ @Override public String toString() { return "Null"; } /** * All instances of this class are effectively the same, so treat * them as equal. */ @Override public boolean equals(Object anObject) { if (anObject.getClass().equals(this.getClass())) { return true; } else return false; } }