/* * Copyright 2010 Google Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package com.google.gwt.dev.util.editdistance; /** * A collection of instance generators for the GeneralEditDistance interface. */ public class GeneralEditDistances { /** * Chooses the best implementation of Levenshtein string edit distance * available at the current time. */ /* * As of 2007-08-23, the best algorithm known (to the author=mwyoung) for * short strings is one due to Eugene Myers, except for the special case * where the distance limit is 0 or 1. The Myers algorithm also has good * worst-case performance for long strings when the edit distance is not * reasonably bounded. * * When there is a good bound, a variant of the Ukkonen algorithm due to * Berghel and Roach (modified by Michael Young to use linear space) * is faster for long strings. * * Note that other algorithms that perform better in some cases for running * text searches do not outperform Myers for rigid distance computations. * Notably: * Navarro/Baeza-Yates (Algorithmica 23,2) simulates an NFA with an * epsilon-cycle on the initial state (appropriate for running texts) * and reports success without computing exact distance. When adjusted * to a fixed starting point and computing distance, its state machine * is larger and it underperforms. * * BITAP (Baeza-Yates/Gonnet, Manber/Wu) also simulates an NFA, and * Navarro claims that it wins for small patterns and small limits for * running search. Experiments with a Java implementation showed that * it beat Myers on pure string edit distance only for limits where the * special 0-1 limit applied, where special-case comparison beats all. * * A survey of algorithms for running text search by Navarro appeared * in ACM Computing Surveys 33#1: http://portal.acm.org/citation.cfm?id=375365 * Another algorithm (Four Russians) that Navarro claims superior for very * long patterns and high limits was not evaluated for inclusion here. * Filtering algorithms also improve running search, but do not help * for pure edit distance. */ private static class Levenshtein implements GeneralEditDistance { /** * Long+bounded implementation class: distance-only Berghel-Roach. */ private ModifiedBerghelRoachEditDistance berghel; /** * Short/unbounded implementation class: Myers bit-parallel. */ private MyersBitParallelEditDistance myers; /** * Saved pattern, for specialized comparisons. */ private final CharSequence pattern; /** * Length of saved pattern. */ private final int patternLength; private Levenshtein(CharSequence pattern) { this.pattern = pattern; this.patternLength = pattern.length(); } public GeneralEditDistance duplicate() { Levenshtein dup = new Levenshtein(pattern); /* Duplicate the Myers engine, as it is cheaper than rebuilding */ if (this.myers != null) { dup.myers = (MyersBitParallelEditDistance) this.myers.duplicate(); } /* Do not duplicate the Berghel engine; it provides no savings. */ return dup; } public int getDistance(CharSequence target, int limit) { /* When the limit is 0 or 1, specialized comparisons are much faster. */ if (limit <= 1) { return limit == 0 ? (pattern.equals(target) ? 0 : 1) : atMostOneError(pattern, target); } /* * The best algorithm for long strings depends on the resulting * edit distance (or the limit placed on it). Without further * information on the likelihood of a low distance, we guess * based on the provided limit. We currently lean toward using * the Myers algorithm unless we are pretty sure that the * Berghel-Roach algorithm will win (based on the limit). * * Note that when the string lengths are small (fewer characters * than bits in a long), Myers wins regardless of limit. */ if ((patternLength > 64) && (limit < (target.length() / 10))) { if (berghel == null) { berghel = ModifiedBerghelRoachEditDistance.getInstance(pattern); } return berghel.getDistance(target, limit); } if (myers == null) { myers = MyersBitParallelEditDistance.getInstance(pattern); } return myers.getDistance(target, limit); } } /** * Compares two strings for at most one insert/delete/substitute difference. * Since operations cannot be composed, a simple case analysis is possible. * * @param s1 one string to be compared * @param s2 the other string to be compared * @return Levenshtein edit distance if no greater than 1; * otherwise, more than 1 */ public static int atMostOneError(CharSequence s1, CharSequence s2) { int s1Length = s1.length(); int s2Length = s2.length(); int errors = 0; /* running count of edits required */ switch(s2Length - s1Length) { /* * Strings are the same length. No single insert/delete is possible; * at most one substitution can be present. */ case 0: for (int i = 0; i < s2Length; i++) { if ((s2.charAt(i) != s1.charAt(i)) && (errors++ != 0)) { break; } } return errors; /* * Strings differ in length by 1, so we have an insertion * (and therefore cannot have any other substitutions). */ case 1: /* s2Length > s1Length */ for (int i = 0; i < s1Length; i++) { if (s2.charAt(i) != s1.charAt(i)) { for (; i < s1Length; i++) { if (s2.charAt(i + 1) != s1.charAt(i)) { return 2; } } return 1; } } return 1; /* Same as above case, with strings reversed */ case -1: /* s1Length > s2Length */ for (int i = 0; i < s2Length; i++) { if (s2.charAt(i) != s1.charAt(i)) { for (; i < s2Length; i++) { if (s2.charAt(i) != s1.charAt(i + 1)) { return 2; } } return 1; } } return 1; /* Edit distance is at least difference in lengths; more than 1 here. */ default: return 2; } } /** * Generates an GeneralEditDistance engine for a particular pattern string * based on Levenshtein distance. Caller must ensure that the * pattern does not change (consider using pattern.toString() if * necessary) as long as the generated object is to be used. * * @param pattern a string from which distance computations are desired * @return an engine for computing Levenshtein distances from that pattern */ public static GeneralEditDistance getLevenshteinDistance(CharSequence pattern) { return new Levenshtein(pattern); } private GeneralEditDistances() { } }