/* * Copyright 2012 Google Inc. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.google.errorprone.names; /** * A utility class for finding the Levenshtein edit distance between strings. The edit distance * between two strings is the number of deletions, insertions, and substitutions required to * transform the source to the target. See * {@link https://en.wikipedia.org/wiki/Levenshtein_distance}. * * @author eaftan@google.com (Eddie Aftandilian) */ public class LevenshteinEditDistance { private LevenshteinEditDistance() { /* disallow instantiation */ } /** * Returns the edit distance between two strings. * * @param source The source string. * @param target The target distance. * @return The edit distance between the source and target string. * @see #getEditDistance(String, String, boolean) */ public static int getEditDistance(String source, String target) { return getEditDistance(source, target, true); } /** * Returns the edit distance between two strings. * The algorithm used to calculate this distance has space requirements of * len(source)*len(target). * * @param source The source string. * @param target The target string * @param caseSensitive If true, case is used in comparisons and 'a' != 'A'. * @return The edit distance between the source and target strings. * @see #getEditDistance(String, String) */ public static int getEditDistance(String source, String target, boolean caseSensitive) { // Levenshtein distance algorithm int sourceLength = isEmptyOrWhitespace(source) ? 0 : source.length(); int targetLength = isEmptyOrWhitespace(target) ? 0 : target.length(); if (sourceLength == 0) { return targetLength; } if (targetLength == 0) { return sourceLength; } if (!caseSensitive) { source = source.toLowerCase(); target = target.toLowerCase(); } int[][] levMatrix = new int[sourceLength + 1][targetLength + 1]; for (int i = 0; i <= sourceLength; i++) { levMatrix[i][0] = i; } for (int i = 0; i <= targetLength; i++) { levMatrix[0][i] = i; } for (int i = 1; i <= sourceLength; i++) { char sourceI = source.charAt(i - 1); for (int j = 1; j <= targetLength; j++) { char targetJ = target.charAt(j - 1); int cost = 0; if (sourceI != targetJ) { cost = 1; } levMatrix[i][j] = Math.min(cost + levMatrix[i - 1][j - 1], Math.min(levMatrix[i - 1][j] + 1, levMatrix[i][j - 1] + 1)); } } return levMatrix[sourceLength][targetLength]; } /** * Returns a normalized edit distance between 0 and 1. This is useful if you are comparing or * aggregating distances of different pairs of strings */ public static double getNormalizedEditDistance( String source, String target, boolean caseSensitive) { if (isEmptyOrWhitespace(source) && isEmptyOrWhitespace(target)) { return 0.0; } return (double) getEditDistance(source, target, caseSensitive) / (double) getWorstCaseEditDistance(source.length(), target.length()); } /** Calculate the worst case distance between two strings with the given lengths */ public static int getWorstCaseEditDistance(int sourceLength, int targetLength) { return Math.max(sourceLength, targetLength); } /** * Determines if a string is empty or consists only of whitespace * * @param source The string to check * @return True if the string is empty or contains only whitespace, * false otherwise */ private static boolean isEmptyOrWhitespace(String source) { return source == null || source.matches("\\s*"); } }