/* * Copyright 2017 Google Inc. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.google.errorprone.names; import blogspot.software_and_algorithms.stern_library.optimization.HungarianAlgorithm; import com.google.common.collect.ImmutableList; import java.util.function.BiFunction; import java.util.stream.DoubleStream; /** * A utility class for finding the distance between two identifiers. Each identifier is split into * its constituent terms (based on camel case or underscore naming conventions). Then the edit * distance between each term is computed and the minimum cost assignment is found. */ public class TermEditDistance { private final BiFunction<String, String, Double> editDistanceFn; private final BiFunction<Integer, Integer, Double> maxDistanceFn; /** * Creates a TermEditDistance Object * * @param editDistanceFn function to compute the distance between two terms * @param maxDistanceFn function to compute the worst case distance between two terms */ public TermEditDistance( BiFunction<String, String, Double> editDistanceFn, BiFunction<Integer, Integer, Double> maxDistanceFn) { this.editDistanceFn = editDistanceFn; this.maxDistanceFn = maxDistanceFn; } public TermEditDistance() { this( (s, t) -> (double) LevenshteinEditDistance.getEditDistance(s, t, /*isCaseSensitive*/ false), (s, t) -> (double) LevenshteinEditDistance.getWorstCaseEditDistance(s, t)); } public double getNormalizedEditDistance(String source, String target) { ImmutableList<String> sourceTerms = NamingConventions.splitToLowercaseTerms(source); ImmutableList<String> targetTerms = NamingConventions.splitToLowercaseTerms(target); // costMatrix[s][t] is the edit distance between source term s and target term t double[][] costMatrix = sourceTerms .stream() .map(s -> targetTerms.stream().mapToDouble(t -> editDistanceFn.apply(s, t)).toArray()) .toArray(double[][]::new); // worstCaseMatrix[s][t] is the worst case distance between source term s and target term t double[][] worstCaseMatrix = sourceTerms .stream() .map(s -> s.length()) .map( s -> targetTerms .stream() .map(t -> t.length()) .mapToDouble(t -> maxDistanceFn.apply(s, t)) .toArray()) .toArray(double[][]::new); double[] sourceTermDeletionCosts = sourceTerms.stream().mapToDouble(s -> maxDistanceFn.apply(s.length(), 0)).toArray(); double[] targetTermAdditionCosts = targetTerms.stream().mapToDouble(s -> maxDistanceFn.apply(0, s.length())).toArray(); // this is an array of assignments of source terms to target terms. If assignments[i] contains // the value j this means that source term i has been assigned to target term j // There will be one entry in cost for each source term: // - If there are more source terms than target terms then some will be unassigned - value -1 // - If there are a fewer source terms than target terms then some target terms will not be // referenced in the array int[] assignments = new HungarianAlgorithm(costMatrix).execute(); double assignmentCost = computeCost(assignments, costMatrix, sourceTermDeletionCosts, targetTermAdditionCosts); double maxCost = computeCost(assignments, worstCaseMatrix, sourceTermDeletionCosts, targetTermAdditionCosts); return assignmentCost / maxCost; } /** * Compute the total cost of this assignment including the costs of unassigned source and target * terms. */ private static double computeCost( int[] assignments, double[][] costMatrix, double[] sourceTermDeletionCosts, double[] targetTermDeletionCosts) { // We need to sum the costs of each assigned pair, each unassigned source term, and each // unassigned target term. // Start with the total cost of _not_ using all the target terms, then when we use one we'll // remove it from this total. double totalCost = DoubleStream.of(targetTermDeletionCosts).sum(); for (int sourceTermIndex = 0; sourceTermIndex < assignments.length; sourceTermIndex++) { int targetTermIndex = assignments[sourceTermIndex]; if (targetTermIndex == -1) { // not using this source term totalCost += sourceTermDeletionCosts[sourceTermIndex]; } else { // add the cost of the assignments totalCost += costMatrix[sourceTermIndex][targetTermIndex]; // we are using this target term and so we should remove the cost of deleting it totalCost -= targetTermDeletionCosts[targetTermIndex]; } } return totalCost; } }