TermEditDistance.java example

Explorer
error-prone-master
/*
 * Copyright 2017 Google Inc. All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.google.errorprone.names;

import blogspot.software_and_algorithms.stern_library.optimization.HungarianAlgorithm;
import com.google.common.collect.ImmutableList;
import java.util.function.BiFunction;
import java.util.stream.DoubleStream;

/**
 * A utility class for finding the distance between two identifiers. Each identifier is split into
 * its constituent terms (based on camel case or underscore naming conventions). Then the edit
 * distance between each term is computed and the minimum cost assignment is found.
 */
public class TermEditDistance {

  private final BiFunction<String, String, Double> editDistanceFn;
  private final BiFunction<Integer, Integer, Double> maxDistanceFn;

  /**
   * Creates a TermEditDistance Object
   *
   * @param editDistanceFn function to compute the distance between two terms
   * @param maxDistanceFn function to compute the worst case distance between two terms
   */
  public TermEditDistance(
      BiFunction<String, String, Double> editDistanceFn,
      BiFunction<Integer, Integer, Double> maxDistanceFn) {
    this.editDistanceFn = editDistanceFn;
    this.maxDistanceFn = maxDistanceFn;
  }

  public TermEditDistance() {
    this(
        (s, t) -> (double) LevenshteinEditDistance.getEditDistance(s, t, /*isCaseSensitive*/ false),
        (s, t) -> (double) LevenshteinEditDistance.getWorstCaseEditDistance(s, t));
  }

  public double getNormalizedEditDistance(String source, String target) {

    ImmutableList<String> sourceTerms = NamingConventions.splitToLowercaseTerms(source);
    ImmutableList<String> targetTerms = NamingConventions.splitToLowercaseTerms(target);

    // costMatrix[s][t] is the edit distance between source term s and target term t
    double[][] costMatrix =
        sourceTerms
            .stream()
            .map(s -> targetTerms.stream().mapToDouble(t -> editDistanceFn.apply(s, t)).toArray())
            .toArray(double[][]::new);

    // worstCaseMatrix[s][t] is the worst case distance between source term s and target term t
    double[][] worstCaseMatrix =
        sourceTerms
            .stream()
            .map(s -> s.length())
            .map(
                s ->
                    targetTerms
                        .stream()
                        .map(t -> t.length())
                        .mapToDouble(t -> maxDistanceFn.apply(s, t))
                        .toArray())
            .toArray(double[][]::new);

    double[] sourceTermDeletionCosts =
        sourceTerms.stream().mapToDouble(s -> maxDistanceFn.apply(s.length(), 0)).toArray();

    double[] targetTermAdditionCosts =
        targetTerms.stream().mapToDouble(s -> maxDistanceFn.apply(0, s.length())).toArray();

    // this is an array of assignments of source terms to target terms. If assignments[i] contains
    // the value j this means that source term i has been assigned to target term j
    // There will be one entry in cost for each source term:
    // - If there are more source terms than target terms then some will be unassigned - value -1
    // - If there are a fewer source terms than target terms then some target terms will not be
    //    referenced in the array
    int[] assignments = new HungarianAlgorithm(costMatrix).execute();
    double assignmentCost =
        computeCost(assignments, costMatrix, sourceTermDeletionCosts, targetTermAdditionCosts);

    double maxCost =
        computeCost(assignments, worstCaseMatrix, sourceTermDeletionCosts, targetTermAdditionCosts);

    return assignmentCost / maxCost;
  }

  /**
   * Compute the total cost of this assignment including the costs of unassigned source and target
   * terms.
   */
  private static double computeCost(
      int[] assignments,
      double[][] costMatrix,
      double[] sourceTermDeletionCosts,
      double[] targetTermDeletionCosts) {

    // We need to sum the costs of each assigned pair, each unassigned source term, and each
    // unassigned target term.

    // Start with the total cost of _not_ using all the target terms, then when we use one we'll
    // remove it from this total.
    double totalCost = DoubleStream.of(targetTermDeletionCosts).sum();
    for (int sourceTermIndex = 0; sourceTermIndex < assignments.length; sourceTermIndex++) {
      int targetTermIndex = assignments[sourceTermIndex];
      if (targetTermIndex == -1) {
        // not using this source term
        totalCost += sourceTermDeletionCosts[sourceTermIndex];
      } else {
        // add the cost of the assignments
        totalCost += costMatrix[sourceTermIndex][targetTermIndex];

        // we are using this target term and so we should remove the cost of deleting it
        totalCost -= targetTermDeletionCosts[targetTermIndex];
      }
    }
    return totalCost;
  }
}