ClusteringUtils.java example

Explorer
mahout-commits-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.mahout.clustering;

import java.util.ArrayList;
import java.util.List;

import com.google.common.base.Preconditions;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import org.apache.mahout.common.distance.DistanceMeasure;
import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
import org.apache.mahout.math.Centroid;
import org.apache.mahout.math.DenseMatrix;
import org.apache.mahout.math.Matrix;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.WeightedVector;
import org.apache.mahout.math.neighborhood.BruteSearch;
import org.apache.mahout.math.neighborhood.ProjectionSearch;
import org.apache.mahout.math.neighborhood.Searcher;
import org.apache.mahout.math.neighborhood.UpdatableSearcher;
import org.apache.mahout.math.random.WeightedThing;
import org.apache.mahout.math.stats.OnlineSummarizer;

public final class ClusteringUtils {
  private ClusteringUtils() {
  }

  /**
   * Computes the summaries for the distances in each cluster.
   * @param datapoints iterable of datapoints.
   * @param centroids iterable of Centroids.
   * @return a list of OnlineSummarizers where the i-th element is the summarizer corresponding to the cluster whose
   * index is i.
   */
  public static List<OnlineSummarizer> summarizeClusterDistances(Iterable<? extends Vector> datapoints,
                                                                 Iterable<? extends Vector> centroids,
                                                                 DistanceMeasure distanceMeasure) {
    UpdatableSearcher searcher = new ProjectionSearch(distanceMeasure, 3, 1);
    searcher.addAll(centroids);
    List<OnlineSummarizer> summarizers = new ArrayList<>();
    if (searcher.size() == 0) {
      return summarizers;
    }
    for (int i = 0; i < searcher.size(); ++i) {
      summarizers.add(new OnlineSummarizer());
    }
    for (Vector v : datapoints) {
      Centroid closest = (Centroid)searcher.search(v,  1).get(0).getValue();
      OnlineSummarizer summarizer = summarizers.get(closest.getIndex());
      summarizer.add(distanceMeasure.distance(v, closest));
    }
    return summarizers;
  }

  /**
   * Adds up the distances from each point to its closest cluster and returns the sum.
   * @param datapoints iterable of datapoints.
   * @param centroids iterable of Centroids.
   * @return the total cost described above.
   */
  public static double totalClusterCost(Iterable<? extends Vector> datapoints, Iterable<? extends Vector> centroids) {
    DistanceMeasure distanceMeasure = new EuclideanDistanceMeasure();
    UpdatableSearcher searcher = new ProjectionSearch(distanceMeasure, 3, 1);
    searcher.addAll(centroids);
    return totalClusterCost(datapoints, searcher);
  }

  /**
   * Adds up the distances from each point to its closest cluster and returns the sum.
   * @param datapoints iterable of datapoints.
   * @param centroids searcher of Centroids.
   * @return the total cost described above.
   */
  public static double totalClusterCost(Iterable<? extends Vector> datapoints, Searcher centroids) {
    double totalCost = 0;
    for (Vector vector : datapoints) {
      totalCost += centroids.searchFirst(vector, false).getWeight();
    }
    return totalCost;
  }

  /**
   * Estimates the distance cutoff. In StreamingKMeans, the distance between two vectors divided
   * by this value is used as a probability threshold when deciding whether to form a new cluster
   * or not.
   * Small values (comparable to the minimum distance between two points) are preferred as they
   * guarantee with high likelihood that all but very close points are put in separate clusters
   * initially. The clusters themselves are actually collapsed periodically when their number goes
   * over the maximum number of clusters and the distanceCutoff is increased.
   * So, the returned value is only an initial estimate.
   * @param data the datapoints whose distance is to be estimated.
   * @param distanceMeasure the distance measure used to compute the distance between two points.
   * @return the minimum distance between the first sampleLimit points
   * @see org.apache.mahout.clustering.streaming.cluster.StreamingKMeans#clusterInternal(Iterable, boolean)
   */
  public static double estimateDistanceCutoff(List<? extends Vector> data, DistanceMeasure distanceMeasure) {
    BruteSearch searcher = new BruteSearch(distanceMeasure);
    searcher.addAll(data);
    double minDistance = Double.POSITIVE_INFINITY;
    for (Vector vector : data) {
      double closest = searcher.searchFirst(vector, true).getWeight();
      if (minDistance > 0 && closest < minDistance) {
        minDistance = closest;
      }
      searcher.add(vector);
    }
    return minDistance;
  }

  public static <T extends Vector> double estimateDistanceCutoff(
      Iterable<T> data, DistanceMeasure distanceMeasure, int sampleLimit) {
    return estimateDistanceCutoff(Lists.newArrayList(Iterables.limit(data, sampleLimit)), distanceMeasure);
  }

  /**
   * Computes the Davies-Bouldin Index for a given clustering.
   * See http://en.wikipedia.org/wiki/Clustering_algorithm#Internal_evaluation
   * @param centroids list of centroids
   * @param distanceMeasure distance measure for inter-cluster distances
   * @param clusterDistanceSummaries summaries of the clusters; See summarizeClusterDistances
   * @return the Davies-Bouldin Index
   */
  public static double daviesBouldinIndex(List<? extends Vector> centroids, DistanceMeasure distanceMeasure,
                                          List<OnlineSummarizer> clusterDistanceSummaries) {
    Preconditions.checkArgument(centroids.size() == clusterDistanceSummaries.size(),
        "Number of centroids and cluster summaries differ.");
    int n = centroids.size();
    double totalDBIndex = 0;
    // The inner loop shouldn't be reduced for j = i + 1 to n because the computation of the Davies-Bouldin
    // index is not really symmetric.
    // For a given cluster i, we look for a cluster j that maximizes the ratio of the sum of average distances
    // from points in cluster i to its center and and points in cluster j to its center to the distance between
    // cluster i and cluster j.
    // The maximization is the key issue, as the cluster that maximizes this ratio might be j for i but is NOT
    // NECESSARILY i for j.
    for (int i = 0; i < n; ++i) {
      double averageDistanceI = clusterDistanceSummaries.get(i).getMean();
      double maxDBIndex = 0;
      for (int j = 0; j < n; ++j) {
        if (i != j) {
          double dbIndex = (averageDistanceI + clusterDistanceSummaries.get(j).getMean())
              / distanceMeasure.distance(centroids.get(i), centroids.get(j));
          if (dbIndex > maxDBIndex) {
            maxDBIndex = dbIndex;
          }
        }
      }
      totalDBIndex += maxDBIndex;
    }
    return totalDBIndex / n;
  }

  /**
   * Computes the Dunn Index of a given clustering. See http://en.wikipedia.org/wiki/Dunn_index
   * @param centroids list of centroids
   * @param distanceMeasure distance measure to compute inter-centroid distance with
   * @param clusterDistanceSummaries summaries of the clusters; See summarizeClusterDistances
   * @return the Dunn Index
   */
  public static double dunnIndex(List<? extends Vector> centroids, DistanceMeasure distanceMeasure,
                                 List<OnlineSummarizer> clusterDistanceSummaries) {
    Preconditions.checkArgument(centroids.size() == clusterDistanceSummaries.size(),
        "Number of centroids and cluster summaries differ.");
    int n = centroids.size();
    // Intra-cluster distances will come from the OnlineSummarizer, and will be the median distance (noting that
    // the median for just one value is that value).
    // A variety of metrics can be used for the intra-cluster distance including max distance between two points,
    // mean distance, etc. Median distance was chosen as this is more robust to outliers and characterizes the
    // distribution of distances (from a point to the center) better.
    double maxIntraClusterDistance = 0;
    for (OnlineSummarizer summarizer : clusterDistanceSummaries) {
      if (summarizer.getCount() > 0) {
        double intraClusterDistance;
        if (summarizer.getCount() == 1) {
          intraClusterDistance = summarizer.getMean();
        } else {
          intraClusterDistance = summarizer.getMedian();
        }
        if (maxIntraClusterDistance < intraClusterDistance) {
          maxIntraClusterDistance = intraClusterDistance;
        }
      }
    }
    double minDunnIndex = Double.POSITIVE_INFINITY;
    for (int i = 0; i < n; ++i) {
      // Distances are symmetric, so d(i, j) = d(j, i).
      for (int j = i + 1; j < n; ++j) {
        double dunnIndex = distanceMeasure.distance(centroids.get(i), centroids.get(j));
        if (minDunnIndex > dunnIndex) {
          minDunnIndex = dunnIndex;
        }
      }
    }
    return minDunnIndex / maxIntraClusterDistance;
  }

  public static double choose2(double n) {
    return n * (n - 1) / 2;
  }

  /**
   * Creates a confusion matrix by searching for the closest cluster of both the row clustering and column clustering
   * of a point and adding its weight to that cell of the matrix.
   * It doesn't matter which clustering is the row clustering and which is the column clustering. If they're
   * interchanged, the resulting matrix is the transpose of the original one.
   * @param rowCentroids clustering one
   * @param columnCentroids clustering two
   * @param datapoints datapoints whose closest cluster we need to find
   * @param distanceMeasure distance measure to use
   * @return the confusion matrix
   */
  public static Matrix getConfusionMatrix(List<? extends Vector> rowCentroids, List<? extends  Vector> columnCentroids,
                                          Iterable<? extends Vector> datapoints, DistanceMeasure distanceMeasure) {
    Searcher rowSearcher = new BruteSearch(distanceMeasure);
    rowSearcher.addAll(rowCentroids);
    Searcher columnSearcher = new BruteSearch(distanceMeasure);
    columnSearcher.addAll(columnCentroids);

    int numRows = rowCentroids.size();
    int numCols = columnCentroids.size();
    Matrix confusionMatrix = new DenseMatrix(numRows, numCols);

    for (Vector vector : datapoints) {
      WeightedThing<Vector> closestRowCentroid = rowSearcher.search(vector, 1).get(0);
      WeightedThing<Vector> closestColumnCentroid = columnSearcher.search(vector, 1).get(0);
      int row = ((Centroid) closestRowCentroid.getValue()).getIndex();
      int column = ((Centroid) closestColumnCentroid.getValue()).getIndex();
      double vectorWeight;
      if (vector instanceof WeightedVector) {
        vectorWeight = ((WeightedVector) vector).getWeight();
      } else {
        vectorWeight = 1;
      }
      confusionMatrix.set(row, column, confusionMatrix.get(row, column) + vectorWeight);
    }

    return confusionMatrix;
  }

  /**
   * Computes the Adjusted Rand Index for a given confusion matrix.
   * @param confusionMatrix confusion matrix; not to be confused with the more restrictive ConfusionMatrix class
   * @return the Adjusted Rand Index
   */
  public static double getAdjustedRandIndex(Matrix confusionMatrix) {
    int numRows = confusionMatrix.numRows();
    int numCols = confusionMatrix.numCols();
    double rowChoiceSum = 0;
    double columnChoiceSum = 0;
    double totalChoiceSum = 0;
    double total = 0;
    for (int i = 0; i < numRows; ++i) {
      double rowSum = 0;
      for (int j = 0; j < numCols; ++j) {
        rowSum += confusionMatrix.get(i, j);
        totalChoiceSum += choose2(confusionMatrix.get(i, j));
      }
      total += rowSum;
      rowChoiceSum += choose2(rowSum);
    }
    for (int j = 0; j < numCols; ++j) {
      double columnSum = 0;
      for (int i = 0; i < numRows; ++i) {
        columnSum += confusionMatrix.get(i, j);
      }
      columnChoiceSum += choose2(columnSum);
    }
    double rowColumnChoiceSumDivTotal = rowChoiceSum * columnChoiceSum / choose2(total);
    return (totalChoiceSum - rowColumnChoiceSumDivTotal)
        / ((rowChoiceSum + columnChoiceSum) / 2 - rowColumnChoiceSumDivTotal);
  }

  /**
   * Computes the total weight of the points in the given Vector iterable.
   * @param data iterable of points
   * @return total weight
   */
  public static double totalWeight(Iterable<? extends Vector> data) {
    double sum = 0;
    for (Vector row : data) {
      Preconditions.checkNotNull(row);
      if (row instanceof WeightedVector) {
        sum += ((WeightedVector)row).getWeight();
      } else {
        sum++;
      }
    }
    return sum;
  }
}