StreamingKMeansTestMR.java example

Explorer
mahout-commits-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.mahout.clustering.streaming.mapreduce;

import java.io.IOException;
import java.util.Arrays;
import java.util.List;

import com.google.common.base.Function;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mrunit.mapreduce.MapDriver;
import org.apache.hadoop.mrunit.mapreduce.MapReduceDriver;
import org.apache.hadoop.mrunit.mapreduce.ReduceDriver;
import org.apache.mahout.clustering.ClusteringUtils;
import org.apache.mahout.clustering.streaming.cluster.DataUtils;
import org.apache.mahout.clustering.streaming.cluster.StreamingKMeans;
import org.apache.mahout.common.MahoutTestCase;
import org.apache.mahout.common.Pair;
import org.apache.mahout.common.RandomUtils;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable;
import org.apache.mahout.math.Centroid;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.math.neighborhood.BruteSearch;
import org.apache.mahout.math.neighborhood.FastProjectionSearch;
import org.apache.mahout.math.neighborhood.LocalitySensitiveHashSearch;
import org.apache.mahout.math.neighborhood.ProjectionSearch;
import org.apache.mahout.math.random.WeightedThing;
import org.junit.Before;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.Parameterized;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;

@RunWith(Parameterized.class)
public class StreamingKMeansTestMR extends MahoutTestCase {
  private static final int NUM_DATA_POINTS = 1 << 15;
  private static final int NUM_DIMENSIONS = 8;
  private static final int NUM_PROJECTIONS = 3;
  private static final int SEARCH_SIZE = 5;
  private static final int MAX_NUM_ITERATIONS = 10;
  private static final double DISTANCE_CUTOFF = 1.0e-6;

  private static Pair<List<Centroid>, List<Centroid>> syntheticData;

  @Before
  public void setUp() {
    RandomUtils.useTestSeed();
    syntheticData =
      DataUtils.sampleMultiNormalHypercube(NUM_DIMENSIONS, NUM_DATA_POINTS, 1.0e-4);
  }

  private final String searcherClassName;
  private final String distanceMeasureClassName;

  public StreamingKMeansTestMR(String searcherClassName, String distanceMeasureClassName) {
    this.searcherClassName = searcherClassName;
    this.distanceMeasureClassName = distanceMeasureClassName;
  }

  private void configure(Configuration configuration) {
    configuration.set(DefaultOptionCreator.DISTANCE_MEASURE_OPTION, distanceMeasureClassName);
    configuration.setInt(StreamingKMeansDriver.SEARCH_SIZE_OPTION, SEARCH_SIZE);
    configuration.setInt(StreamingKMeansDriver.NUM_PROJECTIONS_OPTION, NUM_PROJECTIONS);
    configuration.set(StreamingKMeansDriver.SEARCHER_CLASS_OPTION, searcherClassName);
    configuration.setInt(DefaultOptionCreator.NUM_CLUSTERS_OPTION, 1 << NUM_DIMENSIONS);
    configuration.setInt(StreamingKMeansDriver.ESTIMATED_NUM_MAP_CLUSTERS,
        (1 << NUM_DIMENSIONS) * (int)Math.log(NUM_DATA_POINTS));
    configuration.setFloat(StreamingKMeansDriver.ESTIMATED_DISTANCE_CUTOFF, (float) DISTANCE_CUTOFF);
    configuration.setInt(StreamingKMeansDriver.MAX_NUM_ITERATIONS, MAX_NUM_ITERATIONS);

    // Collapse the Centroids in the reducer.
    configuration.setBoolean(StreamingKMeansDriver.REDUCE_STREAMING_KMEANS, true);
  }

  @Parameterized.Parameters
  public static List<Object[]> generateData() {
    return Arrays.asList(new Object[][]{
        {ProjectionSearch.class.getName(), SquaredEuclideanDistanceMeasure.class.getName()},
        {FastProjectionSearch.class.getName(), SquaredEuclideanDistanceMeasure.class.getName()},
        {LocalitySensitiveHashSearch.class.getName(), SquaredEuclideanDistanceMeasure.class.getName()},
    });
  }

  @Test
  public void testHypercubeMapper() throws IOException {
    MapDriver<Writable, VectorWritable, IntWritable, CentroidWritable> mapDriver =
        MapDriver.newMapDriver(new StreamingKMeansMapper());
    configure(mapDriver.getConfiguration());
    System.out.printf("%s mapper test\n",
        mapDriver.getConfiguration().get(StreamingKMeansDriver.SEARCHER_CLASS_OPTION));
    for (Centroid datapoint : syntheticData.getFirst()) {
      mapDriver.addInput(new IntWritable(0), new VectorWritable(datapoint));
    }
    List<org.apache.hadoop.mrunit.types.Pair<IntWritable,CentroidWritable>> results = mapDriver.run();
    BruteSearch resultSearcher = new BruteSearch(new SquaredEuclideanDistanceMeasure());
    for (org.apache.hadoop.mrunit.types.Pair<IntWritable, CentroidWritable> result : results) {
      resultSearcher.add(result.getSecond().getCentroid());
    }
    System.out.printf("Clustered the data into %d clusters\n", results.size());
    for (Vector mean : syntheticData.getSecond()) {
      WeightedThing<Vector> closest = resultSearcher.search(mean, 1).get(0);
      assertTrue("Weight " + closest.getWeight() + " not less than 0.5", closest.getWeight() < 0.5);
    }
  }

  @Test
  public void testMapperVsLocal() throws IOException {
    // Clusters the data using the StreamingKMeansMapper.
    MapDriver<Writable, VectorWritable, IntWritable, CentroidWritable> mapDriver =
        MapDriver.newMapDriver(new StreamingKMeansMapper());
    Configuration configuration = mapDriver.getConfiguration();
    configure(configuration);
    System.out.printf("%s mapper vs local test\n",
        mapDriver.getConfiguration().get(StreamingKMeansDriver.SEARCHER_CLASS_OPTION));

    for (Centroid datapoint : syntheticData.getFirst()) {
      mapDriver.addInput(new IntWritable(0), new VectorWritable(datapoint));
    }
    List<Centroid> mapperCentroids = Lists.newArrayList();
    for (org.apache.hadoop.mrunit.types.Pair<IntWritable, CentroidWritable> pair : mapDriver.run()) {
      mapperCentroids.add(pair.getSecond().getCentroid());
    }

    // Clusters the data using local batch StreamingKMeans.
    StreamingKMeans batchClusterer =
        new StreamingKMeans(StreamingKMeansUtilsMR.searcherFromConfiguration(configuration),
            mapDriver.getConfiguration().getInt("estimatedNumMapClusters", -1), DISTANCE_CUTOFF);
    batchClusterer.cluster(syntheticData.getFirst());
    List<Centroid> batchCentroids = Lists.newArrayList();
    for (Vector v : batchClusterer) {
      batchCentroids.add((Centroid) v);
    }

    // Clusters the data using point by point StreamingKMeans.
    StreamingKMeans perPointClusterer =
        new StreamingKMeans(StreamingKMeansUtilsMR.searcherFromConfiguration(configuration),
            (1 << NUM_DIMENSIONS) * (int)Math.log(NUM_DATA_POINTS), DISTANCE_CUTOFF);
    for (Centroid datapoint : syntheticData.getFirst()) {
      perPointClusterer.cluster(datapoint);
    }
    List<Centroid> perPointCentroids = Lists.newArrayList();
    for (Vector v : perPointClusterer) {
      perPointCentroids.add((Centroid) v);
    }

    // Computes the cost (total sum of distances) of these different clusterings.
    double mapperCost = ClusteringUtils.totalClusterCost(syntheticData.getFirst(), mapperCentroids);
    double localCost = ClusteringUtils.totalClusterCost(syntheticData.getFirst(), batchCentroids);
    double perPointCost = ClusteringUtils.totalClusterCost(syntheticData.getFirst(), perPointCentroids);
    System.out.printf("[Total cost] Mapper %f [%d] Local %f [%d] Perpoint local %f [%d];" +
        "[ratio m-vs-l %f] [ratio pp-vs-l %f]\n", mapperCost, mapperCentroids.size(),
        localCost, batchCentroids.size(), perPointCost, perPointCentroids.size(),
        mapperCost / localCost, perPointCost / localCost);

    // These ratios should be close to 1.0 and have been observed to be go as low as 0.6 and as low as 1.5.
    // A buffer of [0.2, 1.8] seems appropriate.
    assertEquals("Mapper StreamingKMeans / Batch local StreamingKMeans total cost ratio too far from 1",
        1.0, mapperCost / localCost, 0.8);
    assertEquals("One by one local StreamingKMeans / Batch local StreamingKMeans total cost ratio too high",
        1.0, perPointCost / localCost, 0.8);
  }

  @Test
  public void testHypercubeReducer() throws IOException {
    ReduceDriver<IntWritable, CentroidWritable, IntWritable, CentroidWritable> reduceDriver =
        ReduceDriver.newReduceDriver(new StreamingKMeansReducer());
    Configuration configuration = reduceDriver.getConfiguration();
    configure(configuration);

    System.out.printf("%s reducer test\n", configuration.get(StreamingKMeansDriver.SEARCHER_CLASS_OPTION));
    StreamingKMeans clusterer =
        new StreamingKMeans(StreamingKMeansUtilsMR .searcherFromConfiguration(configuration),
            (1 << NUM_DIMENSIONS) * (int)Math.log(NUM_DATA_POINTS), DISTANCE_CUTOFF);

    long start = System.currentTimeMillis();
    clusterer.cluster(syntheticData.getFirst());
    long end = System.currentTimeMillis();

    System.out.printf("%f [s]\n", (end - start) / 1000.0);
    List<CentroidWritable> reducerInputs = Lists.newArrayList();
    int postMapperTotalWeight = 0;
    for (Centroid intermediateCentroid : clusterer) {
      reducerInputs.add(new CentroidWritable(intermediateCentroid));
      postMapperTotalWeight += intermediateCentroid.getWeight();
    }

    reduceDriver.addInput(new IntWritable(0), reducerInputs);
    List<org.apache.hadoop.mrunit.types.Pair<IntWritable, CentroidWritable>> results =
        reduceDriver.run();
    testReducerResults(postMapperTotalWeight, results);
  }

  @Test
  public void testHypercubeMapReduce() throws IOException {
    MapReduceDriver<Writable, VectorWritable, IntWritable, CentroidWritable, IntWritable, CentroidWritable>
        mapReduceDriver = new MapReduceDriver<Writable, VectorWritable, IntWritable, CentroidWritable,
        IntWritable, CentroidWritable>(new StreamingKMeansMapper(), new StreamingKMeansReducer());
    Configuration configuration = mapReduceDriver.getConfiguration();
    configure(configuration);

    System.out.printf("%s full test\n", configuration.get(StreamingKMeansDriver.SEARCHER_CLASS_OPTION));
    for (Centroid datapoint : syntheticData.getFirst()) {
      mapReduceDriver.addInput(new IntWritable(0), new VectorWritable(datapoint));
    }
    List<org.apache.hadoop.mrunit.types.Pair<IntWritable, CentroidWritable>> results = mapReduceDriver.run();
    testReducerResults(syntheticData.getFirst().size(), results);
  }

  @Test
  public void testHypercubeMapReduceRunSequentially() throws Exception {
    Configuration configuration = getConfiguration();
    configure(configuration);
    configuration.set(DefaultOptionCreator.METHOD_OPTION, DefaultOptionCreator.SEQUENTIAL_METHOD);

    Path inputPath = new Path("testInput");
    Path outputPath = new Path("testOutput");
    StreamingKMeansUtilsMR.writeVectorsToSequenceFile(syntheticData.getFirst(), inputPath, configuration);

    StreamingKMeansDriver.run(configuration, inputPath, outputPath);

    testReducerResults(syntheticData.getFirst().size(),
        Lists.newArrayList(Iterables.transform(
            new SequenceFileIterable<IntWritable, CentroidWritable>(outputPath, configuration),
            new Function<
                Pair<IntWritable, CentroidWritable>,
                org.apache.hadoop.mrunit.types.Pair<IntWritable, CentroidWritable>>() {
              @Override
              public org.apache.hadoop.mrunit.types.Pair<IntWritable, CentroidWritable> apply(
                  org.apache.mahout.common.Pair<IntWritable, CentroidWritable> input) {
                return new org.apache.hadoop.mrunit.types.Pair<IntWritable, CentroidWritable>(
                    input.getFirst(), input.getSecond());
              }
            })));
  }

  private static void testReducerResults(int totalWeight, List<org.apache.hadoop.mrunit.types.Pair<IntWritable,
      CentroidWritable>> results) {
    int expectedNumClusters = 1 << NUM_DIMENSIONS;
    double expectedWeight = (double) totalWeight / expectedNumClusters;
    int numClusters = 0;
    int numUnbalancedClusters = 0;
    int totalReducerWeight = 0;
    for (org.apache.hadoop.mrunit.types.Pair<IntWritable, CentroidWritable> result : results) {
      if (result.getSecond().getCentroid().getWeight() != expectedWeight) {
        System.out.printf("Unbalanced weight %f in centroid %d\n",  result.getSecond().getCentroid().getWeight(),
            result.getSecond().getCentroid().getIndex());
        ++numUnbalancedClusters;
      }
      assertEquals("Final centroid index is invalid", numClusters, result.getFirst().get());
      totalReducerWeight += result.getSecond().getCentroid().getWeight();
      ++numClusters;
    }
    System.out.printf("%d clusters are unbalanced\n", numUnbalancedClusters);
    assertEquals("Invalid total weight", totalWeight, totalReducerWeight);
    assertEquals("Invalid number of clusters", 1 << NUM_DIMENSIONS, numClusters);
  }

}