TestKmeansClustering.java example

Explorer
mahout-commits-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.mahout.clustering.kmeans;

import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.util.ToolRunner;
import org.apache.mahout.clustering.ClusteringTestUtils;
import org.apache.mahout.clustering.canopy.CanopyDriver;
import org.apache.mahout.clustering.classify.WeightedPropertyVectorWritable;
import org.apache.mahout.clustering.iterator.ClusterWritable;
import org.apache.mahout.common.DummyOutputCollector;
import org.apache.mahout.common.MahoutTestCase;
import org.apache.mahout.common.Pair;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.apache.mahout.common.distance.DistanceMeasure;
import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
import org.apache.mahout.common.distance.ManhattanDistanceMeasure;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable;
import org.apache.mahout.math.DenseVector;
import org.apache.mahout.math.RandomAccessSparseVector;
import org.apache.mahout.math.SequentialAccessSparseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
import org.junit.Before;
import org.junit.Test;

import com.google.common.collect.Lists;
import com.google.common.io.Closeables;

public final class TestKmeansClustering extends MahoutTestCase {
  
  public static final double[][] REFERENCE = { {1, 1}, {2, 1}, {1, 2}, {2, 2}, {3, 3}, {4, 4}, {5, 4}, {4, 5}, {5, 5}};
  
  private static final int[][] EXPECTED_NUM_POINTS = { {9}, {4, 5}, {4, 4, 1}, {1, 2, 1, 5}, {1, 1, 1, 2, 4},
      {1, 1, 1, 1, 1, 4}, {1, 1, 1, 1, 1, 2, 2}, {1, 1, 1, 1, 1, 1, 2, 1}, {1, 1, 1, 1, 1, 1, 1, 1, 1}};
  
  private FileSystem fs;
  
  @Override
  @Before
  public void setUp() throws Exception {
    super.setUp();
    Configuration conf = getConfiguration();
    fs = FileSystem.get(conf);
  }
  
  public static List<VectorWritable> getPointsWritable(double[][] raw) {
    List<VectorWritable> points = Lists.newArrayList();
    for (double[] fr : raw) {
      Vector vec = new RandomAccessSparseVector(fr.length);
      vec.assign(fr);
      points.add(new VectorWritable(vec));
    }
    return points;
  }
  
  public static List<VectorWritable> getPointsWritableDenseVector(double[][] raw) {
    List<VectorWritable> points = Lists.newArrayList();
    for (double[] fr : raw) {
      Vector vec = new DenseVector(fr.length);
      vec.assign(fr);
      points.add(new VectorWritable(vec));
    }
    return points;
  }
  
  public static List<Vector> getPoints(double[][] raw) {
    List<Vector> points = Lists.newArrayList();
    for (double[] fr : raw) {
      Vector vec = new SequentialAccessSparseVector(fr.length);
      vec.assign(fr);
      points.add(vec);
    }
    return points;
  }
  
  /**
   * Tests
   * {@link KMeansClusterer#runKMeansIteration(Iterable, Iterable, DistanceMeasure, double)}
   * ) single run convergence with a given distance threshold.
   */
  /*@Test
  public void testRunKMeansIterationConvergesInOneRunWithGivenDistanceThreshold() {
    double[][] rawPoints = { {0, 0}, {0, 0.25}, {0, 0.75}, {0, 1}};
    List<Vector> points = getPoints(rawPoints);

    ManhattanDistanceMeasure distanceMeasure = new ManhattanDistanceMeasure();
    List<Kluster> clusters = Arrays.asList(new Kluster(points.get(0), 0, distanceMeasure), new Kluster(points.get(3),
        3, distanceMeasure));

    // To converge in a single run, the given distance threshold should be
    // greater than or equal to 0.125,
    // since 0.125 will be the distance between center and centroid for the
    // initial two clusters after one run.
    double distanceThreshold = 0.25;

    boolean converged = KMeansClusterer.runKMeansIteration(points, clusters, distanceMeasure, distanceThreshold);

    Vector cluster1Center = clusters.get(0).getCenter();
    assertEquals(0, cluster1Center.get(0), EPSILON);
    assertEquals(0.125, cluster1Center.get(1), EPSILON);

    Vector cluster2Center = clusters.get(1).getCenter();
    assertEquals(0, cluster2Center.get(0), EPSILON);
    assertEquals(0.875, cluster2Center.get(1), EPSILON);

    assertTrue("KMeans iteration should be converged after a single run", converged);
  }*/

  /** Story: User wishes to run kmeans job on reference data */
  @Test
  public void testKMeansSeqJob() throws Exception {
    DistanceMeasure measure = new EuclideanDistanceMeasure();
    List<VectorWritable> points = getPointsWritable(REFERENCE);
    
    Path pointsPath = getTestTempDirPath("points");
    Path clustersPath = getTestTempDirPath("clusters");
    Configuration conf = getConfiguration();
    ClusteringTestUtils.writePointsToFile(points, true, new Path(pointsPath, "file1"), fs, conf);
    ClusteringTestUtils.writePointsToFile(points, true, new Path(pointsPath, "file2"), fs, conf);
    for (int k = 1; k < points.size(); k++) {
      System.out.println("testKMeansMRJob k= " + k);
      // pick k initial cluster centers at random
      Path path = new Path(clustersPath, "part-00000");
      FileSystem fs = FileSystem.get(path.toUri(), conf);
      SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, path, Text.class, Kluster.class);
      try {
        for (int i = 0; i < k + 1; i++) {
          Vector vec = points.get(i).get();
          
          Kluster cluster = new Kluster(vec, i, measure);
          // add the center so the centroid will be correct upon output
          cluster.observe(cluster.getCenter(), 1);
          writer.append(new Text(cluster.getIdentifier()), cluster);
        }
      } finally {
        Closeables.close(writer, false);
      }
      // now run the Job
      Path outputPath = getTestTempDirPath("output" + k);
      String[] args = {optKey(DefaultOptionCreator.INPUT_OPTION), pointsPath.toString(),
          optKey(DefaultOptionCreator.CLUSTERS_IN_OPTION), clustersPath.toString(),
          optKey(DefaultOptionCreator.OUTPUT_OPTION), outputPath.toString(),
          optKey(DefaultOptionCreator.DISTANCE_MEASURE_OPTION), EuclideanDistanceMeasure.class.getName(),
          optKey(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION), "0.001",
          optKey(DefaultOptionCreator.MAX_ITERATIONS_OPTION), "2", optKey(DefaultOptionCreator.CLUSTERING_OPTION),
          optKey(DefaultOptionCreator.OVERWRITE_OPTION), optKey(DefaultOptionCreator.METHOD_OPTION),
          DefaultOptionCreator.SEQUENTIAL_METHOD};
      ToolRunner.run(conf, new KMeansDriver(), args);
      
      // now compare the expected clusters with actual
      Path clusteredPointsPath = new Path(outputPath, "clusteredPoints");
      int[] expect = EXPECTED_NUM_POINTS[k];
      DummyOutputCollector<IntWritable,WeightedPropertyVectorWritable> collector = new DummyOutputCollector<IntWritable,WeightedPropertyVectorWritable>();
      // The key is the clusterId, the value is the weighted vector
      for (Pair<IntWritable,WeightedPropertyVectorWritable> record : new SequenceFileIterable<IntWritable,WeightedPropertyVectorWritable>(
          new Path(clusteredPointsPath, "part-m-0"), conf)) {
        collector.collect(record.getFirst(), record.getSecond());
      }
      assertEquals("clusters[" + k + ']', expect.length, collector.getKeys().size());
    }
  }
  
  /** Story: User wishes to run kmeans job on reference data (DenseVector test) */
  @Test
  public void testKMeansSeqJobDenseVector() throws Exception {
    DistanceMeasure measure = new EuclideanDistanceMeasure();
    List<VectorWritable> points = getPointsWritableDenseVector(REFERENCE);
    
    Path pointsPath = getTestTempDirPath("points");
    Path clustersPath = getTestTempDirPath("clusters");
    Configuration conf = getConfiguration();
    ClusteringTestUtils.writePointsToFile(points, true, new Path(pointsPath, "file1"), fs, conf);
    ClusteringTestUtils.writePointsToFile(points, true, new Path(pointsPath, "file2"), fs, conf);
    for (int k = 1; k < points.size(); k++) {
      System.out.println("testKMeansMRJob k= " + k);
      // pick k initial cluster centers at random
      Path path = new Path(clustersPath, "part-00000");
      FileSystem fs = FileSystem.get(path.toUri(), conf);
      SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, path, Text.class, Kluster.class);
      try {
        for (int i = 0; i < k + 1; i++) {
          Vector vec = points.get(i).get();
          
          Kluster cluster = new Kluster(vec, i, measure);
          // add the center so the centroid will be correct upon output
          cluster.observe(cluster.getCenter(), 1);
          writer.append(new Text(cluster.getIdentifier()), cluster);
        }
      } finally {
        Closeables.close(writer, false);
      }
      // now run the Job
      Path outputPath = getTestTempDirPath("output" + k);
      String[] args = {optKey(DefaultOptionCreator.INPUT_OPTION), pointsPath.toString(),
          optKey(DefaultOptionCreator.CLUSTERS_IN_OPTION), clustersPath.toString(),
          optKey(DefaultOptionCreator.OUTPUT_OPTION), outputPath.toString(),
          optKey(DefaultOptionCreator.DISTANCE_MEASURE_OPTION), EuclideanDistanceMeasure.class.getName(),
          optKey(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION), "0.001",
          optKey(DefaultOptionCreator.MAX_ITERATIONS_OPTION), "2", optKey(DefaultOptionCreator.CLUSTERING_OPTION),
          optKey(DefaultOptionCreator.OVERWRITE_OPTION), optKey(DefaultOptionCreator.METHOD_OPTION),
          DefaultOptionCreator.SEQUENTIAL_METHOD};
      ToolRunner.run(conf, new KMeansDriver(), args);
      
      // now compare the expected clusters with actual
      Path clusteredPointsPath = new Path(outputPath, "clusteredPoints");
      int[] expect = EXPECTED_NUM_POINTS[k];
      DummyOutputCollector<IntWritable,WeightedPropertyVectorWritable> collector = new DummyOutputCollector<IntWritable,WeightedPropertyVectorWritable>();
      // The key is the clusterId, the value is the weighted vector
      for (Pair<IntWritable,WeightedPropertyVectorWritable> record : new SequenceFileIterable<IntWritable,WeightedPropertyVectorWritable>(
          new Path(clusteredPointsPath, "part-m-0"), conf)) {
        collector.collect(record.getFirst(), record.getSecond());
      }
      assertEquals("clusters[" + k + ']', expect.length, collector.getKeys().size());
    }
  }
  
  /** Story: User wishes to run kmeans job on reference data */
  @Test
  public void testKMeansMRJob() throws Exception {
    DistanceMeasure measure = new EuclideanDistanceMeasure();
    List<VectorWritable> points = getPointsWritable(REFERENCE);
    
    Path pointsPath = getTestTempDirPath("points");
    Path clustersPath = getTestTempDirPath("clusters");
    Configuration conf = getConfiguration();
    ClusteringTestUtils.writePointsToFile(points, true, new Path(pointsPath, "file1"), fs, conf);
    ClusteringTestUtils.writePointsToFile(points, true, new Path(pointsPath, "file2"), fs, conf);
    for (int k = 1; k < points.size(); k += 3) {
      System.out.println("testKMeansMRJob k= " + k);
      // pick k initial cluster centers at random
      Path path = new Path(clustersPath, "part-00000");
      FileSystem fs = FileSystem.get(path.toUri(), conf);
      SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, path, Text.class, Kluster.class);
      
      try {
        for (int i = 0; i < k + 1; i++) {
          Vector vec = points.get(i).get();
          
          Kluster cluster = new Kluster(vec, i, measure);
          // add the center so the centroid will be correct upon output
          cluster.observe(cluster.getCenter(), 1);
          writer.append(new Text(cluster.getIdentifier()), cluster);
        }
      } finally {
        Closeables.close(writer, false);
      }
      // now run the Job
      Path outputPath = getTestTempDirPath("output" + k);
      String[] args = {optKey(DefaultOptionCreator.INPUT_OPTION), pointsPath.toString(),
          optKey(DefaultOptionCreator.CLUSTERS_IN_OPTION), clustersPath.toString(),
          optKey(DefaultOptionCreator.OUTPUT_OPTION), outputPath.toString(),
          optKey(DefaultOptionCreator.DISTANCE_MEASURE_OPTION), EuclideanDistanceMeasure.class.getName(),
          optKey(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION), "0.001",
          optKey(DefaultOptionCreator.MAX_ITERATIONS_OPTION), "2", optKey(DefaultOptionCreator.CLUSTERING_OPTION),
          optKey(DefaultOptionCreator.OVERWRITE_OPTION)};
      ToolRunner.run(getConfiguration(), new KMeansDriver(), args);
      
      // now compare the expected clusters with actual
      Path clusteredPointsPath = new Path(outputPath, "clusteredPoints");
      // assertEquals("output dir files?", 4, outFiles.length);
      int[] expect = EXPECTED_NUM_POINTS[k];
      DummyOutputCollector<IntWritable,WeightedPropertyVectorWritable> collector = new DummyOutputCollector<IntWritable,WeightedPropertyVectorWritable>();
      // The key is the clusterId, the value is the weighted vector
      for (Pair<IntWritable,WeightedPropertyVectorWritable> record : new SequenceFileIterable<IntWritable,WeightedPropertyVectorWritable>(
          new Path(clusteredPointsPath, "part-m-00000"), conf)) {
        collector.collect(record.getFirst(), record.getSecond());
      }
      assertEquals("clusters[" + k + ']', expect.length, collector.getKeys().size());
    }
  }
  
  /**
   * Story: User wants to use canopy clustering to input the initial clusters
   * for kmeans job.
   */
  @Test
  public void testKMeansWithCanopyClusterInput() throws Exception {
    List<VectorWritable> points = getPointsWritable(REFERENCE);
    
    Path pointsPath = getTestTempDirPath("points");
    Configuration conf = getConfiguration();
    ClusteringTestUtils.writePointsToFile(points, true, new Path(pointsPath, "file1"), fs, conf);
    ClusteringTestUtils.writePointsToFile(points, true, new Path(pointsPath, "file2"), fs, conf);
    
    Path outputPath = getTestTempDirPath("output");
    // now run the Canopy job
    CanopyDriver.run(conf, pointsPath, outputPath, new ManhattanDistanceMeasure(), 3.1, 2.1, false, 0.0, false);
    
    DummyOutputCollector<Text, ClusterWritable> collector1 =
        new DummyOutputCollector<Text, ClusterWritable>();

    FileStatus[] outParts = FileSystem.get(conf).globStatus(
                    new Path(outputPath, "clusters-0-final/*-0*"));
    for (FileStatus outPartStat : outParts) {
      for (Pair<Text,ClusterWritable> record :
               new SequenceFileIterable<Text,ClusterWritable>(
                 outPartStat.getPath(), conf)) {
          collector1.collect(record.getFirst(), record.getSecond());
      }
    }

    boolean got15 = false;
    boolean got43 = false;
    int count = 0;
    for (Text k : collector1.getKeys()) {
      count++;
      List<ClusterWritable> vl = collector1.getValue(k);
      assertEquals("non-singleton centroid!", 1, vl.size());
      ClusterWritable clusterWritable = vl.get(0);
      Vector v = clusterWritable.getValue().getCenter();
      assertEquals("cetriod vector is wrong length", 2, v.size());
      if ( (Math.abs(v.get(0) - 1.5) < EPSILON) 
                  && (Math.abs(v.get(1) - 1.5) < EPSILON)
                  && !got15) {
        got15 = true;
      } else if ( (Math.abs(v.get(0) - 4.333333333333334) < EPSILON) 
                  && (Math.abs(v.get(1) - 4.333333333333334) < EPSILON)
                  && !got43) {
        got43 = true;
      } else {
        fail("got unexpected center: " + v + " [" + v.getClass().toString() + ']');
      }
    }
    assertEquals("got unexpected number of centers", 2, count);

    // now run the KMeans job
    Path kmeansOutput = new Path(outputPath, "kmeans");
	  KMeansDriver.run(getConfiguration(), pointsPath, new Path(outputPath, "clusters-0-final"), kmeansOutput,
      0.001, 10, true, 0.0, false);
    
    // now compare the expected clusters with actual
    Path clusteredPointsPath = new Path(kmeansOutput, "clusteredPoints");
    DummyOutputCollector<IntWritable,WeightedPropertyVectorWritable> collector = new DummyOutputCollector<IntWritable,WeightedPropertyVectorWritable>();
    
    // The key is the clusterId, the value is the weighted vector
    for (Pair<IntWritable,WeightedPropertyVectorWritable> record : new SequenceFileIterable<IntWritable,WeightedPropertyVectorWritable>(
        new Path(clusteredPointsPath, "part-m-00000"), conf)) {
      collector.collect(record.getFirst(), record.getSecond());
    }
    
    for (IntWritable k : collector.getKeys()) {
      List<WeightedPropertyVectorWritable> wpvList = collector.getValue(k);
      assertTrue("empty cluster!", !wpvList.isEmpty());
      if (wpvList.get(0).getVector().get(0) <= 2.0) {
        for (WeightedPropertyVectorWritable wv : wpvList) {
          Vector v = wv.getVector();
          int idx = v.maxValueIndex();
          assertTrue("bad cluster!", v.get(idx) <= 2.0);
        }
        assertEquals("Wrong size cluster", 4, wpvList.size());
      } else {
        for (WeightedPropertyVectorWritable wv : wpvList) {
          Vector v = wv.getVector();
          int idx = v.minValueIndex();
          assertTrue("bad cluster!", v.get(idx) > 2.0);
        }
        assertEquals("Wrong size cluster", 5, wpvList.size());
      }
    }
  }
}