TestCanopyCreation.java example

Explorer
mahout-rbmClassifier-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.mahout.clustering.canopy;

import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Map.Entry;

import com.google.common.collect.Lists;
import com.google.common.io.Closeables;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.util.ToolRunner;
import org.apache.mahout.clustering.ClusteringTestUtils;
import org.apache.mahout.clustering.WeightedVectorWritable;
import org.apache.mahout.common.DummyRecordWriter;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.MahoutTestCase;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.apache.mahout.common.distance.DistanceMeasure;
import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
import org.apache.mahout.common.distance.ManhattanDistanceMeasure;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterable;
import org.apache.mahout.math.RandomAccessSparseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
import org.junit.Before;
import org.junit.Test;

public final class TestCanopyCreation extends MahoutTestCase {

  private static final double[][] RAW = { { 1, 1 }, { 2, 1 }, { 1, 2 },
      { 2, 2 }, { 3, 3 }, { 4, 4 }, { 5, 4 }, { 4, 5 }, { 5, 5 } };

  private List<Canopy> referenceManhattan;

  private final DistanceMeasure manhattanDistanceMeasure = new ManhattanDistanceMeasure();

  private List<Vector> manhattanCentroids;

  private List<Canopy> referenceEuclidean;

  private final DistanceMeasure euclideanDistanceMeasure = new EuclideanDistanceMeasure();

  private List<Vector> euclideanCentroids;

  private FileSystem fs;

  private static List<VectorWritable> getPointsWritable() {
    List<VectorWritable> points = Lists.newArrayList();
    for (double[] fr : RAW) {
      Vector vec = new RandomAccessSparseVector(fr.length);
      vec.assign(fr);
      points.add(new VectorWritable(vec));
    }
    return points;
  }

  private static List<Vector> getPoints() {
    List<Vector> points = Lists.newArrayList();
    for (double[] fr : RAW) {
      Vector vec = new RandomAccessSparseVector(fr.length);
      vec.assign(fr);
      points.add(vec);
    }
    return points;
  }

  /**
   * Print the canopies to the transcript
   * 
   * @param canopies
   *          a List<Canopy>
   */
  private static void printCanopies(Iterable<Canopy> canopies) {
    for (Canopy canopy : canopies) {
      System.out.println(canopy.asFormatString(null));
    }
  }

  private static Canopy findCanopy(Integer key, Iterable<Canopy> canopies) {
    for (Canopy c : canopies) {
      if (c.getId() == key) {
        return c;
      }
    }
    return null;
  }

  @Override
  @Before
  public void setUp() throws Exception {
    super.setUp();
    fs = FileSystem.get(new Configuration());
    referenceManhattan = CanopyClusterer.createCanopies(getPoints(),
        manhattanDistanceMeasure, 3.1, 2.1);
    manhattanCentroids = CanopyClusterer.getCenters(referenceManhattan);
    referenceEuclidean = CanopyClusterer.createCanopies(getPoints(),
        euclideanDistanceMeasure, 3.1, 2.1);
    euclideanCentroids = CanopyClusterer.getCenters(referenceEuclidean);
  }

  /**
   * Story: User can cluster points using a ManhattanDistanceMeasure and a
   * reference implementation
   */
  @Test
  public void testReferenceManhattan() throws Exception {
    // see setUp for cluster creation
    printCanopies(referenceManhattan);
    assertEquals("number of canopies", 3, referenceManhattan.size());
    for (int canopyIx = 0; canopyIx < referenceManhattan.size(); canopyIx++) {
      Canopy testCanopy = referenceManhattan.get(canopyIx);
      int[] expectedNumPoints = { 4, 4, 3 };
      double[][] expectedCentroids = { { 1.5, 1.5 }, { 4.0, 4.0 },
          { 4.666666666666667, 4.6666666666666667 } };
      assertEquals("canopy points " + canopyIx, expectedNumPoints[canopyIx],
          testCanopy.getNumPoints());
      double[] refCentroid = expectedCentroids[canopyIx];
      Vector testCentroid = testCanopy.computeCentroid();
      for (int pointIx = 0; pointIx < refCentroid.length; pointIx++) {
        assertEquals("canopy centroid " + canopyIx + '[' + pointIx + ']',
            refCentroid[pointIx], testCentroid.get(pointIx), EPSILON);
      }
    }
  }

  /**
   * Story: User can cluster points using a EuclideanDistanceMeasure and a
   * reference implementation
   */
  @Test
  public void testReferenceEuclidean() throws Exception {
    // see setUp for cluster creation
    printCanopies(referenceEuclidean);
    assertEquals("number of canopies", 3, referenceEuclidean.size());
    int[] expectedNumPoints = { 5, 5, 3 };
    double[][] expectedCentroids = { { 1.8, 1.8 }, { 4.2, 4.2 },
        { 4.666666666666667, 4.666666666666667 } };
    for (int canopyIx = 0; canopyIx < referenceEuclidean.size(); canopyIx++) {
      Canopy testCanopy = referenceEuclidean.get(canopyIx);
      assertEquals("canopy points " + canopyIx, expectedNumPoints[canopyIx],
          testCanopy.getNumPoints());
      double[] refCentroid = expectedCentroids[canopyIx];
      Vector testCentroid = testCanopy.computeCentroid();
      for (int pointIx = 0; pointIx < refCentroid.length; pointIx++) {
        assertEquals("canopy centroid " + canopyIx + '[' + pointIx + ']',
            refCentroid[pointIx], testCentroid.get(pointIx), EPSILON);
      }
    }
  }

  /**
   * Story: User can produce initial canopy centers using a
   * ManhattanDistanceMeasure and a CanopyMapper which clusters input points to
   * produce an output set of canopy centroid points.
   */
  @Test
  public void testCanopyMapperManhattan() throws Exception {
    CanopyMapper mapper = new CanopyMapper();
    Configuration conf = new Configuration();
    conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY, manhattanDistanceMeasure
        .getClass().getName());
    conf.set(CanopyConfigKeys.T1_KEY, String.valueOf(3.1));
    conf.set(CanopyConfigKeys.T2_KEY, String.valueOf(2.1));
    conf.set(CanopyConfigKeys.CF_KEY, "0");
    DummyRecordWriter<Text, VectorWritable> writer = new DummyRecordWriter<Text, VectorWritable>();
    Mapper<WritableComparable<?>, VectorWritable, Text, VectorWritable>.Context context = DummyRecordWriter
        .build(mapper, conf, writer);
    mapper.setup(context);

    List<VectorWritable> points = getPointsWritable();
    // map the data
    for (VectorWritable point : points) {
      mapper.map(new Text(), point, context);
    }
    mapper.cleanup(context);
    assertEquals("Number of map results", 1, writer.getData().size());
    // now verify the output
    List<VectorWritable> data = writer.getValue(new Text("centroid"));
    assertEquals("Number of centroids", 3, data.size());
    for (int i = 0; i < data.size(); i++) {
      assertEquals("Centroid error",
          manhattanCentroids.get(i).asFormatString(), data.get(i).get()
              .asFormatString());
    }
  }

  /**
   * Story: User can produce initial canopy centers using a
   * EuclideanDistanceMeasure and a CanopyMapper/Combiner which clusters input
   * points to produce an output set of canopy centroid points.
   */
  @Test
  public void testCanopyMapperEuclidean() throws Exception {
    CanopyMapper mapper = new CanopyMapper();
    Configuration conf = new Configuration();
    conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY, euclideanDistanceMeasure
        .getClass().getName());
    conf.set(CanopyConfigKeys.T1_KEY, String.valueOf(3.1));
    conf.set(CanopyConfigKeys.T2_KEY, String.valueOf(2.1));
    conf.set(CanopyConfigKeys.CF_KEY, "0");
    DummyRecordWriter<Text, VectorWritable> writer = new DummyRecordWriter<Text, VectorWritable>();
    Mapper<WritableComparable<?>, VectorWritable, Text, VectorWritable>.Context context = DummyRecordWriter
        .build(mapper, conf, writer);
    mapper.setup(context);

    List<VectorWritable> points = getPointsWritable();
    // map the data
    for (VectorWritable point : points) {
      mapper.map(new Text(), point, context);
    }
    mapper.cleanup(context);
    assertEquals("Number of map results", 1, writer.getData().size());
    // now verify the output
    List<VectorWritable> data = writer.getValue(new Text("centroid"));
    assertEquals("Number of centroids", 3, data.size());
    for (int i = 0; i < data.size(); i++) {
      assertEquals("Centroid error",
          euclideanCentroids.get(i).asFormatString(), data.get(i).get()
              .asFormatString());
    }
  }

  /**
   * Story: User can produce final canopy centers using a
   * ManhattanDistanceMeasure and a CanopyReducer which clusters input centroid
   * points to produce an output set of final canopy centroid points.
   */
  @Test
  public void testCanopyReducerManhattan() throws Exception {
    CanopyReducer reducer = new CanopyReducer();
    Configuration conf = new Configuration();
    conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY,
        "org.apache.mahout.common.distance.ManhattanDistanceMeasure");
    conf.set(CanopyConfigKeys.T1_KEY, String.valueOf(3.1));
    conf.set(CanopyConfigKeys.T2_KEY, String.valueOf(2.1));
    conf.set(CanopyConfigKeys.CF_KEY, "0");
    DummyRecordWriter<Text, Canopy> writer = new DummyRecordWriter<Text, Canopy>();
    Reducer<Text, VectorWritable, Text, Canopy>.Context context = DummyRecordWriter
        .build(reducer, conf, writer, Text.class, VectorWritable.class);
    reducer.setup(context);

    List<VectorWritable> points = getPointsWritable();
    reducer.reduce(new Text("centroid"), points, context);
    Set<Text> keys = writer.getKeys();
    assertEquals("Number of centroids", 3, keys.size());
    int i = 0;
    for (Text key : keys) {
      List<Canopy> data = writer.getValue(key);
      assertEquals(manhattanCentroids.get(i).asFormatString()
          + " is not equal to "
          + data.get(0).computeCentroid().asFormatString(), manhattanCentroids
          .get(i), data.get(0).computeCentroid());
      i++;
    }
  }

  /**
   * Story: User can produce final canopy centers using a
   * EuclideanDistanceMeasure and a CanopyReducer which clusters input centroid
   * points to produce an output set of final canopy centroid points.
   */
  @Test
  public void testCanopyReducerEuclidean() throws Exception {
    CanopyReducer reducer = new CanopyReducer();
    Configuration conf = new Configuration();
    conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY,
        "org.apache.mahout.common.distance.EuclideanDistanceMeasure");
    conf.set(CanopyConfigKeys.T1_KEY, String.valueOf(3.1));
    conf.set(CanopyConfigKeys.T2_KEY, String.valueOf(2.1));
    conf.set(CanopyConfigKeys.CF_KEY, "0");
    DummyRecordWriter<Text, Canopy> writer = new DummyRecordWriter<Text, Canopy>();
    Reducer<Text, VectorWritable, Text, Canopy>.Context context = DummyRecordWriter
        .build(reducer, conf, writer, Text.class, VectorWritable.class);
    reducer.setup(context);

    List<VectorWritable> points = getPointsWritable();
    reducer.reduce(new Text("centroid"), points, context);
    Set<Text> keys = writer.getKeys();
    assertEquals("Number of centroids", 3, keys.size());
    int i = 0;
    for (Text key : keys) {
      List<Canopy> data = writer.getValue(key);
      assertEquals(euclideanCentroids.get(i).asFormatString()
          + " is not equal to "
          + data.get(0).computeCentroid().asFormatString(), euclideanCentroids
          .get(i), data.get(0).computeCentroid());
      i++;
    }
  }

  /**
   * Story: User can produce final canopy centers using a Hadoop map/reduce job
   * and a ManhattanDistanceMeasure.
   */
  @Test
  public void testCanopyGenManhattanMR() throws Exception {
    List<VectorWritable> points = getPointsWritable();
    Configuration config = new Configuration();
    ClusteringTestUtils.writePointsToFile(points,
        getTestTempFilePath("testdata/file1"), fs, config);
    ClusteringTestUtils.writePointsToFile(points,
        getTestTempFilePath("testdata/file2"), fs, config);
    // now run the Canopy Driver
    Path output = getTestTempDirPath("output");
    CanopyDriver.run(config, getTestTempDirPath("testdata"), output,
        manhattanDistanceMeasure, 3.1, 2.1, false, false);

    // verify output from sequence file
    Path path = new Path(output, "clusters-0-final/part-r-00000");
    FileSystem fs = FileSystem.get(path.toUri(), config);
    SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, config);
    try {
      Writable key = new Text();
      Canopy canopy = new Canopy();
      assertTrue("more to come", reader.next(key, canopy));
      assertEquals("1st key", "C-0", key.toString());
      assertEquals("1st x value", 1.5, canopy.getCenter().get(0), EPSILON);
      assertEquals("1st y value", 1.5, canopy.getCenter().get(1), EPSILON);
      assertTrue("more to come", reader.next(key, canopy));
      assertEquals("2nd key", "C-1", key.toString());
      assertEquals("2nd x value", 4.333333333333334, canopy.getCenter().get(0),
          EPSILON);
      assertEquals("2nd y value", 4.333333333333334, canopy.getCenter().get(1),
          EPSILON);
      assertFalse("more to come", reader.next(key, canopy));
    } finally {
      Closeables.closeQuietly(reader);
    }
  }

  /**
   * Story: User can produce final canopy centers using a Hadoop map/reduce job
   * and a EuclideanDistanceMeasure.
   */
  @Test
  public void testCanopyGenEuclideanMR() throws Exception {
    List<VectorWritable> points = getPointsWritable();
    Configuration config = new Configuration();
    ClusteringTestUtils.writePointsToFile(points,
        getTestTempFilePath("testdata/file1"), fs, config);
    ClusteringTestUtils.writePointsToFile(points,
        getTestTempFilePath("testdata/file2"), fs, config);
    // now run the Canopy Driver
    Path output = getTestTempDirPath("output");
    CanopyDriver.run(config, getTestTempDirPath("testdata"), output,
        euclideanDistanceMeasure, 3.1, 2.1, false, false);

    // verify output from sequence file
    Path path = new Path(output, "clusters-0-final/part-r-00000");
    FileSystem fs = FileSystem.get(path.toUri(), config);
    SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, config);
    try {
      Writable key = new Text();
      Canopy value = new Canopy();
      assertTrue("more to come", reader.next(key, value));
      assertEquals("1st key", "C-0", key.toString());
      assertEquals("1st x value", 1.8, value.getCenter().get(0), EPSILON);
      assertEquals("1st y value", 1.8, value.getCenter().get(1), EPSILON);
      assertTrue("more to come", reader.next(key, value));
      assertEquals("2nd key", "C-1", key.toString());
      assertEquals("2nd x value", 4.433333333333334, value.getCenter().get(0),
          EPSILON);
      assertEquals("2nd y value", 4.433333333333334, value.getCenter().get(1),
          EPSILON);
      assertFalse("more to come", reader.next(key, value));
    } finally {
      Closeables.closeQuietly(reader);
    }
  }

  /**
   * Story: User can cluster a subset of the points using a ClusterMapper and a
   * ManhattanDistanceMeasure.
   */
  @Test
  public void testClusterMapperManhattan() throws Exception {
    ClusterMapper mapper = new ClusterMapper();
    Configuration conf = new Configuration();
    conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY,
        "org.apache.mahout.common.distance.ManhattanDistanceMeasure");
    conf.set(CanopyConfigKeys.T1_KEY, String.valueOf(3.1));
    conf.set(CanopyConfigKeys.T2_KEY, String.valueOf(2.1));
    DummyRecordWriter<IntWritable, WeightedVectorWritable> writer = new DummyRecordWriter<IntWritable, WeightedVectorWritable>();
    Mapper<WritableComparable<?>, VectorWritable, IntWritable, WeightedVectorWritable>.Context context = DummyRecordWriter
        .build(mapper, conf, writer);
    mapper.setup(context);

    Collection<Canopy> canopies = Lists.newArrayList();
    int nextCanopyId = 0;
    for (Vector centroid : manhattanCentroids) {
      canopies.add(new Canopy(centroid, nextCanopyId++,
          manhattanDistanceMeasure));
    }
    setField(mapper, "canopies", canopies);
    List<VectorWritable> points = getPointsWritable();
    // map the data
    for (VectorWritable point : points) {
      mapper.map(new Text(), point, context);
    }
    Map<IntWritable, List<WeightedVectorWritable>> data = writer.getData();
    assertEquals("Number of map results", canopies.size(), data.size());
    for (Entry<IntWritable, List<WeightedVectorWritable>> stringListEntry : data
        .entrySet()) {
      IntWritable key = stringListEntry.getKey();
      Canopy canopy = findCanopy(key.get(), canopies);
      List<WeightedVectorWritable> pts = stringListEntry.getValue();
      for (WeightedVectorWritable ptDef : pts) {
        assertTrue("Point not in canopy", mapper.canopyCovers(canopy, ptDef
            .getVector()));
      }
    }
  }

  /**
   * Story: User can cluster a subset of the points using a ClusterMapper and a
   * EuclideanDistanceMeasure.
   */
  @Test
  public void testClusterMapperEuclidean() throws Exception {
    ClusterMapper mapper = new ClusterMapper();
    Configuration conf = new Configuration();
    conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY,
        "org.apache.mahout.common.distance.EuclideanDistanceMeasure");
    conf.set(CanopyConfigKeys.T1_KEY, String.valueOf(3.1));
    conf.set(CanopyConfigKeys.T2_KEY, String.valueOf(2.1));
    DummyRecordWriter<IntWritable, WeightedVectorWritable> writer = new DummyRecordWriter<IntWritable, WeightedVectorWritable>();
    Mapper<WritableComparable<?>, VectorWritable, IntWritable, WeightedVectorWritable>.Context context = DummyRecordWriter
        .build(mapper, conf, writer);
    mapper.setup(context);

    Collection<Canopy> canopies = Lists.newArrayList();
    int nextCanopyId = 0;
    for (Vector centroid : euclideanCentroids) {
      canopies.add(new Canopy(centroid, nextCanopyId++,
          euclideanDistanceMeasure));
    }

    setField(mapper, "canopies", canopies);
    List<VectorWritable> points = getPointsWritable();
    // map the data
    for (VectorWritable point : points) {
      mapper.map(new Text(), point, context);
    }
    Map<IntWritable, List<WeightedVectorWritable>> data = writer.getData();
    assertEquals("Number of map results", canopies.size(), data.size());
    for (Entry<IntWritable, List<WeightedVectorWritable>> stringListEntry : data
        .entrySet()) {
      IntWritable key = stringListEntry.getKey();
      Canopy canopy = findCanopy(key.get(), canopies);
      List<WeightedVectorWritable> pts = stringListEntry.getValue();
      for (WeightedVectorWritable ptDef : pts) {
        assertTrue("Point not in canopy", mapper.canopyCovers(canopy, ptDef
            .getVector()));
      }
    }
  }

  /** Story: User can cluster points using sequential execution */
  @Test
  public void testClusteringManhattanSeq() throws Exception {
    List<VectorWritable> points = getPointsWritable();
    Configuration config = new Configuration();
    ClusteringTestUtils.writePointsToFile(points,
        getTestTempFilePath("testdata/file1"), fs, config);
    // now run the Canopy Driver in sequential mode
    Path output = getTestTempDirPath("output");
    CanopyDriver.run(config, getTestTempDirPath("testdata"), output,
        manhattanDistanceMeasure, 3.1, 2.1, true, true);

    // verify output from sequence file
    Path path = new Path(output, "clusters-0-final/part-r-00000");
    int ix = 0;
    for (Canopy value : new SequenceFileValueIterable<Canopy>(path, true,
        config)) {
      assertEquals("Center [" + ix + ']', manhattanCentroids.get(ix), value
          .getCenter());
      ix++;
    }

    path = new Path(output, "clusteredPoints/part-m-0");
    long count = HadoopUtil.countRecords(path, config);
    assertEquals("number of points", points.size(), count);
  }

  /** Story: User can cluster points using sequential execution */
  @Test
  public void testClusteringEuclideanSeq() throws Exception {
    List<VectorWritable> points = getPointsWritable();
    Configuration config = new Configuration();
    ClusteringTestUtils.writePointsToFile(points,
        getTestTempFilePath("testdata/file1"), fs, config);
    // now run the Canopy Driver in sequential mode
    Path output = getTestTempDirPath("output");
    String[] args = { optKey(DefaultOptionCreator.INPUT_OPTION),
        getTestTempDirPath("testdata").toString(),
        optKey(DefaultOptionCreator.OUTPUT_OPTION), output.toString(),
        optKey(DefaultOptionCreator.DISTANCE_MEASURE_OPTION),
        EuclideanDistanceMeasure.class.getName(),
        optKey(DefaultOptionCreator.T1_OPTION), "3.1",
        optKey(DefaultOptionCreator.T2_OPTION), "2.1",
        optKey(DefaultOptionCreator.CLUSTERING_OPTION),
        optKey(DefaultOptionCreator.OVERWRITE_OPTION),
        optKey(DefaultOptionCreator.METHOD_OPTION),
        DefaultOptionCreator.SEQUENTIAL_METHOD };
    new CanopyDriver().run(args);

    // verify output from sequence file
    Path path = new Path(output, "clusters-0-final/part-r-00000");

    int ix = 0;
    for (Canopy value : new SequenceFileValueIterable<Canopy>(path, true,
        config)) {
      assertEquals("Center [" + ix + ']', euclideanCentroids.get(ix), value
          .getCenter());
      ix++;
    }

    path = new Path(output, "clusteredPoints/part-m-0");
    long count = HadoopUtil.countRecords(path, config);
    assertEquals("number of points", points.size(), count);
  }

  /**
   * Story: User can produce final point clustering using a Hadoop map/reduce
   * job and a ManhattanDistanceMeasure.
   */
  @Test
  public void testClusteringManhattanMR() throws Exception {
    List<VectorWritable> points = getPointsWritable();
    Configuration conf = new Configuration();
    ClusteringTestUtils.writePointsToFile(points,
        getTestTempFilePath("testdata/file1"), fs, conf);
    ClusteringTestUtils.writePointsToFile(points,
        getTestTempFilePath("testdata/file2"), fs, conf);
    // now run the Job
    Path output = getTestTempDirPath("output");
    CanopyDriver.run(conf, getTestTempDirPath("testdata"), output,
        manhattanDistanceMeasure, 3.1, 2.1, true, false);
    Path path = new Path(output, "clusteredPoints/part-m-00000");
    long count = HadoopUtil.countRecords(path, conf);
    assertEquals("number of points", points.size(), count);
  }

  /**
   * Story: User can produce final point clustering using a Hadoop map/reduce
   * job and a EuclideanDistanceMeasure.
   */
  @Test
  public void testClusteringEuclideanMR() throws Exception {
    List<VectorWritable> points = getPointsWritable();
    Configuration conf = new Configuration();
    ClusteringTestUtils.writePointsToFile(points,
        getTestTempFilePath("testdata/file1"), fs, conf);
    ClusteringTestUtils.writePointsToFile(points,
        getTestTempFilePath("testdata/file2"), fs, conf);
    // now run the Job using the run() command. Others can use runJob().
    Path output = getTestTempDirPath("output");
    String[] args = { optKey(DefaultOptionCreator.INPUT_OPTION),
        getTestTempDirPath("testdata").toString(),
        optKey(DefaultOptionCreator.OUTPUT_OPTION), output.toString(),
        optKey(DefaultOptionCreator.DISTANCE_MEASURE_OPTION),
        EuclideanDistanceMeasure.class.getName(),
        optKey(DefaultOptionCreator.T1_OPTION), "3.1",
        optKey(DefaultOptionCreator.T2_OPTION), "2.1",
        optKey(DefaultOptionCreator.CLUSTERING_OPTION),
        optKey(DefaultOptionCreator.OVERWRITE_OPTION) };
    ToolRunner.run(new Configuration(), new CanopyDriver(), args);
    Path path = new Path(output, "clusteredPoints/part-m-00000");
    long count = HadoopUtil.countRecords(path, conf);
    assertEquals("number of points", points.size(), count);
  }

  /**
   * Story: User can set T3 and T4 values to be used by the reducer for its T1
   * and T2 thresholds
   */
  @Test
  public void testCanopyReducerT3T4Configuration() throws Exception {
    CanopyReducer reducer = new CanopyReducer();
    Configuration conf = new Configuration();
    conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY,
        "org.apache.mahout.common.distance.ManhattanDistanceMeasure");
    conf.set(CanopyConfigKeys.T1_KEY, String.valueOf(3.1));
    conf.set(CanopyConfigKeys.T2_KEY, String.valueOf(2.1));
    conf.set(CanopyConfigKeys.T3_KEY, String.valueOf(1.1));
    conf.set(CanopyConfigKeys.T4_KEY, String.valueOf(0.1));
    conf.set(CanopyConfigKeys.CF_KEY, "0");
    DummyRecordWriter<Text, Canopy> writer = new DummyRecordWriter<Text, Canopy>();
    Reducer<Text, VectorWritable, Text, Canopy>.Context context = DummyRecordWriter
        .build(reducer, conf, writer, Text.class, VectorWritable.class);
    reducer.setup(context);
    assertEquals(1.1, reducer.getCanopyClusterer().getT1(), EPSILON);
    assertEquals(0.1, reducer.getCanopyClusterer().getT2(), EPSILON);
  }

  /**
   * Story: User can specify a clustering limit that prevents output of small
   * clusters
   */
  @Test
  public void testCanopyMapperClusterFilter() throws Exception {
    CanopyMapper mapper = new CanopyMapper();
    Configuration conf = new Configuration();
    conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY, manhattanDistanceMeasure
        .getClass().getName());
    conf.set(CanopyConfigKeys.T1_KEY, String.valueOf(3.1));
    conf.set(CanopyConfigKeys.T2_KEY, String.valueOf(2.1));
    conf.set(CanopyConfigKeys.CF_KEY, "3");
    DummyRecordWriter<Text, VectorWritable> writer = new DummyRecordWriter<Text, VectorWritable>();
    Mapper<WritableComparable<?>, VectorWritable, Text, VectorWritable>.Context context = DummyRecordWriter
        .build(mapper, conf, writer);
    mapper.setup(context);

    List<VectorWritable> points = getPointsWritable();
    // map the data
    for (VectorWritable point : points) {
      mapper.map(new Text(), point, context);
    }
    mapper.cleanup(context);
    assertEquals("Number of map results", 1, writer.getData().size());
    // now verify the output
    List<VectorWritable> data = writer.getValue(new Text("centroid"));
    assertEquals("Number of centroids", 2, data.size());
  }

  /**
   * Story: User can specify a cluster filter that limits the minimum size of
   * canopies produced by the reducer
   */
  @Test
  public void testCanopyReducerClusterFilter() throws Exception {
    CanopyReducer reducer = new CanopyReducer();
    Configuration conf = new Configuration();
    conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY,
        "org.apache.mahout.common.distance.ManhattanDistanceMeasure");
    conf.set(CanopyConfigKeys.T1_KEY, String.valueOf(3.1));
    conf.set(CanopyConfigKeys.T2_KEY, String.valueOf(2.1));
    conf.set(CanopyConfigKeys.CF_KEY, "3");
    DummyRecordWriter<Text, Canopy> writer = new DummyRecordWriter<Text, Canopy>();
    Reducer<Text, VectorWritable, Text, Canopy>.Context context = DummyRecordWriter
        .build(reducer, conf, writer, Text.class, VectorWritable.class);
    reducer.setup(context);

    List<VectorWritable> points = getPointsWritable();
    reducer.reduce(new Text("centroid"), points, context);
    Set<Text> keys = writer.getKeys();
    assertEquals("Number of centroids", 2, keys.size());
  }
}