/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.mahout.clustering.meanshift;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.util.ToolRunner;
import org.apache.mahout.clustering.ClusteringTestUtils;
import org.apache.mahout.common.DummyRecordWriter;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.MahoutTestCase;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.apache.mahout.common.distance.DistanceMeasure;
import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterator;
import org.apache.mahout.common.kernel.IKernelProfile;
import org.apache.mahout.common.kernel.TriangularKernelProfile;
import org.apache.mahout.math.DenseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
import org.junit.Before;
import org.junit.Test;
public final class TestMeanShift extends MahoutTestCase {
private Vector[] raw = null;
// DistanceMeasure manhattanDistanceMeasure = new ManhattanDistanceMeasure();
private final DistanceMeasure euclideanDistanceMeasure = new EuclideanDistanceMeasure();
private final IKernelProfile kernelProfile = new TriangularKernelProfile();
/**
* Print the canopies to the transcript
*
* @param canopies
* a List<Canopy>
*/
private static void printCanopies(Iterable<MeanShiftCanopy> canopies) {
for (MeanShiftCanopy canopy : canopies) {
System.out.println(canopy.asFormatString(null));
}
}
/**
* Print a graphical representation of the clustered image points as a 10x10
* character mask
*/
private void printImage(Iterable<MeanShiftCanopy> canopies) {
char[][] out = new char[10][10];
for (int i = 0; i < out.length; i++) {
for (int j = 0; j < out[0].length; j++) {
out[i][j] = ' ';
}
}
for (MeanShiftCanopy canopy : canopies) {
int ch = 'A' + canopy.getId();
for (int pid : canopy.getBoundPoints().toList()) {
Vector pt = raw[pid];
out[(int) pt.getQuick(0)][(int) pt.getQuick(1)] = (char) ch;
}
}
for (char[] anOut : out) {
System.out.println(anOut);
}
}
private List<MeanShiftCanopy> getInitialCanopies() {
int nextCanopyId = 0;
List<MeanShiftCanopy> canopies = Lists.newArrayList();
for (Vector point : raw) {
canopies.add(new MeanShiftCanopy(point, nextCanopyId++,
euclideanDistanceMeasure));
}
return canopies;
}
@Override
@Before
public void setUp() throws Exception {
super.setUp();
raw = new Vector[100];
for (int i = 0; i < 10; i++) {
for (int j = 0; j < 10; j++) {
int ix = i * 10 + j;
Vector v = new DenseVector(3);
v.setQuick(0, i);
v.setQuick(1, j);
if (i == j) {
v.setQuick(2, 9);
} else if (i + j == 9) {
v.setQuick(2, 4.5);
}
raw[ix] = v;
}
}
}
/**
* Story: User can exercise the reference implementation to verify that the
* test datapoints are clustered in a reasonable manner.
*/
@Test
public void testReferenceImplementation() {
MeanShiftCanopyClusterer clusterer = new MeanShiftCanopyClusterer(
new EuclideanDistanceMeasure(), new TriangularKernelProfile(), 4.0,
1.0, 0.5, true);
List<MeanShiftCanopy> canopies = Lists.newArrayList();
// add all points to the canopies
int nextCanopyId = 0;
for (Vector aRaw : raw) {
clusterer.mergeCanopy(new MeanShiftCanopy(aRaw, nextCanopyId++,
euclideanDistanceMeasure), canopies);
}
boolean done = false;
int iter = 1;
while (!done) {// shift canopies to their centroids
done = true;
List<MeanShiftCanopy> migratedCanopies = Lists.newArrayList();
for (MeanShiftCanopy canopy : canopies) {
done = clusterer.shiftToMean(canopy) && done;
clusterer.mergeCanopy(canopy, migratedCanopies);
}
canopies = migratedCanopies;
printCanopies(canopies);
printImage(canopies);
System.out.println(iter++);
}
}
/**
* Test the MeanShiftCanopyClusterer's reference implementation. Should
* produce the same final output as above.
*/
@Test
public void testClustererReferenceImplementation() {
Iterable<Vector> points = Lists.newArrayList(raw);
List<MeanShiftCanopy> canopies = MeanShiftCanopyClusterer.clusterPoints(
points, euclideanDistanceMeasure, kernelProfile, 0.5, 4, 1, 10);
printCanopies(canopies);
printImage(canopies);
}
/**
* Story: User can produce initial canopy centers using a
* EuclideanDistanceMeasure and a CanopyMapper/Combiner which clusters input
* points to produce an output set of canopies.
*/
@Test
public void testCanopyMapperEuclidean() throws Exception {
MeanShiftCanopyClusterer clusterer = new MeanShiftCanopyClusterer(
euclideanDistanceMeasure, kernelProfile, 4, 1, 0.5, true);
// get the initial canopies
List<MeanShiftCanopy> canopies = getInitialCanopies();
// build the reference set
Collection<MeanShiftCanopy> refCanopies = Lists.newArrayList();
int nextCanopyId = 0;
for (Vector aRaw : raw) {
clusterer.mergeCanopy(new MeanShiftCanopy(aRaw, nextCanopyId++,
euclideanDistanceMeasure), refCanopies);
}
Configuration conf = new Configuration();
conf.set(MeanShiftCanopyConfigKeys.DISTANCE_MEASURE_KEY,
"org.apache.mahout.common.distance.EuclideanDistanceMeasure");
conf.set(MeanShiftCanopyConfigKeys.KERNEL_PROFILE_KEY,
"org.apache.mahout.common.kernel.TriangularKernelProfile");
conf.set(MeanShiftCanopyConfigKeys.T1_KEY, "4");
conf.set(MeanShiftCanopyConfigKeys.T2_KEY, "1");
conf.set(MeanShiftCanopyConfigKeys.CLUSTER_CONVERGENCE_KEY, "0.5");
// map the data
MeanShiftCanopyMapper mapper = new MeanShiftCanopyMapper();
DummyRecordWriter<Text, MeanShiftCanopy> mapWriter = new DummyRecordWriter<Text, MeanShiftCanopy>();
Mapper<WritableComparable<?>, MeanShiftCanopy, Text, MeanShiftCanopy>.Context mapContext = DummyRecordWriter
.build(mapper, conf, mapWriter);
mapper.setup(mapContext);
for (MeanShiftCanopy canopy : canopies) {
mapper.map(new Text(), canopy, mapContext);
}
mapper.cleanup(mapContext);
// now verify the output
assertEquals("Number of map results", 1, mapWriter.getData().size());
List<MeanShiftCanopy> data = mapWriter.getValue(new Text("0"));
assertEquals("Number of canopies", refCanopies.size(), data.size());
// add all points to the reference canopies
Map<String, MeanShiftCanopy> refCanopyMap = Maps.newHashMap();
for (MeanShiftCanopy canopy : refCanopies) {
clusterer.shiftToMean(canopy);
refCanopyMap.put(canopy.getIdentifier(), canopy);
}
// build a map of the combiner output
Map<String, MeanShiftCanopy> canopyMap = Maps.newHashMap();
for (MeanShiftCanopy d : data) {
canopyMap.put(d.getIdentifier(), d);
}
// compare the maps
for (Map.Entry<String, MeanShiftCanopy> stringMeanShiftCanopyEntry : refCanopyMap
.entrySet()) {
MeanShiftCanopy ref = stringMeanShiftCanopyEntry.getValue();
MeanShiftCanopy canopy = canopyMap.get((ref.isConverged() ? "MSV-"
: "MSC-")
+ ref.getId());
assertEquals("ids", ref.getId(), canopy.getId());
assertEquals("centers(" + ref.getIdentifier() + ')', ref.getCenter()
.asFormatString(), canopy.getCenter().asFormatString());
assertEquals("bound points", ref.getBoundPoints().toList().size(), canopy
.getBoundPoints().toList().size());
assertEquals("num bound points", ref.getNumPoints(), canopy
.getNumPoints());
}
}
/**
* Story: User can produce final canopy centers using a
* EuclideanDistanceMeasure and a CanopyReducer which clusters input centroid
* points to produce an output set of final canopy centroid points.
*/
@Test
public void testCanopyReducerEuclidean() throws Exception {
MeanShiftCanopyClusterer clusterer = new MeanShiftCanopyClusterer(
euclideanDistanceMeasure, kernelProfile, 4, 1, 0.5, true);
// get the initial canopies
List<MeanShiftCanopy> canopies = getInitialCanopies();
// build the mapper output reference set
Collection<MeanShiftCanopy> mapperReference = Lists.newArrayList();
int nextCanopyId = 0;
for (Vector aRaw : raw) {
clusterer.mergeCanopy(new MeanShiftCanopy(aRaw, nextCanopyId++,
euclideanDistanceMeasure), mapperReference);
}
for (MeanShiftCanopy canopy : mapperReference) {
clusterer.shiftToMean(canopy);
}
// build the reducer reference output set
Collection<MeanShiftCanopy> reducerReference = Lists.newArrayList();
for (MeanShiftCanopy canopy : mapperReference) {
clusterer.mergeCanopy(canopy, reducerReference);
}
for (MeanShiftCanopy canopy : reducerReference) {
clusterer.shiftToMean(canopy);
}
Configuration conf = new Configuration();
conf.set(MeanShiftCanopyConfigKeys.DISTANCE_MEASURE_KEY,
"org.apache.mahout.common.distance.EuclideanDistanceMeasure");
conf.set(MeanShiftCanopyConfigKeys.KERNEL_PROFILE_KEY,
"org.apache.mahout.common.kernel.TriangularKernelProfile");
conf.set(MeanShiftCanopyConfigKeys.T1_KEY, "4");
conf.set(MeanShiftCanopyConfigKeys.T2_KEY, "1");
conf.set(MeanShiftCanopyConfigKeys.CLUSTER_CONVERGENCE_KEY, "0.5");
conf.set(MeanShiftCanopyConfigKeys.CONTROL_PATH_KEY, "output/control");
MeanShiftCanopyMapper mapper = new MeanShiftCanopyMapper();
DummyRecordWriter<Text, MeanShiftCanopy> mapWriter = new DummyRecordWriter<Text, MeanShiftCanopy>();
Mapper<WritableComparable<?>, MeanShiftCanopy, Text, MeanShiftCanopy>.Context mapContext = DummyRecordWriter
.build(mapper, conf, mapWriter);
mapper.setup(mapContext);
// map the data
for (MeanShiftCanopy canopy : canopies) {
mapper.map(new Text(), canopy, mapContext);
}
mapper.cleanup(mapContext);
assertEquals("Number of map results", 1, mapWriter.getData().size());
// now reduce the mapper output
MeanShiftCanopyReducer reducer = new MeanShiftCanopyReducer();
DummyRecordWriter<Text, MeanShiftCanopy> reduceWriter = new DummyRecordWriter<Text, MeanShiftCanopy>();
Reducer<Text, MeanShiftCanopy, Text, MeanShiftCanopy>.Context reduceContext = DummyRecordWriter
.build(reducer, conf, reduceWriter, Text.class, MeanShiftCanopy.class);
reducer.setup(reduceContext);
reducer.reduce(new Text("0"), mapWriter.getValue(new Text("0")),
reduceContext);
reducer.cleanup(reduceContext);
// now verify the output
assertEquals("Number of canopies", reducerReference.size(), reduceWriter
.getKeys().size());
// add all points to the reference canopy maps
Map<String, MeanShiftCanopy> reducerReferenceMap = Maps.newHashMap();
for (MeanShiftCanopy canopy : reducerReference) {
reducerReferenceMap.put(canopy.getIdentifier(), canopy);
}
// compare the maps
for (Map.Entry<String, MeanShiftCanopy> mapEntry : reducerReferenceMap
.entrySet()) {
MeanShiftCanopy refCanopy = mapEntry.getValue();
List<MeanShiftCanopy> values = reduceWriter.getValue(new Text((refCanopy
.isConverged() ? "MSV-" : "MSC-")
+ refCanopy.getId()));
assertEquals("values", 1, values.size());
MeanShiftCanopy reducerCanopy = values.get(0);
assertEquals("ids", refCanopy.getId(), reducerCanopy.getId());
long refNumPoints = refCanopy.getNumPoints();
long reducerNumPoints = reducerCanopy.getNumPoints();
assertEquals("numPoints", refNumPoints, reducerNumPoints);
String refCenter = refCanopy.getCenter().asFormatString();
String reducerCenter = reducerCanopy.getCenter().asFormatString();
assertEquals("centers(" + mapEntry.getKey() + ')', refCenter,
reducerCenter);
assertEquals("bound points", refCanopy.getBoundPoints().toList().size(),
reducerCanopy.getBoundPoints().toList().size());
assertEquals("num bound points", refCanopy.getNumPoints(), reducerCanopy
.getNumPoints());
}
}
/**
* Story: User can produce final point clustering using a Hadoop map/reduce
* job and a EuclideanDistanceMeasure.
*/
@Test
public void testCanopyEuclideanMRJob() throws Exception {
Path input = getTestTempDirPath("testdata");
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(input.toUri(), conf);
Collection<VectorWritable> points = Lists.newArrayList();
for (Vector v : raw) {
points.add(new VectorWritable(v));
}
ClusteringTestUtils.writePointsToFile(points,
getTestTempFilePath("testdata/file1"), fs, conf);
ClusteringTestUtils.writePointsToFile(points,
getTestTempFilePath("testdata/file2"), fs, conf);
// now run the Job using the run() command. Other tests can continue to use
// runJob().
Path output = getTestTempDirPath("output");
// MeanShiftCanopyDriver.runJob(input, output,
// EuclideanDistanceMeasure.class.getName(), 4, 1, 0.5, 10, false, false);
String[] args = { optKey(DefaultOptionCreator.INPUT_OPTION),
getTestTempDirPath("testdata").toString(),
optKey(DefaultOptionCreator.OUTPUT_OPTION), output.toString(),
optKey(DefaultOptionCreator.DISTANCE_MEASURE_OPTION),
EuclideanDistanceMeasure.class.getName(),
optKey(DefaultOptionCreator.KERNEL_PROFILE_OPTION),
TriangularKernelProfile.class.getName(),
optKey(DefaultOptionCreator.T1_OPTION), "4",
optKey(DefaultOptionCreator.T2_OPTION), "1",
optKey(DefaultOptionCreator.CLUSTERING_OPTION),
optKey(DefaultOptionCreator.MAX_ITERATIONS_OPTION), "7",
optKey(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION), "0.2",
optKey(DefaultOptionCreator.OVERWRITE_OPTION) };
ToolRunner.run(conf, new MeanShiftCanopyDriver(), args);
Path outPart = new Path(output, "clusters-4-final/part-r-00000");
long count = HadoopUtil.countRecords(outPart, conf);
assertEquals("count", 3, count);
outPart = new Path(output, "clusters-0/part-m-00000");
Iterator<?> iterator = new SequenceFileValueIterator<Writable>(outPart,
true, conf);
// now test the initial clusters to ensure the type of their centers has
// been retained
while (iterator.hasNext()) {
MeanShiftCanopy canopy = (MeanShiftCanopy) iterator.next();
assertTrue(canopy.getCenter() instanceof DenseVector);
assertFalse(canopy.getBoundPoints().isEmpty());
}
}
/**
* Story: User can produce final point clustering using a Hadoop map/reduce
* job and a EuclideanDistanceMeasure.
*/
@Test
public void testCanopyEuclideanSeqJob() throws Exception {
Path input = getTestTempDirPath("testdata");
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(input.toUri(), conf);
Collection<VectorWritable> points = Lists.newArrayList();
for (Vector v : raw) {
points.add(new VectorWritable(v));
}
ClusteringTestUtils.writePointsToFile(points,
getTestTempFilePath("testdata/file1"), fs, conf);
ClusteringTestUtils.writePointsToFile(points,
getTestTempFilePath("testdata/file2"), fs, conf);
// now run the Job using the run() command. Other tests can continue to use
// runJob().
Path output = getTestTempDirPath("output");
System.out.println("Output Path: " + output);
// MeanShiftCanopyDriver.runJob(input, output,
// EuclideanDistanceMeasure.class.getName(), 4, 1, 0.5, 10, false, false);
String[] args = { optKey(DefaultOptionCreator.INPUT_OPTION),
getTestTempDirPath("testdata").toString(),
optKey(DefaultOptionCreator.OUTPUT_OPTION), output.toString(),
optKey(DefaultOptionCreator.DISTANCE_MEASURE_OPTION),
EuclideanDistanceMeasure.class.getName(),
optKey(DefaultOptionCreator.KERNEL_PROFILE_OPTION),
TriangularKernelProfile.class.getName(),
optKey(DefaultOptionCreator.T1_OPTION), "4",
optKey(DefaultOptionCreator.T2_OPTION), "1",
optKey(DefaultOptionCreator.CLUSTERING_OPTION),
optKey(DefaultOptionCreator.MAX_ITERATIONS_OPTION), "7",
optKey(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION), "0.2",
optKey(DefaultOptionCreator.OVERWRITE_OPTION),
optKey(DefaultOptionCreator.METHOD_OPTION),
DefaultOptionCreator.SEQUENTIAL_METHOD };
ToolRunner.run(new Configuration(), new MeanShiftCanopyDriver(), args);
Path outPart = new Path(output, "clusters-7-final/part-r-00000");
long count = HadoopUtil.countRecords(outPart, conf);
assertEquals("count", 3, count);
}
/**
* Story: User can produce final point clustering using a Hadoop map/reduce
* job and a EuclideanDistanceMeasure.
*/
@Test
public void testCanopyEuclideanMRJobNoClustering() throws Exception {
Path input = getTestTempDirPath("testdata");
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(input.toUri(), conf);
Collection<VectorWritable> points = Lists.newArrayList();
for (Vector v : raw) {
points.add(new VectorWritable(v));
}
ClusteringTestUtils.writePointsToFile(points,
getTestTempFilePath("testdata/file1"), fs, conf);
ClusteringTestUtils.writePointsToFile(points,
getTestTempFilePath("testdata/file2"), fs, conf);
// now run the Job using the run() command. Other tests can continue to use
// runJob().
Path output = getTestTempDirPath("output");
// MeanShiftCanopyDriver.runJob(input, output,
// EuclideanDistanceMeasure.class.getName(), 4, 1, 0.5, 10, false, false);
String[] args = { optKey(DefaultOptionCreator.INPUT_OPTION),
getTestTempDirPath("testdata").toString(),
optKey(DefaultOptionCreator.OUTPUT_OPTION), output.toString(),
optKey(DefaultOptionCreator.DISTANCE_MEASURE_OPTION),
EuclideanDistanceMeasure.class.getName(),
optKey(DefaultOptionCreator.KERNEL_PROFILE_OPTION),
TriangularKernelProfile.class.getName(),
optKey(DefaultOptionCreator.T1_OPTION), "4",
optKey(DefaultOptionCreator.T2_OPTION), "1",
optKey(DefaultOptionCreator.MAX_ITERATIONS_OPTION), "7",
optKey(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION), "0.2",
optKey(DefaultOptionCreator.OVERWRITE_OPTION) };
ToolRunner.run(conf, new MeanShiftCanopyDriver(), args);
Path outPart = new Path(output, "clusters-3-final/part-r-00000");
long count = HadoopUtil.countRecords(outPart, conf);
assertEquals("count", 3, count);
Iterator<?> iterator = new SequenceFileValueIterator<Writable>(outPart,
true, conf);
while (iterator.hasNext()) {
MeanShiftCanopy canopy = (MeanShiftCanopy) iterator.next();
assertTrue(canopy.getCenter() instanceof DenseVector);
assertEquals(1, canopy.getBoundPoints().size());
}
}
/**
* Story: User can produce final point clustering using a Hadoop map/reduce
* job and a EuclideanDistanceMeasure.
*/
@Test
public void testCanopyEuclideanSeqJobNoClustering() throws Exception {
Path input = getTestTempDirPath("testdata");
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(input.toUri(), conf);
Collection<VectorWritable> points = Lists.newArrayList();
for (Vector v : raw) {
points.add(new VectorWritable(v));
}
ClusteringTestUtils.writePointsToFile(points,
getTestTempFilePath("testdata/file1"), fs, conf);
ClusteringTestUtils.writePointsToFile(points,
getTestTempFilePath("testdata/file2"), fs, conf);
// now run the Job using the run() command. Other tests can continue to use
// runJob().
Path output = getTestTempDirPath("output");
System.out.println("Output Path: " + output);
// MeanShiftCanopyDriver.runJob(input, output,
// EuclideanDistanceMeasure.class.getName(), 4, 1, 0.5, 10, false, false);
String[] args = { optKey(DefaultOptionCreator.INPUT_OPTION),
getTestTempDirPath("testdata").toString(),
optKey(DefaultOptionCreator.OUTPUT_OPTION), output.toString(),
optKey(DefaultOptionCreator.DISTANCE_MEASURE_OPTION),
EuclideanDistanceMeasure.class.getName(),
optKey(DefaultOptionCreator.KERNEL_PROFILE_OPTION),
TriangularKernelProfile.class.getName(),
optKey(DefaultOptionCreator.T1_OPTION), "4",
optKey(DefaultOptionCreator.T2_OPTION), "1",
optKey(DefaultOptionCreator.MAX_ITERATIONS_OPTION), "7",
optKey(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION), "0.2",
optKey(DefaultOptionCreator.OVERWRITE_OPTION),
optKey(DefaultOptionCreator.METHOD_OPTION),
DefaultOptionCreator.SEQUENTIAL_METHOD };
ToolRunner.run(new Configuration(), new MeanShiftCanopyDriver(), args);
Path outPart = new Path(output, "clusters-7-final/part-r-00000");
long count = HadoopUtil.countRecords(outPart, conf);
assertEquals("count", 3, count);
Iterator<?> iterator = new SequenceFileValueIterator<Writable>(outPart,
true, conf);
while (iterator.hasNext()) {
MeanShiftCanopy canopy = (MeanShiftCanopy) iterator.next();
assertTrue(canopy.getCenter() instanceof DenseVector);
assertEquals(1, canopy.getBoundPoints().size());
}
}
}