/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.mahout.clustering.cdbw; import java.io.IOException; import java.util.Collection; import java.util.List; import java.util.Map; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.mahout.clustering.Cluster; import org.apache.mahout.clustering.ClusteringTestUtils; import org.apache.mahout.clustering.TestClusterEvaluator; import org.apache.mahout.clustering.canopy.Canopy; import org.apache.mahout.clustering.canopy.CanopyDriver; import org.apache.mahout.clustering.dirichlet.DirichletDriver; import org.apache.mahout.clustering.dirichlet.UncommonDistributions; import org.apache.mahout.clustering.dirichlet.models.DistributionDescription; import org.apache.mahout.clustering.dirichlet.models.GaussianClusterDistribution; import org.apache.mahout.clustering.evaluation.RepresentativePointsDriver; import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver; import org.apache.mahout.clustering.kmeans.KMeansDriver; import org.apache.mahout.clustering.kmeans.TestKmeansClustering; import org.apache.mahout.clustering.meanshift.MeanShiftCanopyDriver; import org.apache.mahout.common.MahoutTestCase; import org.apache.mahout.common.distance.DistanceMeasure; import org.apache.mahout.common.distance.EuclideanDistanceMeasure; import org.apache.mahout.common.kernel.IKernelProfile; import org.apache.mahout.common.kernel.TriangularKernelProfile; import org.apache.mahout.math.DenseVector; import org.apache.mahout.math.Vector; import org.apache.mahout.math.VectorWritable; import org.junit.Before; import org.junit.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public final class TestCDbwEvaluator extends MahoutTestCase { private static final double[][] REFERENCE = { {1, 1}, {2, 1}, {1, 2}, {2, 2}, {3, 3}, {4, 4}, {5, 4}, {4, 5}, {5, 5}}; private static final Logger log = LoggerFactory .getLogger(TestClusterEvaluator.class); private Map<Integer,List<VectorWritable>> representativePoints; private List<Cluster> clusters; private Configuration conf; private FileSystem fs; private final Collection<VectorWritable> sampleData = Lists.newArrayList(); private List<VectorWritable> referenceData = Lists.newArrayList(); private Path testdata; private Path output; @Override @Before public void setUp() throws Exception { super.setUp(); conf = new Configuration(); fs = FileSystem.get(conf); testdata = getTestTempDirPath("testdata"); output = getTestTempDirPath("output"); // Create small reference data set referenceData = TestKmeansClustering.getPointsWritable(REFERENCE); // generate larger test data set for the clustering tests to chew on generateSamples(); } /** * Initialize synthetic data using 4 clusters dC units from origin having 4 * representative points dP from each center * * @param dC * a double cluster center offset * @param dP * a double representative point offset * @param measure * the DistanceMeasure */ private void initData(double dC, double dP, DistanceMeasure measure) { clusters = Lists.newArrayList(); clusters.add(new Canopy(new DenseVector(new double[] {-dC, -dC}), 1, measure)); clusters .add(new Canopy(new DenseVector(new double[] {-dC, dC}), 3, measure)); clusters .add(new Canopy(new DenseVector(new double[] {dC, dC}), 5, measure)); clusters .add(new Canopy(new DenseVector(new double[] {dC, -dC}), 7, measure)); representativePoints = Maps.newHashMap(); for (Cluster cluster : clusters) { List<VectorWritable> points = Lists.newArrayList(); representativePoints.put(cluster.getId(), points); points.add(new VectorWritable(cluster.getCenter().clone())); points.add(new VectorWritable(cluster.getCenter().plus( new DenseVector(new double[] {dP, dP})))); points.add(new VectorWritable(cluster.getCenter().plus( new DenseVector(new double[] {dP, -dP})))); points.add(new VectorWritable(cluster.getCenter().plus( new DenseVector(new double[] {-dP, -dP})))); points.add(new VectorWritable(cluster.getCenter().plus( new DenseVector(new double[] {-dP, dP})))); } } /** * Generate random samples and add them to the sampleData * * @param num * int number of samples to generate * @param mx * double x-value of the sample mean * @param my * double y-value of the sample mean * @param sd * double standard deviation of the samples * @throws Exception */ private void generateSamples(int num, double mx, double my, double sd) { log.info("Generating {} samples m=[{}, {}] sd={}", new Object[] {num, mx, my, sd}); for (int i = 0; i < num; i++) { sampleData.add(new VectorWritable(new DenseVector(new double[] { UncommonDistributions.rNorm(mx, sd), UncommonDistributions.rNorm(my, sd)}))); } } private void generateSamples() { generateSamples(500, 1, 1, 3); generateSamples(300, 1, 0, 0.5); generateSamples(300, 0, 2, 0.1); } @Test public void testCDbw0() throws IOException { ClusteringTestUtils.writePointsToFile(referenceData, getTestTempFilePath("testdata/file1"), fs, conf); DistanceMeasure measure = new EuclideanDistanceMeasure(); initData(1, 0.25, measure); CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure); assertEquals("inter cluster density", 0.0, evaluator.interClusterDensity(), EPSILON); assertEquals("separation", 20.485281374238568, evaluator.separation(), EPSILON); assertEquals("intra cluster density", 0.8, evaluator.intraClusterDensity(), EPSILON); assertEquals("CDbw", 16.388225099390855, evaluator.getCDbw(), EPSILON); } @Test public void testCDbw1() throws IOException { ClusteringTestUtils.writePointsToFile(referenceData, getTestTempFilePath("testdata/file1"), fs, conf); DistanceMeasure measure = new EuclideanDistanceMeasure(); initData(1, 0.5, measure); CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure); assertEquals("inter cluster density", 1.2, evaluator.interClusterDensity(), EPSILON); assertEquals("separation", 6.207661022496537, evaluator.separation(), EPSILON); assertEquals("intra cluster density", 0.4, evaluator.intraClusterDensity(), EPSILON); assertEquals("CDbw", 2.483064408998615, evaluator.getCDbw(), EPSILON); } @Test public void testCDbw2() throws IOException { ClusteringTestUtils.writePointsToFile(referenceData, getTestTempFilePath("testdata/file1"), fs, conf); DistanceMeasure measure = new EuclideanDistanceMeasure(); initData(1, 0.75, measure); CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure); assertEquals("inter cluster density", 0.682842712474619, evaluator.interClusterDensity(), EPSILON); assertEquals("separation", 4.0576740025245694, evaluator.separation(), EPSILON); assertEquals("intra cluster density", 0.26666666666666666, evaluator.intraClusterDensity(), EPSILON); assertEquals("CDbw", 1.0820464006732184, evaluator.getCDbw(), EPSILON); } @Test public void testEmptyCluster() throws IOException { ClusteringTestUtils.writePointsToFile(referenceData, getTestTempFilePath("testdata/file1"), fs, conf); DistanceMeasure measure = new EuclideanDistanceMeasure(); initData(1, 0.25, measure); Canopy cluster = new Canopy(new DenseVector(new double[] {10, 10}), 19, measure); clusters.add(cluster); List<VectorWritable> points = Lists.newArrayList(); representativePoints.put(cluster.getId(), points); CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure); assertEquals("inter cluster density", 0.0, evaluator.interClusterDensity(), EPSILON); assertEquals("separation", 20.485281374238568, evaluator.separation(), EPSILON); assertEquals("intra cluster density", 0.8, evaluator.intraClusterDensity(), EPSILON); assertEquals("CDbw", 16.388225099390855, evaluator.getCDbw(), EPSILON); } @Test public void testSingleValueCluster() throws IOException { ClusteringTestUtils.writePointsToFile(referenceData, getTestTempFilePath("testdata/file1"), fs, conf); DistanceMeasure measure = new EuclideanDistanceMeasure(); initData(1, 0.25, measure); Canopy cluster = new Canopy(new DenseVector(new double[] {0, 0}), 19, measure); clusters.add(cluster); List<VectorWritable> points = Lists.newArrayList(); points.add(new VectorWritable(cluster.getCenter().plus( new DenseVector(new double[] {1, 1})))); representativePoints.put(cluster.getId(), points); CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure); assertEquals("inter cluster density", 0.0, evaluator.interClusterDensity(), EPSILON); assertEquals("separation", 20.485281374238568, evaluator.separation(), EPSILON); assertEquals("intra cluster density", 0.8, evaluator.intraClusterDensity(), EPSILON); assertEquals("CDbw", 16.388225099390855, evaluator.getCDbw(), EPSILON); } /** * Representative points extraction will duplicate the cluster center if the * cluster has no assigned points. These clusters should be ignored like empty * clusters above * * @throws IOException */ @Test public void testAllSameValueCluster() throws IOException { ClusteringTestUtils.writePointsToFile(referenceData, getTestTempFilePath("testdata/file1"), fs, conf); DistanceMeasure measure = new EuclideanDistanceMeasure(); initData(1, 0.25, measure); Canopy cluster = new Canopy(new DenseVector(new double[] {0, 0}), 19, measure); clusters.add(cluster); List<VectorWritable> points = Lists.newArrayList(); points.add(new VectorWritable(cluster.getCenter())); points.add(new VectorWritable(cluster.getCenter())); points.add(new VectorWritable(cluster.getCenter())); representativePoints.put(cluster.getId(), points); CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure); assertEquals("inter cluster density", 0.0, evaluator.interClusterDensity(), EPSILON); assertEquals("separation", 20.485281374238568, evaluator.separation(), EPSILON); assertEquals("intra cluster density", 0.8, evaluator.intraClusterDensity(), EPSILON); assertEquals("CDbw", 16.388225099390855, evaluator.getCDbw(), EPSILON); } /** * Clustering can produce very, very tight clusters that can cause the std * calculation to fail. These clusters should be processed correctly. * * @throws IOException */ @Test public void testAlmostSameValueCluster() throws IOException { ClusteringTestUtils.writePointsToFile(referenceData, getTestTempFilePath("testdata/file1"), fs, conf); DistanceMeasure measure = new EuclideanDistanceMeasure(); initData(1, 0.25, measure); Canopy cluster = new Canopy(new DenseVector(new double[] {0, 0}), 19, measure); clusters.add(cluster); List<VectorWritable> points = Lists.newArrayList(); Vector delta = new DenseVector(new double[] { 0, Double.MIN_NORMAL }); points.add(new VectorWritable(delta.clone())); points.add(new VectorWritable(delta.clone())); points.add(new VectorWritable(delta.clone())); points.add(new VectorWritable(delta.clone())); points.add(new VectorWritable(delta.clone())); representativePoints.put(cluster.getId(), points); CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure); assertEquals("inter cluster density", 0.0, evaluator.interClusterDensity(), EPSILON); assertEquals("separation", 28.970562748477143, evaluator.separation(), EPSILON); assertEquals("intra cluster density", 1.8, evaluator.intraClusterDensity(), EPSILON); assertEquals("CDbw", 52.147012947258865, evaluator.getCDbw(), EPSILON); } @Test public void testCanopy() throws Exception { ClusteringTestUtils.writePointsToFile(sampleData, getTestTempFilePath("testdata/file1"), fs, conf); DistanceMeasure measure = new EuclideanDistanceMeasure(); CanopyDriver.run(new Configuration(), testdata, output, measure, 3.1, 2.1, true, true); int numIterations = 10; Path clustersIn = new Path(output, "clusters-0-final"); RepresentativePointsDriver.run(conf, clustersIn, new Path(output, "clusteredPoints"), output, measure, numIterations, true); CDbwEvaluator evaluator = new CDbwEvaluator(conf, clustersIn); // printRepPoints(numIterations); // now print out the Results System.out.println("Canopy CDbw = " + evaluator.getCDbw()); System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity()); System.out.println("Inter-cluster density = " + evaluator.interClusterDensity()); System.out.println("Separation = " + evaluator.separation()); } @Test public void testKmeans() throws Exception { ClusteringTestUtils.writePointsToFile(sampleData, getTestTempFilePath("testdata/file1"), fs, conf); DistanceMeasure measure = new EuclideanDistanceMeasure(); // now run the Canopy job to prime kMeans canopies CanopyDriver.run(new Configuration(), testdata, output, measure, 3.1, 2.1, false, true); // now run the KMeans job KMeansDriver.run(testdata, new Path(output, "clusters-0-final"), output, measure, 0.001, 10, true, true); int numIterations = 10; Path clustersIn = new Path(output, "clusters-2"); RepresentativePointsDriver.run(conf, clustersIn, new Path(output, "clusteredPoints"), output, measure, numIterations, true); CDbwEvaluator evaluator = new CDbwEvaluator(conf, clustersIn); // printRepPoints(numIterations); // now print out the Results System.out.println("K-Means CDbw = " + evaluator.getCDbw()); System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity()); System.out.println("Inter-cluster density = " + evaluator.interClusterDensity()); System.out.println("Separation = " + evaluator.separation()); } @Test public void testFuzzyKmeans() throws Exception { ClusteringTestUtils.writePointsToFile(sampleData, getTestTempFilePath("testdata/file1"), fs, conf); DistanceMeasure measure = new EuclideanDistanceMeasure(); // now run the Canopy job to prime kMeans canopies CanopyDriver.run(new Configuration(), testdata, output, measure, 3.1, 2.1, false, true); // now run the KMeans job FuzzyKMeansDriver.run(testdata, new Path(output, "clusters-0-final"), output, measure, 0.001, 10, 2, true, true, 0, true); int numIterations = 10; Path clustersIn = new Path(output, "clusters-4"); RepresentativePointsDriver.run(conf, clustersIn, new Path(output, "clusteredPoints"), output, measure, numIterations, true); CDbwEvaluator evaluator = new CDbwEvaluator(conf, clustersIn); // printRepPoints(numIterations); // now print out the Results System.out.println("Fuzzy K-Means CDbw = " + evaluator.getCDbw()); System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity()); System.out.println("Inter-cluster density = " + evaluator.interClusterDensity()); System.out.println("Separation = " + evaluator.separation()); } @Test public void testMeanShift() throws Exception { ClusteringTestUtils.writePointsToFile(sampleData, getTestTempFilePath("testdata/file1"), fs, conf); DistanceMeasure measure = new EuclideanDistanceMeasure(); IKernelProfile kernelProfile = new TriangularKernelProfile(); MeanShiftCanopyDriver.run(conf, testdata, output, measure, kernelProfile, 2.1, 1.0, 0.001, 10, false, true, true); int numIterations = 10; Path clustersIn = new Path(output, "clusters-2"); RepresentativePointsDriver.run(conf, clustersIn, new Path(output, "clusteredPoints"), output, measure, numIterations, true); CDbwEvaluator evaluator = new CDbwEvaluator(conf, clustersIn); // printRepPoints(numIterations); // now print out the Results System.out.println("Mean Shift CDbw = " + evaluator.getCDbw()); System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity()); System.out.println("Inter-cluster density = " + evaluator.interClusterDensity()); System.out.println("Separation = " + evaluator.separation()); } @Test public void testDirichlet() throws Exception { ClusteringTestUtils.writePointsToFile(sampleData, getTestTempFilePath("testdata/file1"), fs, conf); DistributionDescription description = new DistributionDescription( GaussianClusterDistribution.class.getName(), DenseVector.class.getName(), null, 2); DirichletDriver.run(testdata, output, description, 15, 5, 1.0, true, true, 0, true); int numIterations = 10; Path clustersIn = new Path(output, "clusters-0"); RepresentativePointsDriver.run(conf, clustersIn, new Path(output, "clusteredPoints"), output, new EuclideanDistanceMeasure(), numIterations, true); CDbwEvaluator evaluator = new CDbwEvaluator(conf, clustersIn); // printRepPoints(numIterations); // now print out the Results System.out.println("Dirichlet CDbw = " + evaluator.getCDbw()); System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity()); System.out.println("Inter-cluster density = " + evaluator.interClusterDensity()); System.out.println("Separation = " + evaluator.separation()); } }