/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.mahout.clustering.fuzzykmeans; import java.util.List; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.apache.hadoop.util.ToolRunner; import org.apache.mahout.clustering.ClusteringTestUtils; import org.apache.mahout.clustering.kmeans.TestKmeansClustering; import org.apache.mahout.common.HadoopUtil; import org.apache.mahout.common.MahoutTestCase; import org.apache.mahout.common.commandline.DefaultOptionCreator; import org.apache.mahout.common.distance.DistanceMeasure; import org.apache.mahout.common.distance.EuclideanDistanceMeasure; import org.apache.mahout.math.Vector; import org.apache.mahout.math.VectorWritable; import org.junit.Before; import org.junit.Test; import com.google.common.io.Closeables; public final class TestFuzzyKmeansClustering extends MahoutTestCase { private FileSystem fs; private final DistanceMeasure measure = new EuclideanDistanceMeasure(); @Override @Before public void setUp() throws Exception { super.setUp(); Configuration conf = getConfiguration(); fs = FileSystem.get(conf); } private static Vector tweakValue(Vector point) { return point.plus(0.1); } @Test public void testFuzzyKMeansSeqJob() throws Exception { List<VectorWritable> points = TestKmeansClustering.getPointsWritable(TestKmeansClustering.REFERENCE); Path pointsPath = getTestTempDirPath("points"); Path clustersPath = getTestTempDirPath("clusters"); Configuration conf = getConfiguration(); ClusteringTestUtils.writePointsToFile(points, new Path(pointsPath, "file1"), fs, conf); for (int k = 0; k < points.size(); k++) { System.out.println("testKFuzzyKMeansMRJob k= " + k); // pick k initial cluster centers at random SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, new Path(clustersPath, "part-00000"), Text.class, SoftCluster.class); try { for (int i = 0; i < k + 1; i++) { Vector vec = tweakValue(points.get(i).get()); SoftCluster cluster = new SoftCluster(vec, i, measure); /* add the center so the centroid will be correct upon output */ cluster.observe(cluster.getCenter(), 1); // writer.write(cluster.getIdentifier() + '\t' + SoftCluster.formatCluster(cluster) + '\n'); writer.append(new Text(cluster.getIdentifier()), cluster); } } finally { Closeables.close(writer, false); } // now run the Job using the run() command line options. Path output = getTestTempDirPath("output" + k); /* FuzzyKMeansDriver.runJob(pointsPath, clustersPath, output, EuclideanDistanceMeasure.class.getName(), 0.001, 2, k + 1, 2, false, true, 0); */ String[] args = { optKey(DefaultOptionCreator.INPUT_OPTION), pointsPath.toString(), optKey(DefaultOptionCreator.CLUSTERS_IN_OPTION), clustersPath.toString(), optKey(DefaultOptionCreator.OUTPUT_OPTION), output.toString(), optKey(DefaultOptionCreator.DISTANCE_MEASURE_OPTION), EuclideanDistanceMeasure.class.getName(), optKey(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION), "0.001", optKey(DefaultOptionCreator.MAX_ITERATIONS_OPTION), "2", optKey(FuzzyKMeansDriver.M_OPTION), "2.0", optKey(DefaultOptionCreator.CLUSTERING_OPTION), optKey(DefaultOptionCreator.EMIT_MOST_LIKELY_OPTION), optKey(DefaultOptionCreator.OVERWRITE_OPTION), optKey(DefaultOptionCreator.METHOD_OPTION), DefaultOptionCreator.SEQUENTIAL_METHOD }; FuzzyKMeansDriver.main(args); long count = HadoopUtil.countRecords(new Path(output, "clusteredPoints/part-m-0"), conf); assertTrue(count > 0); } } @Test public void testFuzzyKMeansMRJob() throws Exception { List<VectorWritable> points = TestKmeansClustering.getPointsWritable(TestKmeansClustering.REFERENCE); Path pointsPath = getTestTempDirPath("points"); Path clustersPath = getTestTempDirPath("clusters"); Configuration conf = getConfiguration(); ClusteringTestUtils.writePointsToFile(points, new Path(pointsPath, "file1"), fs, conf); for (int k = 0; k < points.size(); k++) { System.out.println("testKFuzzyKMeansMRJob k= " + k); // pick k initial cluster centers at random SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, new Path(clustersPath, "part-00000"), Text.class, SoftCluster.class); try { for (int i = 0; i < k + 1; i++) { Vector vec = tweakValue(points.get(i).get()); SoftCluster cluster = new SoftCluster(vec, i, measure); /* add the center so the centroid will be correct upon output */ cluster.observe(cluster.getCenter(), 1); // writer.write(cluster.getIdentifier() + '\t' + SoftCluster.formatCluster(cluster) + '\n'); writer.append(new Text(cluster.getIdentifier()), cluster); } } finally { Closeables.close(writer, false); } // now run the Job using the run() command line options. Path output = getTestTempDirPath("output" + k); /* FuzzyKMeansDriver.runJob(pointsPath, clustersPath, output, EuclideanDistanceMeasure.class.getName(), 0.001, 2, k + 1, 2, false, true, 0); */ String[] args = { optKey(DefaultOptionCreator.INPUT_OPTION), pointsPath.toString(), optKey(DefaultOptionCreator.CLUSTERS_IN_OPTION), clustersPath.toString(), optKey(DefaultOptionCreator.OUTPUT_OPTION), output.toString(), optKey(DefaultOptionCreator.DISTANCE_MEASURE_OPTION), EuclideanDistanceMeasure.class.getName(), optKey(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION), "0.001", optKey(DefaultOptionCreator.MAX_ITERATIONS_OPTION), "2", optKey(FuzzyKMeansDriver.M_OPTION), "2.0", optKey(DefaultOptionCreator.CLUSTERING_OPTION), optKey(DefaultOptionCreator.EMIT_MOST_LIKELY_OPTION), optKey(DefaultOptionCreator.OVERWRITE_OPTION) }; ToolRunner.run(getConfiguration(), new FuzzyKMeansDriver(), args); long count = HadoopUtil.countRecords(new Path(output, "clusteredPoints/part-m-00000"), conf); assertTrue(count > 0); } } }