/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.mahout.clustering.dirichlet;
import java.io.DataOutputStream;
import java.io.IOException;
import java.util.Collection;
import java.util.List;
import com.google.common.collect.Lists;
import com.google.common.io.Closeables;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.mahout.clustering.Cluster;
import org.apache.mahout.clustering.ClusteringTestUtils;
import org.apache.mahout.clustering.Model;
import org.apache.mahout.clustering.dirichlet.models.DistanceMeasureClusterDistribution;
import org.apache.mahout.clustering.dirichlet.models.DistributionDescription;
import org.apache.mahout.clustering.dirichlet.models.GaussianClusterDistribution;
import org.apache.mahout.common.DummyRecordWriter;
import org.apache.mahout.common.MahoutTestCase;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.apache.mahout.common.distance.MahalanobisDistanceMeasure;
import org.apache.mahout.math.DenseMatrix;
import org.apache.mahout.math.DenseVector;
import org.apache.mahout.math.Matrix;
import org.apache.mahout.math.MatrixWritable;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
import org.junit.Before;
import org.junit.Test;
public final class TestMapReduce extends MahoutTestCase {
private Collection<VectorWritable> sampleData = Lists.newArrayList();
private FileSystem fs;
private Configuration conf;
private void addSample(double[] values) {
Vector v = new DenseVector(2);
for (int j = 0; j < values.length; j++) {
v.setQuick(j, values[j]);
}
sampleData.add(new VectorWritable(v));
}
/**
* Generate random samples and add them to the sampleData
*
* @param num
* int number of samples to generate
* @param mx
* double x-value of the sample mean
* @param my
* double y-value of the sample mean
* @param sd
* double standard deviation of the samples
*/
private void generateSamples(int num, double mx, double my, double sd) {
System.out.println("Generating " + num + " samples m=[" + mx + ", " + my + "] sd=" + sd);
for (int i = 0; i < num; i++) {
addSample(new double[] { UncommonDistributions.rNorm(mx, sd), UncommonDistributions.rNorm(my, sd) });
}
}
/**
* Generate random samples with asymmetric standard deviations and add them to the sampleData
*
* @param num
* int number of samples to generate
* @param mx
* double x-value of the sample mean
* @param my
* double y-value of the sample mean
* @param sdx
* double standard deviation in x of the samples
* @param sdy
* double standard deviation in y of the samples
*/
private void generateAsymmetricSamples(int num, double mx, double my, double sdx, double sdy) {
System.out.println("Generating " + num + " samples m=[" + mx + ", " + my + "] sd=[" + sdx + ", " + sdy + ']');
for (int i = 0; i < num; i++) {
addSample(new double[] { UncommonDistributions.rNorm(mx, sdx), UncommonDistributions.rNorm(my, sdy) });
}
}
@Override
@Before
public void setUp() throws Exception {
super.setUp();
conf = new Configuration();
fs = FileSystem.get(conf);
}
/** Test the basic Mapper */
@Test
public void testMapper() throws Exception {
generateSamples(10, 0, 0, 1);
DirichletState state =
new DirichletState(new GaussianClusterDistribution(new VectorWritable(new DenseVector(2))), 5, 1);
DirichletMapper mapper = new DirichletMapper();
mapper.setup(state);
RecordWriter<Text, VectorWritable> writer = new DummyRecordWriter<Text, VectorWritable>();
Mapper<WritableComparable<?>,VectorWritable,Text,VectorWritable>.Context context =
DummyRecordWriter.build(mapper, conf, writer);
for (VectorWritable v : sampleData) {
mapper.map(null, v, context);
}
// Map<String, List<VectorWritable>> data = collector.getData();
// this seed happens to produce two partitions, but they work
// assertEquals("output size", 3, data.size());
}
/** Test the basic Reducer */
@Test
public void testReducer() throws Exception {
generateSamples(100, 0, 0, 1);
generateSamples(100, 2, 0, 1);
generateSamples(100, 0, 2, 1);
generateSamples(100, 2, 2, 1);
DirichletState state =
new DirichletState(new GaussianClusterDistribution(new VectorWritable(new DenseVector(2))), 20, 1);
DirichletMapper mapper = new DirichletMapper();
mapper.setup(state);
DummyRecordWriter<Text, VectorWritable> mapWriter = new DummyRecordWriter<Text, VectorWritable>();
Mapper<WritableComparable<?>,VectorWritable,Text,VectorWritable>.Context mapContext =
DummyRecordWriter.build(mapper, conf, mapWriter);
for (VectorWritable v : sampleData) {
mapper.map(null, v, mapContext);
}
DirichletReducer reducer = new DirichletReducer();
reducer.setup(state);
RecordWriter<Text, DirichletCluster> reduceWriter = new DummyRecordWriter<Text, DirichletCluster>();
Reducer<Text, VectorWritable, Text, DirichletCluster>.Context reduceContext =
DummyRecordWriter.build(reducer, conf, reduceWriter, Text.class, VectorWritable.class);
for (Text key : mapWriter.getKeys()) {
reducer.reduce(new Text(key), mapWriter.getValue(key), reduceContext);
}
Cluster[] newModels = reducer.getNewModels();
state.update(newModels);
}
/** Test the Mapper and Reducer in an iteration loop */
@Test
public void testMRIterations() throws Exception {
generateSamples(100, 0, 0, 1);
generateSamples(100, 2, 0, 1);
generateSamples(100, 0, 2, 1);
generateSamples(100, 2, 2, 1);
DirichletState state =
new DirichletState(new GaussianClusterDistribution(new VectorWritable(new DenseVector(2))), 20, 1.0);
Collection<Model<VectorWritable>[]> models = Lists.newArrayList();
for (int iteration = 0; iteration < 10; iteration++) {
DirichletMapper mapper = new DirichletMapper();
mapper.setup(state);
DummyRecordWriter<Text, VectorWritable> mapWriter = new DummyRecordWriter<Text, VectorWritable>();
Mapper<WritableComparable<?>, VectorWritable, Text, VectorWritable>.Context mapContext =
DummyRecordWriter.build(mapper, conf, mapWriter);
for (VectorWritable v : sampleData) {
mapper.map(null, v, mapContext);
}
DirichletReducer reducer = new DirichletReducer();
reducer.setup(state);
RecordWriter<Text, DirichletCluster> reduceWriter = new DummyRecordWriter<Text, DirichletCluster>();
Reducer<Text,VectorWritable, Text,DirichletCluster>.Context reduceContext =
DummyRecordWriter.build(reducer, conf, reduceWriter, Text.class, VectorWritable.class);
for (Text key : mapWriter.getKeys()) {
reducer.reduce(new Text(key), mapWriter.getValue(key), reduceContext);
}
Cluster[] newModels = reducer.getNewModels();
state.update(newModels);
models.add(newModels);
}
printModels(models, 0);
}
private static void printModels(Iterable<Model<VectorWritable>[]> results, int significant) {
int row = 0;
for (Model<VectorWritable>[] r : results) {
System.out.print("sample[" + row++ + "]= ");
for (int k = 0; k < r.length; k++) {
Model<VectorWritable> model = r[k];
if (model.count() > significant) {
System.out.print("m" + k + model.toString() + ", ");
}
}
System.out.println();
}
System.out.println();
}
private static void printResults(Iterable<List<DirichletCluster>> clusters, int significant) {
int row = 0;
for (List<DirichletCluster> r : clusters) {
System.out.print("sample[" + row++ + "]= ");
for (int k = 0; k < r.size(); k++) {
Model<VectorWritable> model = r.get(k).getModel();
if (model.count() > significant) {
int total = (int) r.get(k).getTotalCount();
System.out.print("m" + k + '(' + total + ')' + model.toString() + ", ");
}
}
System.out.println();
}
System.out.println();
}
/** Test the Mapper and Reducer using the Driver in sequential execution mode */
@Test
public void testDriverIterationsSeq() throws Exception {
generateSamples(100, 0, 0, 0.5);
generateSamples(100, 2, 0, 0.2);
generateSamples(100, 0, 2, 0.3);
generateSamples(100, 2, 2, 1);
ClusteringTestUtils.writePointsToFile(sampleData, getTestTempFilePath("input/data.txt"), fs, conf);
// Now run the driver using the run() method. Others can use runJob() as before
Integer maxIterations = 5;
DistributionDescription description =
new DistributionDescription(GaussianClusterDistribution.class.getName(),
DenseVector.class.getName(),
null,
2);
String[] args = { optKey(DefaultOptionCreator.INPUT_OPTION), getTestTempDirPath("input").toString(),
optKey(DefaultOptionCreator.OUTPUT_OPTION), getTestTempDirPath("output").toString(),
optKey(DirichletDriver.MODEL_DISTRIBUTION_CLASS_OPTION), description.getModelFactory(),
optKey(DirichletDriver.MODEL_PROTOTYPE_CLASS_OPTION), description.getModelPrototype(),
optKey(DefaultOptionCreator.NUM_CLUSTERS_OPTION), "20", optKey(DefaultOptionCreator.MAX_ITERATIONS_OPTION),
maxIterations.toString(), optKey(DirichletDriver.ALPHA_OPTION), "1.0",
optKey(DefaultOptionCreator.OVERWRITE_OPTION),
optKey(DefaultOptionCreator.CLUSTERING_OPTION), optKey(DefaultOptionCreator.METHOD_OPTION),
DefaultOptionCreator.SEQUENTIAL_METHOD };
DirichletDriver dirichletDriver = new DirichletDriver();
dirichletDriver.setConf(conf);
dirichletDriver.run(args);
// and inspect results
Collection<List<DirichletCluster>> clusters = Lists.newArrayList();
Configuration conf = new Configuration();
conf.set(DirichletDriver.MODEL_DISTRIBUTION_KEY, description.toString());
conf.set(DirichletDriver.NUM_CLUSTERS_KEY, "20");
conf.set(DirichletDriver.ALPHA_0_KEY, "1.0");
for (int i = 0; i <= maxIterations; i++) {
conf.set(DirichletDriver.STATE_IN_KEY, new Path(getTestTempDirPath("output"), "clusters-" + i).toString());
clusters.add(DirichletMapper.getDirichletState(conf).getClusters());
}
printResults(clusters, 0);
}
/** Test the Mapper and Reducer using the Driver in mapreduce mode */
@Test
public void testDriverIterationsMR() throws Exception {
generateSamples(100, 0, 0, 0.5);
generateSamples(100, 2, 0, 0.2);
generateSamples(100, 0, 2, 0.3);
generateSamples(100, 2, 2, 1);
ClusteringTestUtils.writePointsToFile(sampleData, getTestTempFilePath("input/data.txt"), fs, conf);
// Now run the driver using the run() method. Others can use runJob() as before
Integer maxIterations = 5;
DistributionDescription description =
new DistributionDescription(GaussianClusterDistribution.class.getName(),
DenseVector.class.getName(),
null,
2);
String[] args = { optKey(DefaultOptionCreator.INPUT_OPTION), getTestTempDirPath("input").toString(),
optKey(DefaultOptionCreator.OUTPUT_OPTION), getTestTempDirPath("output").toString(),
optKey(DirichletDriver.MODEL_DISTRIBUTION_CLASS_OPTION), description.getModelFactory(),
optKey(DirichletDriver.MODEL_PROTOTYPE_CLASS_OPTION), description.getModelPrototype(),
optKey(DefaultOptionCreator.NUM_CLUSTERS_OPTION), "20", optKey(DefaultOptionCreator.MAX_ITERATIONS_OPTION),
maxIterations.toString(), optKey(DirichletDriver.ALPHA_OPTION), "1.0",
optKey(DefaultOptionCreator.OVERWRITE_OPTION),
optKey(DefaultOptionCreator.CLUSTERING_OPTION)};
ToolRunner.run(new Configuration(), new DirichletDriver(), args);
// and inspect results
Collection<List<DirichletCluster>> clusters = Lists.newArrayList();
Configuration conf = new Configuration();
conf.set(DirichletDriver.MODEL_DISTRIBUTION_KEY, description.toString());
conf.set(DirichletDriver.NUM_CLUSTERS_KEY, "20");
conf.set(DirichletDriver.ALPHA_0_KEY, "1.0");
for (int i = 0; i <= maxIterations; i++) {
conf.set(DirichletDriver.STATE_IN_KEY, new Path(getTestTempDirPath("output"), "clusters-" + i).toString());
clusters.add(DirichletMapper.getDirichletState(conf).getClusters());
}
printResults(clusters, 0);
}
/** Test the Mapper and Reducer using the Driver */
@Test
public void testDriverMnRIterations() throws Exception {
generate4Datasets();
// Now run the driver
int maxIterations = 3;
DistributionDescription description =
new DistributionDescription(GaussianClusterDistribution.class.getName(),
DenseVector.class.getName(),
null,
2);
Configuration conf = new Configuration();
DirichletDriver.run(conf,
getTestTempDirPath("input"),
getTestTempDirPath("output"),
description,
20,
maxIterations,
1.0,
false,
true,
0,
false);
// and inspect results
Collection<List<DirichletCluster>> clusters = Lists.newArrayList();
conf.set(DirichletDriver.MODEL_DISTRIBUTION_KEY, description.toString());
conf.set(DirichletDriver.NUM_CLUSTERS_KEY, "20");
conf.set(DirichletDriver.ALPHA_0_KEY, "1.0");
for (int i = 0; i <= maxIterations; i++) {
conf.set(DirichletDriver.STATE_IN_KEY, new Path(getTestTempDirPath("output"), "clusters-" + i).toString());
clusters.add(DirichletMapper.getDirichletState(conf).getClusters());
}
printResults(clusters, 0);
}
/** Test the Driver in sequential execution mode using MahalanobisDistanceMeasure */
@Test
public void testDriverIterationsMahalanobisSeq() throws Exception {
generateAsymmetricSamples(100, 0, 0, 0.5, 3.0);
generateAsymmetricSamples(100, 0, 3, 0.3, 4.0);
ClusteringTestUtils.writePointsToFile(sampleData, getTestTempFilePath("input/data.txt"), fs, conf);
// Now run the driver using the run() method. Others can use runJob() as before
MahalanobisDistanceMeasure measure = new MahalanobisDistanceMeasure();
DistributionDescription description =
new DistributionDescription(DistanceMeasureClusterDistribution.class.getName(),
DenseVector.class.getName(),
MahalanobisDistanceMeasure.class.getName(),
2);
Vector meanVector = new DenseVector(new double[] { 0.0, 0.0 });
measure.setMeanVector(meanVector);
Matrix m= new DenseMatrix(new double [][] {{0.5, 0.0}, {0.0, 4.0}});
measure.setCovarianceMatrix(m);
Path inverseCovarianceFile =
new Path(getTestTempDirPath("mahalanobis"), "MahalanobisDistanceMeasureInverseCovarianceFile");
conf.set("MahalanobisDistanceMeasure.inverseCovarianceFile", inverseCovarianceFile.toString());
FileSystem fs = FileSystem.get(inverseCovarianceFile.toUri(), conf);
MatrixWritable inverseCovarianceMatrix = new MatrixWritable(measure.getInverseCovarianceMatrix());
DataOutputStream out = fs.create(inverseCovarianceFile);
try {
inverseCovarianceMatrix.write(out);
} finally {
Closeables.closeQuietly(out);
}
Path meanVectorFile = new Path(getTestTempDirPath("mahalanobis"), "MahalanobisDistanceMeasureMeanVectorFile");
conf.set("MahalanobisDistanceMeasure.meanVectorFile", meanVectorFile.toString());
fs = FileSystem.get(meanVectorFile.toUri(), conf);
VectorWritable meanVectorWritable = new VectorWritable(meanVector);
out = fs.create(meanVectorFile);
try {
meanVectorWritable.write(out);
} finally {
Closeables.closeQuietly(out);
}
conf.set("MahalanobisDistanceMeasure.maxtrixClass", MatrixWritable.class.getName());
conf.set("MahalanobisDistanceMeasure.vectorClass", VectorWritable.class.getName());
Integer maxIterations = 5;
String[] args = { optKey(DefaultOptionCreator.INPUT_OPTION), getTestTempDirPath("input").toString(),
optKey(DefaultOptionCreator.OUTPUT_OPTION), getTestTempDirPath("output").toString(),
optKey(DirichletDriver.MODEL_DISTRIBUTION_CLASS_OPTION), description.getModelFactory(),
optKey(DefaultOptionCreator.DISTANCE_MEASURE_OPTION), description.getDistanceMeasure(),
optKey(DirichletDriver.MODEL_PROTOTYPE_CLASS_OPTION), description.getModelPrototype(),
optKey(DefaultOptionCreator.NUM_CLUSTERS_OPTION), "20", optKey(DefaultOptionCreator.MAX_ITERATIONS_OPTION),
maxIterations.toString(), optKey(DirichletDriver.ALPHA_OPTION), "1.0",
optKey(DefaultOptionCreator.OVERWRITE_OPTION),
optKey(DefaultOptionCreator.CLUSTERING_OPTION), optKey(DefaultOptionCreator.METHOD_OPTION),
DefaultOptionCreator.SEQUENTIAL_METHOD };
DirichletDriver dirichletDriver = new DirichletDriver();
dirichletDriver.setConf(conf);
dirichletDriver.run(args);
// and inspect results
Collection<List<DirichletCluster>> clusters = Lists.newArrayList();
Configuration conf = new Configuration();
conf.set(DirichletDriver.MODEL_DISTRIBUTION_KEY, description.toString());
conf.set(DirichletDriver.NUM_CLUSTERS_KEY, "20");
conf.set(DirichletDriver.ALPHA_0_KEY, "1.0");
for (int i = 0; i <= maxIterations; i++) {
conf.set(DirichletDriver.STATE_IN_KEY, new Path(getTestTempDirPath("output"), "clusters-" + i).toString());
clusters.add(DirichletMapper.getDirichletState(conf).getClusters());
}
printResults(clusters, 0);
}
/** Test the Mapper and Reducer using the Driver in mapreduce mode */
@Test
public void testDriverIterationsMahalanobisMR() throws Exception {
generateAsymmetricSamples(100, 0, 0, 0.5, 3.0);
generateAsymmetricSamples(100, 0, 3, 0.3, 4.0);
ClusteringTestUtils.writePointsToFile(sampleData, getTestTempFilePath("input/data.txt"), fs, conf);
// Now run the driver using the run() method. Others can use runJob() as before
MahalanobisDistanceMeasure measure = new MahalanobisDistanceMeasure();
DistributionDescription description =
new DistributionDescription(DistanceMeasureClusterDistribution.class.getName(),
DenseVector.class.getName(),
MahalanobisDistanceMeasure.class.getName(),
2);
Vector meanVector = new DenseVector(new double[]{0.0, 0.0});
measure.setMeanVector(meanVector);
Matrix m = new DenseMatrix(new double [][] {{0.5, 0.0}, {0.0, 4.0}});
measure.setCovarianceMatrix(m);
Path inverseCovarianceFile =
new Path(getTestTempDirPath("mahalanobis"), "MahalanobisDistanceMeasureInverseCovarianceFile");
conf.set("MahalanobisDistanceMeasure.inverseCovarianceFile", inverseCovarianceFile.toString());
FileSystem fs = FileSystem.get(inverseCovarianceFile.toUri(), conf);
MatrixWritable inverseCovarianceMatrix = new MatrixWritable(measure.getInverseCovarianceMatrix());
DataOutputStream out = fs.create(inverseCovarianceFile);
try {
inverseCovarianceMatrix.write(out);
} finally {
Closeables.closeQuietly(out);
}
Path meanVectorFile = new Path(getTestTempDirPath("mahalanobis"), "MahalanobisDistanceMeasureMeanVectorFile");
conf.set("MahalanobisDistanceMeasure.meanVectorFile", meanVectorFile.toString());
fs = FileSystem.get(meanVectorFile.toUri(), conf);
VectorWritable meanVectorWritable = new VectorWritable(meanVector);
out = fs.create(meanVectorFile);
try {
meanVectorWritable.write(out);
} finally {
Closeables.closeQuietly(out);
}
conf.set("MahalanobisDistanceMeasure.maxtrixClass", MatrixWritable.class.getName());
conf.set("MahalanobisDistanceMeasure.vectorClass", VectorWritable.class.getName());
Integer maxIterations = 5;
String[] args = { optKey(DefaultOptionCreator.INPUT_OPTION), getTestTempDirPath("input").toString(),
optKey(DefaultOptionCreator.OUTPUT_OPTION), getTestTempDirPath("output").toString(),
optKey(DirichletDriver.MODEL_DISTRIBUTION_CLASS_OPTION), description.getModelFactory(),
optKey(DefaultOptionCreator.DISTANCE_MEASURE_OPTION), description.getDistanceMeasure(),
optKey(DirichletDriver.MODEL_PROTOTYPE_CLASS_OPTION), description.getModelPrototype(),
optKey(DefaultOptionCreator.NUM_CLUSTERS_OPTION), "20", optKey(DefaultOptionCreator.MAX_ITERATIONS_OPTION),
maxIterations.toString(), optKey(DirichletDriver.ALPHA_OPTION), "1.0",
optKey(DefaultOptionCreator.OVERWRITE_OPTION),
optKey(DefaultOptionCreator.CLUSTERING_OPTION)};
Tool dirichletDriver = new DirichletDriver();
dirichletDriver.setConf(conf);
ToolRunner.run(conf, dirichletDriver, args);
// and inspect results
Collection<List<DirichletCluster>> clusters = Lists.newArrayList();
Configuration conf = new Configuration();
conf.set(DirichletDriver.MODEL_DISTRIBUTION_KEY, description.toString());
conf.set(DirichletDriver.NUM_CLUSTERS_KEY, "20");
conf.set(DirichletDriver.ALPHA_0_KEY, "1.0");
for (int i = 0; i <= maxIterations; i++) {
conf.set(DirichletDriver.STATE_IN_KEY, new Path(getTestTempDirPath("output"), "clusters-" + i).toString());
clusters.add(DirichletMapper.getDirichletState(conf).getClusters());
}
printResults(clusters, 0);
}
private void generate4Datasets() throws IOException {
generateSamples(500, 0, 0, 0.5);
ClusteringTestUtils.writePointsToFile(sampleData, getTestTempFilePath("input/data1.txt"), fs, conf);
sampleData = Lists.newArrayList();
generateSamples(500, 2, 0, 0.2);
ClusteringTestUtils.writePointsToFile(sampleData, getTestTempFilePath("input/data2.txt"), fs, conf);
sampleData = Lists.newArrayList();
generateSamples(500, 0, 2, 0.3);
ClusteringTestUtils.writePointsToFile(sampleData, getTestTempFilePath("input/data3.txt"), fs, conf);
sampleData = Lists.newArrayList();
generateSamples(500, 2, 2, 1);
ClusteringTestUtils.writePointsToFile(sampleData, getTestTempFilePath("input/data4.txt"), fs, conf);
}
}