TestMapReduce.java example

Explorer
mahout-rbmClassifier-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.mahout.clustering.dirichlet;

import java.io.DataOutputStream;
import java.io.IOException;
import java.util.Collection;
import java.util.List;

import com.google.common.collect.Lists;
import com.google.common.io.Closeables;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.mahout.clustering.Cluster;
import org.apache.mahout.clustering.ClusteringTestUtils;
import org.apache.mahout.clustering.Model;
import org.apache.mahout.clustering.dirichlet.models.DistanceMeasureClusterDistribution;
import org.apache.mahout.clustering.dirichlet.models.DistributionDescription;
import org.apache.mahout.clustering.dirichlet.models.GaussianClusterDistribution;
import org.apache.mahout.common.DummyRecordWriter;
import org.apache.mahout.common.MahoutTestCase;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.apache.mahout.common.distance.MahalanobisDistanceMeasure;
import org.apache.mahout.math.DenseMatrix;
import org.apache.mahout.math.DenseVector;
import org.apache.mahout.math.Matrix;
import org.apache.mahout.math.MatrixWritable;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
import org.junit.Before;
import org.junit.Test;

public final class TestMapReduce extends MahoutTestCase {

  private Collection<VectorWritable> sampleData = Lists.newArrayList();

  private FileSystem fs;

  private Configuration conf;

  private void addSample(double[] values) {
    Vector v = new DenseVector(2);
    for (int j = 0; j < values.length; j++) {
      v.setQuick(j, values[j]);
    }
    sampleData.add(new VectorWritable(v));
  }

  /**
   * Generate random samples and add them to the sampleData
   *
   * @param num
   *          int number of samples to generate
   * @param mx
   *          double x-value of the sample mean
   * @param my
   *          double y-value of the sample mean
   * @param sd
   *          double standard deviation of the samples
   */
  private void generateSamples(int num, double mx, double my, double sd) {
    System.out.println("Generating " + num + " samples m=[" + mx + ", " + my + "] sd=" + sd);
    for (int i = 0; i < num; i++) {
      addSample(new double[] { UncommonDistributions.rNorm(mx, sd), UncommonDistributions.rNorm(my, sd) });
    }
  }

  /**
   * Generate random samples with asymmetric standard deviations and add them to the sampleData
   *
   * @param num
   *          int number of samples to generate
   * @param mx
   *          double x-value of the sample mean
   * @param my
   *          double y-value of the sample mean
   * @param sdx
   *          double standard deviation in x of the samples
   * @param sdy
   *          double standard deviation in y of the samples
   */
  private void generateAsymmetricSamples(int num, double mx, double my, double sdx, double sdy) {
    System.out.println("Generating " + num + " samples m=[" + mx + ", " + my + "] sd=[" + sdx + ", " + sdy + ']');
    for (int i = 0; i < num; i++) {
      addSample(new double[] { UncommonDistributions.rNorm(mx, sdx), UncommonDistributions.rNorm(my, sdy) });
    }
  }

  @Override
  @Before
  public void setUp() throws Exception {
    super.setUp();
    conf = new Configuration();
    fs = FileSystem.get(conf);
  }

  /** Test the basic Mapper */
  @Test
  public void testMapper() throws Exception {
    generateSamples(10, 0, 0, 1);
    DirichletState state =
        new DirichletState(new GaussianClusterDistribution(new VectorWritable(new DenseVector(2))), 5, 1);
    DirichletMapper mapper = new DirichletMapper();
    mapper.setup(state);

    RecordWriter<Text, VectorWritable> writer = new DummyRecordWriter<Text, VectorWritable>();
    Mapper<WritableComparable<?>,VectorWritable,Text,VectorWritable>.Context context =
        DummyRecordWriter.build(mapper, conf, writer);
    for (VectorWritable v : sampleData) {
      mapper.map(null, v, context);
    }
    // Map<String, List<VectorWritable>> data = collector.getData();
    // this seed happens to produce two partitions, but they work
    // assertEquals("output size", 3, data.size());
  }

  /** Test the basic Reducer */
  @Test
  public void testReducer() throws Exception {
    generateSamples(100, 0, 0, 1);
    generateSamples(100, 2, 0, 1);
    generateSamples(100, 0, 2, 1);
    generateSamples(100, 2, 2, 1);
    DirichletState state =
        new DirichletState(new GaussianClusterDistribution(new VectorWritable(new DenseVector(2))), 20, 1);
    DirichletMapper mapper = new DirichletMapper();
    mapper.setup(state);

    DummyRecordWriter<Text, VectorWritable> mapWriter = new DummyRecordWriter<Text, VectorWritable>();
    Mapper<WritableComparable<?>,VectorWritable,Text,VectorWritable>.Context mapContext =
        DummyRecordWriter.build(mapper, conf, mapWriter);
    for (VectorWritable v : sampleData) {
      mapper.map(null, v, mapContext);
    }

    DirichletReducer reducer = new DirichletReducer();
    reducer.setup(state);
    RecordWriter<Text, DirichletCluster> reduceWriter = new DummyRecordWriter<Text, DirichletCluster>();
    Reducer<Text, VectorWritable, Text, DirichletCluster>.Context reduceContext =
        DummyRecordWriter.build(reducer, conf, reduceWriter, Text.class, VectorWritable.class);
    for (Text key : mapWriter.getKeys()) {
      reducer.reduce(new Text(key), mapWriter.getValue(key), reduceContext);
    }

    Cluster[] newModels = reducer.getNewModels();
    state.update(newModels);
  }

  /** Test the Mapper and Reducer in an iteration loop */
  @Test
  public void testMRIterations() throws Exception {
    generateSamples(100, 0, 0, 1);
    generateSamples(100, 2, 0, 1);
    generateSamples(100, 0, 2, 1);
    generateSamples(100, 2, 2, 1);
    DirichletState state =
        new DirichletState(new GaussianClusterDistribution(new VectorWritable(new DenseVector(2))), 20, 1.0);

    Collection<Model<VectorWritable>[]> models = Lists.newArrayList();

    for (int iteration = 0; iteration < 10; iteration++) {
      DirichletMapper mapper = new DirichletMapper();
      mapper.setup(state);
      DummyRecordWriter<Text, VectorWritable> mapWriter = new DummyRecordWriter<Text, VectorWritable>();
      Mapper<WritableComparable<?>, VectorWritable, Text, VectorWritable>.Context mapContext =
          DummyRecordWriter.build(mapper, conf, mapWriter);
      for (VectorWritable v : sampleData) {
        mapper.map(null, v, mapContext);
      }

      DirichletReducer reducer = new DirichletReducer();
      reducer.setup(state);
      RecordWriter<Text, DirichletCluster> reduceWriter = new DummyRecordWriter<Text, DirichletCluster>();
      Reducer<Text,VectorWritable, Text,DirichletCluster>.Context reduceContext =
          DummyRecordWriter.build(reducer, conf, reduceWriter, Text.class, VectorWritable.class);
      for (Text key : mapWriter.getKeys()) {
        reducer.reduce(new Text(key), mapWriter.getValue(key), reduceContext);
      }

      Cluster[] newModels = reducer.getNewModels();
      state.update(newModels);
      models.add(newModels);
    }
    printModels(models, 0);
  }

  private static void printModels(Iterable<Model<VectorWritable>[]> results, int significant) {
    int row = 0;
    for (Model<VectorWritable>[] r : results) {
      System.out.print("sample[" + row++ + "]= ");
      for (int k = 0; k < r.length; k++) {
        Model<VectorWritable> model = r[k];
        if (model.count() > significant) {
          System.out.print("m" + k + model.toString() + ", ");
        }
      }
      System.out.println();
    }
    System.out.println();
  }

  private static void printResults(Iterable<List<DirichletCluster>> clusters, int significant) {
    int row = 0;
    for (List<DirichletCluster> r : clusters) {
      System.out.print("sample[" + row++ + "]= ");
      for (int k = 0; k < r.size(); k++) {
        Model<VectorWritable> model = r.get(k).getModel();
        if (model.count() > significant) {
          int total = (int) r.get(k).getTotalCount();
          System.out.print("m" + k + '(' + total + ')' + model.toString() + ", ");
        }
      }
      System.out.println();
    }
    System.out.println();
  }

  /** Test the Mapper and Reducer using the Driver in sequential execution mode */
  @Test
  public void testDriverIterationsSeq() throws Exception {
    generateSamples(100, 0, 0, 0.5);
    generateSamples(100, 2, 0, 0.2);
    generateSamples(100, 0, 2, 0.3);
    generateSamples(100, 2, 2, 1);
    ClusteringTestUtils.writePointsToFile(sampleData, getTestTempFilePath("input/data.txt"), fs, conf);
    // Now run the driver using the run() method. Others can use runJob() as before
    Integer maxIterations = 5;
    DistributionDescription description =
        new DistributionDescription(GaussianClusterDistribution.class.getName(),
                                    DenseVector.class.getName(),
                                    null,
                                    2);
    String[] args = { optKey(DefaultOptionCreator.INPUT_OPTION), getTestTempDirPath("input").toString(),
        optKey(DefaultOptionCreator.OUTPUT_OPTION), getTestTempDirPath("output").toString(),
        optKey(DirichletDriver.MODEL_DISTRIBUTION_CLASS_OPTION), description.getModelFactory(),
        optKey(DirichletDriver.MODEL_PROTOTYPE_CLASS_OPTION), description.getModelPrototype(),
        optKey(DefaultOptionCreator.NUM_CLUSTERS_OPTION), "20", optKey(DefaultOptionCreator.MAX_ITERATIONS_OPTION),
        maxIterations.toString(), optKey(DirichletDriver.ALPHA_OPTION), "1.0",
        optKey(DefaultOptionCreator.OVERWRITE_OPTION),
        optKey(DefaultOptionCreator.CLUSTERING_OPTION), optKey(DefaultOptionCreator.METHOD_OPTION),
        DefaultOptionCreator.SEQUENTIAL_METHOD };
    DirichletDriver dirichletDriver = new DirichletDriver();
    dirichletDriver.setConf(conf);
    dirichletDriver.run(args);
    // and inspect results
    Collection<List<DirichletCluster>> clusters = Lists.newArrayList();
    Configuration conf = new Configuration();
    conf.set(DirichletDriver.MODEL_DISTRIBUTION_KEY, description.toString());
    conf.set(DirichletDriver.NUM_CLUSTERS_KEY, "20");
    conf.set(DirichletDriver.ALPHA_0_KEY, "1.0");
    for (int i = 0; i <= maxIterations; i++) {
      conf.set(DirichletDriver.STATE_IN_KEY, new Path(getTestTempDirPath("output"), "clusters-" + i).toString());
      clusters.add(DirichletMapper.getDirichletState(conf).getClusters());
    }
    printResults(clusters, 0);
  }

  /** Test the Mapper and Reducer using the Driver in mapreduce mode */
  @Test
  public void testDriverIterationsMR() throws Exception {
    generateSamples(100, 0, 0, 0.5);
    generateSamples(100, 2, 0, 0.2);
    generateSamples(100, 0, 2, 0.3);
    generateSamples(100, 2, 2, 1);
    ClusteringTestUtils.writePointsToFile(sampleData, getTestTempFilePath("input/data.txt"), fs, conf);
    // Now run the driver using the run() method. Others can use runJob() as before
    Integer maxIterations = 5;
    DistributionDescription description =
        new DistributionDescription(GaussianClusterDistribution.class.getName(),
                                    DenseVector.class.getName(),
                                    null,
                                    2);
    String[] args = { optKey(DefaultOptionCreator.INPUT_OPTION), getTestTempDirPath("input").toString(),
        optKey(DefaultOptionCreator.OUTPUT_OPTION), getTestTempDirPath("output").toString(),
        optKey(DirichletDriver.MODEL_DISTRIBUTION_CLASS_OPTION), description.getModelFactory(),
        optKey(DirichletDriver.MODEL_PROTOTYPE_CLASS_OPTION), description.getModelPrototype(),
        optKey(DefaultOptionCreator.NUM_CLUSTERS_OPTION), "20", optKey(DefaultOptionCreator.MAX_ITERATIONS_OPTION),
        maxIterations.toString(), optKey(DirichletDriver.ALPHA_OPTION), "1.0",
        optKey(DefaultOptionCreator.OVERWRITE_OPTION),
        optKey(DefaultOptionCreator.CLUSTERING_OPTION)};
    ToolRunner.run(new Configuration(), new DirichletDriver(), args);
    // and inspect results
    Collection<List<DirichletCluster>> clusters = Lists.newArrayList();
    Configuration conf = new Configuration();
    conf.set(DirichletDriver.MODEL_DISTRIBUTION_KEY, description.toString());
    conf.set(DirichletDriver.NUM_CLUSTERS_KEY, "20");
    conf.set(DirichletDriver.ALPHA_0_KEY, "1.0");
    for (int i = 0; i <= maxIterations; i++) {
      conf.set(DirichletDriver.STATE_IN_KEY, new Path(getTestTempDirPath("output"), "clusters-" + i).toString());
      clusters.add(DirichletMapper.getDirichletState(conf).getClusters());
    }
    printResults(clusters, 0);
  }

  /** Test the Mapper and Reducer using the Driver */
  @Test
  public void testDriverMnRIterations() throws Exception {
    generate4Datasets();
    // Now run the driver
    int maxIterations = 3;
    DistributionDescription description =
        new DistributionDescription(GaussianClusterDistribution.class.getName(),
                                    DenseVector.class.getName(),
                                    null,
                                    2);
    Configuration conf = new Configuration();
    DirichletDriver.run(conf,
                        getTestTempDirPath("input"),
                        getTestTempDirPath("output"),
                        description,
                        20,
                        maxIterations,
                        1.0,
                        false,
                        true,
                        0,
                        false);
    // and inspect results
    Collection<List<DirichletCluster>> clusters = Lists.newArrayList();
    conf.set(DirichletDriver.MODEL_DISTRIBUTION_KEY, description.toString());
    conf.set(DirichletDriver.NUM_CLUSTERS_KEY, "20");
    conf.set(DirichletDriver.ALPHA_0_KEY, "1.0");
    for (int i = 0; i <= maxIterations; i++) {
      conf.set(DirichletDriver.STATE_IN_KEY, new Path(getTestTempDirPath("output"), "clusters-" + i).toString());
      clusters.add(DirichletMapper.getDirichletState(conf).getClusters());
    }
    printResults(clusters, 0);
  }

  /** Test the Driver in sequential execution mode using MahalanobisDistanceMeasure */
  @Test
  public void testDriverIterationsMahalanobisSeq() throws Exception {
    generateAsymmetricSamples(100, 0, 0, 0.5, 3.0);
    generateAsymmetricSamples(100, 0, 3, 0.3, 4.0);
    ClusteringTestUtils.writePointsToFile(sampleData, getTestTempFilePath("input/data.txt"), fs, conf);
    // Now run the driver using the run() method. Others can use runJob() as before
    MahalanobisDistanceMeasure measure = new MahalanobisDistanceMeasure();
    DistributionDescription description =
        new DistributionDescription(DistanceMeasureClusterDistribution.class.getName(),
                                    DenseVector.class.getName(),
                                    MahalanobisDistanceMeasure.class.getName(),
                                    2);

    Vector meanVector = new DenseVector(new double[] { 0.0, 0.0 });
    measure.setMeanVector(meanVector);
    Matrix m= new DenseMatrix(new double [][] {{0.5, 0.0}, {0.0, 4.0}});
    measure.setCovarianceMatrix(m);

    Path inverseCovarianceFile =
        new Path(getTestTempDirPath("mahalanobis"), "MahalanobisDistanceMeasureInverseCovarianceFile");
    conf.set("MahalanobisDistanceMeasure.inverseCovarianceFile", inverseCovarianceFile.toString());
    FileSystem fs = FileSystem.get(inverseCovarianceFile.toUri(), conf);
    MatrixWritable inverseCovarianceMatrix = new MatrixWritable(measure.getInverseCovarianceMatrix());
    DataOutputStream out = fs.create(inverseCovarianceFile);
    try {
      inverseCovarianceMatrix.write(out);
    } finally {
      Closeables.closeQuietly(out);
    }

    Path meanVectorFile = new Path(getTestTempDirPath("mahalanobis"), "MahalanobisDistanceMeasureMeanVectorFile");
    conf.set("MahalanobisDistanceMeasure.meanVectorFile", meanVectorFile.toString());
    fs = FileSystem.get(meanVectorFile.toUri(), conf);
    VectorWritable meanVectorWritable = new VectorWritable(meanVector);
    out = fs.create(meanVectorFile);
    try {
      meanVectorWritable.write(out);
    } finally {
      Closeables.closeQuietly(out);
    }

    conf.set("MahalanobisDistanceMeasure.maxtrixClass", MatrixWritable.class.getName());
    conf.set("MahalanobisDistanceMeasure.vectorClass", VectorWritable.class.getName());

    Integer maxIterations = 5;
    String[] args = { optKey(DefaultOptionCreator.INPUT_OPTION), getTestTempDirPath("input").toString(),
        optKey(DefaultOptionCreator.OUTPUT_OPTION), getTestTempDirPath("output").toString(),
        optKey(DirichletDriver.MODEL_DISTRIBUTION_CLASS_OPTION), description.getModelFactory(),
        optKey(DefaultOptionCreator.DISTANCE_MEASURE_OPTION), description.getDistanceMeasure(),
        optKey(DirichletDriver.MODEL_PROTOTYPE_CLASS_OPTION), description.getModelPrototype(),
        optKey(DefaultOptionCreator.NUM_CLUSTERS_OPTION), "20", optKey(DefaultOptionCreator.MAX_ITERATIONS_OPTION),
        maxIterations.toString(), optKey(DirichletDriver.ALPHA_OPTION), "1.0",
        optKey(DefaultOptionCreator.OVERWRITE_OPTION),
        optKey(DefaultOptionCreator.CLUSTERING_OPTION), optKey(DefaultOptionCreator.METHOD_OPTION),
        DefaultOptionCreator.SEQUENTIAL_METHOD };
    DirichletDriver dirichletDriver = new DirichletDriver();
    dirichletDriver.setConf(conf);
    dirichletDriver.run(args);
    // and inspect results
    Collection<List<DirichletCluster>> clusters = Lists.newArrayList();
    Configuration conf = new Configuration();
    conf.set(DirichletDriver.MODEL_DISTRIBUTION_KEY, description.toString());
    conf.set(DirichletDriver.NUM_CLUSTERS_KEY, "20");
    conf.set(DirichletDriver.ALPHA_0_KEY, "1.0");
    for (int i = 0; i <= maxIterations; i++) {
      conf.set(DirichletDriver.STATE_IN_KEY, new Path(getTestTempDirPath("output"), "clusters-" + i).toString());
      clusters.add(DirichletMapper.getDirichletState(conf).getClusters());
    }
    printResults(clusters, 0);
  }

  /** Test the Mapper and Reducer using the Driver in mapreduce mode */
  @Test
  public void testDriverIterationsMahalanobisMR() throws Exception {
    generateAsymmetricSamples(100, 0, 0, 0.5, 3.0);
    generateAsymmetricSamples(100, 0, 3, 0.3, 4.0);
    ClusteringTestUtils.writePointsToFile(sampleData, getTestTempFilePath("input/data.txt"), fs, conf);
    // Now run the driver using the run() method. Others can use runJob() as before

    MahalanobisDistanceMeasure measure = new MahalanobisDistanceMeasure();
    DistributionDescription description =
        new DistributionDescription(DistanceMeasureClusterDistribution.class.getName(),
                                    DenseVector.class.getName(),
                                    MahalanobisDistanceMeasure.class.getName(),
                                    2);

    Vector meanVector = new DenseVector(new double[]{0.0, 0.0});
    measure.setMeanVector(meanVector);
    Matrix m = new DenseMatrix(new double [][] {{0.5, 0.0}, {0.0, 4.0}});
    measure.setCovarianceMatrix(m);

    Path inverseCovarianceFile =
        new Path(getTestTempDirPath("mahalanobis"), "MahalanobisDistanceMeasureInverseCovarianceFile");
    conf.set("MahalanobisDistanceMeasure.inverseCovarianceFile", inverseCovarianceFile.toString());
    FileSystem fs = FileSystem.get(inverseCovarianceFile.toUri(), conf);
    MatrixWritable inverseCovarianceMatrix = new MatrixWritable(measure.getInverseCovarianceMatrix());
    DataOutputStream out = fs.create(inverseCovarianceFile);
    try {
      inverseCovarianceMatrix.write(out);
    } finally {
      Closeables.closeQuietly(out);
    }

    Path meanVectorFile = new Path(getTestTempDirPath("mahalanobis"), "MahalanobisDistanceMeasureMeanVectorFile");
    conf.set("MahalanobisDistanceMeasure.meanVectorFile", meanVectorFile.toString());
    fs = FileSystem.get(meanVectorFile.toUri(), conf);
    VectorWritable meanVectorWritable = new VectorWritable(meanVector);
    out = fs.create(meanVectorFile);
    try {
      meanVectorWritable.write(out);
    } finally {
      Closeables.closeQuietly(out);
    }

    conf.set("MahalanobisDistanceMeasure.maxtrixClass", MatrixWritable.class.getName());
    conf.set("MahalanobisDistanceMeasure.vectorClass", VectorWritable.class.getName());

    Integer maxIterations = 5;
    String[] args = { optKey(DefaultOptionCreator.INPUT_OPTION), getTestTempDirPath("input").toString(),
        optKey(DefaultOptionCreator.OUTPUT_OPTION), getTestTempDirPath("output").toString(),
        optKey(DirichletDriver.MODEL_DISTRIBUTION_CLASS_OPTION), description.getModelFactory(),
        optKey(DefaultOptionCreator.DISTANCE_MEASURE_OPTION), description.getDistanceMeasure(),
        optKey(DirichletDriver.MODEL_PROTOTYPE_CLASS_OPTION), description.getModelPrototype(),
        optKey(DefaultOptionCreator.NUM_CLUSTERS_OPTION), "20", optKey(DefaultOptionCreator.MAX_ITERATIONS_OPTION),
        maxIterations.toString(), optKey(DirichletDriver.ALPHA_OPTION), "1.0",
        optKey(DefaultOptionCreator.OVERWRITE_OPTION),
        optKey(DefaultOptionCreator.CLUSTERING_OPTION)};
    Tool dirichletDriver = new DirichletDriver();
    dirichletDriver.setConf(conf);
    ToolRunner.run(conf, dirichletDriver, args);
    // and inspect results
    Collection<List<DirichletCluster>> clusters = Lists.newArrayList();
    Configuration conf = new Configuration();
    conf.set(DirichletDriver.MODEL_DISTRIBUTION_KEY, description.toString());
    conf.set(DirichletDriver.NUM_CLUSTERS_KEY, "20");
    conf.set(DirichletDriver.ALPHA_0_KEY, "1.0");
    for (int i = 0; i <= maxIterations; i++) {
      conf.set(DirichletDriver.STATE_IN_KEY, new Path(getTestTempDirPath("output"), "clusters-" + i).toString());
      clusters.add(DirichletMapper.getDirichletState(conf).getClusters());
    }
    printResults(clusters, 0);
  }

  private void generate4Datasets() throws IOException {
    generateSamples(500, 0, 0, 0.5);
    ClusteringTestUtils.writePointsToFile(sampleData, getTestTempFilePath("input/data1.txt"), fs, conf);
    sampleData = Lists.newArrayList();
    generateSamples(500, 2, 0, 0.2);
    ClusteringTestUtils.writePointsToFile(sampleData, getTestTempFilePath("input/data2.txt"), fs, conf);
    sampleData = Lists.newArrayList();
    generateSamples(500, 0, 2, 0.3);
    ClusteringTestUtils.writePointsToFile(sampleData, getTestTempFilePath("input/data3.txt"), fs, conf);
    sampleData = Lists.newArrayList();
    generateSamples(500, 2, 2, 1);
    ClusteringTestUtils.writePointsToFile(sampleData, getTestTempFilePath("input/data4.txt"), fs, conf);
  }

}