ClusterCountReaderTest.java example

Explorer
mahout-rbmClassifier-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.mahout.clustering.topdown.postprocessor;

import java.io.IOException;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.mahout.clustering.Cluster;
import org.apache.mahout.clustering.ClusteringTestUtils;
import org.apache.mahout.clustering.WeightedVectorWritable;
import org.apache.mahout.clustering.canopy.CanopyDriver;
import org.apache.mahout.clustering.kmeans.KMeansDriver;
import org.apache.mahout.common.DummyOutputCollector;
import org.apache.mahout.common.MahoutTestCase;
import org.apache.mahout.common.Pair;
import org.apache.mahout.common.distance.DistanceMeasure;
import org.apache.mahout.common.distance.ManhattanDistanceMeasure;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable;
import org.apache.mahout.math.RandomAccessSparseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;

import com.google.common.collect.Lists;

public final class ClusterCountReaderTest extends MahoutTestCase {
  
  public static final double[][] REFERENCE = { {1, 1}, {2, 1}, {1, 2}, {4, 4}, {5, 4}, {4, 5}, {5, 5}};
  
  private FileSystem fs;
  private Path outputPathForCanopy;
  private Path outputPathForKMeans;
  
  @Override
  @Before
  public void setUp() throws Exception {
    super.setUp();
    Configuration conf = new Configuration();
    fs = FileSystem.get(conf);
  }
  
  public static List<VectorWritable> getPointsWritable(double[][] raw) {
    List<VectorWritable> points = Lists.newArrayList();
    for (double[] fr : raw) {
      Vector vec = new RandomAccessSparseVector(fr.length);
      vec.assign(fr);
      points.add(new VectorWritable(vec));
    }
    return points;
  }
  
  /**
   * Story: User wants to use cluster post processor after canopy clustering and then run clustering on the
   * output clusters
   */
  @Test
  public void testGetNumberOfClusters() throws Exception {
    List<VectorWritable> points = getPointsWritable(REFERENCE);
    
    Path pointsPath = getTestTempDirPath("points");
    Configuration conf = new Configuration();
    ClusteringTestUtils.writePointsToFile(points, new Path(pointsPath, "file1"), fs, conf);
    ClusteringTestUtils.writePointsToFile(points, new Path(pointsPath, "file2"), fs, conf);
    
    outputPathForCanopy = getTestTempDirPath("canopy");
    outputPathForKMeans = getTestTempDirPath("kmeans");
    
    topLevelClustering(pointsPath, conf);
    
    int numberOfClusters = ClusterCountReader.getNumberOfClusters(outputPathForKMeans, conf);
    Assert.assertEquals(2, numberOfClusters);
    verifyThatNumberOfClustersIsCorrect(conf, new Path(outputPathForKMeans, new Path("clusteredPoints")));
    
  }
  
  private void topLevelClustering(Path pointsPath, Configuration conf) throws IOException,
                                                                      InterruptedException,
                                                                      ClassNotFoundException {
    DistanceMeasure measure = new ManhattanDistanceMeasure();
    CanopyDriver.run(conf, pointsPath, outputPathForCanopy, measure, 4.0, 3.0, true, true);
    Path clustersIn = new Path(outputPathForCanopy, new Path(Cluster.CLUSTERS_DIR + '0'
                                                                   + Cluster.FINAL_ITERATION_SUFFIX));
    KMeansDriver.run(conf, pointsPath, clustersIn, outputPathForKMeans, measure, 1, 1, true, true);
  }
  
  private static void verifyThatNumberOfClustersIsCorrect(Configuration conf, Path clusteredPointsPath) {
    DummyOutputCollector<IntWritable,WeightedVectorWritable> collector =
        new DummyOutputCollector<IntWritable,WeightedVectorWritable>();
    
    // The key is the clusterId, the value is the weighted vector
    for (Pair<IntWritable,WeightedVectorWritable> record :
         new SequenceFileIterable<IntWritable,WeightedVectorWritable>(new Path(clusteredPointsPath, "part-m-0"),
                                                                      conf)) {
      collector.collect(record.getFirst(), record.getSecond());
    }
    int clusterSize = collector.getKeys().size();
    assertEquals(2, clusterSize);
  }
  
}