ClusterOutputPostProcessorTest.java example

Explorer
mahout-rbmClassifier-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements. See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership. The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License. You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied. See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.mahout.clustering.topdown.postprocessor;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;

import org.apache.commons.lang.ArrayUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Writable;
import org.apache.mahout.clustering.ClusteringTestUtils;
import org.apache.mahout.clustering.WeightedVectorWritable;
import org.apache.mahout.clustering.canopy.CanopyDriver;
import org.apache.mahout.clustering.topdown.PathDirectory;
import org.apache.mahout.common.DummyOutputCollector;
import org.apache.mahout.common.MahoutTestCase;
import org.apache.mahout.common.Pair;
import org.apache.mahout.common.distance.ManhattanDistanceMeasure;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable;
import org.apache.mahout.math.RandomAccessSparseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;

import com.google.common.collect.Lists;

public final class ClusterOutputPostProcessorTest extends MahoutTestCase {
  
  private static final double[][] REFERENCE = { {1, 1}, {2, 1}, {1, 2}, {4, 4}, {5, 4}, {4, 5}, {5, 5}};
  
  private FileSystem fs;
  
  private Path outputPath;
  
  private Configuration conf;
  
  @Override
  @Before
  public void setUp() throws Exception {
    super.setUp();
    Configuration conf = new Configuration();
    fs = FileSystem.get(conf);
  }
  
  private static List<VectorWritable> getPointsWritable(double[][] raw) {
    List<VectorWritable> points = Lists.newArrayList();
    for (double[] fr : raw) {
      Vector vec = new RandomAccessSparseVector(fr.length);
      vec.assign(fr);
      points.add(new VectorWritable(vec));
    }
    return points;
  }
  
  /**
   * Story: User wants to use cluster post processor after canopy clustering and then run clustering on the
   * output clusters
   */
  @Test
  public void testTopDownClustering() throws Exception {
    List<VectorWritable> points = getPointsWritable(REFERENCE);
    
    Path pointsPath = getTestTempDirPath("points");
    conf = new Configuration();
    ClusteringTestUtils.writePointsToFile(points, new Path(pointsPath, "file1"), fs, conf);
    ClusteringTestUtils.writePointsToFile(points, new Path(pointsPath, "file2"), fs, conf);
    
    outputPath = getTestTempDirPath("output");
    
    topLevelClustering(pointsPath, conf);
    
    Map<String,Path> postProcessedClusterDirectories = ouputPostProcessing(conf);
    
    assertPostProcessedOutput(postProcessedClusterDirectories);
    
    bottomLevelClustering(postProcessedClusterDirectories);
  }
  
  private void assertTopLevelCluster(Entry<String,Path> cluster) {
    String clusterId = cluster.getKey();
    Path clusterPath = cluster.getValue();
    
    try {
      if ("0".equals(clusterId)) {
        assertPointsInFirstTopLevelCluster(clusterPath);
      } else if ("1".equals(clusterId)) {
        assertPointsInSecondTopLevelCluster(clusterPath);
      }
    } catch (IOException e) {
      Assert.fail("Exception occurred while asserting top level cluster.");
    }
    
  }
  
  private void assertPointsInFirstTopLevelCluster(Path clusterPath) throws IOException {
    List<Vector> vectorsInCluster = getVectorsInCluster(clusterPath);
    for (Vector vector : vectorsInCluster) {
      Assert.assertTrue(ArrayUtils.contains(new String[] {"{1:1.0,0:1.0}", "{1:1.0,0:2.0}", "{1:2.0,0:1.0}"},
        vector.asFormatString()));
    }
  }
  
  private void assertPointsInSecondTopLevelCluster(Path clusterPath) throws IOException {
    List<Vector> vectorsInCluster = getVectorsInCluster(clusterPath);
    for (Vector vector : vectorsInCluster) {
      Assert.assertTrue(ArrayUtils.contains(new String[] {"{1:4.0,0:4.0}", "{1:4.0,0:5.0}", "{1:5.0,0:4.0}",
                                                          "{1:5.0,0:5.0}"}, vector.asFormatString()));
    }
  }
  
  private List<Vector> getVectorsInCluster(Path clusterPath) throws IOException {
    Path[] partFilePaths = FileUtil.stat2Paths(fs.globStatus(clusterPath));
    FileStatus[] listStatus = fs.listStatus(partFilePaths);
    List<Vector> vectors = new ArrayList<Vector>();
    for (FileStatus partFile : listStatus) {
      SequenceFile.Reader topLevelClusterReader = new SequenceFile.Reader(fs, partFile.getPath(), conf);
      Writable clusterIdAsKey = new LongWritable();
      VectorWritable point = new VectorWritable();
      while (topLevelClusterReader.next(clusterIdAsKey, point)) {
        vectors.add(point.get());
      }
    }
    return vectors;
  }
  
  private void bottomLevelClustering(Map<String,Path> postProcessedClusterDirectories) throws IOException,
                                                                                      InterruptedException,
                                                                                      ClassNotFoundException {
    for (Entry<String,Path> topLevelCluster : postProcessedClusterDirectories.entrySet()) {
      String clusterId = topLevelCluster.getKey();
      Path topLevelclusterPath = topLevelCluster.getValue();
      
      Path bottomLevelCluster = PathDirectory.getBottomLevelClusterPath(outputPath, clusterId);
      CanopyDriver.run(conf, topLevelclusterPath, bottomLevelCluster, new ManhattanDistanceMeasure(), 2.1,
        2.0, true, true);
      assertBottomLevelCluster(bottomLevelCluster);
    }
  }
  
  private void assertBottomLevelCluster(Path bottomLevelCluster) {
    Path clusteredPointsPath = new Path(bottomLevelCluster, "clusteredPoints");
    
    DummyOutputCollector<IntWritable,WeightedVectorWritable> collector =
        new DummyOutputCollector<IntWritable,WeightedVectorWritable>();
    
    // The key is the clusterId, the value is the weighted vector
    for (Pair<IntWritable,WeightedVectorWritable> record :
         new SequenceFileIterable<IntWritable,WeightedVectorWritable>(new Path(clusteredPointsPath, "part-m-0"),
                                                                      conf)) {
      collector.collect(record.getFirst(), record.getSecond());
    }
    int clusterSize = collector.getKeys().size();
    // First top level cluster produces two more clusters, second top level cluster is not broken again
    assertTrue(clusterSize == 1 || clusterSize == 2);
    
  }
  
  private void assertPostProcessedOutput(Map<String,Path> postProcessedClusterDirectories) {
    for (Entry<String,Path> cluster : postProcessedClusterDirectories.entrySet()) {
      assertTopLevelCluster(cluster);
    }
  }
  
  private Map<String,Path> ouputPostProcessing(Configuration conf) throws IOException {
    ClusterOutputPostProcessor clusterOutputPostProcessor = new ClusterOutputPostProcessor(outputPath,
        outputPath, conf);
    clusterOutputPostProcessor.process();
    return clusterOutputPostProcessor.getPostProcessedClusterDirectories();
  }
  
  private void topLevelClustering(Path pointsPath, Configuration conf) throws IOException,
                                                                      InterruptedException,
                                                                      ClassNotFoundException {
    CanopyDriver.run(conf, pointsPath, outputPath, new ManhattanDistanceMeasure(), 3.1, 2.1, true, true);
  }
  
}