/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.mahout.clustering.topdown.postprocessor;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.commons.lang.ArrayUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Writable;
import org.apache.mahout.clustering.ClusteringTestUtils;
import org.apache.mahout.clustering.WeightedVectorWritable;
import org.apache.mahout.clustering.canopy.CanopyDriver;
import org.apache.mahout.clustering.topdown.PathDirectory;
import org.apache.mahout.common.DummyOutputCollector;
import org.apache.mahout.common.MahoutTestCase;
import org.apache.mahout.common.Pair;
import org.apache.mahout.common.distance.ManhattanDistanceMeasure;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable;
import org.apache.mahout.math.RandomAccessSparseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import com.google.common.collect.Lists;
public final class ClusterOutputPostProcessorTest extends MahoutTestCase {
private static final double[][] REFERENCE = { {1, 1}, {2, 1}, {1, 2}, {4, 4}, {5, 4}, {4, 5}, {5, 5}};
private FileSystem fs;
private Path outputPath;
private Configuration conf;
@Override
@Before
public void setUp() throws Exception {
super.setUp();
Configuration conf = new Configuration();
fs = FileSystem.get(conf);
}
private static List<VectorWritable> getPointsWritable(double[][] raw) {
List<VectorWritable> points = Lists.newArrayList();
for (double[] fr : raw) {
Vector vec = new RandomAccessSparseVector(fr.length);
vec.assign(fr);
points.add(new VectorWritable(vec));
}
return points;
}
/**
* Story: User wants to use cluster post processor after canopy clustering and then run clustering on the
* output clusters
*/
@Test
public void testTopDownClustering() throws Exception {
List<VectorWritable> points = getPointsWritable(REFERENCE);
Path pointsPath = getTestTempDirPath("points");
conf = new Configuration();
ClusteringTestUtils.writePointsToFile(points, new Path(pointsPath, "file1"), fs, conf);
ClusteringTestUtils.writePointsToFile(points, new Path(pointsPath, "file2"), fs, conf);
outputPath = getTestTempDirPath("output");
topLevelClustering(pointsPath, conf);
Map<String,Path> postProcessedClusterDirectories = ouputPostProcessing(conf);
assertPostProcessedOutput(postProcessedClusterDirectories);
bottomLevelClustering(postProcessedClusterDirectories);
}
private void assertTopLevelCluster(Entry<String,Path> cluster) {
String clusterId = cluster.getKey();
Path clusterPath = cluster.getValue();
try {
if ("0".equals(clusterId)) {
assertPointsInFirstTopLevelCluster(clusterPath);
} else if ("1".equals(clusterId)) {
assertPointsInSecondTopLevelCluster(clusterPath);
}
} catch (IOException e) {
Assert.fail("Exception occurred while asserting top level cluster.");
}
}
private void assertPointsInFirstTopLevelCluster(Path clusterPath) throws IOException {
List<Vector> vectorsInCluster = getVectorsInCluster(clusterPath);
for (Vector vector : vectorsInCluster) {
Assert.assertTrue(ArrayUtils.contains(new String[] {"{1:1.0,0:1.0}", "{1:1.0,0:2.0}", "{1:2.0,0:1.0}"},
vector.asFormatString()));
}
}
private void assertPointsInSecondTopLevelCluster(Path clusterPath) throws IOException {
List<Vector> vectorsInCluster = getVectorsInCluster(clusterPath);
for (Vector vector : vectorsInCluster) {
Assert.assertTrue(ArrayUtils.contains(new String[] {"{1:4.0,0:4.0}", "{1:4.0,0:5.0}", "{1:5.0,0:4.0}",
"{1:5.0,0:5.0}"}, vector.asFormatString()));
}
}
private List<Vector> getVectorsInCluster(Path clusterPath) throws IOException {
Path[] partFilePaths = FileUtil.stat2Paths(fs.globStatus(clusterPath));
FileStatus[] listStatus = fs.listStatus(partFilePaths);
List<Vector> vectors = new ArrayList<Vector>();
for (FileStatus partFile : listStatus) {
SequenceFile.Reader topLevelClusterReader = new SequenceFile.Reader(fs, partFile.getPath(), conf);
Writable clusterIdAsKey = new LongWritable();
VectorWritable point = new VectorWritable();
while (topLevelClusterReader.next(clusterIdAsKey, point)) {
vectors.add(point.get());
}
}
return vectors;
}
private void bottomLevelClustering(Map<String,Path> postProcessedClusterDirectories) throws IOException,
InterruptedException,
ClassNotFoundException {
for (Entry<String,Path> topLevelCluster : postProcessedClusterDirectories.entrySet()) {
String clusterId = topLevelCluster.getKey();
Path topLevelclusterPath = topLevelCluster.getValue();
Path bottomLevelCluster = PathDirectory.getBottomLevelClusterPath(outputPath, clusterId);
CanopyDriver.run(conf, topLevelclusterPath, bottomLevelCluster, new ManhattanDistanceMeasure(), 2.1,
2.0, true, true);
assertBottomLevelCluster(bottomLevelCluster);
}
}
private void assertBottomLevelCluster(Path bottomLevelCluster) {
Path clusteredPointsPath = new Path(bottomLevelCluster, "clusteredPoints");
DummyOutputCollector<IntWritable,WeightedVectorWritable> collector =
new DummyOutputCollector<IntWritable,WeightedVectorWritable>();
// The key is the clusterId, the value is the weighted vector
for (Pair<IntWritable,WeightedVectorWritable> record :
new SequenceFileIterable<IntWritable,WeightedVectorWritable>(new Path(clusteredPointsPath, "part-m-0"),
conf)) {
collector.collect(record.getFirst(), record.getSecond());
}
int clusterSize = collector.getKeys().size();
// First top level cluster produces two more clusters, second top level cluster is not broken again
assertTrue(clusterSize == 1 || clusterSize == 2);
}
private void assertPostProcessedOutput(Map<String,Path> postProcessedClusterDirectories) {
for (Entry<String,Path> cluster : postProcessedClusterDirectories.entrySet()) {
assertTopLevelCluster(cluster);
}
}
private Map<String,Path> ouputPostProcessing(Configuration conf) throws IOException {
ClusterOutputPostProcessor clusterOutputPostProcessor = new ClusterOutputPostProcessor(outputPath,
outputPath, conf);
clusterOutputPostProcessor.process();
return clusterOutputPostProcessor.getPostProcessedClusterDirectories();
}
private void topLevelClustering(Path pointsPath, Configuration conf) throws IOException,
InterruptedException,
ClassNotFoundException {
CanopyDriver.run(conf, pointsPath, outputPath, new ManhattanDistanceMeasure(), 3.1, 2.1, true, true);
}
}