/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.mahout.clustering.topdown.postprocessor;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.ToolRunner;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.apache.mahout.common.iterator.sequencefile.PathFilters;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterator;
import org.apache.mahout.math.VectorWritable;
/**
* Post processes the output of clustering algorithms and groups them into respective clusters. Ideal to be
* used for top down clustering. It can also be used if the clustering output needs to be grouped into their
* respective clusters.
*/
public class ClusterOutputPostProcessorDriver extends AbstractJob {
/**
* CLI to run clustering post processor. The input to post processor is the ouput path specified to the
* clustering.
*/
@Override
public int run(String[] args) throws Exception {
addInputOption();
addOutputOption();
addOption(DefaultOptionCreator.methodOption().create());
if (parseArguments(args) == null) {
return -1;
}
Path input = getInputPath();
Path output = getOutputPath();
if (getConf() == null) {
setConf(new Configuration());
}
boolean runSequential = getOption(DefaultOptionCreator.METHOD_OPTION).equalsIgnoreCase(
DefaultOptionCreator.SEQUENTIAL_METHOD);
run(input, output, runSequential);
return 0;
}
/**
* Constructor to be used by the ToolRunner.
*/
private ClusterOutputPostProcessorDriver() {}
public static void main(String[] args) throws Exception {
ToolRunner.run(new Configuration(), new ClusterOutputPostProcessorDriver(), args);
}
/**
* Post processes the output of clustering algorithms and groups them into respective clusters. Each
* cluster's vectors are written into a directory named after its clusterId.
*
* @param input
* The output path provided to the clustering algorithm, whose would be post processed. Hint : The
* path of the directory containing clusters-*-final and clusteredPoints.
* @param output
* The post processed data would be stored at this path.
* @param runSequential
* If set to true, post processes it sequentially, else, uses. MapReduce. Hint : If the clustering
* was done sequentially, make it sequential, else vice versa.
*/
public static void run(Path input, Path output, boolean runSequential) throws IOException,
InterruptedException,
ClassNotFoundException {
if (runSequential) {
postProcessSeq(input, output);
} else {
Configuration conf = new Configuration();
postProcessMR(conf, input, output);
movePartFilesToRespectiveDirectories(conf, output);
}
}
/**
* Process Sequentially. Reads the vectors one by one, and puts them into respective directory, named after
* their clusterId.
*
* @param input
* The output path provided to the clustering algorithm, whose would be post processed. Hint : The
* path of the directory containing clusters-*-final and clusteredPoints.
* @param output
* The post processed data would be stored at this path.
*/
private static void postProcessSeq(Path input, Path output) throws IOException {
ClusterOutputPostProcessor clusterOutputPostProcessor = new ClusterOutputPostProcessor(input, output,
new Configuration());
clusterOutputPostProcessor.process();
}
/**
* Process as a map reduce job. The numberOfReduceTasks is set to the number of clusters present in the
* output. So that each cluster's vector is written in its own part file.
*
* @param conf
* The hadoop configuration.
* @param input
* The output path provided to the clustering algorithm, whose would be post processed. Hint : The
* path of the directory containing clusters-*-final and clusteredPoints.
* @param output
* The post processed data would be stored at this path.
*/
private static void postProcessMR(Configuration conf, Path input, Path output) throws IOException,
InterruptedException,
ClassNotFoundException {
Job job = new Job(conf, "ClusterOutputPostProcessor Driver running over input: " + input);
job.setInputFormatClass(SequenceFileInputFormat.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
job.setMapperClass(ClusterOutputPostProcessorMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(VectorWritable.class);
job.setReducerClass(ClusterOutputPostProcessorReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(VectorWritable.class);
int numberOfClusters = ClusterCountReader.getNumberOfClusters(input, conf);
job.setNumReduceTasks(numberOfClusters);
job.setJarByClass(ClusterOutputPostProcessorDriver.class);
FileInputFormat.addInputPath(job, new Path(input, new Path("clusteredPoints")));
FileOutputFormat.setOutputPath(job, output);
if (!job.waitForCompletion(true)) {
throw new InterruptedException("ClusterOutputPostProcessor Job failed processing " + input);
}
}
/**
* The mapreduce version of the post processor writes different clusters into different part files. This
* method reads the part files and moves them into directories named after their clusterIds.
*
* @param conf
* The hadoop configuration.
* @param output
* The post processed data would be stored at this path.
*/
private static void movePartFilesToRespectiveDirectories(Configuration conf, Path output) throws IOException {
FileSystem fileSystem = output.getFileSystem(conf);
for (FileStatus fileStatus : fileSystem.listStatus(output, PathFilters.partFilter())) {
SequenceFileIterator<Writable,Writable> it =
new SequenceFileIterator<Writable,Writable>(fileStatus.getPath(), true, conf);
if (it.hasNext()) {
renameFile(it.next().getFirst(), fileStatus, conf);
}
it.close();
}
}
/**
* Using @FileSystem rename method to move the file.
*/
private static void renameFile(Writable key, FileStatus fileStatus, Configuration conf) throws IOException {
Path path = fileStatus.getPath();
FileSystem fileSystem = path.getFileSystem(conf);
Path subDir = new Path(key.toString());
Path renameTo = new Path(path.getParent(), subDir);
fileSystem.mkdirs(renameTo);
fileSystem.rename(path, renameTo);
}
}