/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.mahout.clustering.dirichlet;
import java.io.IOException;
import java.util.List;
import com.google.common.io.Closeables;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.ToolRunner;
import org.apache.mahout.clustering.Cluster;
import org.apache.mahout.clustering.WeightedVectorWritable;
import org.apache.mahout.clustering.dirichlet.models.DistributionDescription;
import org.apache.mahout.clustering.dirichlet.models.GaussianClusterDistribution;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.apache.mahout.common.iterator.sequencefile.PathFilters;
import org.apache.mahout.common.iterator.sequencefile.PathType;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterable;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterable;
import org.apache.mahout.math.RandomAccessSparseVector;
import org.apache.mahout.math.VectorWritable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class DirichletDriver extends AbstractJob {
public static final String STATE_IN_KEY = "org.apache.mahout.clustering.dirichlet.stateIn";
public static final String MODEL_DISTRIBUTION_KEY = "org.apache.mahout.clustering.dirichlet.modelFactory";
public static final String NUM_CLUSTERS_KEY = "org.apache.mahout.clustering.dirichlet.numClusters";
public static final String ALPHA_0_KEY = "org.apache.mahout.clustering.dirichlet.alpha_0";
public static final String EMIT_MOST_LIKELY_KEY = "org.apache.mahout.clustering.dirichlet.emitMostLikely";
public static final String THRESHOLD_KEY = "org.apache.mahout.clustering.dirichlet.threshold";
public static final String MODEL_PROTOTYPE_CLASS_OPTION = "modelPrototype";
public static final String MODEL_DISTRIBUTION_CLASS_OPTION = "modelDist";
public static final String ALPHA_OPTION = "alpha";
private static final Logger log = LoggerFactory.getLogger(DirichletDriver.class);
public static void main(String[] args) throws Exception {
ToolRunner.run(new Configuration(), new DirichletDriver(), args);
}
@Override
public int run(String[] args) throws Exception {
addInputOption();
addOutputOption();
addOption(DefaultOptionCreator.maxIterationsOption().create());
addOption(DefaultOptionCreator.numClustersOption().withRequired(true).create());
addOption(DefaultOptionCreator.overwriteOption().create());
addOption(DefaultOptionCreator.clusteringOption().create());
addOption(ALPHA_OPTION, "a0", "The alpha0 value for the DirichletDistribution. Defaults to 1.0", "1.0");
addOption(MODEL_DISTRIBUTION_CLASS_OPTION,
"md",
"The ModelDistribution class name. Defaults to GaussianClusterDistribution",
GaussianClusterDistribution.class.getName());
addOption(MODEL_PROTOTYPE_CLASS_OPTION,
"mp",
"The ModelDistribution prototype Vector class name. Defaults to RandomAccessSparseVector",
RandomAccessSparseVector.class.getName());
addOption(DefaultOptionCreator.distanceMeasureOption().withRequired(false).create());
addOption(DefaultOptionCreator.emitMostLikelyOption().create());
addOption(DefaultOptionCreator.thresholdOption().create());
addOption(DefaultOptionCreator.methodOption().create());
if (parseArguments(args) == null) {
return -1;
}
Path input = getInputPath();
Path output = getOutputPath();
if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
HadoopUtil.delete(getConf(), output);
}
String modelFactory = getOption(MODEL_DISTRIBUTION_CLASS_OPTION);
String modelPrototype = getOption(MODEL_PROTOTYPE_CLASS_OPTION);
String distanceMeasure = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION);
int numModels = Integer.parseInt(getOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION));
int maxIterations = Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION));
boolean emitMostLikely = Boolean.parseBoolean(getOption(DefaultOptionCreator.EMIT_MOST_LIKELY_OPTION));
double threshold = Double.parseDouble(getOption(DefaultOptionCreator.THRESHOLD_OPTION));
double alpha0 = Double.parseDouble(getOption(ALPHA_OPTION));
boolean runClustering = hasOption(DefaultOptionCreator.CLUSTERING_OPTION);
boolean runSequential =
getOption(DefaultOptionCreator.METHOD_OPTION).equalsIgnoreCase(DefaultOptionCreator.SEQUENTIAL_METHOD);
int prototypeSize = readPrototypeSize(input);
DistributionDescription description =
new DistributionDescription(modelFactory, modelPrototype, distanceMeasure, prototypeSize);
run(getConf(),
input,
output,
description,
numModels,
maxIterations,
alpha0,
runClustering,
emitMostLikely,
threshold,
runSequential);
return 0;
}
/**
* Iterate over the input vectors to produce clusters and, if requested, use the
* results of the final iteration to cluster the input vectors.
*
* @param conf
* the Configuration to use
* @param input
* the directory Path for input points
* @param output
* the directory Path for output points
* @param description model distribution parameters
* @param maxIterations
* the maximum number of iterations
* @param alpha0
* the alpha_0 value for the DirichletDistribution
* @param runClustering
* true if clustering of points to be done after iterations
* @param emitMostLikely
* a boolean if true emit only most likely cluster for each point
* @param threshold
* a double threshold value emits all clusters having greater pdf (emitMostLikely = false)
* @param runSequential execute sequentially if true
*/
public static void run(Configuration conf,
Path input,
Path output,
DistributionDescription description,
int numModels,
int maxIterations,
double alpha0,
boolean runClustering,
boolean emitMostLikely,
double threshold,
boolean runSequential)
throws IOException, ClassNotFoundException, InterruptedException {
Path clustersOut =
buildClusters(conf, input, output, description, numModels, maxIterations, alpha0, runSequential);
if (runClustering) {
clusterData(conf,
input,
clustersOut,
new Path(output, Cluster.CLUSTERED_POINTS_DIR),
emitMostLikely,
threshold,
runSequential);
}
}
/**
* Convenience method provides default Configuration
* Iterate over the input vectors to produce clusters and, if requested, use the
* results of the final iteration to cluster the input vectors.
*
* @param input
* the directory Path for input points
* @param output
* the directory Path for output points
* @param description model distribution parameters
* @param numClusters
* the number of models to iterate over
* @param maxIterations
* the maximum number of iterations
* @param alpha0
* the alpha_0 value for the DirichletDistribution
* @param runClustering
* true if clustering of points to be done after iterations
* @param emitMostLikely
* a boolean if true emit only most likely cluster for each point
* @param threshold
* a double threshold value emits all clusters having greater pdf (emitMostLikely = false)
* @param runSequential execute sequentially if true
*/
public static void run(Path input,
Path output,
DistributionDescription description,
int numClusters,
int maxIterations,
double alpha0,
boolean runClustering,
boolean emitMostLikely,
double threshold,
boolean runSequential)
throws IOException, ClassNotFoundException, InterruptedException {
run(new Configuration(),
input,
output,
description,
numClusters,
maxIterations,
alpha0,
runClustering,
emitMostLikely,
threshold,
runSequential);
}
/**
* Creates a DirichletState object from the given arguments. Note that the modelFactory is presumed to be a
* subclass of VectorModelDistribution that can be initialized with a concrete Vector prototype.
*
* @param description model distribution parameters
* @param numModels an int number of models to be created
* @param alpha0 the double alpha_0 argument to the algorithm
* @return an initialized DirichletState
*/
static DirichletState createState(DistributionDescription description, int numModels, double alpha0) {
return new DirichletState(description, numModels, alpha0);
}
/**
* Read the first input vector to determine the prototype size for the modelPrototype
*/
public static int readPrototypeSize(Path input) throws IOException {
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(input.toUri(), conf);
FileStatus[] status = fs.listStatus(input, PathFilters.logsCRCFilter());
int protoSize = 0;
if (status.length > 0) {
FileStatus s = status[0];
for (VectorWritable value : new SequenceFileValueIterable<VectorWritable>(s.getPath(), true, conf)) {
protoSize = value.get().size();
}
}
return protoSize;
}
/**
* Write initial state (prior distribution) to the output path directory
* @param output the output Path
* @param stateOut the state output Path
* @param description model distribution parameters
* @param numModels the int number of models to generate
* @param alpha0 the double alpha_0 argument to the DirichletDistribution
*/
private static void writeInitialState(Path output,
Path stateOut,
DistributionDescription description,
int numModels,
double alpha0) throws IOException {
DirichletState state = createState(description, numModels, alpha0);
writeState(output, stateOut, numModels, state);
}
private static void writeState(Path output, Path stateOut, int numModels, DirichletState state) throws IOException {
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(output.toUri(), conf);
for (int i = 0; i < numModels; i++) {
Path path = new Path(stateOut, "part-" + i);
SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, path, Text.class, DirichletCluster.class);
try {
writer.append(new Text(Integer.toString(i)), state.getClusters().get(i));
} finally {
Closeables.closeQuietly(writer);
}
}
}
/**
* Run an iteration using supplied arguments
* @param conf
* @param input the directory pathname for input points
* @param stateIn the directory pathname for input state
* @param stateOut the directory pathname for output state
* @param description model distribution parameters
* @param numClusters the number of clusters
* @param alpha0 alpha_0
*/
private static void runIteration(Configuration conf,
Path input,
Path stateIn,
Path stateOut,
DistributionDescription description,
int numClusters,
double alpha0) throws IOException, InterruptedException, ClassNotFoundException {
conf.set(STATE_IN_KEY, stateIn.toString());
conf.set(MODEL_DISTRIBUTION_KEY, description.toString());
conf.set(NUM_CLUSTERS_KEY, Integer.toString(numClusters));
conf.set(ALPHA_0_KEY, Double.toString(alpha0));
Job job = new Job(conf, "Dirichlet Driver running runIteration over stateIn: " + stateIn);
job.setInputFormatClass(SequenceFileInputFormat.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(DirichletCluster.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(VectorWritable.class);
job.setMapperClass(DirichletMapper.class);
job.setReducerClass(DirichletReducer.class);
job.setJarByClass(DirichletDriver.class);
FileInputFormat.addInputPath(job, input);
FileOutputFormat.setOutputPath(job, stateOut);
if (!job.waitForCompletion(true)) {
throw new InterruptedException("Dirichlet Iteration failed processing " + stateIn);
}
}
/**
* Iterate over the input vectors to produce cluster directories for each iteration
* @param conf
* @param input
* the directory Path for input points
* @param output
* the directory Path for output points
* @param description model distribution parameters
* @param numClusters
* the number of models to iterate over
* @param maxIterations
* the maximum number of iterations
* @param alpha0
* the alpha_0 value for the DirichletDistribution
* @param runSequential execute sequentially if true
*
* @return the Path of the final clusters directory
*/
public static Path buildClusters(Configuration conf,
Path input,
Path output,
DistributionDescription description,
int numClusters,
int maxIterations,
double alpha0,
boolean runSequential)
throws IOException, ClassNotFoundException, InterruptedException {
Path clustersIn = new Path(output, Cluster.INITIAL_CLUSTERS_DIR);
writeInitialState(output, clustersIn, description, numClusters, alpha0);
if (runSequential) {
clustersIn = buildClustersSeq(conf, input, output, description, numClusters, maxIterations, alpha0, clustersIn);
} else {
clustersIn = buildClustersMR(conf, input, output, description, numClusters, maxIterations, alpha0, clustersIn);
}
return clustersIn;
}
private static Path buildClustersSeq(Configuration conf,
Path input,
Path output,
DistributionDescription description,
int numClusters,
int maxIterations,
double alpha0,
Path clustersIn) throws IOException {
int iteration = 1;
while (iteration <= maxIterations) {
log.info("Iteration {}", iteration);
// point the output to a new directory per iteration
Path clustersOut = new Path(output, Cluster.CLUSTERS_DIR + iteration);
DirichletState state = DirichletMapper.loadState(conf,
clustersIn.toString(),
description,
alpha0,
numClusters);
List<DirichletCluster> oldModels = state.getClusters();
for (DirichletCluster oldModel : oldModels) {
oldModel.getModel().configure(conf);
}
Cluster[] newModels = (Cluster[]) state.getModelFactory().sampleFromPosterior(state.getModels());
for (Cluster newModel : newModels) {
newModel.configure(conf);
}
DirichletClusterer clusterer = new DirichletClusterer(state);
for (VectorWritable value
: new SequenceFileDirValueIterable<VectorWritable>(input,
PathType.LIST,
PathFilters.logsCRCFilter(),
conf)) {
clusterer.observe(newModels, value);
}
clusterer.updateModels(newModels);
writeState(output, clustersOut, numClusters, state);
// now point the input to the old output directory
clustersIn = clustersOut;
iteration++;
}
Path finalClustersIn = new Path(output, Cluster.CLUSTERS_DIR + (iteration-1) + Cluster.FINAL_ITERATION_SUFFIX);
FileSystem.get(conf).rename(new Path(output, Cluster.CLUSTERS_DIR + (iteration-1)), finalClustersIn);
return finalClustersIn;
}
private static Path buildClustersMR(Configuration conf,
Path input,
Path output,
DistributionDescription description,
int numClusters,
int maxIterations,
double alpha0,
Path clustersIn)
throws IOException, InterruptedException, ClassNotFoundException {
int iteration = 1;
while (iteration <= maxIterations) {
log.info("Iteration {}", iteration);
// point the output to a new directory per iteration
Path clustersOut = new Path(output, Cluster.CLUSTERS_DIR + iteration);
runIteration(conf, input, clustersIn, clustersOut, description, numClusters, alpha0);
// now point the input to the old output directory
clustersIn = clustersOut;
iteration++;
}
Path finalClustersIn = new Path(output, Cluster.CLUSTERS_DIR + (iteration-1) + Cluster.FINAL_ITERATION_SUFFIX);
FileSystem.get(conf).rename(new Path(output, Cluster.CLUSTERS_DIR + (iteration-1)), finalClustersIn);
return finalClustersIn;
}
/**
* Run the job using supplied arguments
* @param conf
*
* @param input
* the directory pathname for input points
* @param stateIn
* the directory pathname for input state
* @param output
* the directory pathname for output points
* @param emitMostLikely
* a boolean if true emit only most likely cluster for each point
* @param threshold
* a double threshold value emits all clusters having greater pdf (emitMostLikely = false)
* @param runSequential execute sequentially if true
*/
public static void clusterData(Configuration conf,
Path input,
Path stateIn,
Path output,
boolean emitMostLikely,
double threshold,
boolean runSequential)
throws IOException, InterruptedException, ClassNotFoundException {
if (runSequential) {
clusterDataSeq(conf, input, stateIn, output, emitMostLikely, threshold);
} else {
clusterDataMR(conf, input, stateIn, output, emitMostLikely, threshold);
}
}
private static void clusterDataSeq(Configuration conf,
Path input,
Path stateIn,
Path output,
boolean emitMostLikely,
double threshold) throws IOException {
List<DirichletCluster> clusters = DirichletClusterMapper.loadClusters(conf, stateIn);
for (DirichletCluster cluster : clusters) {
cluster.getModel().configure(conf);
}
DirichletClusterer clusterer = new DirichletClusterer(emitMostLikely, threshold);
// iterate over all points, assigning each to the closest canopy and outputing that clustering
FileSystem fs = FileSystem.get(input.toUri(), conf);
FileStatus[] status = fs.listStatus(input, PathFilters.logsCRCFilter());
int part = 0;
for (FileStatus s : status) {
SequenceFile.Writer writer = new SequenceFile.Writer(fs,
conf,
new Path(output, "part-m-" + part++),
IntWritable.class,
WeightedVectorWritable.class);
try {
for (VectorWritable value : new SequenceFileValueIterable<VectorWritable>(s.getPath(), conf)) {
clusterer.emitPointToClusters(value, clusters, writer);
}
} finally {
Closeables.closeQuietly(writer);
}
}
}
private static void clusterDataMR(Configuration conf,
Path input,
Path stateIn,
Path output,
boolean emitMostLikely,
double threshold) throws IOException, InterruptedException, ClassNotFoundException {
conf.set(STATE_IN_KEY, stateIn.toString());
conf.set(EMIT_MOST_LIKELY_KEY, Boolean.toString(emitMostLikely));
conf.set(THRESHOLD_KEY, Double.toString(threshold));
Job job = new Job(conf, "Dirichlet Driver running clusterData over input: " + input);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(WeightedVectorWritable.class);
job.setMapperClass(DirichletClusterMapper.class);
job.setInputFormatClass(SequenceFileInputFormat.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
job.setNumReduceTasks(0);
job.setJarByClass(DirichletDriver.class);
FileInputFormat.addInputPath(job, input);
FileOutputFormat.setOutputPath(job, output);
if (!job.waitForCompletion(true)) {
throw new InterruptedException("Dirichlet Clustering failed processing " + stateIn);
}
}
}