/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.mahout.clustering.dirichlet; import java.io.IOException; import java.util.List; import com.google.common.io.Closeables; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; import org.apache.hadoop.util.ToolRunner; import org.apache.mahout.clustering.Cluster; import org.apache.mahout.clustering.WeightedVectorWritable; import org.apache.mahout.clustering.dirichlet.models.DistributionDescription; import org.apache.mahout.clustering.dirichlet.models.GaussianClusterDistribution; import org.apache.mahout.common.AbstractJob; import org.apache.mahout.common.HadoopUtil; import org.apache.mahout.common.commandline.DefaultOptionCreator; import org.apache.mahout.common.iterator.sequencefile.PathFilters; import org.apache.mahout.common.iterator.sequencefile.PathType; import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterable; import org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterable; import org.apache.mahout.math.RandomAccessSparseVector; import org.apache.mahout.math.VectorWritable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class DirichletDriver extends AbstractJob { public static final String STATE_IN_KEY = "org.apache.mahout.clustering.dirichlet.stateIn"; public static final String MODEL_DISTRIBUTION_KEY = "org.apache.mahout.clustering.dirichlet.modelFactory"; public static final String NUM_CLUSTERS_KEY = "org.apache.mahout.clustering.dirichlet.numClusters"; public static final String ALPHA_0_KEY = "org.apache.mahout.clustering.dirichlet.alpha_0"; public static final String EMIT_MOST_LIKELY_KEY = "org.apache.mahout.clustering.dirichlet.emitMostLikely"; public static final String THRESHOLD_KEY = "org.apache.mahout.clustering.dirichlet.threshold"; public static final String MODEL_PROTOTYPE_CLASS_OPTION = "modelPrototype"; public static final String MODEL_DISTRIBUTION_CLASS_OPTION = "modelDist"; public static final String ALPHA_OPTION = "alpha"; private static final Logger log = LoggerFactory.getLogger(DirichletDriver.class); public static void main(String[] args) throws Exception { ToolRunner.run(new Configuration(), new DirichletDriver(), args); } @Override public int run(String[] args) throws Exception { addInputOption(); addOutputOption(); addOption(DefaultOptionCreator.maxIterationsOption().create()); addOption(DefaultOptionCreator.numClustersOption().withRequired(true).create()); addOption(DefaultOptionCreator.overwriteOption().create()); addOption(DefaultOptionCreator.clusteringOption().create()); addOption(ALPHA_OPTION, "a0", "The alpha0 value for the DirichletDistribution. Defaults to 1.0", "1.0"); addOption(MODEL_DISTRIBUTION_CLASS_OPTION, "md", "The ModelDistribution class name. Defaults to GaussianClusterDistribution", GaussianClusterDistribution.class.getName()); addOption(MODEL_PROTOTYPE_CLASS_OPTION, "mp", "The ModelDistribution prototype Vector class name. Defaults to RandomAccessSparseVector", RandomAccessSparseVector.class.getName()); addOption(DefaultOptionCreator.distanceMeasureOption().withRequired(false).create()); addOption(DefaultOptionCreator.emitMostLikelyOption().create()); addOption(DefaultOptionCreator.thresholdOption().create()); addOption(DefaultOptionCreator.methodOption().create()); if (parseArguments(args) == null) { return -1; } Path input = getInputPath(); Path output = getOutputPath(); if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { HadoopUtil.delete(getConf(), output); } String modelFactory = getOption(MODEL_DISTRIBUTION_CLASS_OPTION); String modelPrototype = getOption(MODEL_PROTOTYPE_CLASS_OPTION); String distanceMeasure = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION); int numModels = Integer.parseInt(getOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION)); int maxIterations = Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION)); boolean emitMostLikely = Boolean.parseBoolean(getOption(DefaultOptionCreator.EMIT_MOST_LIKELY_OPTION)); double threshold = Double.parseDouble(getOption(DefaultOptionCreator.THRESHOLD_OPTION)); double alpha0 = Double.parseDouble(getOption(ALPHA_OPTION)); boolean runClustering = hasOption(DefaultOptionCreator.CLUSTERING_OPTION); boolean runSequential = getOption(DefaultOptionCreator.METHOD_OPTION).equalsIgnoreCase(DefaultOptionCreator.SEQUENTIAL_METHOD); int prototypeSize = readPrototypeSize(input); DistributionDescription description = new DistributionDescription(modelFactory, modelPrototype, distanceMeasure, prototypeSize); run(getConf(), input, output, description, numModels, maxIterations, alpha0, runClustering, emitMostLikely, threshold, runSequential); return 0; } /** * Iterate over the input vectors to produce clusters and, if requested, use the * results of the final iteration to cluster the input vectors. * * @param conf * the Configuration to use * @param input * the directory Path for input points * @param output * the directory Path for output points * @param description model distribution parameters * @param maxIterations * the maximum number of iterations * @param alpha0 * the alpha_0 value for the DirichletDistribution * @param runClustering * true if clustering of points to be done after iterations * @param emitMostLikely * a boolean if true emit only most likely cluster for each point * @param threshold * a double threshold value emits all clusters having greater pdf (emitMostLikely = false) * @param runSequential execute sequentially if true */ public static void run(Configuration conf, Path input, Path output, DistributionDescription description, int numModels, int maxIterations, double alpha0, boolean runClustering, boolean emitMostLikely, double threshold, boolean runSequential) throws IOException, ClassNotFoundException, InterruptedException { Path clustersOut = buildClusters(conf, input, output, description, numModels, maxIterations, alpha0, runSequential); if (runClustering) { clusterData(conf, input, clustersOut, new Path(output, Cluster.CLUSTERED_POINTS_DIR), emitMostLikely, threshold, runSequential); } } /** * Convenience method provides default Configuration * Iterate over the input vectors to produce clusters and, if requested, use the * results of the final iteration to cluster the input vectors. * * @param input * the directory Path for input points * @param output * the directory Path for output points * @param description model distribution parameters * @param numClusters * the number of models to iterate over * @param maxIterations * the maximum number of iterations * @param alpha0 * the alpha_0 value for the DirichletDistribution * @param runClustering * true if clustering of points to be done after iterations * @param emitMostLikely * a boolean if true emit only most likely cluster for each point * @param threshold * a double threshold value emits all clusters having greater pdf (emitMostLikely = false) * @param runSequential execute sequentially if true */ public static void run(Path input, Path output, DistributionDescription description, int numClusters, int maxIterations, double alpha0, boolean runClustering, boolean emitMostLikely, double threshold, boolean runSequential) throws IOException, ClassNotFoundException, InterruptedException { run(new Configuration(), input, output, description, numClusters, maxIterations, alpha0, runClustering, emitMostLikely, threshold, runSequential); } /** * Creates a DirichletState object from the given arguments. Note that the modelFactory is presumed to be a * subclass of VectorModelDistribution that can be initialized with a concrete Vector prototype. * * @param description model distribution parameters * @param numModels an int number of models to be created * @param alpha0 the double alpha_0 argument to the algorithm * @return an initialized DirichletState */ static DirichletState createState(DistributionDescription description, int numModels, double alpha0) { return new DirichletState(description, numModels, alpha0); } /** * Read the first input vector to determine the prototype size for the modelPrototype */ public static int readPrototypeSize(Path input) throws IOException { Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(input.toUri(), conf); FileStatus[] status = fs.listStatus(input, PathFilters.logsCRCFilter()); int protoSize = 0; if (status.length > 0) { FileStatus s = status[0]; for (VectorWritable value : new SequenceFileValueIterable<VectorWritable>(s.getPath(), true, conf)) { protoSize = value.get().size(); } } return protoSize; } /** * Write initial state (prior distribution) to the output path directory * @param output the output Path * @param stateOut the state output Path * @param description model distribution parameters * @param numModels the int number of models to generate * @param alpha0 the double alpha_0 argument to the DirichletDistribution */ private static void writeInitialState(Path output, Path stateOut, DistributionDescription description, int numModels, double alpha0) throws IOException { DirichletState state = createState(description, numModels, alpha0); writeState(output, stateOut, numModels, state); } private static void writeState(Path output, Path stateOut, int numModels, DirichletState state) throws IOException { Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(output.toUri(), conf); for (int i = 0; i < numModels; i++) { Path path = new Path(stateOut, "part-" + i); SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, path, Text.class, DirichletCluster.class); try { writer.append(new Text(Integer.toString(i)), state.getClusters().get(i)); } finally { Closeables.closeQuietly(writer); } } } /** * Run an iteration using supplied arguments * @param conf * @param input the directory pathname for input points * @param stateIn the directory pathname for input state * @param stateOut the directory pathname for output state * @param description model distribution parameters * @param numClusters the number of clusters * @param alpha0 alpha_0 */ private static void runIteration(Configuration conf, Path input, Path stateIn, Path stateOut, DistributionDescription description, int numClusters, double alpha0) throws IOException, InterruptedException, ClassNotFoundException { conf.set(STATE_IN_KEY, stateIn.toString()); conf.set(MODEL_DISTRIBUTION_KEY, description.toString()); conf.set(NUM_CLUSTERS_KEY, Integer.toString(numClusters)); conf.set(ALPHA_0_KEY, Double.toString(alpha0)); Job job = new Job(conf, "Dirichlet Driver running runIteration over stateIn: " + stateIn); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(DirichletCluster.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(VectorWritable.class); job.setMapperClass(DirichletMapper.class); job.setReducerClass(DirichletReducer.class); job.setJarByClass(DirichletDriver.class); FileInputFormat.addInputPath(job, input); FileOutputFormat.setOutputPath(job, stateOut); if (!job.waitForCompletion(true)) { throw new InterruptedException("Dirichlet Iteration failed processing " + stateIn); } } /** * Iterate over the input vectors to produce cluster directories for each iteration * @param conf * @param input * the directory Path for input points * @param output * the directory Path for output points * @param description model distribution parameters * @param numClusters * the number of models to iterate over * @param maxIterations * the maximum number of iterations * @param alpha0 * the alpha_0 value for the DirichletDistribution * @param runSequential execute sequentially if true * * @return the Path of the final clusters directory */ public static Path buildClusters(Configuration conf, Path input, Path output, DistributionDescription description, int numClusters, int maxIterations, double alpha0, boolean runSequential) throws IOException, ClassNotFoundException, InterruptedException { Path clustersIn = new Path(output, Cluster.INITIAL_CLUSTERS_DIR); writeInitialState(output, clustersIn, description, numClusters, alpha0); if (runSequential) { clustersIn = buildClustersSeq(conf, input, output, description, numClusters, maxIterations, alpha0, clustersIn); } else { clustersIn = buildClustersMR(conf, input, output, description, numClusters, maxIterations, alpha0, clustersIn); } return clustersIn; } private static Path buildClustersSeq(Configuration conf, Path input, Path output, DistributionDescription description, int numClusters, int maxIterations, double alpha0, Path clustersIn) throws IOException { int iteration = 1; while (iteration <= maxIterations) { log.info("Iteration {}", iteration); // point the output to a new directory per iteration Path clustersOut = new Path(output, Cluster.CLUSTERS_DIR + iteration); DirichletState state = DirichletMapper.loadState(conf, clustersIn.toString(), description, alpha0, numClusters); List<DirichletCluster> oldModels = state.getClusters(); for (DirichletCluster oldModel : oldModels) { oldModel.getModel().configure(conf); } Cluster[] newModels = (Cluster[]) state.getModelFactory().sampleFromPosterior(state.getModels()); for (Cluster newModel : newModels) { newModel.configure(conf); } DirichletClusterer clusterer = new DirichletClusterer(state); for (VectorWritable value : new SequenceFileDirValueIterable<VectorWritable>(input, PathType.LIST, PathFilters.logsCRCFilter(), conf)) { clusterer.observe(newModels, value); } clusterer.updateModels(newModels); writeState(output, clustersOut, numClusters, state); // now point the input to the old output directory clustersIn = clustersOut; iteration++; } Path finalClustersIn = new Path(output, Cluster.CLUSTERS_DIR + (iteration-1) + Cluster.FINAL_ITERATION_SUFFIX); FileSystem.get(conf).rename(new Path(output, Cluster.CLUSTERS_DIR + (iteration-1)), finalClustersIn); return finalClustersIn; } private static Path buildClustersMR(Configuration conf, Path input, Path output, DistributionDescription description, int numClusters, int maxIterations, double alpha0, Path clustersIn) throws IOException, InterruptedException, ClassNotFoundException { int iteration = 1; while (iteration <= maxIterations) { log.info("Iteration {}", iteration); // point the output to a new directory per iteration Path clustersOut = new Path(output, Cluster.CLUSTERS_DIR + iteration); runIteration(conf, input, clustersIn, clustersOut, description, numClusters, alpha0); // now point the input to the old output directory clustersIn = clustersOut; iteration++; } Path finalClustersIn = new Path(output, Cluster.CLUSTERS_DIR + (iteration-1) + Cluster.FINAL_ITERATION_SUFFIX); FileSystem.get(conf).rename(new Path(output, Cluster.CLUSTERS_DIR + (iteration-1)), finalClustersIn); return finalClustersIn; } /** * Run the job using supplied arguments * @param conf * * @param input * the directory pathname for input points * @param stateIn * the directory pathname for input state * @param output * the directory pathname for output points * @param emitMostLikely * a boolean if true emit only most likely cluster for each point * @param threshold * a double threshold value emits all clusters having greater pdf (emitMostLikely = false) * @param runSequential execute sequentially if true */ public static void clusterData(Configuration conf, Path input, Path stateIn, Path output, boolean emitMostLikely, double threshold, boolean runSequential) throws IOException, InterruptedException, ClassNotFoundException { if (runSequential) { clusterDataSeq(conf, input, stateIn, output, emitMostLikely, threshold); } else { clusterDataMR(conf, input, stateIn, output, emitMostLikely, threshold); } } private static void clusterDataSeq(Configuration conf, Path input, Path stateIn, Path output, boolean emitMostLikely, double threshold) throws IOException { List<DirichletCluster> clusters = DirichletClusterMapper.loadClusters(conf, stateIn); for (DirichletCluster cluster : clusters) { cluster.getModel().configure(conf); } DirichletClusterer clusterer = new DirichletClusterer(emitMostLikely, threshold); // iterate over all points, assigning each to the closest canopy and outputing that clustering FileSystem fs = FileSystem.get(input.toUri(), conf); FileStatus[] status = fs.listStatus(input, PathFilters.logsCRCFilter()); int part = 0; for (FileStatus s : status) { SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, new Path(output, "part-m-" + part++), IntWritable.class, WeightedVectorWritable.class); try { for (VectorWritable value : new SequenceFileValueIterable<VectorWritable>(s.getPath(), conf)) { clusterer.emitPointToClusters(value, clusters, writer); } } finally { Closeables.closeQuietly(writer); } } } private static void clusterDataMR(Configuration conf, Path input, Path stateIn, Path output, boolean emitMostLikely, double threshold) throws IOException, InterruptedException, ClassNotFoundException { conf.set(STATE_IN_KEY, stateIn.toString()); conf.set(EMIT_MOST_LIKELY_KEY, Boolean.toString(emitMostLikely)); conf.set(THRESHOLD_KEY, Double.toString(threshold)); Job job = new Job(conf, "Dirichlet Driver running clusterData over input: " + input); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(WeightedVectorWritable.class); job.setMapperClass(DirichletClusterMapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setNumReduceTasks(0); job.setJarByClass(DirichletDriver.class); FileInputFormat.addInputPath(job, input); FileOutputFormat.setOutputPath(job, output); if (!job.waitForCompletion(true)) { throw new InterruptedException("Dirichlet Clustering failed processing " + stateIn); } } }