/* Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.mahout.clustering.kmeans; import java.io.IOException; import java.util.Collection; import com.google.common.collect.Lists; import com.google.common.io.Closeables; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; import org.apache.hadoop.util.ToolRunner; import org.apache.mahout.clustering.AbstractCluster; import org.apache.mahout.clustering.ClusterObservations; import org.apache.mahout.clustering.WeightedPropertyVectorWritable; import org.apache.mahout.clustering.WeightedVectorWritable; import org.apache.mahout.common.AbstractJob; import org.apache.mahout.common.ClassUtils; import org.apache.mahout.common.HadoopUtil; import org.apache.mahout.common.commandline.DefaultOptionCreator; import org.apache.mahout.common.distance.DistanceMeasure; import org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure; import org.apache.mahout.common.iterator.sequencefile.PathFilters; import org.apache.mahout.common.iterator.sequencefile.PathType; import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterable; import org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterable; import org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterator; import org.apache.mahout.math.VectorWritable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class KMeansDriver extends AbstractJob { private static final Logger log = LoggerFactory.getLogger(KMeansDriver.class); public static void main(String[] args) throws Exception { ToolRunner.run(new Configuration(), new KMeansDriver(), args); } @Override public int run(String[] args) throws Exception { addInputOption(); addOutputOption(); addOption(DefaultOptionCreator.distanceMeasureOption().create()); addOption(DefaultOptionCreator.clustersInOption() .withDescription("The input centroids, as Vectors. Must be a SequenceFile of Writable, Cluster/Canopy. " + "If k is also specified, then a random set of vectors will be selected" + " and written out to this path first") .create()); addOption(DefaultOptionCreator.numClustersOption() .withDescription("The k in k-Means. If specified, then a random selection of k Vectors will be chosen" + " as the Centroid and written to the clusters input path.").create()); addOption(DefaultOptionCreator.convergenceOption().create()); addOption(DefaultOptionCreator.maxIterationsOption().create()); addOption(DefaultOptionCreator.overwriteOption().create()); addOption(DefaultOptionCreator.clusteringOption().create()); addOption(DefaultOptionCreator.methodOption().create()); if (parseArguments(args) == null) { return -1; } Path input = getInputPath(); Path clusters = new Path(getOption(DefaultOptionCreator.CLUSTERS_IN_OPTION)); Path output = getOutputPath(); String measureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION); if (measureClass == null) { measureClass = SquaredEuclideanDistanceMeasure.class.getName(); } double convergenceDelta = Double.parseDouble(getOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION)); int maxIterations = Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION)); if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { HadoopUtil.delete(getConf(), output); } DistanceMeasure measure = ClassUtils.instantiateAs(measureClass, DistanceMeasure.class); if (hasOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION)) { clusters = RandomSeedGenerator.buildRandom(getConf(), input, clusters, Integer .parseInt(getOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION)), measure); } boolean runClustering = hasOption(DefaultOptionCreator.CLUSTERING_OPTION); boolean runSequential = getOption(DefaultOptionCreator.METHOD_OPTION).equalsIgnoreCase( DefaultOptionCreator.SEQUENTIAL_METHOD); if (getConf() == null) { setConf(new Configuration()); } run(getConf(), input, clusters, output, measure, convergenceDelta, maxIterations, runClustering, runSequential); return 0; } /** * Iterate over the input vectors to produce clusters and, if requested, use the * results of the final iteration to cluster the input vectors. * @param input * the directory pathname for input points * @param clustersIn * the directory pathname for initial & computed clusters * @param output * the directory pathname for output points * @param measure * the DistanceMeasure to use * @param convergenceDelta * the convergence delta value * @param maxIterations * the maximum number of iterations * @param runClustering * true if points are to be clustered after iterations are completed * @param runSequential if true execute sequential algorithm */ public static void run(Configuration conf, Path input, Path clustersIn, Path output, DistanceMeasure measure, double convergenceDelta, int maxIterations, boolean runClustering, boolean runSequential) throws IOException, InterruptedException, ClassNotFoundException { // iterate until the clusters converge String delta = Double.toString(convergenceDelta); if (log.isInfoEnabled()) { log.info("Input: {} Clusters In: {} Out: {} Distance: {}", new Object[] {input, clustersIn, output,measure.getClass().getName()}); log.info("convergence: {} max Iterations: {} num Reduce Tasks: {} Input Vectors: {}", new Object[] {convergenceDelta, maxIterations, VectorWritable.class.getName()}); } Path clustersOut = buildClusters(conf, input, clustersIn, output, measure, maxIterations, delta, runSequential); if (runClustering) { log.info("Clustering data"); clusterData(conf, input, clustersOut, new Path(output, AbstractCluster.CLUSTERED_POINTS_DIR), measure, delta, runSequential); } } /** * Iterate over the input vectors to produce clusters and, if requested, use the * results of the final iteration to cluster the input vectors. * * @param input * the directory pathname for input points * @param clustersIn * the directory pathname for initial & computed clusters * @param output * the directory pathname for output points * @param measure * the DistanceMeasure to use * @param convergenceDelta * the convergence delta value * @param maxIterations * the maximum number of iterations * @param runClustering * true if points are to be clustered after iterations are completed * @param runSequential if true execute sequential algorithm */ public static void run(Path input, Path clustersIn, Path output, DistanceMeasure measure, double convergenceDelta, int maxIterations, boolean runClustering, boolean runSequential) throws IOException, InterruptedException, ClassNotFoundException { run(new Configuration(), input, clustersIn, output, measure, convergenceDelta, maxIterations, runClustering, runSequential); } /** * Iterate over the input vectors to produce cluster directories for each iteration * @param conf * the Configuration to use * @param input * the directory pathname for input points * @param clustersIn * the directory pathname for initial & computed clusters * @param output * the directory pathname for output points * @param measure * the classname of the DistanceMeasure * @param maxIterations * the maximum number of iterations * @param delta * the convergence delta value * @param runSequential if true execute sequential algorithm * * @return the Path of the final clusters directory */ public static Path buildClusters(Configuration conf, Path input, Path clustersIn, Path output, DistanceMeasure measure, int maxIterations, String delta, boolean runSequential) throws IOException, InterruptedException, ClassNotFoundException { if (runSequential) { return buildClustersSeq(conf, input, clustersIn, output, measure, maxIterations, delta); } else { return buildClustersMR(conf, input, clustersIn, output, measure, maxIterations, delta); } } private static Path buildClustersSeq(Configuration conf, Path input, Path clustersIn, Path output, DistanceMeasure measure, int maxIterations, String delta) throws IOException { KMeansClusterer clusterer = new KMeansClusterer(measure); Collection<Cluster> clusters = Lists.newArrayList(); KMeansUtil.configureWithClusterInfo(conf, clustersIn, clusters); if (clusters.isEmpty()) { throw new IllegalStateException("Clusters is empty!"); } boolean converged = false; int iteration = 1; while (!converged && iteration <= maxIterations) { log.info("K-Means Iteration: {}", iteration); FileSystem fs = FileSystem.get(input.toUri(), conf); for (VectorWritable value : new SequenceFileDirValueIterable<VectorWritable>(input, PathType.LIST, PathFilters.logsCRCFilter(), conf)) { clusterer.addPointToNearestCluster(value.get(), clusters); } converged = clusterer.testConvergence(clusters, Double.parseDouble(delta)); Path clustersOut = new Path(output, AbstractCluster.CLUSTERS_DIR + iteration); SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, new Path(clustersOut, "part-r-00000"), Text.class, Cluster.class); try { for (Cluster cluster : clusters) { if (log.isDebugEnabled()) { log.debug("Writing Cluster:{} center:{} numPoints:{} radius:{} to: {}", new Object[] { cluster.getId(), AbstractCluster.formatVector(cluster.getCenter(), null), cluster.getNumPoints(), AbstractCluster.formatVector(cluster.getRadius(), null), clustersOut.getName() }); } writer.append(new Text(cluster.getIdentifier()), cluster); } } finally { Closeables.closeQuietly(writer); } clustersIn = clustersOut; iteration++; } Path finalClustersIn = new Path(output, AbstractCluster.CLUSTERS_DIR + (iteration-1) + org.apache.mahout.clustering.Cluster.FINAL_ITERATION_SUFFIX); FileSystem.get(conf).rename(new Path(output, AbstractCluster.CLUSTERS_DIR + (iteration-1)), finalClustersIn); return finalClustersIn; } private static Path buildClustersMR(Configuration conf, Path input, Path clustersIn, Path output, DistanceMeasure measure, int maxIterations, String delta) throws IOException, InterruptedException, ClassNotFoundException { boolean converged = false; int iteration = 1; while (!converged && iteration <= maxIterations) { log.info("K-Means Iteration {}", iteration); // point the output to a new directory per iteration Path clustersOut = new Path(output, AbstractCluster.CLUSTERS_DIR + iteration); converged = runIteration(conf, input, clustersIn, clustersOut, measure.getClass().getName(), delta); // now point the input to the old output directory clustersIn = clustersOut; iteration++; } Path finalClustersIn = new Path(output, AbstractCluster.CLUSTERS_DIR + (iteration-1) + "-final"); FileSystem.get(conf).rename(new Path(output, AbstractCluster.CLUSTERS_DIR + (iteration-1)), finalClustersIn); return finalClustersIn; } /** * Run the job using supplied arguments * @param input * the directory pathname for input points * @param clustersIn * the directory pathname for input clusters * @param clustersOut * the directory pathname for output clusters * @param measureClass * the classname of the DistanceMeasure * @param convergenceDelta * the convergence delta value * * @return true if the iteration successfully runs */ private static boolean runIteration(Configuration conf, Path input, Path clustersIn, Path clustersOut, String measureClass, String convergenceDelta) throws IOException, InterruptedException, ClassNotFoundException { conf.set(KMeansConfigKeys.CLUSTER_PATH_KEY, clustersIn.toString()); conf.set(KMeansConfigKeys.DISTANCE_MEASURE_KEY, measureClass); conf.set(KMeansConfigKeys.CLUSTER_CONVERGENCE_KEY, convergenceDelta); Job job = new Job(conf, "KMeans Driver running runIteration over clustersIn: " + clustersIn); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(ClusterObservations.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Cluster.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapperClass(KMeansMapper.class); job.setCombinerClass(KMeansCombiner.class); job.setReducerClass(KMeansReducer.class); FileInputFormat.addInputPath(job, input); FileOutputFormat.setOutputPath(job, clustersOut); job.setJarByClass(KMeansDriver.class); HadoopUtil.delete(conf, clustersOut); if (!job.waitForCompletion(true)) { throw new InterruptedException("K-Means Iteration failed processing " + clustersIn); } FileSystem fs = FileSystem.get(clustersOut.toUri(), conf); return isConverged(clustersOut, conf, fs); } /** * Return if all of the Clusters in the parts in the filePath have converged or not * * @param filePath * the file path to the single file containing the clusters * @return true if all Clusters are converged * @throws IOException * if there was an IO error */ private static boolean isConverged(Path filePath, Configuration conf, FileSystem fs) throws IOException { for (FileStatus part : fs.listStatus(filePath, PathFilters.partFilter())) { SequenceFileValueIterator<Cluster> iterator = new SequenceFileValueIterator<Cluster>(part.getPath(), true, conf); while (iterator.hasNext()) { Cluster value = iterator.next(); if (!value.isConverged()) { Closeables.closeQuietly(iterator); return false; } } } return true; } /** * Run the job using supplied arguments * @param input * the directory pathname for input points * @param clustersIn * the directory pathname for input clusters * @param output * the directory pathname for output points * @param measure * the classname of the DistanceMeasure * @param convergenceDelta * the convergence delta value * @param runSequential if true execute sequential algorithm */ public static void clusterData(Configuration conf, Path input, Path clustersIn, Path output, DistanceMeasure measure, String convergenceDelta, boolean runSequential) throws IOException, InterruptedException, ClassNotFoundException { if (log.isInfoEnabled()) { log.info("Running Clustering"); log.info("Input: {} Clusters In: {} Out: {} Distance: {}", new Object[] {input, clustersIn, output, measure}); log.info("convergence: {} Input Vectors: {}", convergenceDelta, VectorWritable.class.getName()); } if (runSequential) { clusterDataSeq(conf, input, clustersIn, output, measure); } else { clusterDataMR(conf, input, clustersIn, output, measure, convergenceDelta); } } private static void clusterDataSeq(Configuration conf, Path input, Path clustersIn, Path output, DistanceMeasure measure) throws IOException { KMeansClusterer clusterer = new KMeansClusterer(measure); Collection<Cluster> clusters = Lists.newArrayList(); KMeansUtil.configureWithClusterInfo(conf, clustersIn, clusters); if (clusters.isEmpty()) { throw new IllegalStateException("Clusters is empty!"); } FileSystem fs = FileSystem.get(input.toUri(), conf); FileStatus[] status = fs.listStatus(input, PathFilters.logsCRCFilter()); int part = 0; for (FileStatus s : status) { SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, new Path(output, "part-m-" + part), IntWritable.class, WeightedVectorWritable.class); try { for (VectorWritable value : new SequenceFileValueIterable<VectorWritable>(s.getPath(), conf)) { clusterer.emitPointToNearestCluster(value.get(), clusters, writer); } } finally { Closeables.closeQuietly(writer); } } } private static void clusterDataMR(Configuration conf, Path input, Path clustersIn, Path output, DistanceMeasure measure, String convergenceDelta) throws IOException, InterruptedException, ClassNotFoundException { conf.set(KMeansConfigKeys.CLUSTER_PATH_KEY, clustersIn.toString()); conf.set(KMeansConfigKeys.DISTANCE_MEASURE_KEY, measure.getClass().getName()); conf.set(KMeansConfigKeys.CLUSTER_CONVERGENCE_KEY, convergenceDelta); Job job = new Job(conf, "KMeans Driver running clusterData over input: " + input); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(WeightedPropertyVectorWritable.class); FileInputFormat.setInputPaths(job, input); HadoopUtil.delete(conf, output); FileOutputFormat.setOutputPath(job, output); job.setMapperClass(KMeansClusterMapper.class); job.setNumReduceTasks(0); job.setJarByClass(KMeansDriver.class); if (!job.waitForCompletion(true)) { throw new InterruptedException("K-Means Clustering failed processing " + clustersIn); } } }