/* Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.mahout.clustering.kmeans; import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.util.ToolRunner; import org.apache.mahout.clustering.Cluster; import org.apache.mahout.clustering.classify.ClusterClassificationDriver; import org.apache.mahout.clustering.classify.ClusterClassifier; import org.apache.mahout.clustering.iterator.ClusterIterator; import org.apache.mahout.clustering.iterator.ClusteringPolicy; import org.apache.mahout.clustering.iterator.KMeansClusteringPolicy; import org.apache.mahout.clustering.topdown.PathDirectory; import org.apache.mahout.common.AbstractJob; import org.apache.mahout.common.ClassUtils; import org.apache.mahout.common.HadoopUtil; import org.apache.mahout.common.commandline.DefaultOptionCreator; import org.apache.mahout.common.distance.DistanceMeasure; import org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class KMeansDriver extends AbstractJob { private static final Logger log = LoggerFactory.getLogger(KMeansDriver.class); public static void main(String[] args) throws Exception { ToolRunner.run(new Configuration(), new KMeansDriver(), args); } @Override public int run(String[] args) throws Exception { addInputOption(); addOutputOption(); addOption(DefaultOptionCreator.distanceMeasureOption().create()); addOption(DefaultOptionCreator .clustersInOption() .withDescription( "The input centroids, as Vectors. Must be a SequenceFile of Writable, Cluster/Canopy. " + "If k is also specified, then a random set of vectors will be selected" + " and written out to this path first").create()); addOption(DefaultOptionCreator .numClustersOption() .withDescription( "The k in k-Means. If specified, then a random selection of k Vectors will be chosen" + " as the Centroid and written to the clusters input path.").create()); addOption(DefaultOptionCreator.useSetRandomSeedOption().create()); addOption(DefaultOptionCreator.convergenceOption().create()); addOption(DefaultOptionCreator.maxIterationsOption().create()); addOption(DefaultOptionCreator.overwriteOption().create()); addOption(DefaultOptionCreator.clusteringOption().create()); addOption(DefaultOptionCreator.methodOption().create()); addOption(DefaultOptionCreator.outlierThresholdOption().create()); if (parseArguments(args) == null) { return -1; } Path input = getInputPath(); Path clusters = new Path(getOption(DefaultOptionCreator.CLUSTERS_IN_OPTION)); Path output = getOutputPath(); String measureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION); if (measureClass == null) { measureClass = SquaredEuclideanDistanceMeasure.class.getName(); } double convergenceDelta = Double.parseDouble(getOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION)); int maxIterations = Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION)); if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { HadoopUtil.delete(getConf(), output); } DistanceMeasure measure = ClassUtils.instantiateAs(measureClass, DistanceMeasure.class); if (hasOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION)) { int numClusters = Integer.parseInt(getOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION)); Long seed = null; if (hasOption(DefaultOptionCreator.RANDOM_SEED)) { seed = Long.parseLong(getOption(DefaultOptionCreator.RANDOM_SEED)); } clusters = RandomSeedGenerator.buildRandom(getConf(), input, clusters, numClusters, measure, seed); } boolean runClustering = hasOption(DefaultOptionCreator.CLUSTERING_OPTION); boolean runSequential = getOption(DefaultOptionCreator.METHOD_OPTION).equalsIgnoreCase( DefaultOptionCreator.SEQUENTIAL_METHOD); double clusterClassificationThreshold = 0.0; if (hasOption(DefaultOptionCreator.OUTLIER_THRESHOLD)) { clusterClassificationThreshold = Double.parseDouble(getOption(DefaultOptionCreator.OUTLIER_THRESHOLD)); } run(getConf(), input, clusters, output, convergenceDelta, maxIterations, runClustering, clusterClassificationThreshold, runSequential); return 0; } /** * Iterate over the input vectors to produce clusters and, if requested, use the results of the final iteration to * cluster the input vectors. * * @param input * the directory pathname for input points * @param clustersIn * the directory pathname for initial & computed clusters * @param output * the directory pathname for output points * @param convergenceDelta * the convergence delta value * @param maxIterations * the maximum number of iterations * @param runClustering * true if points are to be clustered after iterations are completed * @param clusterClassificationThreshold * Is a clustering strictness / outlier removal parameter. Its value should be between 0 and 1. Vectors * having pdf below this value will not be clustered. * @param runSequential * if true execute sequential algorithm */ public static void run(Configuration conf, Path input, Path clustersIn, Path output, double convergenceDelta, int maxIterations, boolean runClustering, double clusterClassificationThreshold, boolean runSequential) throws IOException, InterruptedException, ClassNotFoundException { // iterate until the clusters converge String delta = Double.toString(convergenceDelta); if (log.isInfoEnabled()) { log.info("Input: {} Clusters In: {} Out: {}", input, clustersIn, output); log.info("convergence: {} max Iterations: {}", convergenceDelta, maxIterations); } Path clustersOut = buildClusters(conf, input, clustersIn, output, maxIterations, delta, runSequential); if (runClustering) { log.info("Clustering data"); clusterData(conf, input, clustersOut, output, clusterClassificationThreshold, runSequential); } } /** * Iterate over the input vectors to produce clusters and, if requested, use the results of the final iteration to * cluster the input vectors. * * @param input * the directory pathname for input points * @param clustersIn * the directory pathname for initial & computed clusters * @param output * the directory pathname for output points * @param convergenceDelta * the convergence delta value * @param maxIterations * the maximum number of iterations * @param runClustering * true if points are to be clustered after iterations are completed * @param clusterClassificationThreshold * Is a clustering strictness / outlier removal parameter. Its value should be between 0 and 1. Vectors * having pdf below this value will not be clustered. * @param runSequential * if true execute sequential algorithm */ public static void run(Path input, Path clustersIn, Path output, double convergenceDelta, int maxIterations, boolean runClustering, double clusterClassificationThreshold, boolean runSequential) throws IOException, InterruptedException, ClassNotFoundException { run(new Configuration(), input, clustersIn, output, convergenceDelta, maxIterations, runClustering, clusterClassificationThreshold, runSequential); } /** * Iterate over the input vectors to produce cluster directories for each iteration * * * @param conf * the Configuration to use * @param input * the directory pathname for input points * @param clustersIn * the directory pathname for initial & computed clusters * @param output * the directory pathname for output points * @param maxIterations * the maximum number of iterations * @param delta * the convergence delta value * @param runSequential * if true execute sequential algorithm * * @return the Path of the final clusters directory */ public static Path buildClusters(Configuration conf, Path input, Path clustersIn, Path output, int maxIterations, String delta, boolean runSequential) throws IOException, InterruptedException, ClassNotFoundException { double convergenceDelta = Double.parseDouble(delta); List<Cluster> clusters = new ArrayList<>(); KMeansUtil.configureWithClusterInfo(conf, clustersIn, clusters); if (clusters.isEmpty()) { throw new IllegalStateException("No input clusters found in " + clustersIn + ". Check your -c argument."); } Path priorClustersPath = new Path(output, Cluster.INITIAL_CLUSTERS_DIR); ClusteringPolicy policy = new KMeansClusteringPolicy(convergenceDelta); ClusterClassifier prior = new ClusterClassifier(clusters, policy); prior.writeToSeqFiles(priorClustersPath); if (runSequential) { ClusterIterator.iterateSeq(conf, input, priorClustersPath, output, maxIterations); } else { ClusterIterator.iterateMR(conf, input, priorClustersPath, output, maxIterations); } return output; } /** * Run the job using supplied arguments * * @param input * the directory pathname for input points * @param clustersIn * the directory pathname for input clusters * @param output * the directory pathname for output points * @param clusterClassificationThreshold * Is a clustering strictness / outlier removal parameter. Its value should be between 0 and 1. Vectors * having pdf below this value will not be clustered. * @param runSequential * if true execute sequential algorithm */ public static void clusterData(Configuration conf, Path input, Path clustersIn, Path output, double clusterClassificationThreshold, boolean runSequential) throws IOException, InterruptedException, ClassNotFoundException { if (log.isInfoEnabled()) { log.info("Running Clustering"); log.info("Input: {} Clusters In: {} Out: {}", input, clustersIn, output); } ClusterClassifier.writePolicy(new KMeansClusteringPolicy(), clustersIn); ClusterClassificationDriver.run(conf, input, output, new Path(output, PathDirectory.CLUSTERED_POINTS_DIRECTORY), clusterClassificationThreshold, true, runSequential); } }