/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.mahout.clustering.canopy; import java.io.IOException; import java.util.Collection; import com.google.common.collect.Lists; import com.google.common.io.Closeables; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; import org.apache.hadoop.util.ToolRunner; import org.apache.mahout.clustering.AbstractCluster; import org.apache.mahout.clustering.Cluster; import org.apache.mahout.clustering.WeightedVectorWritable; import org.apache.mahout.common.AbstractJob; import org.apache.mahout.common.ClassUtils; import org.apache.mahout.common.HadoopUtil; import org.apache.mahout.common.commandline.DefaultOptionCreator; import org.apache.mahout.common.distance.DistanceMeasure; import org.apache.mahout.common.iterator.sequencefile.PathFilters; import org.apache.mahout.common.iterator.sequencefile.PathType; import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterable; import org.apache.mahout.math.VectorWritable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class CanopyDriver extends AbstractJob { public static final String DEFAULT_CLUSTERED_POINTS_DIRECTORY = "clusteredPoints"; private static final Logger log = LoggerFactory.getLogger(CanopyDriver.class); public static void main(String[] args) throws Exception { ToolRunner.run(new Configuration(), new CanopyDriver(), args); } @Override public int run(String[] args) throws Exception { addInputOption(); addOutputOption(); addOption(DefaultOptionCreator.distanceMeasureOption().create()); addOption(DefaultOptionCreator.t1Option().create()); addOption(DefaultOptionCreator.t2Option().create()); addOption(DefaultOptionCreator.t3Option().create()); addOption(DefaultOptionCreator.t4Option().create()); addOption(DefaultOptionCreator.clusterFilterOption().create()); addOption(DefaultOptionCreator.overwriteOption().create()); addOption(DefaultOptionCreator.clusteringOption().create()); addOption(DefaultOptionCreator.methodOption().create()); if (parseArguments(args) == null) { return -1; } Path input = getInputPath(); Path output = getOutputPath(); Configuration conf = getConf(); if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { HadoopUtil.delete(conf, output); } String measureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION); double t1 = Double.parseDouble(getOption(DefaultOptionCreator.T1_OPTION)); double t2 = Double.parseDouble(getOption(DefaultOptionCreator.T2_OPTION)); double t3 = t1; if (hasOption(DefaultOptionCreator.T3_OPTION)) { t3 = Double.parseDouble(getOption(DefaultOptionCreator.T3_OPTION)); } double t4 = t2; if (hasOption(DefaultOptionCreator.T4_OPTION)) { t4 = Double.parseDouble(getOption(DefaultOptionCreator.T4_OPTION)); } int clusterFilter = 0; if (hasOption(DefaultOptionCreator.CLUSTER_FILTER_OPTION)) { clusterFilter = Integer .parseInt(getOption(DefaultOptionCreator.CLUSTER_FILTER_OPTION)); } boolean runClustering = hasOption(DefaultOptionCreator.CLUSTERING_OPTION); boolean runSequential = getOption(DefaultOptionCreator.METHOD_OPTION) .equalsIgnoreCase(DefaultOptionCreator.SEQUENTIAL_METHOD); DistanceMeasure measure = ClassUtils.instantiateAs(measureClass, DistanceMeasure.class); run(conf, input, output, measure, t1, t2, t3, t4, clusterFilter, runClustering, runSequential); return 0; } /** * Build a directory of Canopy clusters from the input arguments and, if * requested, cluster the input vectors using these clusters * * @param conf * the Configuration * @param input * the Path to the directory containing input vectors * @param output * the Path for all output directories * @param measure * the DistanceMeasure * @param t1 * the double T1 distance metric * @param t2 * the double T2 distance metric * @param t3 * the reducer's double T1 distance metric * @param t4 * the reducer's double T2 distance metric * @param clusterFilter * the minimum canopy size output by the mappers * @param runClustering * cluster the input vectors if true * @param runSequential * execute sequentially if true */ public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, double t1, double t2, double t3, double t4, int clusterFilter, boolean runClustering, boolean runSequential) throws IOException, InterruptedException, ClassNotFoundException { Path clustersOut = buildClusters(conf, input, output, measure, t1, t2, t3, t4, clusterFilter, runSequential); if (runClustering) { clusterData(conf, input, clustersOut, output, measure, t1, t2, runSequential); } } /** * Convenience method to provide backward compatibility */ public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, double t1, double t2, boolean runClustering, boolean runSequential) throws IOException, InterruptedException, ClassNotFoundException { run(conf, input, output, measure, t1, t2, t1, t2, 0, runClustering, runSequential); } /** * Convenience method creates new Configuration() Build a directory of Canopy * clusters from the input arguments and, if requested, cluster the input * vectors using these clusters * * @param input * the Path to the directory containing input vectors * @param output * the Path for all output directories * @param t1 * the double T1 distance metric * @param t2 * the double T2 distance metric * @param runClustering * cluster the input vectors if true * @param runSequential * execute sequentially if true */ public static void run(Path input, Path output, DistanceMeasure measure, double t1, double t2, boolean runClustering, boolean runSequential) throws IOException, InterruptedException, ClassNotFoundException { run(new Configuration(), input, output, measure, t1, t2, runClustering, runSequential); } /** * Convenience method for backwards compatibility * */ public static Path buildClusters(Configuration conf, Path input, Path output, DistanceMeasure measure, double t1, double t2, int clusterFilter, boolean runSequential) throws IOException, InterruptedException, ClassNotFoundException { return buildClusters(conf, input, output, measure, t1, t2, t1, t2, clusterFilter, runSequential); } /** * Build a directory of Canopy clusters from the input vectors and other * arguments. Run sequential or mapreduce execution as requested * * @param conf * the Configuration to use * @param input * the Path to the directory containing input vectors * @param output * the Path for all output directories * @param measure * the DistanceMeasure * @param t1 * the double T1 distance metric * @param t2 * the double T2 distance metric * @param t3 * the reducer's double T1 distance metric * @param t4 * the reducer's double T2 distance metric * @param clusterFilter * the int minimum size of canopies produced * @param runSequential * a boolean indicates to run the sequential (reference) algorithm * @return the canopy output directory Path */ public static Path buildClusters(Configuration conf, Path input, Path output, DistanceMeasure measure, double t1, double t2, double t3, double t4, int clusterFilter, boolean runSequential) throws IOException, InterruptedException, ClassNotFoundException { log.info("Build Clusters Input: {} Out: {} Measure: {} t1: {} t2: {}", new Object[] { input, output, measure, t1, t2 }); if (runSequential) { return buildClustersSeq(input, output, measure, t1, t2, clusterFilter); } else { return buildClustersMR(conf, input, output, measure, t1, t2, t3, t4, clusterFilter); } } /** * Build a directory of Canopy clusters from the input vectors and other * arguments. Run sequential execution * * @param input * the Path to the directory containing input vectors * @param output * the Path for all output directories * @param measure * the DistanceMeasure * @param t1 * the double T1 distance metric * @param t2 * the double T2 distance metric * @param clusterFilter * the int minimum size of canopies produced * @return the canopy output directory Path */ private static Path buildClustersSeq(Path input, Path output, DistanceMeasure measure, double t1, double t2, int clusterFilter) throws IOException { CanopyClusterer clusterer = new CanopyClusterer(measure, t1, t2); Collection<Canopy> canopies = Lists.newArrayList(); Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(input.toUri(), conf); for (VectorWritable vw : new SequenceFileDirValueIterable<VectorWritable>( input, PathType.LIST, PathFilters.logsCRCFilter(), conf)) { clusterer.addPointToCanopies(vw.get(), canopies); } Path canopyOutputDir = new Path(output, Cluster.CLUSTERS_DIR + '0'+ Cluster.FINAL_ITERATION_SUFFIX); Path path = new Path(canopyOutputDir, "part-r-00000"); SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, path, Text.class, Canopy.class); try { for (Canopy canopy : canopies) { canopy.computeParameters(); if (log.isDebugEnabled()) { log.debug("Writing Canopy:{} center:{} numPoints:{} radius:{}", new Object[] { canopy.getIdentifier(), AbstractCluster.formatVector(canopy.getCenter(), null), canopy.getNumPoints(), AbstractCluster.formatVector(canopy.getRadius(), null) }); } if (canopy.getNumPoints() > clusterFilter) { writer.append(new Text(canopy.getIdentifier()), canopy); } } } finally { Closeables.closeQuietly(writer); } return canopyOutputDir; } /** * Build a directory of Canopy clusters from the input vectors and other * arguments. Run mapreduce execution * * @param conf * the Configuration * @param input * the Path to the directory containing input vectors * @param output * the Path for all output directories * @param measure * the DistanceMeasure * @param t1 * the double T1 distance metric * @param t2 * the double T2 distance metric * @param t3 * the reducer's double T1 distance metric * @param t4 * the reducer's double T2 distance metric * @param clusterFilter * the int minimum size of canopies produced * @return the canopy output directory Path */ private static Path buildClustersMR(Configuration conf, Path input, Path output, DistanceMeasure measure, double t1, double t2, double t3, double t4, int clusterFilter) throws IOException, InterruptedException, ClassNotFoundException { conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY, measure.getClass() .getName()); conf.set(CanopyConfigKeys.T1_KEY, String.valueOf(t1)); conf.set(CanopyConfigKeys.T2_KEY, String.valueOf(t2)); conf.set(CanopyConfigKeys.T3_KEY, String.valueOf(t3)); conf.set(CanopyConfigKeys.T4_KEY, String.valueOf(t4)); conf.set(CanopyConfigKeys.CF_KEY, String.valueOf(clusterFilter)); Job job = new Job(conf, "Canopy Driver running buildClusters over input: " + input); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapperClass(CanopyMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(VectorWritable.class); job.setReducerClass(CanopyReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Canopy.class); job.setNumReduceTasks(1); job.setJarByClass(CanopyDriver.class); FileInputFormat.addInputPath(job, input); Path canopyOutputDir = new Path(output, Cluster.CLUSTERS_DIR + '0' + Cluster.FINAL_ITERATION_SUFFIX); FileOutputFormat.setOutputPath(job, canopyOutputDir); if (!job.waitForCompletion(true)) { throw new InterruptedException("Canopy Job failed processing " + input); } return canopyOutputDir; } public static void clusterData(Configuration conf, Path points, Path canopies, Path output, DistanceMeasure measure, double t1, double t2, boolean runSequential) throws IOException, InterruptedException, ClassNotFoundException { if (runSequential) { clusterDataSeq(points, canopies, output, measure, t1, t2); } else { clusterDataMR(conf, points, canopies, output, measure, t1, t2); } } private static void clusterDataSeq(Path points, Path canopies, Path output, DistanceMeasure measure, double t1, double t2) throws IOException { CanopyClusterer clusterer = new CanopyClusterer(measure, t1, t2); Collection<Canopy> clusters = Lists.newArrayList(); Configuration conf = new Configuration(); for (Canopy value : new SequenceFileDirValueIterable<Canopy>(canopies, PathType.LIST, PathFilters.logsCRCFilter(), conf)) { clusters.add(value); } // iterate over all points, assigning each to the closest canopy and // outputing that clustering FileSystem fs = FileSystem.get(points.toUri(), conf); FileStatus[] status = fs.listStatus(points, PathFilters.logsCRCFilter()); Path outPath = new Path(output, DEFAULT_CLUSTERED_POINTS_DIRECTORY); int part = 0; for (FileStatus s : status) { SequenceFile.Reader reader = new SequenceFile.Reader(fs, s.getPath(), conf); SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, new Path( outPath, "part-m-" + part), IntWritable.class, WeightedVectorWritable.class); try { Writable key = ClassUtils.instantiateAs(reader.getKeyClassName(), Writable.class); VectorWritable vw = ClassUtils.instantiateAs(reader.getValueClassName(), VectorWritable.class); while (reader.next(key, vw)) { Canopy closest = clusterer.findClosestCanopy(vw.get(), clusters); writer.append(new IntWritable(closest.getId()), new WeightedVectorWritable(1, vw.get())); vw = ClassUtils.instantiateAs(reader.getValueClassName(), VectorWritable.class); } } finally { Closeables.closeQuietly(reader); Closeables.closeQuietly(writer); } } } private static void clusterDataMR(Configuration conf, Path points, Path canopies, Path output, DistanceMeasure measure, double t1, double t2) throws IOException, InterruptedException, ClassNotFoundException { conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY, measure.getClass() .getName()); conf.set(CanopyConfigKeys.T1_KEY, String.valueOf(t1)); conf.set(CanopyConfigKeys.T2_KEY, String.valueOf(t2)); conf.set(CanopyConfigKeys.CANOPY_PATH_KEY, canopies.toString()); Job job = new Job(conf, "Canopy Driver running clusterData over input: " + points); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapperClass(ClusterMapper.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(WeightedVectorWritable.class); job.setNumReduceTasks(0); job.setJarByClass(CanopyDriver.class); FileInputFormat.addInputPath(job, points); Path outPath = new Path(output, DEFAULT_CLUSTERED_POINTS_DIRECTORY); FileOutputFormat.setOutputPath(job, outPath); HadoopUtil.delete(conf, outPath); if (!job.waitForCompletion(true)) { throw new InterruptedException("Canopy Clustering failed processing " + canopies); } } }