package edu.umd.cloud9.example.clustering; import java.io.IOException; import java.util.Arrays; import java.util.List; import java.util.Random; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.GnuParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Option; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Partitioner; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.log4j.Logger; public class LocalClusteringDriver { private static final Random RANDOM = new Random(); private static final String POINTS = "points"; private static final String COMPONENTS = "components"; private static final String KMEANS = "initializeWithKMeans"; private static final String HELP = "help"; private static final String OUTPUT = "output"; // private static final String input="points_input"; @SuppressWarnings({ "static-access" }) public static void main(String[] args) { Options options = new Options(); options.addOption(new Option(KMEANS, "initialize with k-means")); options.addOption(new Option(HELP, "display help options")); options.addOption(OptionBuilder.withArgName("num").hasArg() .withDescription("input path").create(POINTS)); options.addOption(OptionBuilder.withArgName("num").hasArg() .withDescription("output path").create(COMPONENTS)); options.addOption(OptionBuilder.withArgName("path").hasArg() .withDescription("result path").create(OUTPUT)); CommandLine cmdline = null; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); } if (!cmdline.hasOption(OUTPUT)) { System.out.println("args: " + Arrays.toString(args)); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(120); formatter.printHelp(LocalClusteringDriver.class.getName(), options); System.exit(-1); } if (cmdline.hasOption(HELP)) { System.out.println("args: " + Arrays.toString(args)); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(120); formatter.printHelp(LocalClusteringDriver.class.getName(), options); System.exit(-1); } int numComponents = cmdline.hasOption(COMPONENTS) ? Integer.parseInt(cmdline.getOptionValue(COMPONENTS)) : 3; int numPoints = cmdline.hasOption(POINTS) ? Integer.parseInt(cmdline.getOptionValue(POINTS)) : 100000; String output = cmdline.getOptionValue(OUTPUT); System.out.println(output); System.out.println("Number of points: " + numPoints); System.out.println("Number of components in mixture: " + numComponents); UnivariateGaussianMixtureModel sourceModel = new UnivariateGaussianMixtureModel(numComponents); for (int i = 0; i < numComponents; i++) { PVector param = new PVector(2); param.array[0] = RANDOM.nextInt(100); param.array[1] = RANDOM.nextFloat() * 3; sourceModel.param[i] = param; sourceModel.weight[i] = RANDOM.nextInt(10) + 1; } sourceModel.normalizeWeights(); System.out.println("Initial mixture model:\n" + sourceModel + "\n"); // Draw points from initial mixture model and compute the n clusters Point[] points = sourceModel.drawRandomPoints(numPoints); UnivariateGaussianMixtureModel learnedModel = null; if (cmdline.hasOption(KMEANS)) { System.out.println("Running k-means to initialize clusters..."); List<Point>[] clusters = KMeans.run(points, numComponents); double[] means = new double[numComponents]; int cnt = 0; for (List<Point> cluster : clusters) { double tmp = 0.0; for (Point p : cluster) { tmp += p.value; } means[cnt] = tmp / cluster.size(); cnt++; } System.out.println("Cluster means: " + Arrays.toString(means) + "\n"); learnedModel = ExpectationMaximization.initialize(points, means); } else { learnedModel = ExpectationMaximization.initialize(points, numComponents); } Path outputPoi = new Path(output); try { FileSystem fs = FileSystem.get(new Configuration()); fs.delete(outputPoi, true); FSDataOutputStream pointfile=fs.create(new Path(output+"/points")); for (int i=0;i<numPoints;i++){ pointfile.write((Double.toString(points[i].value)+"\n").getBytes()); } pointfile.flush(); pointfile.close(); FSDataOutputStream clusterfile=fs.create(new Path(output+"/cluster0")); for (int i = 0; i < numComponents; i++) { clusterfile.write((i+" "+Double.toString(learnedModel.weight[i])+" "+learnedModel.param[i].array[0]+" "+learnedModel.param[i].array[1]+"\n").getBytes()); } clusterfile.flush(); clusterfile.close(); }catch (IOException exp){ exp.printStackTrace(); } System.out.println("** Ready to run EM **\n"); System.out.println("Initial mixture model:\n" + learnedModel + "\n"); learnedModel = ExpectationMaximization.run(points, learnedModel); System.out.println("Mixure model estimated using EM: \n" + learnedModel + "\n"); } }