package mia.clustering.ch10; import java.io.File; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.SequenceFile; import org.apache.lucene.analysis.Analyzer; import org.apache.mahout.clustering.Cluster; import org.apache.mahout.clustering.kmeans.KMeansDriver; import org.apache.mahout.clustering.kmeans.RandomSeedGenerator; import org.apache.mahout.common.HadoopUtil; import org.apache.mahout.common.distance.CosineDistanceMeasure; import org.apache.mahout.vectorizer.DefaultAnalyzer; import org.apache.mahout.vectorizer.DictionaryVectorizer; import org.apache.mahout.vectorizer.DocumentProcessor; import org.apache.mahout.vectorizer.tfidf.TFIDFConverter; public class MyDistanceNewsClustering { public static void main(String args[]) throws Exception { int minSupport = 5; int minDf = 5; int maxDFPercent = 99; int maxNGramSize = 1; int minLLRValue = 50; int reduceTasks = 1; int chunkSize = 200; int norm = -1; boolean sequentialAccessOutput = true; String inputDir = "inputDir"; File inputDirFile = new File(inputDir); if (!inputDirFile.exists()) { // inputDirFile.mkdir(); } Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); /* * SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, new Path(inputDir, "documents.seq"), * Text.class, Text.class); for (Document d : Database) { writer.append(new Text(d.getID()), new * Text(d.contents())); } writer.close(); */ String outputDir = "myDistanceNewsClusters"; HadoopUtil.delete(conf, new Path(outputDir)); Path tokenizedPath = new Path(outputDir ,DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER); DefaultAnalyzer analyzer = new DefaultAnalyzer(); DocumentProcessor.tokenizeDocuments(new Path(inputDir), analyzer .getClass().asSubclass(Analyzer.class), tokenizedPath, conf); DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, new Path(outputDir), conf, minSupport, maxNGramSize, minLLRValue, 2, true, reduceTasks, chunkSize, sequentialAccessOutput, false); TFIDFConverter.processTfIdf( new Path(outputDir , DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER), new Path(outputDir), conf, chunkSize, minDf, maxDFPercent, norm, true, sequentialAccessOutput, false, reduceTasks); Path vectorsFolder = new Path(outputDir, "tfidf-vectors"); Path centroids = new Path(outputDir, "centroids"); Path clusterOutput = new Path(outputDir, "clusters"); RandomSeedGenerator.buildRandom(conf, vectorsFolder, centroids, 20, new CosineDistanceMeasure()); KMeansDriver.run(conf, vectorsFolder, centroids, clusterOutput, new MyDistanceMeasure(), 0.01, 20, true, false); SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path(clusterOutput, Cluster.CLUSTERED_POINTS_DIR + "/part-m-00000"), conf); } }