package mia.clustering.ch09; import java.util.ArrayList; import java.util.List; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.apache.mahout.clustering.kmeans.Cluster; import org.apache.mahout.clustering.kmeans.KMeansClusterer; import org.apache.mahout.common.distance.CosineDistanceMeasure; import org.apache.mahout.math.Vector; import org.apache.mahout.math.VectorWritable; public class KMeansClustering { public static void main(String args[]) throws Exception { String inputDir = "reuters"; int k = 25; Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); String vectorsFolder = inputDir + "/tfidf-vectors"; SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path(vectorsFolder + "/part-r-00000"), conf); List<Vector> points = new ArrayList<Vector>(); Text key = new Text(); VectorWritable value = new VectorWritable(); while (reader.next(key, value)) { points.add(value.get()); } System.out.println(points.size()); reader.close(); List<Vector> randomPoints = RandomPointsUtil.chooseRandomPoints(points, k); List<Cluster> clusters = new ArrayList<Cluster>(); System.out.println(randomPoints.size()); int clusterId = 0; for (Vector v : randomPoints) { clusters.add(new Cluster(v, clusterId++, new CosineDistanceMeasure())); } List<List<Cluster>> finalClusters = KMeansClusterer.clusterPoints(points, clusters, new CosineDistanceMeasure(), 10, 0.01); for (Cluster cluster : finalClusters.get(finalClusters.size() - 1)) { System.out.println("Cluster id: " + cluster.getId() + " center: " + cluster.getCenter().asFormatString()); } } }