package edu.fudan.ml.cluster; import java.util.ArrayList; import java.util.Iterator; import edu.fudan.ml.types.Instance; import edu.fudan.ml.types.sv.HashSparseVector; public class Kmeans { int k; private final double TOL = 0.0; public HashSparseVector[] centroids = null; private HashSparseVector[] newCentroids = null; private ArrayList<Instance>[] assignedClusters = null; private ArrayList<Instance>[] newClusters = null; private float[] clusterQualities = null; private float[] newQualities = null; int maxIterations = 10; /** * Creates a new instance of Kmeans * * @param k */ public Kmeans (int k) { this.k = k; this.centroids = new HashSparseVector[k]; this.assignedClusters = new ArrayList[k]; this.clusterQualities = new float[k]; this.newCentroids = new HashSparseVector[k]; this.newClusters = new ArrayList[k]; this.newQualities = new float[k]; } /** * 计算类中心 * * @param insts * * @return */ private HashSparseVector calculateCentroid (ArrayList<Instance> insts) { HashSparseVector centroid = new HashSparseVector(); Iterator i = insts.iterator(); while (i.hasNext()) { Instance d = (Instance) i.next(); centroid.plus((HashSparseVector) d.getData()); } centroid.scaleDivide(insts.size()); return centroid; } /** * 类内方差 * * @param docs * @param centroid * * @return */ private float calculateClusterQuality (ArrayList<Instance> docs, HashSparseVector centroid) { float quality = 0.0f; HashSparseVector c = centroid; for (int i = 0; i < docs.size(); ++i) { Instance doc = docs.get(i); quality += c.distanceEuclidean((HashSparseVector) doc.getData()); } return quality; } /** * 总体方差 * * @param docs * @param centroid * * @return */ private double calculatePartitionQuality (ArrayList<Instance>[] docs, HashSparseVector[] centroid) { double quality = 0.0; for (int i = 0; i < docs.length; ++i) { quality += this.calculateClusterQuality(docs[i], centroid[i]); } return quality; } /** * 聚类 * @param insts */ public void cluster (ArrayList<Instance> insts) { System.out.println("Initial centers"); for(int i=0;i<k;i++){ assignedClusters[i] = new ArrayList<Instance>(); } for(int i=0;i<insts.size();i++){ assignedClusters[i%k].add(insts.get(i)); } for(int i=0;i<k;i++){ centroids[i] = calculateCentroid(assignedClusters[i]); clusterQualities[i] = calculateClusterQuality(assignedClusters[i], centroids[i]); } for (int numChanged = 0, itr = 0; (numChanged > 0) || (itr == 0); ++itr) { numChanged = 0; while (true) { int numReassigned = doBatchKmeans(); System.out.println("After an iteration of Batch K-Means, " + numReassigned + " documents were moved."); double oldQuality = 0.0; double newQuality = 0.0; for (int b = 0; b < this.centroids.length; ++b) { oldQuality += this.clusterQualities[b]; newQuality += this.newQualities[b]; } double qualityDelta = oldQuality - newQuality; System.out.println("Change in quality is: " + qualityDelta); if (qualityDelta < this.TOL) { System.out.println( "Benefit of change is below tolerance... Switching to incremental...\n"); break; } if (numReassigned == 0) { System.out.println( "Batch K-Means has made no changes! Switching to incremental...\n"); break; } // We like the new results. Let's make them authoritative for (int kk = 0; kk < this.assignedClusters.length; ++kk) { this.assignedClusters[kk] = this.newClusters[kk]; this.centroids[kk] = this.newCentroids[kk]; this.clusterQualities[kk] = this.newQualities[kk]; } numChanged = numReassigned; // Record the fact we made a change! } double qual = 0.0; for (int i = 0; i < this.clusterQualities.length; ++i) { qual += this.clusterQualities[i]; } System.out.println("Quality of partition generated by Batch K-Means: " + qual); } System.out.println("Batch K-Means Complete!\n"); } /** * Performs one iteration of batch k-means. Returns the number of documents that * were moved during this iteration. This method also updates the global variables * newClusters[] and newCentroids[] to the values. It's up to the caller to copy these * over the current assignedClusters[] and centroids[] arrays if desired. Initial centroids of * each initial cluster must be built in the constructor. * * @return */ private int doBatchKmeans () { System.out.println("\nBegining a new iteration of K-Means..."); int numReassigned = 0; /* Clear records for incremental k-means */ for (int i = 0; i < this.centroids.length; ++i) { this.newClusters[i] = new ArrayList<Instance>(); this.newCentroids[i] = new HashSparseVector(); this.newQualities[i] = 0.0f; } for (int clusterNum = 0; clusterNum < this.centroids.length; ++clusterNum) { // iterate over clusters for (int docNum = 0; docNum < this.assignedClusters[clusterNum].size(); ++docNum) { // iterate over docs /* * Store the document the loops have selected in the 'doc' variable. * Store is vector in the 'docVec' variable for easy access. */ Instance doc = this.assignedClusters[clusterNum].get(docNum); HashSparseVector docVec = (HashSparseVector) doc.getData(); int bestClusterNum = clusterNum; // Assume we are already in the best cluster. double distanceToCurrentCentroid = this.centroids[clusterNum].distanceEuclidean(docVec); double squareDistanceOfBestCluster = distanceToCurrentCentroid; for (int i = 0; i < this.centroids.length; ++i) { double distance = 0.0; // see which centroid is closest to docVec if (clusterNum == i) { // We know the distance in its' current cluster. distance = distanceToCurrentCentroid; } else { distance = this.centroids[i].distanceEuclidean(docVec); } if (distance < squareDistanceOfBestCluster) { squareDistanceOfBestCluster = distance; bestClusterNum = i; } } if (bestClusterNum != clusterNum) { // we moved a document! ++numReassigned; } this.newClusters[bestClusterNum].add(doc); this.newCentroids[bestClusterNum].plus(docVec); } } // Calculate the centroids of the clusters for (int i = 0; i < newClusters.length; ++i) { this.newCentroids[i].scaleDivide(this.newClusters[i].size()); this.newQualities[i] = this.calculateClusterQuality(this.newClusters[i], this.newCentroids[i]); System.out.println("new cluster " + i + " Viarances: " + this.newQualities[i] + " Num: "+ newClusters[i].size()); } return (numReassigned); } }