Kmeans.java example

Explorer
fudannlp-master
package edu.fudan.ml.cluster;

import java.util.ArrayList;
import java.util.Iterator;

import edu.fudan.ml.types.Instance;
import edu.fudan.ml.types.sv.HashSparseVector;

public class Kmeans {
	int k;
	private final double TOL = 0.0;
	public HashSparseVector[] centroids = null;
	private HashSparseVector[] newCentroids = null;
	private ArrayList<Instance>[] assignedClusters = null;
	private ArrayList<Instance>[] newClusters = null;
	private float[] clusterQualities = null;
	
	
	private float[] newQualities = null;
	int maxIterations = 10;

	/**
	 * Creates a new instance of Kmeans
	 *
	 * @param k
	 */
	public Kmeans (int k) {
		this.k = k;
		this.centroids = new HashSparseVector[k];
		this.assignedClusters = new ArrayList[k];
		this.clusterQualities = new float[k];
		this.newCentroids = new HashSparseVector[k];
		this.newClusters = new ArrayList[k];
		this.newQualities = new float[k];
	}

	/**
	 * 计算类中心
	 *
	 * @param insts
	 *
	 * @return
	 */
	private HashSparseVector calculateCentroid (ArrayList<Instance> insts) {
		HashSparseVector centroid = new HashSparseVector();
		
		
		Iterator i = insts.iterator();

		while (i.hasNext()) {
			Instance d = (Instance) i.next();

			centroid.plus((HashSparseVector) d.getData());
		}
		centroid.scaleDivide(insts.size());

		return centroid;
	}

	/**
	 * 类内方差
	 *
	 * @param docs
	 * @param centroid
	 *
	 * @return
	 */
	private float calculateClusterQuality (ArrayList<Instance> docs,
			HashSparseVector centroid) {
		float quality = 0.0f;
		HashSparseVector c = centroid;

		for (int i = 0; i < docs.size(); ++i) {
			Instance doc = docs.get(i);

			quality += c.distanceEuclidean((HashSparseVector) doc.getData());
		}

		return quality;
	}

	/**
	 * 总体方差
	 *
	 * @param docs
	 * @param centroid
	 *
	 * @return
	 */
	private double calculatePartitionQuality (ArrayList<Instance>[] docs,
			HashSparseVector[] centroid) {
		double quality = 0.0;

		for (int i = 0; i < docs.length; ++i) {
			quality += this.calculateClusterQuality(docs[i], centroid[i]);
		}
		return quality;
	}

	/**
	 * 聚类
	 * @param insts
	 */
	public void cluster (ArrayList<Instance> insts) {

		
		System.out.println("Initial centers");
		for(int i=0;i<k;i++){
			assignedClusters[i] = new ArrayList<Instance>();
		}
		for(int i=0;i<insts.size();i++){
			assignedClusters[i%k].add(insts.get(i));
		}
		for(int i=0;i<k;i++){
			centroids[i] = calculateCentroid(assignedClusters[i]);
			clusterQualities[i] = calculateClusterQuality(assignedClusters[i], centroids[i]);
		}
		
		

		for (int numChanged = 0, itr = 0; (numChanged > 0) || (itr == 0); ++itr) {

			numChanged = 0;

			while (true) {

				int numReassigned = doBatchKmeans();

				System.out.println("After an iteration of Batch K-Means, " +
						numReassigned + " documents were moved.");

				double oldQuality = 0.0;
				double newQuality = 0.0;

				for (int b = 0; b < this.centroids.length; ++b) {
					oldQuality += this.clusterQualities[b];
					newQuality += this.newQualities[b];
				}

				double qualityDelta = oldQuality - newQuality;

				System.out.println("Change in quality is: " + qualityDelta);

				if (qualityDelta < this.TOL) {
					System.out.println(
							"Benefit of change is below tolerance... Switching to incremental...\n");

					break;
				}

				if (numReassigned == 0) {
					System.out.println(
							"Batch K-Means has made no changes! Switching to incremental...\n");

					break;
				}

				// We like the new results. Let's make them authoritative
				for (int kk = 0; kk < this.assignedClusters.length; ++kk) {
					this.assignedClusters[kk] = this.newClusters[kk];
					this.centroids[kk] = this.newCentroids[kk];
					this.clusterQualities[kk] = this.newQualities[kk];
				}

				numChanged = numReassigned;    // Record the fact we made a change!
			}

			double qual = 0.0;

			for (int i = 0; i < this.clusterQualities.length; ++i) {
				qual += this.clusterQualities[i];
			}

			System.out.println("Quality of partition generated by Batch K-Means: " +
					qual);
		}

		System.out.println("Batch K-Means Complete!\n");
		
	}

	/**
	 * Performs one iteration of batch k-means. Returns the number of documents that
	 * were moved during this iteration. This method also updates the global variables
	 * newClusters[] and newCentroids[] to the values. It's up to the caller to copy these
	 * over the current assignedClusters[] and centroids[] arrays if desired.  Initial centroids of
	 * each initial cluster must be built in the constructor.
	 *
	 * @return
	 */
	private int doBatchKmeans () {

		System.out.println("\nBegining a new iteration of K-Means...");

		int numReassigned = 0;

		/* Clear records for incremental k-means */

		for (int i = 0; i < this.centroids.length; ++i) {
			this.newClusters[i] = new ArrayList<Instance>();
			this.newCentroids[i] = new HashSparseVector();
			this.newQualities[i] = 0.0f;
		}

		for (int clusterNum = 0; clusterNum < this.centroids.length; ++clusterNum) {    // iterate over clusters
			for (int docNum = 0; docNum < this.assignedClusters[clusterNum].size();	++docNum) {    // iterate over docs

				/*
				 *  Store the document the loops have selected in the 'doc' variable.
				 * Store is vector in the 'docVec' variable for easy access.
				 */
				Instance doc = this.assignedClusters[clusterNum].get(docNum);
				HashSparseVector docVec = (HashSparseVector) doc.getData();

				int bestClusterNum = clusterNum;    // Assume we are already in the best cluster.
				double distanceToCurrentCentroid =
					this.centroids[clusterNum].distanceEuclidean(docVec);
				double squareDistanceOfBestCluster = distanceToCurrentCentroid;

				for (int i = 0; i < this.centroids.length; ++i) {

					double distance = 0.0;

					// see which centroid is closest to docVec
					if (clusterNum == i) {    // We know the distance in its' current cluster.
						distance = distanceToCurrentCentroid;
					} else {
						distance = this.centroids[i].distanceEuclidean(docVec);

					}

					if (distance < squareDistanceOfBestCluster) {
						squareDistanceOfBestCluster = distance;
						bestClusterNum = i;
					}
				}

				if (bestClusterNum != clusterNum) {    // we moved a document!
					++numReassigned;
				}

				this.newClusters[bestClusterNum].add(doc);
				this.newCentroids[bestClusterNum].plus(docVec);
			}
		}

		// Calculate the centroids of the clusters
		for (int i = 0; i < newClusters.length; ++i) {
			this.newCentroids[i].scaleDivide(this.newClusters[i].size());

			this.newQualities[i] = this.calculateClusterQuality(this.newClusters[i],
					this.newCentroids[i]);

			System.out.println("new cluster " + i + " Viarances: " +
					this.newQualities[i] + " Num: "+ newClusters[i].size());
		}

		return (numReassigned);
	}

}