FastKMeans.java example

Explorer
rapidminer-vega-master
/*
 *  RapidMiner
 *
 *  Copyright (C) 2001-2011 by Rapid-I and the contributors
 *
 *  Complete list of developers available at our web site:
 *
 *       http://rapid-i.com
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Affero General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Affero General Public License for more details.
 *
 *  You should have received a copy of the GNU Affero General Public License
 *  along with this program.  If not, see http://www.gnu.org/licenses/.
 */
package com.rapidminer.operator.clustering.clusterer;

import java.util.ArrayList;
import java.util.List;

import com.rapidminer.example.Attribute;
import com.rapidminer.example.Attributes;
import com.rapidminer.example.Example;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.example.Tools;
import com.rapidminer.example.table.AttributeFactory;
import com.rapidminer.operator.OperatorCapability;
import com.rapidminer.operator.OperatorDescription;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.UserError;
import com.rapidminer.operator.clustering.CentroidClusterModel;
import com.rapidminer.operator.clustering.ClusterModel;
import com.rapidminer.operator.learner.CapabilityProvider;
import com.rapidminer.operator.ports.metadata.CapabilityPrecondition;
import com.rapidminer.parameter.ParameterType;
import com.rapidminer.parameter.ParameterTypeInt;
import com.rapidminer.tools.Ontology;
import com.rapidminer.tools.RandomGenerator;
import com.rapidminer.tools.math.similarity.DistanceMeasure;
import com.rapidminer.tools.math.similarity.numerical.EuclideanDistance;


/**
 * This operator represents an implementation of k-means. This operator will create a cluster attribute if not present
 * yet.
 * 
 * The implementation is according to paper of C. Elkan:
 * - Using the Triangle Inequality to Accelerate k-Means -
 * Proceedings of the Twentieth International Conference on Machine Learning (ICML-2003), Washington DC, 2003
 * 
 * @author Alexander Arimond
 */

public class FastKMeans extends RMAbstractClusterer implements CapabilityProvider{
	/** The parameter name for "the maximal number of clusters" */
	public static final String PARAMETER_K = "k";

	/**
	 * The parameter name for "the maximal number of runs of the k method with random initialization that are
	 * performed"
	 */
	public static final String PARAMETER_MAX_RUNS = "max_runs";

	/** The parameter name for "the maximal number of iterations performed for one run of the k method" */
	public static final String PARAMETER_MAX_OPTIMIZATION_STEPS = "max_optimization_steps";

	public FastKMeans(OperatorDescription description) {
		super(description);
		
		getExampleSetInputPort().addPrecondition(new CapabilityPrecondition(this, getExampleSetInputPort()));
	}

	@Override
	public ClusterModel generateClusterModel(ExampleSet exampleSet) throws OperatorException {
		int k = getParameterAsInt(PARAMETER_K);
		int maxOptimizationSteps = getParameterAsInt(PARAMETER_MAX_OPTIMIZATION_STEPS);
		int maxRuns = getParameterAsInt(PARAMETER_MAX_RUNS);
		DistanceMeasure measure = new EuclideanDistance();
		measure.init(exampleSet);

		// checking and creating ids if necessary
		Tools.checkAndCreateIds(exampleSet);

		// additional checks
		Tools.onlyNonMissingValues(exampleSet, "KMeans");
		if (exampleSet.size() < k) {
			throw new UserError(this, 142, k);
		}

		// extracting attribute names
		Attributes attributes = exampleSet.getAttributes();
		ArrayList<String> attributeNames = new ArrayList<String>(attributes.size());
		for (Attribute attribute : attributes)
			attributeNames.add(attribute.getName());
		
		RandomGenerator generator = RandomGenerator.getRandomGenerator(this);
		double minimalIntraClusterDistance = Double.POSITIVE_INFINITY;
		CentroidClusterModel bestModel = null;
		int[] bestAssignments = null;

		for (int iter = 0; iter < maxRuns; iter++) {

			checkForStop();
			CentroidClusterModel model = new CentroidClusterModel(exampleSet, k, attributeNames, measure, getParameterAsBoolean(RMAbstractClusterer.PARAMETER_ADD_AS_LABEL), getParameterAsBoolean(RMAbstractClusterer.PARAMETER_REMOVE_UNLABELED));
			
			// init centroids by assigning one single, unique example!
			int i = 0;
			for (Integer index : generator.nextIntSetWithRange(0, exampleSet.size(), k)) {
				model.assignExample(i, getAsDoubleArray(exampleSet.getExample(index), attributes));
				i++;
			}
			model.finishAssign();
			
			// auxiliary data structures according to paper
			final double [][] l = new double 	[exampleSet.size()][k];
			final double []   u = new double 	[exampleSet.size()];
			final boolean []  r = new boolean	[exampleSet.size()];
			
			final double [][] m_old = new double [k][attributes.size()]; // needed for step 4
			final double []	s = new double [k];
			
			final int[] centroidAssignments = new int[exampleSet.size()];

			final DistanceMatrix centroidDistances = new DistanceMatrix(k);
			computeClusterDistances(centroidDistances, s, model,  measure);

			// initialization step (has many distance calculations)
			int x = 0;
			for (Example example : exampleSet) {
				double[] exampleValues = getAsDoubleArray(example, attributes);
				double nearestDistance = measure.calculateDistance(model.getCentroidCoordinates(0), exampleValues);
				l[x][0] = nearestDistance;
				int nearestIndex = 0;
				for (int centroidIndex = 1; centroidIndex < k; centroidIndex++) {
					if (centroidDistances.get(nearestIndex, centroidIndex) >= 2 * nearestDistance) continue;
					final double distance = measure.calculateDistance(model.getCentroidCoordinates(centroidIndex), exampleValues);
					l[x][centroidIndex] = distance;
					if (distance < nearestDistance) {
						nearestDistance = distance;
						nearestIndex = centroidIndex;
					}
				}
				centroidAssignments[x] = nearestIndex;
				u[x] = nearestDistance;
				r[x] = false;
				x++;
			}

			// optimization steps (repeat until convergence)
			boolean stable = false;
			for (int step = 0; (step < maxOptimizationSteps) && !stable; step++) {
				
				// step 1.
				computeClusterDistances(centroidDistances, s, model, measure);

				int avoidedSamples = 0;
				x = 0;
				for (Example example : exampleSet) {
					final double [] exampleValue = getAsDoubleArray(example, attributes);

					// step 2.
					if ( u[x] <= s[centroidAssignments[x]]){
						// nothing
						avoidedSamples ++;
					} else {
						// step 3.
						for (int c = 0; c < k; c ++){
							if (c != centroidAssignments[x]  // (i)
							    && u[x] > l[x][c] 			// (ii)
							    && u[x] > 0.5 * centroidDistances.get(centroidAssignments[x], c) // (iii)
							    ){
								//step 3a.
								final double d_x_c;   // d(x,c(x))
								if (r[x]){
									d_x_c = measure.calculateDistance(exampleValue, model.getCentroidCoordinates(centroidAssignments[x]));
									l[x][centroidAssignments[x]] = d_x_c;
									u[x] = d_x_c;
									r[x] = false;
								} else {
									d_x_c = u[x];
								}
								// step 3b.
								if (d_x_c > l[x][c] && d_x_c > 0.5 * centroidDistances.get(centroidAssignments[x], c)){
									final double d_x_c_new = measure.calculateDistance(exampleValue, model.getCentroidCoordinates(c)); // d(x,c)
									l[x][c] = d_x_c_new;
									if (d_x_c_new < d_x_c) {
										centroidAssignments[x] = c;
										u[x] = d_x_c_new;
									}
								}
							}
						}

					}
					model.assignExample(centroidAssignments[x], exampleValue);
					x++;
				}
				
				// step 4
				// first store old c
				for (int c=0; c <k ; c++){
					m_old[c] = model.getCentroidCoordinates(c);
				}
				// then compute the m(c) - here this is same as step 7
				stable = model.finishAssign();
				
				// compute all d(c,m(c))
				final double [] mean_distances = new double [k];
				for (int c = 0; c < k; c++){
					mean_distances [c] = measure.calculateDistance(m_old[c], model.getCentroidCoordinates(c));
				}
				
				// step 5 & 6
				for (x=0; x < exampleSet.size(); x++){
					// step 5
					for (int c = 0; c < k; c++){
						final double d = l[x][c] - mean_distances[c]; 
						if (d > 0)
							l[x][c] = d;
						else
							l[x][c] = 0;
					}
					// step 6
					u[x] = u[x] + mean_distances[centroidAssignments[x]];
					r[x] = true;
				}

			}
			// assessing quality of this model
			double distanceSum = 0;
			i = 0;
			for (Example example : exampleSet) {
				double distance = measure.calculateDistance(model.getCentroidCoordinates(centroidAssignments[i]), getAsDoubleArray(example, attributes));
				distanceSum += distance * distance;
				i++;
			}
			if (distanceSum < minimalIntraClusterDistance) {
				bestModel = model;
				minimalIntraClusterDistance = distanceSum;
				bestAssignments = centroidAssignments;
			}
		}
		bestModel.setClusterAssignments(bestAssignments, exampleSet);

		if (addsClusterAttribute()) {
			Attribute cluster = AttributeFactory.createAttribute("cluster", Ontology.NOMINAL);
			exampleSet.getExampleTable().addAttribute(cluster);
			exampleSet.getAttributes().setCluster(cluster);
			int i = 0;
			for (Example example : exampleSet) {
				example.setValue(cluster, "cluster_" + bestAssignments[i]);
				i++;
			}
		}

		return bestModel;
	}

	// this is for step 1 of the paper algorithm
	private void computeClusterDistances(DistanceMatrix centroidDistances, double[] s, 
			CentroidClusterModel model,  DistanceMeasure measure) {
		for (int i = 0; i < model.getNumberOfClusters(); i++){
			s[i] = Double.POSITIVE_INFINITY;
		}
		for (int i = 0; i < model.getNumberOfClusters(); i++){
			for (int j = i+1; j < model.getNumberOfClusters(); j++){
				final double d = measure.calculateDistance(model.getCentroidCoordinates(i), model.getCentroidCoordinates(j));
				if (d < s[i]){
					s[i] = d;
				}
				if (d < s[j]){
					s[j] = d;
				}
				centroidDistances.set(i, j, d);
			}
		}
		for (int i = 0; i < model.getNumberOfClusters(); i++){
			s[i] = 0.5 * s[i];
		}
	}

	private double[] getAsDoubleArray(Example example, Attributes attributes) {
		double[] values = new double[attributes.size()];
		int i = 0;
		for (Attribute attribute : attributes) {
			values[i] = example.getValue(attribute);
			i++;
		}
		return values;
	}

	@Override
	public Class<? extends ClusterModel> getClusterModelClass() {
		return CentroidClusterModel.class;
	}

	@Override
	public boolean supportsCapability(OperatorCapability capability) {
		switch (capability) {
		case BINOMINAL_ATTRIBUTES:
		case POLYNOMINAL_ATTRIBUTES:
			return false;
		default:
			return true;
		}
	}

	@Override
	public List<ParameterType> getParameterTypes() {
		List<ParameterType> types = super.getParameterTypes();
		types.add(new ParameterTypeInt(PARAMETER_K, "The number of clusters which should be detected.", 2, Integer.MAX_VALUE, 2, false));
		types.add(new ParameterTypeInt(PARAMETER_MAX_RUNS, "The maximal number of runs of k-Means with random initialization that are performed.", 1, Integer.MAX_VALUE, 10, false));
		types.add(new ParameterTypeInt(PARAMETER_MAX_OPTIMIZATION_STEPS, "The maximal number of iterations performed for one run of k-Means.", 1, Integer.MAX_VALUE, 100, false));
		types.addAll(RandomGenerator.getRandomGeneratorParameters(this));
		return types;
	}
}