KernelKMeans.java example

Explorer
rapidminer-vega-master
/*
 *  RapidMiner
 *
 *  Copyright (C) 2001-2011 by Rapid-I and the contributors
 *
 *  Complete list of developers available at our web site:
 *
 *       http://rapid-i.com
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Affero General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Affero General Public License for more details.
 *
 *  You should have received a copy of the GNU Affero General Public License
 *  along with this program.  If not, see http://www.gnu.org/licenses/.
 */
package com.rapidminer.operator.clustering.clusterer;

import java.util.ArrayList;
import java.util.List;

import com.rapidminer.example.Attribute;
import com.rapidminer.example.Attributes;
import com.rapidminer.example.Example;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.example.Tools;
import com.rapidminer.example.table.AttributeFactory;
import com.rapidminer.operator.OperatorDescription;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.UserError;
import com.rapidminer.operator.clustering.ClusterModel;
import com.rapidminer.parameter.ParameterType;
import com.rapidminer.parameter.ParameterTypeBoolean;
import com.rapidminer.parameter.ParameterTypeInt;
import com.rapidminer.tools.Ontology;
import com.rapidminer.tools.RandomGenerator;
import com.rapidminer.tools.math.kernels.Kernel;


/**
 * This operator is an implementation of kernel k means. Kernel K Means uses kernels to estimate distance between 
 * objects and clusters. Because of the nature of kernels it is necessary to sum over all elements of a cluster to 
 * calculate one distance. So this algorithm is quadratic in number of examples and returns NO CentroidClusterModel,
 * as its older brother KMeans does.
 * This operator will create a cluster attribute if not present yet.
 * 
 * @author Sebastian Land
 */
public class KernelKMeans extends RMAbstractClusterer {

	/** The parameter name for "the maximal number of clusters" */
	public static final String PARAMETER_K = "k";

	/** The parameter name for "the decision if exampleweights should be used " */
	public static final String PARAMETER_USE_WEIGHTS = "use_weights";

	/** The parameter name for "the maximal number of iterations performed for one run of the k method" */
	public static final String PARAMETER_MAX_OPTIMIZATION_STEPS = "max_optimization_steps";

	public KernelKMeans(OperatorDescription description) {
		super(description);
	}

	@Override
	public ClusterModel generateClusterModel(ExampleSet exampleSet) throws OperatorException {		
		int k = getParameterAsInt(PARAMETER_K);
		int maxOptimizationSteps = getParameterAsInt(PARAMETER_MAX_OPTIMIZATION_STEPS);
		boolean useExampleWeights = getParameterAsBoolean(PARAMETER_USE_WEIGHTS);
		Kernel kernel = Kernel.createKernel(this);

		// checking and creating ids if necessary
		Tools.checkAndCreateIds(exampleSet);

		// additional checks
		Tools.onlyNonMissingValues(exampleSet, "KernelKMeans");

		if (exampleSet.size() < k) {
			throw new UserError(this, 142, k);
		}


		// extracting attribute names
		Attributes attributes = exampleSet.getAttributes();
		ArrayList<String> attributeNames = new ArrayList<String>(attributes.size());
		for (Attribute attribute: attributes)
			attributeNames.add(attribute.getName());
		Attribute weightAttribute = attributes.getWeight();

		RandomGenerator generator = RandomGenerator.getRandomGenerator(this);

		ClusterModel model = new ClusterModel(exampleSet, k, getParameterAsBoolean(RMAbstractClusterer.PARAMETER_ADD_AS_LABEL), getParameterAsBoolean(RMAbstractClusterer.PARAMETER_REMOVE_UNLABELED));
		// init centroids
		int[] clusterAssignments = new int[exampleSet.size()];

		for (int i = 0; i < exampleSet.size(); i++) {
			clusterAssignments[i] = generator.nextIntInRange(0, k);
		}

		// run optimization steps
		boolean stable = false;
		for (int step = 0; (step < maxOptimizationSteps) && !stable; step++) {
			// checking for stop
			checkForStop();

			// calculating cluster kernel properties
			double[] clusterWeights = new double[k];
			double[] clusterKernelCorrection = new double[k];
			int i = 0;
			for (Example firstExample: exampleSet) {
				double firstExampleWeight = (useExampleWeights)? firstExample.getValue(weightAttribute) : 1d;
				double[] firstExampleValues = getAsDoubleArray(firstExample, attributes);
				clusterWeights[clusterAssignments[i]] += firstExampleWeight;
				int j = 0;
				for (Example secondExample: exampleSet) {
					if (clusterAssignments[i] == clusterAssignments[j]) {
						double secondExampleWeight =  (useExampleWeights)? secondExample.getValue(weightAttribute) : 1d;
						clusterKernelCorrection[clusterAssignments[i]] += firstExampleWeight * secondExampleWeight * kernel.calculateDistance(firstExampleValues, getAsDoubleArray(secondExample, attributes));
					}
					j++;	
				}
				i++;
			}
			for (int z = 0; z < k; z++) {
				clusterKernelCorrection[z] /= clusterWeights[z] * clusterWeights[z];
			}

			// assign examples to new centroids
			int[] newClusterAssignments = new int[exampleSet.size()];
			i = 0;
			for (Example example: exampleSet) {
				double[] exampleValues = getAsDoubleArray(example, attributes);
				double exampleKernelValue = kernel.calculateDistance(exampleValues, exampleValues);
				double nearestDistance = Double.POSITIVE_INFINITY;
				int nearestIndex = 0;
				for (int clusterIndex = 0; clusterIndex < k; clusterIndex++) {
					double distance = 0; 
					// iterating over all examples in cluster to get kernel distance
					int j = 0;
					for (Example clusterExample: exampleSet) {
						if (clusterAssignments[j] == clusterIndex) {
							distance += ((useExampleWeights)? clusterExample.getValue(weightAttribute) : 1d) * kernel.calculateDistance(getAsDoubleArray(clusterExample, attributes), exampleValues);
						}
						j++;
					}
					distance *= (-2d) / clusterWeights[clusterIndex];
					// copy in outer loop
					distance += exampleKernelValue;
					distance += clusterKernelCorrection[clusterIndex];
					if (distance < nearestDistance) {
						nearestDistance = distance;
						nearestIndex = clusterIndex;
					}
				}
				newClusterAssignments[i] = nearestIndex;
				i++;
			}

			// finishing assignment
			stable = true;
			for (int j = 0; j < exampleSet.size() && stable; j++)
				stable &= newClusterAssignments[j] == clusterAssignments[j];
			clusterAssignments = newClusterAssignments;
		}

		// setting last clustering into model
		model.setClusterAssignments(clusterAssignments, exampleSet);

		if (addsClusterAttribute()) {
			Attribute cluster = AttributeFactory.createAttribute("cluster", Ontology.NOMINAL);
			exampleSet.getExampleTable().addAttribute(cluster);
			exampleSet.getAttributes().setCluster(cluster);
			int i = 0;
			for (Example example: exampleSet) {
				example.setValue(cluster, "cluster_" + clusterAssignments[i]);
				i++;
			}
		}
		return model;
	}

	private double[] getAsDoubleArray(Example example, Attributes attributes) {
		double[] values = new double[attributes.size()];
		int i = 0;
		for (Attribute attribute: attributes) {
			values[i] = example.getValue(attribute);
			i++;
		}
		return values;
	}

	@Override
	public List<ParameterType> getParameterTypes() {
		List<ParameterType> types = super.getParameterTypes();	
		types.add(new ParameterTypeBoolean(PARAMETER_USE_WEIGHTS, "Indicates if the weight attribute should be used.", false, false));
		types.add(new ParameterTypeInt(PARAMETER_K, "The number of clusters which should be detected.", 2, Integer.MAX_VALUE, 2, false));
		types.add(new ParameterTypeInt(PARAMETER_MAX_OPTIMIZATION_STEPS, "The maximal number of iterations performed for one run of k-Means.", 1, Integer.MAX_VALUE, 100, false));

		types.addAll(RandomGenerator.getRandomGeneratorParameters(this));

		types.addAll(Kernel.getParameters(this));
		return types;
	}
}