/*
* RapidMiner
*
* Copyright (C) 2001-2011 by Rapid-I and the contributors
*
* Complete list of developers available at our web site:
*
* http://rapid-i.com
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see http://www.gnu.org/licenses/.
*/
package com.rapidminer.operator.clustering.clusterer;
import java.util.ArrayList;
import java.util.List;
import com.rapidminer.example.Attribute;
import com.rapidminer.example.Attributes;
import com.rapidminer.example.Example;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.example.Tools;
import com.rapidminer.example.table.AttributeFactory;
import com.rapidminer.operator.OperatorDescription;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.UserError;
import com.rapidminer.operator.clustering.ClusterModel;
import com.rapidminer.parameter.ParameterType;
import com.rapidminer.parameter.ParameterTypeBoolean;
import com.rapidminer.parameter.ParameterTypeInt;
import com.rapidminer.tools.Ontology;
import com.rapidminer.tools.RandomGenerator;
import com.rapidminer.tools.math.kernels.Kernel;
/**
* This operator is an implementation of kernel k means. Kernel K Means uses kernels to estimate distance between
* objects and clusters. Because of the nature of kernels it is necessary to sum over all elements of a cluster to
* calculate one distance. So this algorithm is quadratic in number of examples and returns NO CentroidClusterModel,
* as its older brother KMeans does.
* This operator will create a cluster attribute if not present yet.
*
* @author Sebastian Land
*/
public class KernelKMeans extends RMAbstractClusterer {
/** The parameter name for "the maximal number of clusters" */
public static final String PARAMETER_K = "k";
/** The parameter name for "the decision if exampleweights should be used " */
public static final String PARAMETER_USE_WEIGHTS = "use_weights";
/** The parameter name for "the maximal number of iterations performed for one run of the k method" */
public static final String PARAMETER_MAX_OPTIMIZATION_STEPS = "max_optimization_steps";
public KernelKMeans(OperatorDescription description) {
super(description);
}
@Override
public ClusterModel generateClusterModel(ExampleSet exampleSet) throws OperatorException {
int k = getParameterAsInt(PARAMETER_K);
int maxOptimizationSteps = getParameterAsInt(PARAMETER_MAX_OPTIMIZATION_STEPS);
boolean useExampleWeights = getParameterAsBoolean(PARAMETER_USE_WEIGHTS);
Kernel kernel = Kernel.createKernel(this);
// checking and creating ids if necessary
Tools.checkAndCreateIds(exampleSet);
// additional checks
Tools.onlyNonMissingValues(exampleSet, "KernelKMeans");
if (exampleSet.size() < k) {
throw new UserError(this, 142, k);
}
// extracting attribute names
Attributes attributes = exampleSet.getAttributes();
ArrayList<String> attributeNames = new ArrayList<String>(attributes.size());
for (Attribute attribute: attributes)
attributeNames.add(attribute.getName());
Attribute weightAttribute = attributes.getWeight();
RandomGenerator generator = RandomGenerator.getRandomGenerator(this);
ClusterModel model = new ClusterModel(exampleSet, k, getParameterAsBoolean(RMAbstractClusterer.PARAMETER_ADD_AS_LABEL), getParameterAsBoolean(RMAbstractClusterer.PARAMETER_REMOVE_UNLABELED));
// init centroids
int[] clusterAssignments = new int[exampleSet.size()];
for (int i = 0; i < exampleSet.size(); i++) {
clusterAssignments[i] = generator.nextIntInRange(0, k);
}
// run optimization steps
boolean stable = false;
for (int step = 0; (step < maxOptimizationSteps) && !stable; step++) {
// checking for stop
checkForStop();
// calculating cluster kernel properties
double[] clusterWeights = new double[k];
double[] clusterKernelCorrection = new double[k];
int i = 0;
for (Example firstExample: exampleSet) {
double firstExampleWeight = (useExampleWeights)? firstExample.getValue(weightAttribute) : 1d;
double[] firstExampleValues = getAsDoubleArray(firstExample, attributes);
clusterWeights[clusterAssignments[i]] += firstExampleWeight;
int j = 0;
for (Example secondExample: exampleSet) {
if (clusterAssignments[i] == clusterAssignments[j]) {
double secondExampleWeight = (useExampleWeights)? secondExample.getValue(weightAttribute) : 1d;
clusterKernelCorrection[clusterAssignments[i]] += firstExampleWeight * secondExampleWeight * kernel.calculateDistance(firstExampleValues, getAsDoubleArray(secondExample, attributes));
}
j++;
}
i++;
}
for (int z = 0; z < k; z++) {
clusterKernelCorrection[z] /= clusterWeights[z] * clusterWeights[z];
}
// assign examples to new centroids
int[] newClusterAssignments = new int[exampleSet.size()];
i = 0;
for (Example example: exampleSet) {
double[] exampleValues = getAsDoubleArray(example, attributes);
double exampleKernelValue = kernel.calculateDistance(exampleValues, exampleValues);
double nearestDistance = Double.POSITIVE_INFINITY;
int nearestIndex = 0;
for (int clusterIndex = 0; clusterIndex < k; clusterIndex++) {
double distance = 0;
// iterating over all examples in cluster to get kernel distance
int j = 0;
for (Example clusterExample: exampleSet) {
if (clusterAssignments[j] == clusterIndex) {
distance += ((useExampleWeights)? clusterExample.getValue(weightAttribute) : 1d) * kernel.calculateDistance(getAsDoubleArray(clusterExample, attributes), exampleValues);
}
j++;
}
distance *= (-2d) / clusterWeights[clusterIndex];
// copy in outer loop
distance += exampleKernelValue;
distance += clusterKernelCorrection[clusterIndex];
if (distance < nearestDistance) {
nearestDistance = distance;
nearestIndex = clusterIndex;
}
}
newClusterAssignments[i] = nearestIndex;
i++;
}
// finishing assignment
stable = true;
for (int j = 0; j < exampleSet.size() && stable; j++)
stable &= newClusterAssignments[j] == clusterAssignments[j];
clusterAssignments = newClusterAssignments;
}
// setting last clustering into model
model.setClusterAssignments(clusterAssignments, exampleSet);
if (addsClusterAttribute()) {
Attribute cluster = AttributeFactory.createAttribute("cluster", Ontology.NOMINAL);
exampleSet.getExampleTable().addAttribute(cluster);
exampleSet.getAttributes().setCluster(cluster);
int i = 0;
for (Example example: exampleSet) {
example.setValue(cluster, "cluster_" + clusterAssignments[i]);
i++;
}
}
return model;
}
private double[] getAsDoubleArray(Example example, Attributes attributes) {
double[] values = new double[attributes.size()];
int i = 0;
for (Attribute attribute: attributes) {
values[i] = example.getValue(attribute);
i++;
}
return values;
}
@Override
public List<ParameterType> getParameterTypes() {
List<ParameterType> types = super.getParameterTypes();
types.add(new ParameterTypeBoolean(PARAMETER_USE_WEIGHTS, "Indicates if the weight attribute should be used.", false, false));
types.add(new ParameterTypeInt(PARAMETER_K, "The number of clusters which should be detected.", 2, Integer.MAX_VALUE, 2, false));
types.add(new ParameterTypeInt(PARAMETER_MAX_OPTIMIZATION_STEPS, "The maximal number of iterations performed for one run of k-Means.", 1, Integer.MAX_VALUE, 100, false));
types.addAll(RandomGenerator.getRandomGeneratorParameters(this));
types.addAll(Kernel.getParameters(this));
return types;
}
}