/** * Copyright (C) 2001-2017 by RapidMiner and the contributors * * Complete list of developers available at our web site: * * http://rapidminer.com * * This program is free software: you can redistribute it and/or modify it under the terms of the * GNU Affero General Public License as published by the Free Software Foundation, either version 3 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License along with this program. * If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.operator.clustering.clusterer; import java.util.ArrayList; import java.util.List; import com.rapidminer.example.Attribute; import com.rapidminer.example.Attributes; import com.rapidminer.example.Example; import com.rapidminer.example.ExampleSet; import com.rapidminer.example.Tools; import com.rapidminer.example.table.AttributeFactory; import com.rapidminer.operator.OperatorCapability; import com.rapidminer.operator.OperatorDescription; import com.rapidminer.operator.OperatorException; import com.rapidminer.operator.UserError; import com.rapidminer.operator.clustering.CentroidClusterModel; import com.rapidminer.operator.clustering.ClusterModel; import com.rapidminer.operator.learner.CapabilityProvider; import com.rapidminer.operator.ports.metadata.CapabilityPrecondition; import com.rapidminer.parameter.ParameterType; import com.rapidminer.parameter.ParameterTypeBoolean; import com.rapidminer.parameter.ParameterTypeInt; import com.rapidminer.tools.Ontology; import com.rapidminer.tools.RandomGenerator; import com.rapidminer.tools.math.similarity.DistanceMeasure; import com.rapidminer.tools.math.similarity.DistanceMeasureHelper; import com.rapidminer.tools.math.similarity.DistanceMeasures; import de.dfki.madm.operator.KMeanspp; /** * This operator represents an implementation of k-means. This operator will create a cluster * attribute if not present yet. * * The implementation is according to paper of C. Elkan: - Using the Triangle Inequality to * Accelerate k-Means - Proceedings of the Twentieth International Conference on Machine Learning * (ICML-2003), Washington DC, 2003 * * @author Alexander Arimond */ public class FastKMeans extends RMAbstractClusterer implements CapabilityProvider { /** The parameter name for "the maximal number of clusters" */ public static final String PARAMETER_K = "k"; private DistanceMeasureHelper measureHelper = new DistanceMeasureHelper(this); private DistanceMeasure presetMeasure = null; /** * The parameter name for "the maximal number of runs of the k method with random * initialization that are performed" */ public static final String PARAMETER_MAX_RUNS = "max_runs"; boolean kpp = getParameterAsBoolean(KMeanspp.PARAMETER_USE_KPP); /** * The parameter name for "the maximal number of iterations performed for one run of the k * method" */ public static final String PARAMETER_MAX_OPTIMIZATION_STEPS = "max_optimization_steps"; /** * Overrides the measure specified by the operator parameters. If set to null, parameters will * be used again to determine the measure. */ public void setPresetMeasure(DistanceMeasure me) { this.presetMeasure = me; } public FastKMeans(OperatorDescription description) { super(description); getExampleSetInputPort().addPrecondition(new CapabilityPrecondition(this, getExampleSetInputPort())); } @Override public ClusterModel generateClusterModel(ExampleSet exampleSet) throws OperatorException { int k = getParameterAsInt(PARAMETER_K); int maxOptimizationSteps = getParameterAsInt(PARAMETER_MAX_OPTIMIZATION_STEPS); int maxRuns = getParameterAsInt(PARAMETER_MAX_RUNS); boolean addAsLabel = getParameterAsBoolean(RMAbstractClusterer.PARAMETER_ADD_AS_LABEL); boolean removeUnlabeled = getParameterAsBoolean(RMAbstractClusterer.PARAMETER_REMOVE_UNLABELED); DistanceMeasure measure; if (presetMeasure != null) { measure = presetMeasure; measure.init(exampleSet); } else { // try { measure = measureHelper.getInitializedMeasure(exampleSet); // } catch (NullPointerException e){ // measure = new EuclideanDistance(); // } } // init operator progress getProgress().setTotal(maxRuns); // checking and creating ids if necessary Tools.checkAndCreateIds(exampleSet); // additional checks Tools.onlyNonMissingValues(exampleSet, getOperatorClassName(), this, new String[0]); if (exampleSet.size() < k) { throw new UserError(this, 142, k); } // extracting attribute names Attributes attributes = exampleSet.getAttributes(); ArrayList<String> attributeNames = new ArrayList<String>(attributes.size()); for (Attribute attribute : attributes) { attributeNames.add(attribute.getName()); } RandomGenerator generator = RandomGenerator.getRandomGenerator(this); double minimalIntraClusterDistance = Double.POSITIVE_INFINITY; CentroidClusterModel bestModel = null; int[] bestAssignments = null; for (int iter = 0; iter < maxRuns; iter++) { CentroidClusterModel model = new CentroidClusterModel(exampleSet, k, attributeNames, measure, addAsLabel, removeUnlabeled); // init centroids by assigning one single, unique example! int i = 0; if (kpp) { KMeanspp kmpp = new KMeanspp(getOperatorDescription(), k, exampleSet, measure, generator); int[] hilf = kmpp.getStart(); int i1 = 0; for (int id : hilf) { double[] as = getAsDoubleArray(exampleSet.getExample(id), attributes); model.assignExample(i1, as); i1++; } } else { for (Integer index : generator.nextIntSetWithRange(0, exampleSet.size(), k)) { model.assignExample(i, getAsDoubleArray(exampleSet.getExample(index), attributes)); i++; } } model.finishAssign(); // auxiliary data structures according to paper final double[][] l = new double[exampleSet.size()][k]; final double[] u = new double[exampleSet.size()]; final boolean[] r = new boolean[exampleSet.size()]; final double[][] m_old = new double[k][attributes.size()]; // needed for step 4 final double[] s = new double[k]; final int[] centroidAssignments = new int[exampleSet.size()]; final DistanceMatrix centroidDistances = new DistanceMatrix(k); computeClusterDistances(centroidDistances, s, model, measure); // initialization step (has many distance calculations) int x = 0; for (Example example : exampleSet) { double[] exampleValues = getAsDoubleArray(example, attributes); double nearestDistance = measure.calculateDistance(model.getCentroidCoordinates(0), exampleValues); l[x][0] = nearestDistance; int nearestIndex = 0; for (int centroidIndex = 1; centroidIndex < k; centroidIndex++) { if (centroidDistances.get(nearestIndex, centroidIndex) >= 2 * nearestDistance) { continue; } final double distance = measure.calculateDistance(model.getCentroidCoordinates(centroidIndex), exampleValues); l[x][centroidIndex] = distance; if (distance < nearestDistance) { nearestDistance = distance; nearestIndex = centroidIndex; } } centroidAssignments[x] = nearestIndex; u[x] = nearestDistance; r[x] = false; x++; } // optimization steps (repeat until convergence) boolean stable = false; for (int step = 0; step < maxOptimizationSteps && !stable; step++) { // step 1. computeClusterDistances(centroidDistances, s, model, measure); x = 0; for (Example example : exampleSet) { final double[] exampleValue = getAsDoubleArray(example, attributes); // step 2. if (u[x] <= s[centroidAssignments[x]]) { } else { // step 3. for (int c = 0; c < k; c++) { if (c != centroidAssignments[x] // (i) && u[x] > l[x][c] // (ii) && u[x] > 0.5 * centroidDistances.get(centroidAssignments[x], c) // (iii) ) { // step 3a. final double d_x_c; // d(x,c(x)) if (r[x]) { d_x_c = measure.calculateDistance(exampleValue, model.getCentroidCoordinates(centroidAssignments[x])); l[x][centroidAssignments[x]] = d_x_c; u[x] = d_x_c; r[x] = false; } else { d_x_c = u[x]; } // step 3b. if (d_x_c > l[x][c] && d_x_c > 0.5 * centroidDistances.get(centroidAssignments[x], c)) { final double d_x_c_new = measure.calculateDistance(exampleValue, model.getCentroidCoordinates(c)); // d(x,c) l[x][c] = d_x_c_new; if (d_x_c_new < d_x_c) { centroidAssignments[x] = c; u[x] = d_x_c_new; } } } } } model.assignExample(centroidAssignments[x], exampleValue); x++; } // step 4 // first store old c for (int c = 0; c < k; c++) { m_old[c] = model.getCentroidCoordinates(c); } // then compute the m(c) - here this is same as step 7 stable = model.finishAssign(); // compute all d(c,m(c)) final double[] mean_distances = new double[k]; for (int c = 0; c < k; c++) { mean_distances[c] = measure.calculateDistance(m_old[c], model.getCentroidCoordinates(c)); } // step 5 & 6 for (x = 0; x < exampleSet.size(); x++) { // step 5 for (int c = 0; c < k; c++) { final double d = l[x][c] - mean_distances[c]; if (d > 0) { l[x][c] = d; } else { l[x][c] = 0; } } // step 6 u[x] = u[x] + mean_distances[centroidAssignments[x]]; r[x] = true; } } // assessing quality of this model double distanceSum = 0; i = 0; for (Example example : exampleSet) { double distance = measure.calculateDistance(model.getCentroidCoordinates(centroidAssignments[i]), getAsDoubleArray(example, attributes)); distanceSum += distance * distance; i++; } if (distanceSum < minimalIntraClusterDistance) { bestModel = model; minimalIntraClusterDistance = distanceSum; bestAssignments = centroidAssignments; } getProgress().step(); } bestModel.setClusterAssignments(bestAssignments, exampleSet); if (addsClusterAttribute()) { Attribute cluster = AttributeFactory.createAttribute("cluster", Ontology.NOMINAL); exampleSet.getExampleTable().addAttribute(cluster); exampleSet.getAttributes().setCluster(cluster); int i = 0; for (Example example : exampleSet) { example.setValue(cluster, "cluster_" + bestAssignments[i]); i++; } } getProgress().complete(); return bestModel; } // this is for step 1 of the paper algorithm private void computeClusterDistances(DistanceMatrix centroidDistances, double[] s, CentroidClusterModel model, DistanceMeasure measure) { for (int i = 0; i < model.getNumberOfClusters(); i++) { s[i] = Double.POSITIVE_INFINITY; } for (int i = 0; i < model.getNumberOfClusters(); i++) { for (int j = i + 1; j < model.getNumberOfClusters(); j++) { final double d = measure.calculateDistance(model.getCentroidCoordinates(i), model.getCentroidCoordinates(j)); if (d < s[i]) { s[i] = d; } if (d < s[j]) { s[j] = d; } centroidDistances.set(i, j, d); } } for (int i = 0; i < model.getNumberOfClusters(); i++) { s[i] = 0.5 * s[i]; } } private double[] getAsDoubleArray(Example example, Attributes attributes) { double[] values = new double[attributes.size()]; int i = 0; for (Attribute attribute : attributes) { values[i] = example.getValue(attribute); i++; } return values; } @Override public Class<? extends ClusterModel> getClusterModelClass() { return CentroidClusterModel.class; } @Override public boolean supportsCapability(OperatorCapability capability) { switch (capability) { case BINOMINAL_ATTRIBUTES: case POLYNOMINAL_ATTRIBUTES: return false; default: return true; } } @Override public List<ParameterType> getParameterTypes() { List<ParameterType> types = super.getParameterTypes(); types.add(new ParameterTypeInt(PARAMETER_K, "The number of clusters which should be detected.", 2, Integer.MAX_VALUE, 2, false)); types.add(new ParameterTypeBoolean(KMeanspp.PARAMETER_USE_KPP, KMeanspp.SHORT_DESCRIPTION, false)); for (ParameterType a : DistanceMeasures.getParameterTypes(this)) { if (a.getKey() == DistanceMeasures.PARAMETER_MEASURE_TYPES) { a.setDefaultValue(2); } types.add(a); } types.add(new ParameterTypeInt(PARAMETER_MAX_RUNS, "The maximal number of runs of k-Means with random initialization that are performed.", 1, Integer.MAX_VALUE, 10, false)); types.add(new ParameterTypeInt(PARAMETER_MAX_OPTIMIZATION_STEPS, "The maximal number of iterations performed for one run of k-Means.", 1, Integer.MAX_VALUE, 100, false)); types.addAll(RandomGenerator.getRandomGeneratorParameters(this)); return types; } }