/* * RapidMiner * * Copyright (C) 2001-2011 by Rapid-I and the contributors * * Complete list of developers available at our web site: * * http://rapid-i.com * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.operator.clustering.clusterer; import java.util.ArrayList; import java.util.List; import com.rapidminer.example.Attribute; import com.rapidminer.example.Attributes; import com.rapidminer.example.Example; import com.rapidminer.example.ExampleSet; import com.rapidminer.example.Tools; import com.rapidminer.example.table.AttributeFactory; import com.rapidminer.operator.OperatorCapability; import com.rapidminer.operator.OperatorDescription; import com.rapidminer.operator.OperatorException; import com.rapidminer.operator.UserError; import com.rapidminer.operator.clustering.CentroidClusterModel; import com.rapidminer.operator.clustering.ClusterModel; import com.rapidminer.operator.learner.CapabilityProvider; import com.rapidminer.operator.ports.metadata.DistanceMeasurePrecondition; import com.rapidminer.parameter.ParameterType; import com.rapidminer.parameter.ParameterTypeInt; import com.rapidminer.tools.Ontology; import com.rapidminer.tools.RandomGenerator; import com.rapidminer.tools.math.similarity.DistanceMeasure; import com.rapidminer.tools.math.similarity.DistanceMeasureHelper; import com.rapidminer.tools.math.similarity.DistanceMeasures; /** * This operator represents an implementation of k-medoids. This operator will create a cluster attribute if not present * yet. * * @author Sebastian Land */ public class KMedoids extends RMAbstractClusterer implements CapabilityProvider { /** The parameter name for "the maximal number of clusters" */ public static final String PARAMETER_K = "k"; /** * The parameter name for "the maximal number of runs of the k method with random initialization that are * performed" */ public static final String PARAMETER_MAX_RUNS = "max_runs"; /** The parameter name for "the maximal number of iterations performed for one run of the k method" */ public static final String PARAMETER_MAX_OPTIMIZATION_STEPS = "max_optimization_steps"; private DistanceMeasureHelper measureHelper = new DistanceMeasureHelper(this); public KMedoids(OperatorDescription description) { super(description); getExampleSetInputPort().addPrecondition(new DistanceMeasurePrecondition(getExampleSetInputPort(), this)); } @Override public boolean supportsCapability(OperatorCapability capability) { int measureType = DistanceMeasures.MIXED_MEASURES_TYPE; try { measureType = measureHelper.getSelectedMeasureType(); } catch (Exception e) { } switch (capability) { case BINOMINAL_ATTRIBUTES: case POLYNOMINAL_ATTRIBUTES: return (measureType == DistanceMeasures.MIXED_MEASURES_TYPE) || (measureType == DistanceMeasures.NOMINAL_MEASURES_TYPE); case NUMERICAL_ATTRIBUTES: return (measureType == DistanceMeasures.MIXED_MEASURES_TYPE) || (measureType == DistanceMeasures.DIVERGENCES_TYPE) || (measureType == DistanceMeasures.NUMERICAL_MEASURES_TYPE); case POLYNOMINAL_LABEL: case BINOMINAL_LABEL: case NUMERICAL_LABEL: case WEIGHTED_EXAMPLES: case MISSING_VALUES: return true; default: return false; } } @Override public ClusterModel generateClusterModel(ExampleSet exampleSet) throws OperatorException { int k = getParameterAsInt(PARAMETER_K); int maxOptimizationSteps = getParameterAsInt(PARAMETER_MAX_OPTIMIZATION_STEPS); int maxRuns = getParameterAsInt(PARAMETER_MAX_RUNS); DistanceMeasure measure = measureHelper.getInitializedMeasure(exampleSet); // checking and creating ids if necessary Tools.checkAndCreateIds(exampleSet); // additional checks Tools.onlyNonMissingValues(exampleSet, "KMedoids"); if (exampleSet.size() < k) { throw new UserError(this, 142, k); } // extracting attribute names Attributes attributes = exampleSet.getAttributes(); ArrayList<String> attributeNames = new ArrayList<String>(attributes.size()); for (Attribute attribute : attributes) attributeNames.add(attribute.getName()); RandomGenerator generator = RandomGenerator.getRandomGenerator(this); double minimalIntraClusterDistance = Double.POSITIVE_INFINITY; CentroidClusterModel bestModel = null; int[] bestAssignments = null; double[] values = new double[attributes.size()]; for (int iter = 0; iter < maxRuns; iter++) { checkForStop(); CentroidClusterModel model = new CentroidClusterModel(exampleSet, k, attributeNames, measure, getParameterAsBoolean(RMAbstractClusterer.PARAMETER_ADD_AS_LABEL), getParameterAsBoolean(RMAbstractClusterer.PARAMETER_REMOVE_UNLABELED)); // init centroids int i = 0; for (Integer index : generator.nextIntSetWithRange(0, exampleSet.size(), k)) { model.assignExample(i, getAsDoubleArray(exampleSet.getExample(index), attributes, values)); i++; } model.finishAssign(); // run optimization steps int[] centroidAssignments = new int[exampleSet.size()]; boolean stable = false; for (int step = 0; (step < maxOptimizationSteps) && !stable; step++) { checkForStop(); // assign examples to new centroids i = 0; for (Example example : exampleSet) { double[] exampleValues = getAsDoubleArray(example, attributes, values); double nearestDistance = measure.calculateDistance(model.getCentroidCoordinates(0), exampleValues); int nearestIndex = 0; for (int centroidIndex = 1; centroidIndex < k; centroidIndex++) { double distance = measure.calculateDistance(model.getCentroidCoordinates(centroidIndex), exampleValues); if (distance < nearestDistance) { nearestDistance = distance; nearestIndex = centroidIndex; } } centroidAssignments[i] = nearestIndex; i++; } for (int clusterIndex = 0; clusterIndex < k; clusterIndex++) { double[] bestMedoidValues = new double[attributes.size()]; double bestDistanceSum = Double.POSITIVE_INFINITY; for (Example medoid : exampleSet) { // calculate intra cluster distance if this example is used as medoid double distanceSum = 0; double[] medoidValues = getAsDoubleArray(medoid, attributes, values); int j = 0; for (Example example : exampleSet) { // add only if in current cluster if (centroidAssignments[j] == clusterIndex) distanceSum += measure.calculateDistance(getAsDoubleArray(example, attributes, values), medoidValues); j++; } if (distanceSum < bestDistanceSum) { bestDistanceSum = distanceSum; bestMedoidValues = medoidValues; } } // assigning into model as best point using average of one model.getCentroid(clusterIndex).assignExample(bestMedoidValues); } stable = model.finishAssign(); } // assessing quality of this model double distanceSum = 0; i = 0; for (Example example : exampleSet) { double distance = measure.calculateDistance(model.getCentroidCoordinates(centroidAssignments[i]), getAsDoubleArray(example, attributes, values)); distanceSum += distance * distance; i++; } if (distanceSum < minimalIntraClusterDistance || Double.isInfinite(minimalIntraClusterDistance)) { bestModel = model; minimalIntraClusterDistance = distanceSum; bestAssignments = centroidAssignments; } } bestModel.setClusterAssignments(bestAssignments, exampleSet); if (addsClusterAttribute()) { Attribute cluster = AttributeFactory.createAttribute("cluster", Ontology.NOMINAL); exampleSet.getExampleTable().addAttribute(cluster); exampleSet.getAttributes().setCluster(cluster); int i = 0; for (Example example : exampleSet) { example.setValue(cluster, "cluster_" + bestAssignments[i]); i++; } } return bestModel; } private double[] getAsDoubleArray(Example example, Attributes attributes, double[] values) { int i = 0; for (Attribute attribute : attributes) { values[i] = example.getValue(attribute); i++; } return values; } @Override public Class<? extends ClusterModel> getClusterModelClass() { return CentroidClusterModel.class; } @Override public List<ParameterType> getParameterTypes() { List<ParameterType> types = super.getParameterTypes(); types.add(new ParameterTypeInt(PARAMETER_K, "The number of clusters which should be detected.", 2, Integer.MAX_VALUE, 2, false)); types.add(new ParameterTypeInt(PARAMETER_MAX_RUNS, "The maximal number of runs of k-Means with random initialization that are performed.", 1, Integer.MAX_VALUE, 10, false)); types.add(new ParameterTypeInt(PARAMETER_MAX_OPTIMIZATION_STEPS, "The maximal number of iterations performed for one run of k-Means.", 1, Integer.MAX_VALUE, 100, false)); types.addAll(RandomGenerator.getRandomGeneratorParameters(this)); types.addAll(DistanceMeasures.getParameterTypes(this)); return types; } }