/* * RapidMiner * * Copyright (C) 2001-2011 by Rapid-I and the contributors * * Complete list of developers available at our web site: * * http://rapid-i.com * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.operator.validation.clustering; import java.util.List; import com.rapidminer.example.Example; import com.rapidminer.example.ExampleSet; import com.rapidminer.operator.Operator; import com.rapidminer.operator.OperatorDescription; import com.rapidminer.operator.OperatorException; import com.rapidminer.operator.ValueDouble; import com.rapidminer.operator.clustering.CentroidClusterModel; import com.rapidminer.operator.performance.EstimatedPerformance; import com.rapidminer.operator.performance.PerformanceCriterion; import com.rapidminer.operator.performance.PerformanceVector; import com.rapidminer.operator.ports.InputPort; import com.rapidminer.operator.ports.OutputPort; import com.rapidminer.operator.ports.metadata.GenerateNewMDRule; import com.rapidminer.operator.ports.metadata.MetaData; import com.rapidminer.operator.ports.metadata.PassThroughOrGenerateRule; import com.rapidminer.operator.ports.metadata.SimplePrecondition; import com.rapidminer.parameter.ParameterType; import com.rapidminer.parameter.ParameterTypeBoolean; import com.rapidminer.parameter.ParameterTypeCategory; import com.rapidminer.tools.math.similarity.DistanceMeasure; import com.rapidminer.tools.math.similarity.divergences.SquaredEuclideanDistance; import com.rapidminer.tools.math.similarity.numerical.EuclideanDistance; /** * An evaluator for centroid based clustering methods. The average within cluster distance is calculated * by averaging the distance between the centroid and all examples of a cluster. * * @author Sebastian Land, Michael Wurst, Ingo Mierswa * */ public class CentroidBasedEvaluator extends Operator { public static final String PARAMETER_MAIN_CRITERION = "main_criterion"; public static final String PARAMETER_MAIN_CRITERION_ONLY = "main_criterion_only"; public static final String PARAMETER_NORMALIZE = "normalize"; public static final String PARAMETER_MAXIMIZE = "maximize"; private double avgWithinClusterDistance; private double daviesBouldin; public static final String[] CRITERIA_LIST = { "Avg. within centroid distance", "Davies Bouldin" }; public static final String[] CRITERIA_LIST_SHORT = { "avg_within_distance", "DaviesBouldin" }; private InputPort exampleSetInput = getInputPorts().createPort("example set", ExampleSet.class); private InputPort clusterModelInput = getInputPorts().createPort("cluster model", CentroidClusterModel.class); private InputPort performanceInput = getInputPorts().createPort("performance"); private OutputPort performanceOutput = getOutputPorts().createPort("performance"); private OutputPort exampleSetOutput = getOutputPorts().createPort("example set"); private OutputPort clusterModelOutput = getOutputPorts().createPort("cluster model"); /** * Constructor for ClusterDensityEvaluator. */ public CentroidBasedEvaluator(OperatorDescription description) { super(description); performanceInput.addPrecondition(new SimplePrecondition(performanceInput, new MetaData(PerformanceVector.class), false)); getTransformer().addRule(new PassThroughOrGenerateRule(performanceInput, performanceOutput, new MetaData(PerformanceVector.class))); getTransformer().addPassThroughRule(exampleSetInput, exampleSetOutput); getTransformer().addPassThroughRule(clusterModelInput, clusterModelOutput); getTransformer().addRule(new GenerateNewMDRule(performanceOutput, PerformanceVector.class)); addValue(new ValueDouble(CRITERIA_LIST_SHORT[0], CRITERIA_LIST[0], false) { @Override public double getDoubleValue() { return avgWithinClusterDistance; } }); addValue(new ValueDouble(CRITERIA_LIST_SHORT[1], CRITERIA_LIST[1], false) { @Override public double getDoubleValue() { return daviesBouldin; } }); } @Override public boolean shouldAutoConnect(OutputPort port) { if (port == clusterModelOutput) { return getParameterAsBoolean("keep_cluster_model"); } else if (port == exampleSetOutput) { return getParameterAsBoolean("keep_example_set"); } else { return super.shouldAutoConnect(port); } } @Override public void doWork() throws OperatorException { CentroidClusterModel clusterModel = clusterModelInput.getData(); ExampleSet exampleSet = exampleSetInput.getData(); PerformanceVector performance = performanceInput.getDataOrNull(); if (performance == null) { performance = new PerformanceVector(); } int mainCriterionIndex = getParameterAsInt(PARAMETER_MAIN_CRITERION); boolean onlyMainCriterion = getParameterAsBoolean(PARAMETER_MAIN_CRITERION_ONLY); double multFactor = -1.0; if (getParameterAsBoolean(PARAMETER_MAXIMIZE)) multFactor = 1.0; double divisionFactor = 1.0; if (getParameterAsBoolean(PARAMETER_NORMALIZE)) divisionFactor = exampleSet.getAttributes().size(); // Average Squared within cluster distance 0 double[] averageWithinDistance = getAverageWithinDistance(clusterModel, exampleSet); avgWithinClusterDistance = averageWithinDistance[clusterModel.getNumberOfClusters()]; PerformanceCriterion withinClusterDist = new EstimatedPerformance(CRITERIA_LIST[0], (multFactor * avgWithinClusterDistance) / divisionFactor, 1, false); if ((mainCriterionIndex == 0) || !onlyMainCriterion) performance.addCriterion(withinClusterDist); for (int i = 0; i < clusterModel.getNumberOfClusters(); i++) { PerformanceCriterion withinDistance = new EstimatedPerformance(CRITERIA_LIST[0] + "_cluster_" + clusterModel.getCluster(i).getClusterId(), (multFactor * averageWithinDistance[i]) / divisionFactor, 1, false); if ((mainCriterionIndex == 0) || !onlyMainCriterion) performance.addCriterion(withinDistance); } // Davies Bouldin 1 daviesBouldin = getDaviesBouldin(clusterModel, exampleSet); PerformanceCriterion daviesBouldinCriterion = new EstimatedPerformance(CRITERIA_LIST[1], (multFactor * daviesBouldin) / divisionFactor, 1, false); if ((mainCriterionIndex == 1) || !onlyMainCriterion) performance.addCriterion(daviesBouldinCriterion); performance.setMainCriterionName(CRITERIA_LIST[mainCriterionIndex]); performanceOutput.deliver(performance); exampleSetOutput.deliver(exampleSet); clusterModelOutput.deliver(clusterModel); } private double[] getAverageWithinDistance(CentroidClusterModel model, ExampleSet exampleSet) throws OperatorException { DistanceMeasure measure = new SquaredEuclideanDistance(); measure.init(exampleSet); int numberOfClusters = model.getNumberOfClusters(); // counting distances within double[] result = new double[numberOfClusters + 1]; int[] clusterSizes = new int[numberOfClusters]; int[] clusterIndices = model.getClusterAssignments(exampleSet); int i = 0; for (Example example: exampleSet) { clusterSizes[clusterIndices[i]]++; result[clusterIndices[i]] += measure.calculateDistance(example, model.getCentroidCoordinates(clusterIndices[i])); i++; } // averaging by cluster sizes and sum over all int totalSize = 0; for (i = 0; i < numberOfClusters; i++) { result[numberOfClusters] += result[i]; result[i] /= clusterSizes[i]; totalSize += clusterSizes[i]; } result[numberOfClusters] /= totalSize; return result; } private double getDaviesBouldin(CentroidClusterModel model, ExampleSet exampleSet) throws OperatorException { DistanceMeasure measure = new EuclideanDistance(); measure.init(exampleSet); int numberOfClusters = model.getNumberOfClusters(); // counting distances within double[] withinClusterDistance = new double[numberOfClusters]; int[] clusterSizes = new int[numberOfClusters]; int[] clusterIndices = model.getClusterAssignments(exampleSet); int i = 0; for (Example example: exampleSet) { clusterSizes[clusterIndices[i]]++; withinClusterDistance[clusterIndices[i]] += measure.calculateDistance(example, model.getCentroidCoordinates(clusterIndices[i])); i++; } // averaging by cluster sizes and sum over all for (i = 0; i < numberOfClusters; i++) { withinClusterDistance[i] /= clusterSizes[i]; } double result = 0.0; for (i = 0; i < numberOfClusters; i++) { double max = Double.NEGATIVE_INFINITY; for (int j = 0; j < numberOfClusters; j++) if (i != j) { double val = (withinClusterDistance[i] + withinClusterDistance[j]) / measure.calculateDistance(model.getCentroidCoordinates(i), model.getCentroidCoordinates(j)); if (val > max) max = val; } result = result + max; } return result / model.getNumberOfClusters(); } @Override public List<ParameterType> getParameterTypes() { List<ParameterType> types = super.getParameterTypes(); types.add(new ParameterTypeCategory(PARAMETER_MAIN_CRITERION, "The main criterion to use", CRITERIA_LIST, 0, false)); types.add(new ParameterTypeBoolean(PARAMETER_MAIN_CRITERION_ONLY, "return the main criterion only", false)); types.add(new ParameterTypeBoolean(PARAMETER_NORMALIZE, "divide the criterion by the number of features", false)); types.add(new ParameterTypeBoolean(PARAMETER_MAXIMIZE, "do not multiply the result by minus one", false)); return types; } }