/* * RapidMiner * * Copyright (C) 2001-2008 by Rapid-I and the contributors * * Complete list of developers available at our web site: * * http://rapid-i.com * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.operator.validation.clustering; import java.util.List; import com.rapidminer.example.Example; import com.rapidminer.example.ExampleSet; import com.rapidminer.example.Tools; import com.rapidminer.operator.IOObject; import com.rapidminer.operator.InputDescription; import com.rapidminer.operator.MissingIOObjectException; import com.rapidminer.operator.Operator; import com.rapidminer.operator.OperatorDescription; import com.rapidminer.operator.OperatorException; import com.rapidminer.operator.UserError; import com.rapidminer.operator.learner.clustering.ClusterModel; import com.rapidminer.operator.learner.clustering.ClustererPreconditions; import com.rapidminer.operator.learner.clustering.FlatClusterModel; import com.rapidminer.operator.learner.clustering.IdUtils; import com.rapidminer.operator.performance.EstimatedPerformance; import com.rapidminer.operator.performance.PerformanceCriterion; import com.rapidminer.operator.performance.PerformanceVector; import com.rapidminer.tools.IterationArrayList; /** * This operator evaluates the quality of a flat cluster model based on given class labels using an entropy based measure. * * @author Michael Wurst * @version $Id: ClusterEntropyEvaluator.java,v 1.8 2008/09/12 10:31:53 tobiasmalbrecht Exp $ * */ public class ClusterEntropyEvaluator extends Operator { private int numClasses; public ClusterEntropyEvaluator(OperatorDescription description) { super(description); } public Class<?>[] getInputClasses() { return new Class[] { ClusterModel.class, ExampleSet.class }; } public Class<?>[] getOutputClasses() { return new Class[] { PerformanceVector.class }; } public InputDescription getInputDescription(Class cls) { if (ClusterModel.class.isAssignableFrom(cls)) { return new InputDescription(cls, false, true); } if (ExampleSet.class.isAssignableFrom(cls)) { return new InputDescription(cls, false, true); } return super.getInputDescription(cls); } public IOObject[] apply() throws OperatorException { ClusterModel clusterModel = getInput(ClusterModel.class); ExampleSet es = getInput(ExampleSet.class); if (!(clusterModel instanceof FlatClusterModel)) { throw new UserError(this, 122, "flat cluster model"); } FlatClusterModel cm = (FlatClusterModel)clusterModel; Tools.hasNominalLabels(es); Tools.checkAndCreateIds(es); ClustererPreconditions.isNonEmpty(cm); PerformanceVector performance = null; es.remapIds(); try { performance = getInput(PerformanceVector.class); } catch (MissingIOObjectException e) { // If no performance vector is available create a new one } if (performance == null) performance = new PerformanceVector(); numClasses = es.getAttributes().getLabel().getMapping().getValues().size(); double entr = entropy(cm, es); PerformanceCriterion entropyCriterion = new EstimatedPerformance("Entropy", entr, 1, false); performance.addCriterion(entropyCriterion); return new IOObject[] { performance }; } private double entropy(FlatClusterModel cm, ExampleSet es) { double totalEntropy = 0.0; int numObjs = 0; for (int i = 0; i < cm.getNumberOfClusters(); i++) { double clusterEntropy = 0.0; int[] count = new int[numClasses]; for (int k = 0; k < numClasses; k++) count[k] = 0; List<String> idsInCluster = new IterationArrayList<String>(cm.getClusterAt(i).getObjects()); for (int j = 0; j < idsInCluster.size(); j++) { Example ex = IdUtils.getExampleFromId(es, idsInCluster.get(j)); int index = (int) ex.getLabel() - 1; if ((index < numClasses) && (index >= 0)) count[index]++; else logWarning("Class index out of bound"); } if (idsInCluster.size() > 0) for (int k = 0; k < numClasses; k++) if (count[k] > 0) clusterEntropy = clusterEntropy - (((double) count[k]) / ((double) idsInCluster.size())) * Math.log(((double) count[k]) / ((double) idsInCluster.size())); totalEntropy = totalEntropy + clusterEntropy * idsInCluster.size(); numObjs = numObjs + idsInCluster.size(); } return totalEntropy / numObjs; } }