/*
* RapidMiner
*
* Copyright (C) 2001-2008 by Rapid-I and the contributors
*
* Complete list of developers available at our web site:
*
* http://rapid-i.com
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see http://www.gnu.org/licenses/.
*/
package com.rapidminer.operator.validation.clustering;
import java.util.List;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.operator.IOObject;
import com.rapidminer.operator.InputDescription;
import com.rapidminer.operator.MissingIOObjectException;
import com.rapidminer.operator.Operator;
import com.rapidminer.operator.OperatorDescription;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.UserError;
import com.rapidminer.operator.ValueDouble;
import com.rapidminer.operator.learner.clustering.CentroidBasedClusterModel;
import com.rapidminer.operator.learner.clustering.ClusterModel;
import com.rapidminer.operator.learner.clustering.ClustererPreconditions;
import com.rapidminer.operator.learner.clustering.IdUtils;
import com.rapidminer.operator.performance.EstimatedPerformance;
import com.rapidminer.operator.performance.PerformanceCriterion;
import com.rapidminer.operator.performance.PerformanceVector;
import com.rapidminer.parameter.ParameterType;
import com.rapidminer.parameter.ParameterTypeBoolean;
import com.rapidminer.parameter.ParameterTypeCategory;
import com.rapidminer.tools.IterationArrayList;
/**
* An evaluator for centroid based clustering methods.
*
* @author Michael Wurst, Ingo Mierswa
* @version $Id: CentroidBasedEvaluator.java,v 1.7 2008/07/07 07:06:45 ingomierswa Exp $
*
*/
public class CentroidBasedEvaluator extends Operator {
public static final String PARAMETER_MAIN_CRITERION = "main_criterion";
public static final String PARAMETER_MAIN_CRITERION_ONLY = "main_criterion_only";
public static final String PARAMETER_NORMALIZE = "normalize";
public static final String PARAMETER_MAXIMIZE = "maximize";
private double avgWithinClusterDistance;
private double daviesBouldin;
public static final String[] CRITERIA_LIST = { "avg_within_distance", "DaviesBouldin" };
public static final String[] CRITERIA_LIST_SHORT = { "AVD", "DB" };
/**
* Constructor for ClusterDensityEvaluator.
*/
public CentroidBasedEvaluator(OperatorDescription description) {
super(description);
addValue(new ValueDouble(CRITERIA_LIST_SHORT[0], CRITERIA_LIST[0], false) {
public double getDoubleValue() {
return avgWithinClusterDistance;
}
});
addValue(new ValueDouble(CRITERIA_LIST_SHORT[1], CRITERIA_LIST[1], false) {
public double getDoubleValue() {
return daviesBouldin;
}
});
}
public InputDescription getInputDescription(Class cls) {
if (ClusterModel.class.isAssignableFrom(cls)) {
return new InputDescription(cls, false, true);
}
if (ExampleSet.class.isAssignableFrom(cls)) {
return new InputDescription(cls, false, true);
}
return super.getInputDescription(cls);
}
public Class<?>[] getInputClasses() {
return new Class[] { ExampleSet.class, ClusterModel.class };
}
public Class<?>[] getOutputClasses() {
return new Class[] { PerformanceVector.class };
}
public IOObject[] apply() throws OperatorException {
ClusterModel clusterModel = getInput(ClusterModel.class);
if (!(clusterModel instanceof CentroidBasedClusterModel)) {
throw new UserError(this, 122, "centroid based cluster model");
}
CentroidBasedClusterModel cm = (CentroidBasedClusterModel)clusterModel;
ExampleSet es = getInput(ExampleSet.class);
es.remapIds();
ClustererPreconditions.hasClusters(cm);
ClustererPreconditions.isNonEmpty(cm);
PerformanceVector performance = null;
try {
performance = getInput(PerformanceVector.class);
} catch (MissingIOObjectException e) {
// If no performance vector is available create a new one
}
if (performance == null)
performance = new PerformanceVector();
int mainCriterionIndex = getParameterAsInt(PARAMETER_MAIN_CRITERION);
boolean returnMainCritetionOnly = getParameterAsBoolean(PARAMETER_MAIN_CRITERION_ONLY);
double multFactor = -1.0;
if (getParameterAsBoolean(PARAMETER_MAXIMIZE))
multFactor = 1.0;
double divisionFactor = 1.0;
if (getParameterAsBoolean(PARAMETER_NORMALIZE))
divisionFactor = es.getAttributes().size();
// Average Squared withing cluster distance 0
double[] avgWithinDistances = calcAvgWithinClusterDistance(cm, es);
avgWithinClusterDistance = avgWithinDistances[cm.getNumberOfClusters()];
PerformanceCriterion withinClusterDist = new EstimatedPerformance(CRITERIA_LIST_SHORT[0], (multFactor * avgWithinClusterDistance) / divisionFactor, 1, false);
if ((mainCriterionIndex == 0) || !returnMainCritetionOnly)
performance.addCriterion(withinClusterDist);
for (int i = 0; i < cm.getNumberOfClusters(); i++) {
PerformanceCriterion withinDistance = new EstimatedPerformance(CRITERIA_LIST_SHORT[0] + "_cluster_" + cm.getClusterAt(i).getId(), (multFactor * avgWithinDistances[i]) / divisionFactor, 1, false);
if ((mainCriterionIndex == 0) || !returnMainCritetionOnly)
performance.addCriterion(withinDistance);
}
// Davies Bouldin 1
daviesBouldin = getDaviesBouldin(cm, es);
PerformanceCriterion daviesBouldinCriterion = new EstimatedPerformance(CRITERIA_LIST_SHORT[1], (multFactor * daviesBouldin) / divisionFactor, 1, false);
if ((mainCriterionIndex == 1) || !returnMainCritetionOnly)
performance.addCriterion(daviesBouldinCriterion);
performance.setMainCriterionName(CRITERIA_LIST_SHORT[mainCriterionIndex]);
return new IOObject[] { performance };
}
private double[] calcAvgWithinClusterDistance(CentroidBasedClusterModel cm, ExampleSet es) {
double[] result = new double[cm.getNumberOfClusters() + 1];
int count = 0;
double sum = 0.0;
for (int i = 0; i < cm.getNumberOfClusters(); i++) {
List<String> objs = new IterationArrayList<String>(cm.getClusterAt(i).getObjects());
double sumForCluster = 0;
int countForCluster = 0;
for (int j = 0; j < objs.size(); j++) {
String d = objs.get(j);
double v = cm.getDistanceFromCentroid(i, IdUtils.getExampleFromId(es, d));
sum = sum + v * v;
sumForCluster = sumForCluster + v * v;
count++;
countForCluster++;
}
result[i] = sumForCluster / countForCluster;
}
result[cm.getNumberOfClusters()] = sum / count;
return result;
}
private double getDaviesBouldin(CentroidBasedClusterModel cm, ExampleSet es) {
double[] withinClusterDistance = new double[cm.getNumberOfClusters()];
for (int i = 0; i < cm.getNumberOfClusters(); i++) {
int count = 0;
double sum = 0.0;
List<String> objs = new IterationArrayList<String>(cm.getClusterAt(i).getObjects());
for (int j = 0; j < objs.size(); j++) {
String d = objs.get(j);
double v = cm.getDistanceFromCentroid(i, IdUtils.getExampleFromId(es, d));
sum = sum + v;
count++;
}
if (count > 0)
withinClusterDistance[i] = sum / count;
else
withinClusterDistance[i] = 0.0;
}
double sum2 = 0.0;
for (int i = 0; i < cm.getNumberOfClusters(); i++) {
double max = Double.NEGATIVE_INFINITY;
for (int j = 0; j < cm.getNumberOfClusters(); j++)
if (i != j) {
double val = (withinClusterDistance[i] + withinClusterDistance[j]) / cm.getCentroidDistance(i, j);
if (val > max)
max = val;
}
sum2 = sum2 + max;
}
return sum2 / cm.getNumberOfClusters();
}
public List<ParameterType> getParameterTypes() {
List<ParameterType> types = super.getParameterTypes();
types.add(new ParameterTypeCategory(PARAMETER_MAIN_CRITERION, "The main criterion to use", CRITERIA_LIST, 0));
types.add(new ParameterTypeBoolean(PARAMETER_MAIN_CRITERION_ONLY, "return the main criterion only", false));
types.add(new ParameterTypeBoolean(PARAMETER_NORMALIZE, "divide the criterion by the number of features", false));
types.add(new ParameterTypeBoolean(PARAMETER_MAXIMIZE, "do not multiply the result by minus one", false));
return types;
}
}