/*
* RapidMiner
*
* Copyright (C) 2001-2008 by Rapid-I and the contributors
*
* Complete list of developers available at our web site:
*
* http://rapid-i.com
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see http://www.gnu.org/licenses/.
*/
package com.rapidminer.operator.validation.clustering;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import com.rapidminer.operator.IOObject;
import com.rapidminer.operator.Operator;
import com.rapidminer.operator.OperatorDescription;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.learner.clustering.Cluster;
import com.rapidminer.operator.learner.clustering.ClusterIterator;
import com.rapidminer.operator.learner.clustering.ClusterNode;
import com.rapidminer.operator.learner.clustering.HierarchicalClusterModel;
import com.rapidminer.operator.performance.EstimatedPerformance;
import com.rapidminer.operator.performance.PerformanceCriterion;
import com.rapidminer.operator.performance.PerformanceVector;
import com.rapidminer.parameter.ParameterType;
import com.rapidminer.parameter.ParameterTypeBoolean;
import com.rapidminer.tools.IterationArrayList;
/**
* Compares two cluster models by searching for each concept a best matching one in the compared cluster model in terms of f-measure. The average f-measure of the best matches is then the overall cluster model similarity.
*
* @author Michael Wurst
* @version $Id: BestMatchClusterModelSimilarity.java,v 1.8 2008/09/12 10:31:55 tobiasmalbrecht Exp $
*
*/
public class BestMatchClusterModelSimilarity extends Operator {
public static final String PARAMETER_WEIGHT_CLUSTERS = "weight_clusters";
public BestMatchClusterModelSimilarity(OperatorDescription description) {
super(description);
}
public IOObject[] apply() throws OperatorException {
HierarchicalClusterModel cm2 = getInput(HierarchicalClusterModel.class);
HierarchicalClusterModel cm1 = getInput(HierarchicalClusterModel.class);
if(getParameterAsBoolean("switch")) {
HierarchicalClusterModel cm3 = cm1;
cm1 = cm2;
cm2 = cm3;
}
if ((cm1 == null) || (cm2 == null)) {
PerformanceVector pv = new PerformanceVector();
PerformanceCriterion pc = new EstimatedPerformance("f_measure", Double.NaN, 1, false);
pv.addCriterion(pc);
logWarning("Could not compare cm, one of them is null");
return new IOObject[] { pv };
}
if ((cm1.getRootNode() == null) || (cm2.getRootNode() == null)) {
PerformanceVector pv = new PerformanceVector();
PerformanceCriterion pc = new EstimatedPerformance("f_measure", Double.NaN, 1, false);
pv.addCriterion(pc);
logWarning("Could not compare cm, one of them is null");
return new IOObject[] { pv };
}
log("Reference cluster model has root node label " + cm1.getRootNode().getDescription());
PerformanceVector pv = new PerformanceVector();
double performance = 0.0;
if(getParameterAsBoolean("symmetric"))
performance = (bestMatchSimilarity(cm1, cm2) + bestMatchSimilarity(cm2, cm1))/2;
else
performance = bestMatchSimilarity(cm1, cm2);
PerformanceCriterion pc = new EstimatedPerformance("f-measure", performance, 1, false);
pv.addCriterion(pc);
log("sim:" + performance);
return new IOObject[] { pv };
}
public Class<?>[] getInputClasses() {
return new Class[] { HierarchicalClusterModel.class };
}
public Class<?>[] getOutputClasses() {
return new Class[] { PerformanceVector.class };
}
private double bestMatchSimilarity(HierarchicalClusterModel referenceModel, HierarchicalClusterModel resultModel) {
HierarchicalClusterModel cm2 = referenceModel;
HierarchicalClusterModel cm1 = resultModel;
List<Cluster> clusterVector1 = new IterationArrayList<Cluster>(new ClusterIterator(cm1));
List<Cluster> clusterVector2 = new IterationArrayList<Cluster>(new ClusterIterator(cm2));
Set<String> items1Set = new HashSet<String>(new IterationArrayList<String>(cm1.getRootNode().getObjectsInSubtree()));
int totalNumItems1 = items1Set.size();
Set<String> items2Set = new HashSet<String>(new IterationArrayList<String>(cm2.getRootNode().getObjectsInSubtree()));
int totalNumItems2 = items2Set.size();
int totalNumItems = totalNumItems1;
if (totalNumItems1 != totalNumItems2)
logWarning("Number of items in both cluster models is not the same");
double sum = 0.0;
int counter = 0;
for (int i = 0; i < clusterVector1.size(); i++) {
ClusterNode cl1 = (ClusterNode) clusterVector1.get(i);
int numObjsInCl1 = cl1.getNumberOfObjectsInSubtree();
if (cl1.getNumberOfObjects() > 0) {
double max = Double.NEGATIVE_INFINITY;
for (int j = 0; j < clusterVector2.size(); j++) {
double v = fmeasure(cl1, clusterVector2.get(j), totalNumItems);
if (v > max)
max = v;
}
if (getParameterAsBoolean(PARAMETER_WEIGHT_CLUSTERS)) {
sum = sum + ((double) numObjsInCl1)*max;
counter = counter + numObjsInCl1;
}
else {
sum = sum + max;
counter++;
}
}
}
return sum / counter;
}
private double fmeasure(Cluster c1, Cluster c2, int n) {
Set s1 = null;
if (c1 instanceof ClusterNode)
s1 = new HashSet<String>(new IterationArrayList<String>(((ClusterNode) c1).getObjectsInSubtree()));
else
s1 = new HashSet<String>(new IterationArrayList<String>(c1.getObjects()));
Set s2 = null;
if (c2 instanceof ClusterNode)
s2 = new HashSet<String>(new IterationArrayList<String>(((ClusterNode) c2).getObjectsInSubtree()));
else
s2 = new HashSet<String>(new IterationArrayList<String>(c2.getObjects()));
if ((s1.size() == 0) || (s2.size() == 0))
return 0.0;
int prHits = 0;
int reHits = 0;
Iterator it = s1.iterator();
while (it.hasNext())
if (s2.contains(it.next()))
prHits++;
Iterator it2 = s2.iterator();
while (it2.hasNext())
if (s1.contains(it2.next()))
reHits++;
if ((reHits == 0) && (prHits == 0))
return 0.0;
double pr = ((double) prHits) / ((double) s1.size());
double re = ((double) reHits) / ((double) s2.size());
return 2 * ((re * pr) / (re + pr));
}
public List<ParameterType> getParameterTypes() {
List<ParameterType> types = super.getParameterTypes();
types.add(new ParameterTypeBoolean(PARAMETER_WEIGHT_CLUSTERS, "should the result clusters be weighted by the fraction of items they contain", true));
types.add(new ParameterTypeBoolean("switch", "switch the both cluster models", false));
types.add(new ParameterTypeBoolean("symmetric","build the average of a two-way comparison", false));
return types;
}
}