BestMatchClusterModelSimilarity.java example

Explorer
ComplexRapidMiner-master
- operator
- src
/*
 *  RapidMiner
 *
 *  Copyright (C) 2001-2008 by Rapid-I and the contributors
 *
 *  Complete list of developers available at our web site:
 *
 *       http://rapid-i.com
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Affero General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Affero General Public License for more details.
 *
 *  You should have received a copy of the GNU Affero General Public License
 *  along with this program.  If not, see http://www.gnu.org/licenses/.
 */
package com.rapidminer.operator.validation.clustering;

import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;

import com.rapidminer.operator.IOObject;
import com.rapidminer.operator.Operator;
import com.rapidminer.operator.OperatorDescription;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.learner.clustering.Cluster;
import com.rapidminer.operator.learner.clustering.ClusterIterator;
import com.rapidminer.operator.learner.clustering.ClusterNode;
import com.rapidminer.operator.learner.clustering.HierarchicalClusterModel;
import com.rapidminer.operator.performance.EstimatedPerformance;
import com.rapidminer.operator.performance.PerformanceCriterion;
import com.rapidminer.operator.performance.PerformanceVector;
import com.rapidminer.parameter.ParameterType;
import com.rapidminer.parameter.ParameterTypeBoolean;
import com.rapidminer.tools.IterationArrayList;


/**
 * Compares two cluster models by searching for each concept a best matching one in the compared cluster model in terms of f-measure. The average f-measure of the best matches is then the overall cluster model similarity.
 * 
 * @author Michael Wurst
 * @version $Id: BestMatchClusterModelSimilarity.java,v 1.8 2008/09/12 10:31:55 tobiasmalbrecht Exp $
 * 
 */
public class BestMatchClusterModelSimilarity extends Operator {
	
	public static final String PARAMETER_WEIGHT_CLUSTERS = "weight_clusters";

	public BestMatchClusterModelSimilarity(OperatorDescription description) {
		super(description);
	}

	public IOObject[] apply() throws OperatorException {
		HierarchicalClusterModel cm2 = getInput(HierarchicalClusterModel.class);
		HierarchicalClusterModel cm1 = getInput(HierarchicalClusterModel.class);
		
		
		if(getParameterAsBoolean("switch")) {
			HierarchicalClusterModel cm3 = cm1;
			cm1 = cm2;
			cm2 = cm3;
		}
		
		if ((cm1 == null) || (cm2 == null)) {
			PerformanceVector pv = new PerformanceVector();
			PerformanceCriterion pc = new EstimatedPerformance("f_measure", Double.NaN, 1, false);
			pv.addCriterion(pc);
			logWarning("Could not compare cm, one of them is null");
			return new IOObject[] { pv };
		}

		if ((cm1.getRootNode() == null) || (cm2.getRootNode() == null)) {
			PerformanceVector pv = new PerformanceVector();
			PerformanceCriterion pc = new EstimatedPerformance("f_measure", Double.NaN, 1, false);
			pv.addCriterion(pc);
			logWarning("Could not compare cm, one of them is null");
			return new IOObject[] { pv };
		}

		log("Reference cluster model has root node label " + cm1.getRootNode().getDescription());
		PerformanceVector pv = new PerformanceVector();
		double performance = 0.0;
		if(getParameterAsBoolean("symmetric"))
			performance = (bestMatchSimilarity(cm1, cm2) + bestMatchSimilarity(cm2, cm1))/2;
		else
			performance = bestMatchSimilarity(cm1, cm2);

		PerformanceCriterion pc = new EstimatedPerformance("f-measure", performance, 1, false);
		pv.addCriterion(pc);
		log("sim:" + performance);
		return new IOObject[] { pv };
	}

	public Class<?>[] getInputClasses() {
		return new Class[] { HierarchicalClusterModel.class };
	}

	public Class<?>[] getOutputClasses() {
		return new Class[] { PerformanceVector.class };
	}

	private double bestMatchSimilarity(HierarchicalClusterModel referenceModel, HierarchicalClusterModel resultModel) {
		HierarchicalClusterModel cm2 = referenceModel;
		HierarchicalClusterModel cm1 = resultModel;

		List<Cluster> clusterVector1 = new IterationArrayList<Cluster>(new ClusterIterator(cm1));
		List<Cluster> clusterVector2 = new IterationArrayList<Cluster>(new ClusterIterator(cm2));

		Set<String> items1Set = new HashSet<String>(new IterationArrayList<String>(cm1.getRootNode().getObjectsInSubtree()));
		int totalNumItems1 = items1Set.size();

		Set<String> items2Set = new HashSet<String>(new IterationArrayList<String>(cm2.getRootNode().getObjectsInSubtree()));
		int totalNumItems2 = items2Set.size();

		int totalNumItems = totalNumItems1;

		if (totalNumItems1 != totalNumItems2)
			logWarning("Number of items in both cluster models is not the same");

		double sum = 0.0;
		int counter = 0;

		for (int i = 0; i < clusterVector1.size(); i++) {
			ClusterNode cl1 = (ClusterNode) clusterVector1.get(i);
			int numObjsInCl1 = cl1.getNumberOfObjectsInSubtree();
			if (cl1.getNumberOfObjects() > 0) {
				double max = Double.NEGATIVE_INFINITY;
				for (int j = 0; j < clusterVector2.size(); j++) {
					double v = fmeasure(cl1, clusterVector2.get(j), totalNumItems);
					if (v > max)
						max = v;
				}
				
				if (getParameterAsBoolean(PARAMETER_WEIGHT_CLUSTERS)) {
					sum = sum + ((double) numObjsInCl1)*max;
					counter = counter + numObjsInCl1;
				}
				else {
					sum = sum + max;
					counter++;
				}
				
			}
		}
			return sum / counter;
	}

	private double fmeasure(Cluster c1, Cluster c2, int n) {
		Set s1 = null;
		if (c1 instanceof ClusterNode)
			s1 = new HashSet<String>(new IterationArrayList<String>(((ClusterNode) c1).getObjectsInSubtree()));
		else
			s1 = new HashSet<String>(new IterationArrayList<String>(c1.getObjects()));

		Set s2 = null;
		if (c2 instanceof ClusterNode)
			s2 = new HashSet<String>(new IterationArrayList<String>(((ClusterNode) c2).getObjectsInSubtree()));
		else
			s2 = new HashSet<String>(new IterationArrayList<String>(c2.getObjects()));

		if ((s1.size() == 0) || (s2.size() == 0))
			return 0.0;

		int prHits = 0;
		int reHits = 0;

		Iterator it = s1.iterator();
		while (it.hasNext())
			if (s2.contains(it.next()))
				prHits++;

		Iterator it2 = s2.iterator();
		while (it2.hasNext())
			if (s1.contains(it2.next()))
				reHits++;

		if ((reHits == 0) && (prHits == 0))
			return 0.0;

		double pr = ((double) prHits) / ((double) s1.size());
		double re = ((double) reHits) / ((double) s2.size());

		return 2 * ((re * pr) / (re + pr));
	}

	public List<ParameterType> getParameterTypes() {
		List<ParameterType> types = super.getParameterTypes();
		types.add(new ParameterTypeBoolean(PARAMETER_WEIGHT_CLUSTERS, "should the result clusters be weighted by the fraction of items they contain", true));
		types.add(new ParameterTypeBoolean("switch", "switch the both cluster models", false));
		types.add(new ParameterTypeBoolean("symmetric","build the average of a two-way comparison", false));
		return types;
	}

}