/* * RapidMiner * * Copyright (C) 2001-2008 by Rapid-I and the contributors * * Complete list of developers available at our web site: * * http://rapid-i.com * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.operator.learner.clustering.hierarchical.upgma; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import com.rapidminer.example.ExampleSet; import com.rapidminer.operator.OperatorDescription; import com.rapidminer.operator.OperatorException; import com.rapidminer.operator.learner.clustering.ClusterModel; import com.rapidminer.operator.learner.clustering.hierarchical.AbstractHierarchicalClusterer; import com.rapidminer.parameter.ParameterType; import com.rapidminer.parameter.ParameterTypeBoolean; import com.rapidminer.parameter.ParameterTypeCategory; /** * This operator generates a tree each node of which represents a cluster. UPGMA stands for Unweighted Pair Group Method using Arithmetic Means. Since * the way cluster distances are calculated can be specified using parameters, this name is slightly misleading. Unfortunately, the name of the * algorithm changes depending on the parameters used. <br/>Starting with initial clusters of size 1, the algorithm unites two clusters with minimal * distance forming a new tree node. This is iterated until there is only one cluster left which forms the root of the tree. <br/>This operator does * not generate a special cluster attribute and does not modify the input example set at all, since it generates too many clusters. The tree generated * by this cluster is considered the interesting result of the algorithm. * * @author Simon Fischer, Ingo Mierswa * @version $Id: UPGMA.java,v 1.10 2008/09/12 10:29:51 tobiasmalbrecht Exp $ */ public class UPGMA extends AbstractHierarchicalClusterer { /** The parameter name for "Specifies the way the distance of two examples is calculated." */ public static final String PARAMETER_DISTANCE_MEASURE = "distance_measure"; /** The parameter name for "Specifies the way the distance of two clusters is calculated." */ public static final String PARAMETER_CLUSTER_DISTANCE_MEASURE = "cluster_distance_measure"; public UPGMA(OperatorDescription description) { super(description); } public ClusterModel createClusterModel(ExampleSet es) throws OperatorException { ClusterDistanceMeasure clusterDistanceMeasure = DistanceMeasure.createClusterDistanceMeasure(getParameterAsInt(PARAMETER_CLUSTER_DISTANCE_MEASURE)); DistanceMatrix distanceMatrix = DistanceMeasure.createDistanceMeasure(getParameterAsInt(PARAMETER_DISTANCE_MEASURE)).calculateDistanceMatrix(es); List<Cluster> clusters = new ArrayList<Cluster>(); for (int i = 0; i < distanceMatrix.getDimension(); i++) { double[] distances = new double[distanceMatrix.getDimension()]; for (int j = 0; j < distanceMatrix.getDimension(); j++) { distances[j] = distanceMatrix.getDistance(i, j); } clusters.add(new Cluster(distanceMatrix.getName(i), distances, i)); } while (clusters.size() > 1) { // find two clusters i,j with minimal distance d, j > i Cluster clusterI = null; Cluster clusterJ = null; double d = Double.POSITIVE_INFINITY; for (int i = 0; i < clusters.size(); i++) { Cluster tempI = clusters.get(i); for (int j = 0; j < i; j++) { Cluster tempJ = clusters.get(j); double distance = tempI.getDistance(tempJ.getIndex()); if (distance < d) { d = distance; clusterI = tempI; clusterJ = tempJ; } } } Tree t1 = clusterI.getTree(); double h1 = t1.getHeight(); Tree t2 = clusterJ.getTree(); double h2 = t2.getHeight(); Tree newTree = new Tree("" + ((double) (Math.round(d / 2 * 100)) / 100), t1, d / 2 - h1, t2, d / 2 - h2); newTree.setHeight(d / 2); clusterI.setTree(newTree); clusters.remove(clusterJ); // recalculate the distances Iterator l = clusters.iterator(); while (l.hasNext()) { Cluster clusterL = (Cluster) l.next(); if (clusterI.getIndex() != clusterL.getIndex()) { double newDistance = clusterDistanceMeasure.calculateUnionDistance(clusterI.getDistance(clusterL.getIndex()), clusterJ .getDistance(clusterL.getIndex()), clusterI, clusterJ, clusterL); clusterL.setDistance(clusterI.getIndex(), newDistance); clusterI.setDistance(clusterL.getIndex(), newDistance); } else { clusterI.setDistance(clusterL.getIndex(), Double.POSITIVE_INFINITY); } } clusterI.union(clusterJ); } UPGMAHierarchicalClusterModel result = new UPGMAHierarchicalClusterModel((clusters.get(0)).getTree(), es); return result; } public List<ParameterType> getParameterTypes() { List<ParameterType> types = super.getParameterTypes(); ParameterType type = new ParameterTypeCategory(PARAMETER_DISTANCE_MEASURE, "Specifies the way the distance of two examples is calculated.", DistanceMeasure.TYPE_NAMES, DistanceMeasure.TYPE_EUCLIDIAN); type.setExpert(false); types.add(type); type = new ParameterTypeCategory(PARAMETER_CLUSTER_DISTANCE_MEASURE, "Specifies the way the distance of two clusters is calculated.", DistanceMeasure.CLUSTER_TYPE_NAMES, DistanceMeasure.TYPE_AVERAGE); type.setExpert(false); types.add(type); types.add(new ParameterTypeBoolean(PARAMETER_ADD_CLUSTER_ATTRIBUTE, "if true, a cluster id is generated as new special attribute ", true)); return types; } }