/*
* RapidMiner
*
* Copyright (C) 2001-2008 by Rapid-I and the contributors
*
* Complete list of developers available at our web site:
*
* http://rapid-i.com
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see http://www.gnu.org/licenses/.
*/
package com.rapidminer.operator.learner.clustering.hierarchical;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import com.rapidminer.example.Example;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.UserError;
import com.rapidminer.operator.learner.clustering.DefaultCluster;
import com.rapidminer.operator.learner.clustering.DefaultClusterNode;
import com.rapidminer.operator.learner.clustering.FlatCrispClusterModel;
import com.rapidminer.operator.learner.clustering.HierarchicalClusterModel;
import com.rapidminer.operator.learner.clustering.IdUtils;
import com.rapidminer.operator.learner.clustering.SimpleHierarchicalClusterModel;
import com.rapidminer.operator.learner.clustering.hierarchical.clustersimilarity.ClusterSimilarity;
import com.rapidminer.operator.similarity.SimilarityMeasure;
import com.rapidminer.parameter.ParameterType;
import com.rapidminer.parameter.ParameterTypeStringCategory;
import com.rapidminer.parameter.Parameters;
import com.rapidminer.tools.ClassNameMapper;
import com.rapidminer.tools.IterationArrayList;
/**
* This class performs generic agglomorative clustering based on a set of ids and a similarity measure. The algorithm implemented here is currently
* very simple and not very efficient (cubic).
*
* @author Michael Wurst
* @version $Id: AgglomerativeClusterer.java,v 1.7 2008/09/12 10:30:07 tobiasmalbrecht Exp $
*/
public class AgglomerativeClusterer {
/** The parameter name for "the cluster similarity criterion (class) to use" */
public static final String PARAMETER_MODE = "mode";
private static String[] MODES = new String[] {
"com.rapidminer.operator.learner.clustering.hierarchical.clustersimilarity.SingleLink",
"com.rapidminer.operator.learner.clustering.hierarchical.clustersimilarity.CompleteLink"
};
private static ClassNameMapper MODE_MAP = new ClassNameMapper(MODES);
public DefaultClusterNode[] cluster(ExampleSet es, SimilarityMeasure sim, ClusterSimilarity csim, int k) throws OperatorException {
DefaultClusterNode[] nodes;
double[][] d;
final int numObjs;
Set<String> ids_ = new HashSet<String>();
Iterator<Example> er = es.iterator();
while (er.hasNext()) {
Example ex = er.next();
ids_.add(IdUtils.getIdFromExample(ex));
}
List<String> ids = new ArrayList<String>(ids_);
numObjs = ids.size();
List<List<String>> objLists = new ArrayList<List<String>>(numObjs);
// Initialize matrix
d = new double[numObjs][numObjs];
for (int i = 0; i < numObjs; i++) {
for (int j = 0; j < numObjs; j++)
d[i][j] = sim.similarity(ids.get(i), ids.get(j));
}
// Initialize nodes
nodes = new DefaultClusterNode[numObjs];
for (int i = 0; i < numObjs; i++) {
String objId = ids.get(i);
nodes[i] = new DefaultClusterNode(objId);
nodes[i].addObject(objId);
nodes[i].setWeight(sim.similarity(objId, objId));
List<String> currentList = new LinkedList<String>();
currentList.add(objId);
objLists.add(currentList);
}
// Main loop
for (int numClusters = numObjs; numClusters > k; numClusters--) {
// find maximal pair
double max = Double.NEGATIVE_INFINITY;
int x = -1;
int y = -1;
for (int i = 0; i < numObjs; i++)
for (int j = i + 1; j < numObjs; j++)
if ((nodes[i] != null) && (nodes[j] != null))
if (d[i][j] > max) {
max = d[i][j];
x = i;
y = j;
}
if ((x > -1) || (y > -1)) {
// Update the matrix
for (int i = 0; i < numObjs; i++)
if (nodes[i] != null) {
d[x][i] = csim.similarity(d[x][i], d[y][i], nodes[x], nodes[y], nodes[i]);
d[i][x] = d[x][i];
}
// Merge the two clusters
DefaultClusterNode newNode = new DefaultClusterNode("id " + (numClusters + numObjs));
addSubNode(newNode, nodes[x]);
addSubNode(newNode, nodes[y]);
newNode.setWeight(max);
nodes[x] = newNode;
objLists.get(x).addAll(objLists.get(y));
objLists.set(y, null);
nodes[y] = null;
}
}
return nodes;
}
public HierarchicalClusterModel clusterHierarchical(ExampleSet es, SimilarityMeasure sim, ClusterSimilarity csim, int minItems)
throws OperatorException {
DefaultClusterNode nodes[] = cluster(es, sim, csim, 1);
DefaultClusterNode root = null;
for (int i = 0; (i < nodes.length) && (root == null); i++)
if (nodes[i] != null)
root = nodes[i];
aggregateSmallClusters(root, minItems);
SimpleHierarchicalClusterModel result = new SimpleHierarchicalClusterModel();
result.setRootNode(root);
return result;
}
public FlatCrispClusterModel clusterFlat(ExampleSet es, SimilarityMeasure sim, ClusterSimilarity csim, int k) throws OperatorException {
DefaultClusterNode nodes[] = cluster(es, sim, csim, k);
double minSimilarity = Double.POSITIVE_INFINITY;
FlatCrispClusterModel flatResult = new FlatCrispClusterModel();
for (int i = 0; i < nodes.length; i++) {
if (nodes[i] != null) {
if (nodes[i].getWeight() < minSimilarity)
minSimilarity = nodes[i].getWeight();
DefaultCluster cl = new DefaultCluster("id " + i);
Iterator<String> it = nodes[i].getObjectsInSubtree();
while (it.hasNext())
cl.addObject(it.next());
flatResult.addCluster(cl);
}
}
flatResult.setProperty("min_similarity", minSimilarity);
return flatResult;
}
private List<String> aggregateSmallClusters(DefaultClusterNode cn, int minSize) {
List<String> localItems = new IterationArrayList<String>(cn.getObjects());
if (cn.getNumberOfSubNodes() == 0) {
return localItems;
} else {
List<String> itemsLeft = aggregateSmallClusters((DefaultClusterNode) cn.getSubNodeAt(0), minSize);
List<String> itemsRight = aggregateSmallClusters((DefaultClusterNode) cn.getSubNodeAt(1), minSize);
if ((itemsLeft.size() < minSize) && (itemsRight.size() < minSize)) {
if (itemsLeft.size() + itemsRight.size() + localItems.size() >= minSize) {
while (cn.getNumberOfSubNodes() > 0)
cn.removeSubNodeAt(0);
for (String id : itemsLeft)
cn.addObject(id);
for (String id : itemsRight)
cn.addObject(id);
}
}
if ((itemsLeft.size() < minSize) && (itemsRight.size() >= minSize)) {
for (String id : itemsLeft)
cn.addObject(id);
if(cn.getSubNodeAt(0).getWeight() < cn.getWeight())
cn.setWeight(cn.getSubNodeAt(0).getWeight());
cn.removeSubNodeAt(0);
}
if ((itemsLeft.size() >= minSize) && (itemsRight.size() < minSize)) {
for (String id : itemsRight)
cn.addObject(id);
if(cn.getSubNodeAt(1).getWeight() < cn.getWeight())
cn.setWeight(cn.getSubNodeAt(1).getWeight());
cn.removeSubNodeAt(1);
}
localItems.addAll(itemsLeft);
localItems.addAll(itemsRight);
return localItems;
}
}
private void addSubNode(DefaultClusterNode node, DefaultClusterNode subNode) {
node.addSubNode(subNode);
}
public static ClusterSimilarity resolveClusterSimilarity(Parameters parameters) throws UserError {
String csimClassName = (String) parameters.getParameter(PARAMETER_MODE);
return (ClusterSimilarity) MODE_MAP.getInstantiation(csimClassName);
}
public static ParameterType createClusterSimilarityParameter() {
ParameterType p = new ParameterTypeStringCategory(PARAMETER_MODE, "the cluster similarity criterion (class) to use", MODE_MAP.getShortClassNames(),
MODE_MAP.getShortClassNames()[0]);
p.setExpert(false);
return p;
}
}