/*
* RapidMiner
*
* Copyright (C) 2001-2011 by Rapid-I and the contributors
*
* Complete list of developers available at our web site:
*
* http://rapid-i.com
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see http://www.gnu.org/licenses/.
*/
package com.rapidminer.operator.clustering.clusterer;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.Queue;
import com.rapidminer.example.Attribute;
import com.rapidminer.example.Attributes;
import com.rapidminer.example.Example;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.example.Tools;
import com.rapidminer.example.table.AttributeFactory;
import com.rapidminer.operator.OperatorCapability;
import com.rapidminer.operator.OperatorDescription;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.clustering.ClusterModel;
import com.rapidminer.operator.learner.CapabilityProvider;
import com.rapidminer.operator.ports.metadata.DistanceMeasurePrecondition;
import com.rapidminer.parameter.ParameterType;
import com.rapidminer.parameter.ParameterTypeDouble;
import com.rapidminer.parameter.ParameterTypeInt;
import com.rapidminer.tools.Ontology;
import com.rapidminer.tools.math.similarity.DistanceMeasure;
import com.rapidminer.tools.math.similarity.DistanceMeasureHelper;
import com.rapidminer.tools.math.similarity.DistanceMeasures;
/**
* This operator provides the DBScan cluster algorithm. If no id attribute is present, the operator will create one.
* @author Sebastian Land
*/
public class DBScan extends RMAbstractClusterer implements CapabilityProvider {
private static final String PARAMETER_EPSILON = "epsilon";
private static final String PARAMETER_MIN_POINTS= "min_points";
private DistanceMeasureHelper measureHelper = new DistanceMeasureHelper(this);
public DBScan(OperatorDescription description) {
super(description);
getExampleSetInputPort().addPrecondition(new DistanceMeasurePrecondition(getExampleSetInputPort(), this));
}
@Override
public boolean supportsCapability(OperatorCapability capability) {
int measureType = DistanceMeasures.MIXED_MEASURES_TYPE;
try {
measureType = measureHelper.getSelectedMeasureType();
} catch (Exception e) {
}
switch (capability) {
case BINOMINAL_ATTRIBUTES:
case POLYNOMINAL_ATTRIBUTES:
return (measureType == DistanceMeasures.MIXED_MEASURES_TYPE) ||
(measureType == DistanceMeasures.NOMINAL_MEASURES_TYPE);
case NUMERICAL_ATTRIBUTES:
return (measureType == DistanceMeasures.MIXED_MEASURES_TYPE) ||
(measureType == DistanceMeasures.DIVERGENCES_TYPE) ||
(measureType == DistanceMeasures.NUMERICAL_MEASURES_TYPE);
case POLYNOMINAL_LABEL:
case BINOMINAL_LABEL:
case NUMERICAL_LABEL:
case WEIGHTED_EXAMPLES:
case MISSING_VALUES:
return true;
default:
return false;
}
}
@Override
public ClusterModel generateClusterModel(ExampleSet exampleSet) throws OperatorException {
DistanceMeasure measure = measureHelper.getInitializedMeasure(exampleSet);
double epsilon = getParameterAsDouble(PARAMETER_EPSILON);
int minPoints = getParameterAsInt(PARAMETER_MIN_POINTS);
// checking and creating ids if necessary
Tools.checkAndCreateIds(exampleSet);
// additional checks
Tools.onlyNonMissingValues(exampleSet, "DBScan");
// extracting attribute names
Attributes attributes = exampleSet.getAttributes();
ArrayList<String> attributeNames = new ArrayList<String>(attributes.size());
for (Attribute attribute: attributes)
attributeNames.add(attribute.getName());
boolean[] visited = new boolean[exampleSet.size()];
boolean[] noised = new boolean[exampleSet.size()];
int[] clusterAssignments = new int[exampleSet.size()];
int i = 0;
int clusterIndex = 1;
for (Example example: exampleSet) {
checkForStop();
if (!visited[i]) {
Queue<Integer> centerNeighbourhood = getNeighbourhood(example, exampleSet, measure, epsilon);
if (centerNeighbourhood.size() < minPoints) {
noised[i] = true;
} else {
// then its center point of a cluster. Assign example to new cluster
clusterAssignments[i] = clusterIndex;
// expanding cluster within density borders
while (centerNeighbourhood.size() > 0) {
int currentIndex = centerNeighbourhood.poll().intValue();
Example currentExample = exampleSet.getExample(currentIndex);
// assigning example to current cluster
clusterAssignments[currentIndex] = clusterIndex;
visited[currentIndex] = true;
// appending own neighbourhood to queue
Queue<Integer> neighbourhood = getNeighbourhood(currentExample, exampleSet, measure, epsilon);
if (neighbourhood.size() >= minPoints) {
// then this neighbor of center is also a center of the cluster
while (neighbourhood.size() > 0) {
int neighbourIndex = neighbourhood.poll().intValue();
if (!visited[neighbourIndex]) {
if (!noised[neighbourIndex]) {
// if its not noised, then it might be center of cluster! So append to queue
centerNeighbourhood.add(neighbourIndex);
}
clusterAssignments[neighbourIndex] = clusterIndex;
visited[neighbourIndex] = true;
}
}
}
}
// step to next cluster
clusterIndex++;
}
}
i++;
}
ClusterModel model = new ClusterModel(exampleSet, Math.max(clusterIndex, 1), getParameterAsBoolean(RMAbstractClusterer.PARAMETER_ADD_AS_LABEL), getParameterAsBoolean(RMAbstractClusterer.PARAMETER_REMOVE_UNLABELED));
model.setClusterAssignments(clusterAssignments, exampleSet);
if (addsClusterAttribute()) {
Attribute cluster = AttributeFactory.createAttribute(Attributes.CLUSTER_NAME, Ontology.NOMINAL);
exampleSet.getExampleTable().addAttribute(cluster);
exampleSet.getAttributes().setCluster(cluster);
i = 0;
for (Example example: exampleSet) {
example.setValue(cluster, "cluster_" + clusterAssignments[i]);
i++;
}
}
return model;
}
private LinkedList<Integer> getNeighbourhood(Example centerExample, ExampleSet exampleSet, DistanceMeasure measure, double epsilon) {
LinkedList<Integer> neighbourhood = new LinkedList<Integer>();
int i = 0;
for (Example example: exampleSet) {
double distance = measure.calculateDistance(centerExample, example);
if (distance < epsilon)
neighbourhood.add(i);
i++;
}
return neighbourhood;
}
@Override
public List<ParameterType> getParameterTypes() {
List<ParameterType> types = new LinkedList<ParameterType>();
types.add(new ParameterTypeDouble(PARAMETER_EPSILON, "Specifies the size of neighbourhood.", 0, Double.POSITIVE_INFINITY, 1, false));
types.add(new ParameterTypeInt(PARAMETER_MIN_POINTS, "The minimal number of points forming a cluster.", 1, Integer.MAX_VALUE, 5, false));
types.addAll(super.getParameterTypes());
types.addAll(DistanceMeasures.getParameterTypes(this));
return types;
}
}