package com.rapidminer.operator.learner.clustering.clusterer.uncertain;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import com.rapidminer.example.Attribute;
import com.rapidminer.example.Example;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.operator.InputDescription;
import com.rapidminer.operator.OperatorDescription;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.learner.clustering.ClusterModel;
import com.rapidminer.operator.learner.clustering.FlatClusterModel;
import com.rapidminer.operator.learner.clustering.IdUtils;
import com.rapidminer.operator.learner.clustering.clusterer.AbstractDensityBasedClusterer;
import com.rapidminer.operator.similarity.SimilarityMeasure;
import com.rapidminer.operator.similarity.attributebased.uncertain.SimpleProbabilityDensityFunction;
import com.rapidminer.operator.uncertain.AbstractSampleStrategy;
import com.rapidminer.operator.uncertain.SimpleSampling;
import com.rapidminer.parameter.ParameterType;
import com.rapidminer.parameter.ParameterTypeBoolean;
import com.rapidminer.parameter.ParameterTypeDouble;
import com.rapidminer.parameter.ParameterTypeInt;
import com.rapidminer.parameter.ParameterTypeString;
/**
* Implements the DBSCAN^EA algorithm.
*
* @author Michael Huber, Peter B Volk
* @see com.rapidminer.operator.learner.clustering.clusterer.DBScanClustering
* @see com.rapidminer.operator.learner.clustering.clusterer.uncertain.FDBScanClustering
* @see com.rapidminer.operator.learner.clustering.clusterer.ClusteringAggregation
*/
public class DBScanEAClustering extends AbstractDensityBasedClusterer {
protected ExampleSet es;
private double maxDistance = 0.2;
private static final String MAX_DISTANCE_NAME = "max_distance";
//NOTE: While not having a pdf for each element, this global fuzziness is used.
private double globalFuzziness = 0.0;
private static final String GLOBAL_UNCERTAINTY = "global_uncertainty";
private double lambda = 0.5;
private static final String LAMBDA = "lambda";
//NOTE: sampleRate defines the amount of samples per Object
private int sampleRate = 5;
private static final String SAMPLE_RATE = "sample_rate";
private static final String ABSOLUTE_ERROR = "Absolute error";
protected AbstractSampleStrategy sampleStrategy;
protected Map<String, Double[][]> sampleCache;
public DBScanEAClustering(OperatorDescription description) {
super(description);
}
/**
* Creates a <code>ClusterModel</code> using <code>doClustering</code>.
*
* @param es dataset that is clustered
* @return <code>ClusterModel</code> the clustering of the given dataset
*/
public ClusterModel createClusterModel(ExampleSet es) throws OperatorException {
this.es = es;
lambda = getParameterAsDouble(LAMBDA);
//System.out.println("[Performance Antje] lambda: " + lambda);
globalFuzziness = getParameterAsDouble(GLOBAL_UNCERTAINTY);
maxDistance = getParameterAsDouble(MAX_DISTANCE_NAME);
sampleStrategy = new SimpleSampling();
sampleStrategy.setSampleRate(this.sampleRate);
sampleCache = new HashMap<String, Double[][]>();
FlatClusterModel result = doClustering(es);
return result;
}
/**
* Calculates the neighborhood of the given core object
* and returns a list of IDs of all found objects.
*
* @param es Dataset of Elements to be processed.
* @param id ID of the core object.
* @return All IDs of Elements in the epsilon-neighborhood.
*/
protected List<String> getNeighbours(ExampleSet es, String id) {
List<String> result = new LinkedList<String>();
for (int i = 0; i < getIds().size(); i++) {
String id2 = getIds().get(i);
double v = lambdaDistance(id, id2, this.lambda);
if (v <= maxDistance)
result.add(id2);
}
return result;
}
public double lambdaDistance(String id1, String id2, double lambda) {
double minDist = minDistance(id1, id2);
double diff = maxDistance(id1, id2) - minDist;
double lDist = diff * (1 - lambda) + minDist;
return lDist;
}
public double minDistance(String id1, String id2) {
double dist = Double.MAX_VALUE;
Double [][] e1 = getSamples(id1);
Double [][] e2 = getSamples(id2);
int max_dimensions = e1[0].length;
double[] a = new double[max_dimensions];
double[] b = new double[max_dimensions];
for(int i=0; i<sampleRate; i++) {
for(int j=0; j<sampleRate; j++) {
for(int d=0; d<max_dimensions; d++) {
a[d] = e1[i][d];
b[d] = e2[j][d];
}
if(distance(a, b) < dist) {
dist = distance(a, b);
}
}
}
return dist;
}
public double maxDistance(String id1, String id2) {
double dist = Double.MIN_VALUE;
Double [][] e1 = getSamples(id1);
Double [][] e2 = getSamples(id2);
int max_dimensions = e1[0].length;
double[] a = new double[max_dimensions];
double[] b = new double[max_dimensions];
for(int i=0; i<sampleRate; i++) {
for(int j=0; j<sampleRate; j++) {
for(int d=0; d < max_dimensions; d++) {
a[d] = e1[i][d];
b[d] = e2[j][d];
}
if(distance(a, b) > dist) {
dist = distance(a, b);
}
}
}
return dist;
}
/**
* Calculates the distance using the euclidean distance measurement.
*
* @param e1 measure starting point
* @param e2 measure ending point
*/
public double distance(double e1, double e2) {
if ((Double.isNaN(e1)) || (Double.isNaN(e2))) {
return Double.NaN;
}
return Math.sqrt((e1 - e2) * (e1 - e2));
}
/**
* Calculates the distance of n-dimensional vectors using the
* euclidean distance measurement.
*
* @param e1 n-dimensional starting vector
* @param e2 n-dimensional ending vector
*/
public double distance(double[] e1, double[] e2) {
double sum = 0.0;
int counter = 0;
for (int i = 0; i < e1.length; i++) {
if ((!Double.isNaN(e1[i])) && (!Double.isNaN(e2[i]))) {
sum = sum + (e1[i] - e2[i]) * (e1[i] - e2[i]);
counter++;
}
}
double d = Math.sqrt(sum);
if (counter > 0)
return d;
else
return Double.NaN;
}
protected Double[][] getSamples(String id) {
if(!sampleCache.containsKey(id)) {
Example ex = IdUtils.getExampleFromId(es, id);
sampleStrategy.setPdf(new SimpleProbabilityDensityFunction(globalFuzziness,getParameterAsBoolean(ABSOLUTE_ERROR)));
sampleStrategy.setValue(getValues(ex));
Double[][] res = sampleStrategy.getSamples();
sampleCache.put(id, res);
return res;
}
return sampleCache.get(id);
}
private double[] getValues(Example e) {
if (e == null)
return null;
double[] values = new double[e.getAttributes().size()];
int index = 0;
for (Attribute attribute : e.getAttributes())
values[index++] = e.getValue(attribute);
return values;
}
public InputDescription getInputDescription(Class cls) {
if (SimilarityMeasure.class.isAssignableFrom(cls)) {
return new InputDescription(cls, false, true);
} else {
return super.getInputDescription(cls);
}
}
public List<ParameterType> getParameterTypes() {
List<ParameterType> types = super.getParameterTypes();
ParameterType p1;
//@ANTJE p1 = new ParameterTypeDouble(MAX_DISTANCE_NAME, "maximal distance", 0.0, Double.POSITIVE_INFINITY, 0.8);
p1 = new ParameterTypeString(MAX_DISTANCE_NAME, "maximal distance","0.8");
//@END ANTJE
p1.setExpert(false);
types.add(p1);
ParameterType p2;
p2 = new ParameterTypeDouble(GLOBAL_UNCERTAINTY, "global fuzzyness", 0.0, Double.POSITIVE_INFINITY, 0.0);
p2.setDescription("Global fuzziness describes by which amount the values from the example set " +
"could fluctuate: i.e. plus/minus the given value. ");
p2.setExpert(false);
types.add(p2);
ParameterType p3;
p3 = new ParameterTypeInt(SAMPLE_RATE, "sample rate", 0, Integer.MAX_VALUE, 5);
p3.setDescription("Sample Rate sets the number of samples that are taken from each element.");
p3.setExpert(false);
types.add(p3);
ParameterType p4;
p2 = new ParameterTypeBoolean(ABSOLUTE_ERROR, "Specifies if the error is an absolute error",true);
p2.setExpert(false);
types.add(p2);
//@ANTJE performanceTestOnly:
p4 = new ParameterTypeString(LAMBDA, "lambda","0.5");
//p4 = new ParameterTypeDouble(LAMBDA, "lambda", 0, 1, 0.5);
//end @ANTJE
p4.setDescription("The range of this parameter spans from an extremly optimistic (1) " +
"to an extemly pessimistic (0) cluster strategy.");
p4.setExpert(false);
types.add(p4);
//types.add(SimilarityUtil.generateSimilarityParameter());
return types;
}
}