package com.rapidminer.operator.learner.clustering.clusterer.uncertain; import java.util.HashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import com.rapidminer.example.Attribute; import com.rapidminer.example.Example; import com.rapidminer.example.ExampleSet; import com.rapidminer.operator.InputDescription; import com.rapidminer.operator.OperatorDescription; import com.rapidminer.operator.OperatorException; import com.rapidminer.operator.learner.clustering.ClusterModel; import com.rapidminer.operator.learner.clustering.FlatClusterModel; import com.rapidminer.operator.learner.clustering.IdUtils; import com.rapidminer.operator.learner.clustering.clusterer.AbstractDensityBasedClusterer; import com.rapidminer.operator.similarity.SimilarityMeasure; import com.rapidminer.operator.similarity.attributebased.Matrix; import com.rapidminer.operator.similarity.SimilarityUtil; import com.rapidminer.parameter.ParameterType; import com.rapidminer.parameter.ParameterTypeBoolean; import com.rapidminer.parameter.ParameterTypeDouble; import com.rapidminer.parameter.ParameterTypeInt; import com.rapidminer.operator.similarity.attributebased.uncertain.AbstractProbabilityDensityFunction; import com.rapidminer.operator.similarity.attributebased.uncertain.SimpleProbabilityDensityFunction; import com.rapidminer.operator.uncertain.AbstractSampleStrategy; import com.rapidminer.operator.uncertain.SimpleSampling; /** * Implements the FDBSCAN algorithm. * * @author Michael Huber, Peter B. Volk * @see com.rapidminer.operator.learner.clustering.clusterer.DBScanClustering * @see com.rapidminer.operator.learner.clustering.clusterer.uncertain.DBScanEAClustering * @see com.rapidminer.operator.learner.clustering.clusterer.ClusteringAggregation */ public class FDBScanClustering extends AbstractDensityBasedClusterer { private ExampleSet es; private double maxDistance = 0.2; private static final String ABSOLUTE_ERROR = "Absolute error"; private static final String MAX_DISTANCE_NAME = "max_distance"; //NOTE: While not having a pdf for each element, this global fuzziness is used. private double globalFuzziness = 0.0; private static final String GLOBAL_FUZZINESS = "global_fuzziness"; //sampleRate defines the amount of samples per Object private int sampleRate = 10; private static final String SAMPLE_RATE = "sample_rate"; private AbstractSampleStrategy sampleStrategy; //HashMap that assigns an array of samples to each element private Map<String, Double[][]> sampleCache; //HashMap that assigns a pdf to each element private Map<String, AbstractProbabilityDensityFunction> pdfCache; //HashMap that assigns a Minimum Bounding Rectangle (MBR) to each element //private Map<String, MinimumBoundingRectangle> boundingBoxes; private Map<String, Double> coreObjectList; public FDBScanClustering(OperatorDescription description) { super(description); pdfCache = new HashMap<String, AbstractProbabilityDensityFunction>(); sampleCache = new HashMap<String, Double[][]>(); //boundingBoxes = new HashMap<String, MinimumBoundingRectangle>(); coreObjectList = new HashMap<String, Double>(); sampleStrategy = new SimpleSampling(); } /** * Creates a <code>ClusterModel</code> using <code>doClustering</code>. * * @param es dataset that is clustered * @return <code>ClusterModel</code> the clustering of the given dataset */ public ClusterModel createClusterModel(ExampleSet es) throws OperatorException { this.es = es; maxDistance = getParameterAsDouble(MAX_DISTANCE_NAME); globalFuzziness = getParameterAsDouble(GLOBAL_FUZZINESS); FlatClusterModel result = doClustering(es); return result; } /** * Calculates the epsilon-neighborhood of the given core object * and returns a list of IDs of all found objects. * * @param es Dataset of Elements to be processed. * @param id ID of the core object. * @return All IDs of Elements in the epsilon-neighborhood. */ protected List<String> getNeighbours(ExampleSet es, String id) { List<String> preselection = new LinkedList<String>(); for (int i = 0; i < getIds().size(); i++) { String id2 = getIds().get(i); if (isReachable(id, id2)) preselection.add(id2); } if(!isCoreObject(id, preselection)) { //gibt eine leere Liste zur�ck, da die Kernobjektbedingung nicht zutrifft return new LinkedList<String>(); } // In der Matrixmethode wurden bereits die Core-Object-Wahrscheinlichkeiten berechnet. // Hier wird nur P^reach berechnet und dann damit multipliziert... List<String> result = new LinkedList<String>(); double coreProbability = getCoreObjectProbability(id); for (int i = 0; i < preselection.size(); i++) { String id2 = preselection.get(i); //Es wird die Wahrscheinlichkeit ausgerechnet (P^reach = P^core * P^entfernung) if ((coreProbability * similarity(id, id2)) > 0.5) result.add(id2); } return result; } //Berechnet die Wahrscheinlichkeit, dass die zwei Objekte "epsilon-Nachbarn" sind. public double similarity(String id1, String id2) { double prob = 0; Double [][] e1 = getSamples(id1); Double [][] e2 = getSamples(id2); int max_dimensions = e1.length; double[] a = new double[max_dimensions]; double[] b = new double[max_dimensions]; for(int i=0; i<sampleRate; i++) { for(int j=0; j<sampleRate; j++) { for(int d=0; d<max_dimensions; d++) { a[d] = e1[i][d]; b[d] = e2[j][d]; } if(distance(a, b) <= maxDistance) { prob++; } } } prob = prob / (sampleRate*sampleRate); return prob; } //Pr�ft ob mindestens ein Sample innerhalb der Epsilon-Umgebung liegt. public boolean isReachable(String id1, String id2) { boolean reachable = false; //NOTE: Wenn diese Methode umgeschrieben wird, muss sampleCache delegiert werden! //TODO: Bounding Box hernehmen und checken, ob Element �berhaupt in Frage kommt //Falls es die nicht gibt aus samples erstellen //Falls es die samples noch nicht gibt, die mit element und pdf erstellen /* for (int i = 0; i < sampleRate; i++) { MinimumBoundingRectangle mbr1 = getBoundingBox(id1); MinimumBoundingRectangle mbr2 = getBoundingBox(id2); if (minimalDistance(mbr1, mbr2) < maxDistance) reachable = true; } */ reachable = true; return reachable; } public double getCoreObjectProbability(String id) { return coreObjectList.get(id); } public boolean isCoreObject(String id, List<String> preselection) { //Core-Object Pr�fung. Soweit m�glich werden "ge-cache-te" Daten/Infos verwedet... if(coreObjectList.containsKey(id)) { return true; } Double[][] sample = getSamples(id); int max_dimensions = sample.length; //Pr�fung auf core object fand noch nicht statt //jetzt zu Cache hinzuf�gen (coreObjectList) Matrix m = new Matrix(sampleRate); m.reset(1); //1, weil das CoreObject mitgez�hlt wird double[] a = new double[max_dimensions]; double[] b = new double[max_dimensions]; //Hier wird die Matrix erstellt. Folien S. 72 for(int k=0; k<preselection.size(); k++) { Double[][] tempSample = getSamples(preselection.get(k)); for(int i=0; i<sampleRate; i++) { //Sample-Index f�r Element for(int j=0; j<sampleRate; j++) { //Sample-Index f�r Preselection-Elemente //folgendes Statement ist nur zum Umschreiben der Information for(int d=0; d<max_dimensions; d++) { a[d] = sample[i][d]; b[d] = tempSample[i][d]; } if(distance(a, b) <= maxDistance) { m.inc(i, j); } } } } //Check, ob core object oder nicht: double prob = 0; for(int i=0; i<sampleRate; i++) { for(int j=0; j<sampleRate; j++) { if(m.getValue(i, j) >= minPts) { prob++; } } } prob = prob / (sampleRate*sampleRate); if(prob > 0.5) { coreObjectList.put(id, prob); return true; } return false; } /** * Calculates the distance using the euclidean distance measurement. * * @param e1 measure starting point * @param e2 measure ending point */ public double distance(double e1, double e2) { if ((Double.isNaN(e1)) || (Double.isNaN(e2))) { return Double.NaN; } return Math.sqrt((e1 - e2) * (e1 - e2)); } /** * Calculates the distance of n-dimensional vectors using the * euclidean distance measurement. * * @param e1 n-dimensional starting vector * @param e2 n-dimensional ending vector */ public double distance(double[] e1, double[] e2) { double sum = 0.0; int counter = 0; for (int i = 0; i < e1.length; i++) { if ((!Double.isNaN(e1[i])) && (!Double.isNaN(e2[i]))) { sum = sum + (e1[i] - e2[i]) * (e1[i] - e2[i]); counter++; } } double d = Math.sqrt(sum); if (counter > 0) return d; else return Double.NaN; } // public MinimumBoundingRectangle getBoundingBox(String id) { // return boundingBoxes.get(id); // } // public double minimalDistance(MinimumBoundingRectangle mbr1, MinimumBoundingRectangle mbr2) { // //Finde jeweils ein Element in jeder BoundingBox, die sich am n�hsten sind // double r; // double min1, max1, min2, max2; // // //NOTE: Dies sollte eigentlich schon durch isSimilarityDefined gepr�ft worden sein. // if(mbr1.getDimension() != mbr2.getDimension()) { // return Double.NaN; // } // //Pr�fung jeder Dimension auf Minimum // for(int i=0; i<mbr1.getDimension(); i++) { // min1 = mbr1.getMinimumValue(i); // max1 = mbr1.getMaximumValue(i); // min2 = mbr2.getMinimumValue(i); // max2 = mbr2.getMaximumValue(i); // // //r = // } // return 0; // // //Benutze die angegebene Distanz-Funktion um die Entfernung zu messen // //double dist = nestedSim...; // //return dist; // } //TODO: getSamples in eigene Klasse SampleCache delegieren. //Wenn isReachable() ge�ndert wird, muss sie umgeschrieben werden. protected Double[][] getSamples(String id) { if(!sampleCache.containsKey(id)) { Example ex = IdUtils.getExampleFromId(es, id); sampleStrategy.setPdf(new SimpleProbabilityDensityFunction(globalFuzziness,getParameterAsBoolean(ABSOLUTE_ERROR))); sampleStrategy.setValue(getValues(ex)); Double res[][] = sampleStrategy.getSamples(); sampleCache.put(id, res); return res; } return sampleCache.get(id); } private double[] getValues(Example e) { if (e == null) return null; double[] values = new double[e.getAttributes().size()]; int index = 0; for (Attribute attribute : e.getAttributes()) values[index++] = e.getValue(attribute); return values; } public InputDescription getInputDescription(Class cls) { if (SimilarityMeasure.class.isAssignableFrom(cls)) { return new InputDescription(cls, false, true); } else { return super.getInputDescription(cls); } } public List<ParameterType> getParameterTypes() { List<ParameterType> types = super.getParameterTypes(); ParameterType p1; p1 = new ParameterTypeDouble(MAX_DISTANCE_NAME, "maximal distance", 0.0, Double.POSITIVE_INFINITY, 0.8); p1.setExpert(false); types.add(p1); ParameterType p2; p2 = new ParameterTypeDouble(GLOBAL_FUZZINESS, "global fuzzyness", 0.0, Double.POSITIVE_INFINITY, 0.0); p2.setDescription("Global fuzziness describes by which amount the values from the example set " + "could fluctuate: i.e. plus/minus the given value. "); p2.setExpert(false); types.add(p2); p2 = new ParameterTypeBoolean(ABSOLUTE_ERROR, "Specifies if the error is an absolute error",true); p2.setExpert(false); types.add(p2); ParameterType p3; p3 = new ParameterTypeInt(SAMPLE_RATE, "sample rate", 0, Integer.MAX_VALUE, 5); p3.setDescription("Sample Rate sets the number of samples that are taken from each element."); p3.setExpert(false); types.add(p3); ParameterType pmeasure; pmeasure = SimilarityUtil.generateSimilarityParameter(); pmeasure.setExpert(true); pmeasure.setDescription("nested distance measure"); types.add(pmeasure); return types; } }