/** * Copyright (C) 2001-2017 by RapidMiner and the contributors * * Complete list of developers available at our web site: * * http://rapidminer.com * * This program is free software: you can redistribute it and/or modify it under the terms of the * GNU Affero General Public License as published by the Free Software Foundation, either version 3 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License along with this program. * If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.operator.preprocessing.outlier; import com.rapidminer.operator.Operator; import com.rapidminer.operator.ProcessStoppedException; import java.util.Enumeration; import java.util.ListIterator; import java.util.Vector; /** * SearchSpace is a class for building a room full of SearchObjects (see class definition) and * provides various methods to place those objects into the SearchSpace (by associating those * Objects to the list of objects in the SearchSpace) as well as to do some Outlier Tests on those * Objects. * * @author Stephan Deutsch, Ingo Mierswa */ public class SearchSpace { /** * This variable holds the number of dimensions for the Searchroom. As * {@link SearchObject#dimensions} hold their own number of dimensions per instance of that * class, the dimensions of the SearchSpace and its associated SearchObjects must not assumed to * be equal. E.g. a SearchObject can have more or fewer dimensions than the SearchSpace. * Consistency checks should be performed as necessary, but not as mandatory. */ private int dimensions; /** * The list of SearchObjects in the SearchSpace (as a Vector class). * */ private Vector<SearchObject> listOfObjects; /** * holds the minimum value of the dimensions of all SearchObjects in the SearchSpace and is * updated automatically as SearchObjects are added to the SearchSpace. This is to provide * meta-data for statistical analysis over the SearchSpace. */ private double[] minimumVectorValue; /** * holds the maximum value of the dimensions of all SearchObjects in the SearchSpace and is * updated automatically as SearchObjects are added to the SearchSpace. This is to provide * meta-data for statistical analysis over the SearchSpace. */ private double[] maximumVectorValue; /** * Holds the range (interval) value of the dimensions of all SearchObjects in the SearchSpace * and is updated automatically as SearchObjects are added to the SearchSpace. This is to * provide meta-data for statistical analysis over the SearchSpace. */ private double[] rangeVectorValue; /** * The lower bound for a potential MinPts search (e.g. a LOF search). */ // private int minPtsLowerBound; /** * The upper bound for a potential MinPts search (e.g. a LOF search). */ // private int minPtsUpperBound; /** * This constructor creates a SearchSpace with (integer) <i>dim</i> dimensions and initializes * all fields in the instance of that Class with zero values where appropriate. */ public SearchSpace(int dim) { this.dimensions = dim; this.createListOfObjects(); // create a list of Objects mapped to this.listOfObjects // initialize the SearchRooms Vektor parameters by creating arrays and filling them with // zeros this.minimumVectorValue = new double[dim]; this.maximumVectorValue = new double[dim]; this.rangeVectorValue = new double[dim]; for (int i = 0; i < this.dimensions; i++) { this.minimumVectorValue[i] = 0; this.maximumVectorValue[i] = 0; this.rangeVectorValue[i] = 0; } } /** * This constructor creates a SearchSpace with (integer) <i>2</i> dimensions as a default and * initializes all fields in the instance of that Class with zero values where appropriate. */ public SearchSpace() { // construct a searchroom with at least 2 dimensions this(2); } /** * This constructor creates a SearchSpace with (integer) <i>dim</i> dimensions and initializes * all fields in the instance of that Class with zero values where appropriate. */ public SearchSpace(int dim, int minptslb, int minptsub) { this(dim); // this.minPtsLowerBound = minptslb; // this.minPtsUpperBound = minptsub; } /** * Returns the (integer) number of objects in the Searchroom (associated with it via * {@link #addObject(SearchObject)} to the room) as an integer value as we overall do not expect * the searchroom to hold more than 2 billion objects. * */ public int getNumberOfObjects() { return this.listOfObjects.size(); } /** * Sets the minimum value of all SearchObjects in a SearchSpace to a value for a dimension dim. * * @param dim * @param value */ void setMinimumVectorValue(int dim, double value) { this.minimumVectorValue[dim] = value; } /** * Returns the minimum value of all SearchObjects in a SearchSpace for a dimension dim. * * @param dim */ double getMinimumVectorValue(int dim) { return this.minimumVectorValue[dim]; } /** * Sets the maximum value of all SearchObjects in a SearchSpace to a value for a dimension dim. * * @param dim * @param value */ void setMaximumVectorValue(int dim, double value) { this.maximumVectorValue[dim] = value; } /** * Returns the maximum value of all SearchObjects in a SearchSpace for a dimension dim. * * @param dim */ double getMaximumVectorValue(int dim) { return this.maximumVectorValue[dim]; } /** * Sets the range value (maximum - minimum) of all SearchObjects in a SearchSpace to a value for * a dimension dim. * * @param dim * @param value */ void setRangeVectorValue(int dim, double value) { this.rangeVectorValue[dim] = value; } /** * Returns the range value (maximum - minimum) of all SearchObjects in a SearchSpace for a * dimension dim. * * @param dim */ double getRangeVectorValue(int dim) { return this.rangeVectorValue[dim]; } /** * <p> * Sets the number of dimensions for the SearchSpace to dim. * </p> * <p> * <em>Attention</em>: This is a value that the SearchSpace keeps for the purpose of consistency * checks for all SearchObjects (as each SearchObject has its own number of dimensions and not * all the dimensions of the SearchObjects need to be the same - to give implementation * freedom). * </p> * * @param dim */ public void setDimensions(int dim) { this.dimensions = dim; } /** * Returns the number of dimensions of the SearchSpace. */ public int getDimensions() { return this.dimensions; } /** * Creates a listOfObjects (e.g. a new Vector Class instance within the SearchSpace) and is used * by a constructor. * */ void createListOfObjects() { this.listOfObjects = new Vector<>(); } /** Delivers the list of objects. */ public Vector<SearchObject> getSearchObjects() { return listOfObjects; } /** * This method returns the outlierstatus of the Searchobject (element at index i) in the * SearchSpace from the Searchroom's listOfObjects. * * @param i * @return the boolean outlier status */ public boolean getSearchObjectOutlierStatus(int i) { SearchObject so = this.listOfObjects.elementAt(i); return so.getOutlierStatus(); } /** * This adds a SearchObject to the SearchSpace. * * <p> * It prints a warning to STDOUT in case the dimensions of the SearchObject and SearchSpace are * incompatible, but as the SearchSpace can perform some operations over SearchObjects with * different dimensions, this is not a showstopper. * * <p> * The method also automatically updates the min/max/range information the SearchSpace knows for * itself. * * @param objectToAdd */ public void addObject(SearchObject objectToAdd) { this.listOfObjects.addElement(objectToAdd); // add the object of type SearchObject to the // SearchSpace for (int i = 0; i < this.getDimensions(); i++) { if (this.getMinimumVectorValue(i) > objectToAdd.getVektor(i)) { this.setMinimumVectorValue(i, objectToAdd.getVektor(i)); } if (this.getMaximumVectorValue(i) < objectToAdd.getVektor(i)) { this.setMaximumVectorValue(i, objectToAdd.getVektor(i)); } this.setRangeVectorValue(i, this.getMaximumVectorValue(i) - this.getMinimumVectorValue(i)); } } /** * This method returns a SearchObject with the i-th index in the listOfObjects; the result has * to be casted to SearchObject (Vector Class speciality, as it returns only a JAVA Object Class * object). This is better than to access the listOfObjects directly, but sadly I do not use it * consistently. Maybe in the cleaning-up, this will be changed. * * @param index */ public SearchObject getObject(int index) { return this.listOfObjects.elementAt(index); } /** * This method returns an Enumeration of all SearchObjects from a SearchSpace. */ public Enumeration<SearchObject> getObjects() { return this.listOfObjects.elements(); } /** * Checks the dimensional integrity of the Searchroom and returns an array if int values for * each object with 0 for equal dimensions of room and object, -1 for less dimensions in the * room than object thinks it has and +1 for more dimensions in the room than object has. * * <p> * Method prints to the STDOUT a message on whether the overall integrity is given (all objects * have the same dimensions as the searchroom. ATTN: this checks only those objects in the * search room, e.g. which have been added to it using {@link #addObject(SearchObject)}. */ int[] dimensionsIntegrityCheck() { SearchObject sobject; int number = this.getNumberOfObjects(); int[] range = new int[number]; int checker = 0; for (int i = 0; i < number; i++) { sobject = this.listOfObjects.elementAt(i); // cast this to SearchObject from Vector // class...?? if (sobject.getDimensions() != this.dimensions) { if (sobject.getDimensions() < this.dimensions) { range[i] = 1; } range[i] = -1; } else { range[i] = 0; } checker = checker + range[i]; } return range; } /** * This method resets the Outlier Status for all Objects in the Search room to have a clean * start or to have a new identification of outliers with a separate method. As this zeros all * boolean outlier statuses of all objects associated to this Searchroom and also zeros all * outlier smooth factors, a current status list should be drawn down and stored somewhere * before using this method. * * ATTN: As this only uses references to Objects associated to a Searchroom, in case more than * one Searchroom uses a (fraction) range of objects, this might override the results from other * detections for those objects. But it is encouraged to associate objects to only one * SearchSpace and use duplications of objects with similar vektors in other SearchRooms. */ public void resetOutlierStatus() { SearchObject sobject; for (int i = 0; i < this.getNumberOfObjects(); i++) { sobject = this.listOfObjects.elementAt(i); // cast this to SearchObject from Vector // class...?? sobject.setOutlierStatus(false); sobject.setOutlierFactor(0); } } /** * BruteForce Radius Search to determine the outlier status of an object rObject of the type * SearchObject this method takes d and p as parameters acc. to distance based DB(p,D)-Outlier * (Knorr, Ng) and identifies an object as being an outlier, if more than a proportion p of the * objects is more than distance D from rObject away. * * The simplest approach is to make a radius search for rObject and compare its distance to all * other objects step by step with D (in this case d). If more than M = N(1-p) objects are * within d, than rObject is not an Outlier, else it is. Although this is an approach with * O(N^2) for all objects (it is O(N) for rObject), this prunes the search as soon as more than * M objects are within d from rObject to get some improvement. */ public void radiusODSearch(double d, double p, SearchObject rObject, int kindOfDistance) { int number = this.getNumberOfObjects(); // set N (number) to number of Objects in Search // Room long m = Math.round(number * (1 - p)); // set M for Objects in Search room int counter = 0; // counter for objects within radius distance d for (int i = 0; i < number; i++) { // search through the whole list if (rObject.getDistance(this.listOfObjects.elementAt(i), kindOfDistance) < d) { counter = counter + 1; // increase counter if Object(i) is within d from rObject if (counter > m) { break; // prune if we already have more than m objects within d from rObject } } } if (counter > m) { // ok, probably not the best way, but works rObject.setOutlierStatus(false); } else { rObject.setOutlierStatus(true); } /* * as we expect to have a radius search for all objects, we store the outlier status in * rObject and after the overall search simply ask all objects whether they are thinking * they are now outliers or not :-) */ } /** * This method invokes the class method radiusODSearch on all objects in the SearchSpace * (associated to this Searchroom via the listOfObjects vektor). radiusODSearch does a brute * force distance Outlier test based on the parameters d and p for DB(p,d)-Outliers acc. to * Knorr and Ng's approach to unify statistical Outlier tests. The result of the Outliertest is * stored in the Objects themselves, e.g. each SearchObject knows its Outlier status (set * recently, e.g. by this search) and can tell it by using the SearchObject's class method * getOutlierStatus() (see there!) * * Added feature: prints progress on STDOUT for each 10% segment (app.) one hash "#" is printed * to show progress if brute force should hit complexity boundaries (e.g. with a lot of * dimensions as well as lots of objects). This also prints the parameters d and p and N for * better understanding * */ public void allRadiusSearch(double d, double p, int kindOfDistance) { int n = this.getNumberOfObjects(); int segment = 10; for (int i = 0; i < n; i++) { this.radiusODSearch(d, p, this.listOfObjects.elementAt(i), kindOfDistance); // invoke on // all // objects // in list if (100 * i / n > segment) { segment = segment + 10; } } } /** * Returns the average distances measures for the objects in the SearchSpace, calculating: * * <p> * mean distance * <p> * standard deviation * <p> * variance * * The calculation is time consuming and should only be invoked if the data set is parsed for * the first time (to get a feeling on it for statistical choices of parameters p and d for e.g. * DB(p,d)-Outliers). It parses the objects matrix upper half to build an array of distances * between objects (without doubling and without the distances of objects to themselves) which * should be (n^2-n)/2 distances of value. * * @return double[3] of mean, variance and standard deviation */ public double[] getAverageDistanceMeasures(int kindOfDistance) { double meanDistance = 0; // mean distance between objects in the SearchSpace double standardDeviationOfDistance = 0; // standard deviation of objects in the SearchSpace double varianceOfDistance = 0; // variance of distance in the SearchSpace double distance = 0; double sumOfDistance = 0; double[] distances = new double[(this.getNumberOfObjects() * this.getNumberOfObjects() - this.getNumberOfObjects()) / 2]; double counter = 0; // counts number of distances SearchObject so; // reference to a searchObject // first we have to calculate the mean distance between objects for (int i = 0; i < this.getNumberOfObjects(); i++) { so = this.listOfObjects.elementAt(i); for (int j = i; j < this.getNumberOfObjects(); j++) { if (i != j) { distance = so.getDistance(this.listOfObjects.elementAt(j), kindOfDistance); sumOfDistance = sumOfDistance + distance; distances[(int) counter] = distance; counter = counter + 1; } } } meanDistance = sumOfDistance / counter; // now lets get the variance for (int k = 0; k < counter; k++) { varianceOfDistance = varianceOfDistance + Math.pow((distances[k] - meanDistance), 2); } varianceOfDistance = varianceOfDistance / counter; standardDeviationOfDistance = Math.sqrt(varianceOfDistance); double distMeasures[] = { meanDistance, varianceOfDistance, standardDeviationOfDistance }; return distMeasures; } /** * Returns the average LOF measures for the objects in the SearchSpace, calculating: * * <p> * mean LOF * <p> * standard deviation * <p> * variance * * * @return double[3] of mean, variance and standard deviation */ public double[] getAverageLOFMeasures() { double meanLOF = 0; // mean LOF of objects in the SearchSpace double standardDeviationOfLOF = 0; // standard deviation of Lof of objects in the // SearchSpace double varianceOfLOF = 0; // variance of LOF in the SearchSpace double sumOfLOF = 0; // calculation variable SearchObject so; // reference to a searchObject // first we have to calculate the mean LOF of all objects for (int i = 0; i < this.getNumberOfObjects(); i++) { so = this.listOfObjects.elementAt(i); sumOfLOF += so.getOutlierFactor(); } meanLOF = sumOfLOF / this.getNumberOfObjects(); // now lets get the variance for (int k = 0; k < this.getNumberOfObjects(); k++) { so = this.listOfObjects.elementAt(k); varianceOfLOF = varianceOfLOF + Math.pow((so.getOutlierFactor() - meanLOF), 2); } varianceOfLOF = varianceOfLOF / this.getNumberOfObjects(); // and the standard deviation standardDeviationOfLOF = Math.sqrt(varianceOfLOF); double lofMeasures[] = { meanLOF, varianceOfLOF, standardDeviationOfLOF }; return lofMeasures; } /** * This method returns the maximum Outlier Factor of all SearchObjects in the SearchSpace. Attn: * Due to initializing, the outlier factors should be greater or equal to zero. */ public double getMaximumOutlierFactor() { double maxOutlierFactor = 0; for (int i = 0; i < this.getNumberOfObjects(); i++) { SearchObject so = this.getObject(i); if (maxOutlierFactor < so.getOutlierFactor()) { maxOutlierFactor = so.getOutlierFactor(); } } return maxOutlierFactor; } /** * <p> * This method processes a sequential search over the SearchSpace for a SearchObject so (named p * here to be in line with the literature). * </p> * * <p> * As a result of the search a structure of k-distance-Containers is build and listed within the * SearchObject. Each container for a distance of an object or a number of objects o in relation * to p is filled with all the objects within that distance. The containers are sorted in a * linked list in the SearchObject by increasing distance. Just imagine it like p being a * submarine sending a ping and listing all echos in radiuses (=distance) with the echos stored * in a band (=container) if they are on the same radius. * </p> * * @param so */ public void findKdistanceContainers(SearchObject so, int kindOfDistance) { SearchObject obj; // an iterator reference for the i-th object double distance; // the distance between the so object and the obj object ListIterator<KdistanceContainer> li; // an iterator over the list of containers for so KdistanceContainer container; // a reference for a container out of so's container list int index; // index to know where we are in the list, as we use a while loop boolean added; // flag on whether we already added an obj to the/a container for (int i = 0; i < this.getNumberOfObjects(); i++) { // for all objects in the SearchSpace obj = this.listOfObjects.elementAt(i); // let obj be the i-th object if (obj == so) { // if the obj is the so-object, then do not look at it continue; // get to the next object i+1 } // else do all the useful stuff distance = so.getDistance(obj, kindOfDistance); // the distance between so and i-th // object /** * the process now should be as follows: (1) get an iterator over all kd containers of * object so starting at the beginning (2) iterate over that container list until you * find one (a) of equal distance (b) of greater distance (3) in case of (a) insert the * obj into the container with the same distance (4) in case of (b) create a new * container with that distance and add the obj into it (5) in case there has not been a * container with equal distance or a container with greater distance, create a * container at the end of the list. * * This works, because each time we walk from left to right through the container list * and add an object into an existing container / or we add it into a new container in * the list or at the end, by creating a list of containers with sorted growing * distances: (cont(d1), cont(d2), ... cont(dn), with d1 < d2 < ... < dn) * */ li = so.getKdContainerListIterator(); // we are getting an iterator on the container // list index = -1; // we set our counting index at element zero (after the first ++) added = false; // we have not yet added any obj to a container while (li.hasNext()) { // as long as there are containers in so's list container = li.next(); // take the next container from the list index++; // and increase the parallel indexing accordingly if (container.getDistance() == distance) { // if the distances are equal, do (3) container.addObject(obj, distance); added = true; // we added one obj to the container break; // and want to leave the while-loop } if (container.getDistance() > distance) { // if there's a container with greater // distance KdistanceContainer newcontainer = new KdistanceContainer(so); // create a new // container so.addKdContainer(index, newcontainer); // and add him at the index point // (shifting the remainder of the list // right) newcontainer.addObject(obj, distance); // add the obj to the new container in // the list added = true; // we added one obj to the container break; // and want to leave the while loop } } // else we continue to walk through the container list with the iterator if (!added) { // if we have not yet added a container, one has to go to the end of the // list KdistanceContainer newcontainer = new KdistanceContainer(so); // create one so.addKdContainer(newcontainer); // add it at the end of so's container list newcontainer.addObject(obj, distance); // add the obj to the container } // all cases (3), (4) and (5) have either been handled } // continue with the for-loop and take the next object from the Searchroom } /** * Finds and fills all K distance containers for all objects in the Search Room by invoking the * process of finding all k distance containers for one Search Object. * * @param kindOfDistance * @param operator * if this is NOT <code>null</code>, will call {@link Operator#checkForStop()}. * @throws ProcessStoppedException * only if the the operator parameter was not <code>null</code> and a stop request * was issued */ public void findAllKdContainers(int kindOfDistance, Operator operator) throws ProcessStoppedException { for (int i = 0; i < this.getNumberOfObjects(); i++) { if (operator != null) { operator.checkForStop(); } this.findKdistanceContainers(this.listOfObjects.elementAt(i), kindOfDistance); } } /** * <p> * Some deeper magic to compute all the LOFs for the objects in the searchroom up to MinPtsUB = * kMax! The LOF output is only done up from kMin! * </p> * * <p> * This one is heavily documented in the source, so if you are interested on how it is done, * have a look at the source for the method. * </p> * * @param kMin * @param kMax * @param operator * if this is NOT <code>null</code>, will call {@link Operator#checkForStop()}. * @throws ProcessStoppedException * only if the the operator parameter was not <code>null</code> and a stop request * was issued */ public void computeLOF(int kMin, int kMax, Operator operator) throws ProcessStoppedException { /* * What we do in this step is (1) to scan the k-distance containers for all objects to find * the k-distances for that object and to store it in the object's array * * (2) to compute the k-lrd for each object, we need the k-distance for each object, * therefore this has to be a separate loop, looking exactly the same... * * (3) to compute the k-LOFs for each object, we take the average relation of the k-lrd of * the objects in p's k-neighbourhood and the k-lrd of p. */ int sumCardinality; // count up the container contents int k; // counter for the k-steps (e.g. finding the k-distances double sumdistance; // sumdistance for the r-distance summing up for lrd calculation double lrd; // lrd value (placeholder) double lof; // lof value (placeholder) // (1) for all objects in the search room for (int i = 0; i < this.getNumberOfObjects(); i++) { if (operator != null) { operator.checkForStop(); } SearchObject so = this.listOfObjects.elementAt(i); // get the object p (so) sumCardinality = 0; // set the value to zero for each new object browse k = 1; // for each object start k at 1 for 1-distance // for this object so now browse through its containers ListIterator<KdistanceContainer> li = so.getKdContainerListIterator(); // first get an // iterator // over the // container list // iterate over the container list while (li.hasNext() && k <= kMax) { // for all containers in the list KdistanceContainer container = li.next(); // get the container sumCardinality = sumCardinality + container.getNumberOfObjects(); // add container // objects to # // in distance /* * we have to find a solution to push the items in a zero-distance container * (contains all objects in the same spot (which each have a zero-distance container * with all the respective objects in the same spot)) to the next k-distance, * because the second condition for k-distance is: at most k-1 items (not counting * p) should be < distance than k-distance. For items in zero distance, this cannot * be true, because only p should be in 0-distance of itself -> thus the 1-distance * has to be the next distance, making the following situation: at least 1 object is * <= 1-distance; at most 0 objects without p < 1-distance. */ // if (container.getDistance() != 0) { while (k <= sumCardinality && k <= kMax) { so.setKDistance(k, container.getDistance()); // the k-distance is the container // distance so.setCardN(k, sumCardinality); k++; // increase k } // } } // all containers iterated } // all objects conducted // (2) for all objects in the SearchSpace for (int i = 0; i < this.getNumberOfObjects(); i++) { if (operator != null) { operator.checkForStop(); } SearchObject so = this.listOfObjects.elementAt(i); // get an object sumCardinality = 0; // set the value to zero for each new object browse k = 1; // for each object start k at 1 for 1-distance sumdistance = 0; // for this object now browse again through its containers ListIterator<KdistanceContainer> li = so.getKdContainerListIterator(); // first get an iterator over the container list // we look to compute the local k-reachability density, which is the reciprocal of // the average k-reachability-distance for the object in its k-neighbourhood // it is calculated by taking the maximum of the k-distance of each object of the // neighbourhood // and the distance between the object and the objects in the neighbourhood and // averaging it /* * The good thing is, that the lrd_k(p) = 1 / ( sum_kn(p) r-distance_k(p,o) / card_k(p) * ) meaning that the k-lrd is the reciprocal of the average of the k-r-distances of p's * k-neighbourhood (containing all the objects o). * * As the k+1 neighbourhood contains all the k-neighbourhood, we can do this in a loop * and while we iterate through the loop, we only need to increase the sum of the * k-r-distances and the cardinality of the neighbourhoods to sequentially calculate the * k-lrds step by step. * * We only have to look, that for a k-distance = k+1-distance, of course the lrd is the * same and we cannot increase the bespoken numbers in this case, but just copy the lrd. */ while (li.hasNext() && k <= kMax) { // for all containers in the list until MinPtsUB is // reached KdistanceContainer container = li.next(); // get the container /** * now that we have the container, in this container is a number of objects. We add * this number to the increasing number of the sum of objects in the containers * looked at so far, so that we have the number of objects in all the containers * until the container with this distance in the loop, this equals the cardinality * of the set of objects within k-distance for the given k in this part of the loop. * We need this to get the average r-distance to compute the lrd. */ sumCardinality = sumCardinality + container.getNumberOfObjects(); // ok, now // increase the // cardinality /* * now we look into the container and for each object o in the container, we choose * the reachability-distance. This is the maximum of the k-distance of o (we get * this by asking the object of its k-distance using the k iteration value from the * loop) and the actual distance between so and o. We get this from the container, * as all o's in the container have container's distance to so (so we do not need to * compute it again, which can be time consuming depending on the dimensions of the * objects). * * Afterwards we add the l-reachability distance of the object o to the sumdistance. * * As last step, we calculate the lrd by using the cardinality of all objects in * k-neighbour- hood for so (sumCardinality) as a divisor to the sum of reachability * distances. Of this we take the reciprocal and store it in lrd_k for so. */ boolean calcLRD = false; // in each container we want to compute the lrd, so reset // the trigger on whether we already have the lrd, we don't (yet) lrd = 0; // initialize with zero to be sure (we can than see mistakes) while (k <= sumCardinality && k <= kMax) { // of course, we stop as we reach // MinPtsUB // as the lrd_k is the same for all k-distances with the same objects, we only // need compute once if (!calcLRD) { ListIterator<SearchObject> lobj = container.getListIterator(); // get an // iterator // for the // container while (lobj.hasNext()) { // and iterate over it SearchObject sobj = lobj.next(); // get the object o // (sobj) // now increase the sum of reachability distances with the rd of sobj sumdistance = sumdistance + Math.max(container.getDistance(), sobj.getKDistance(k)); } lrd = 1 / (sumdistance / sumCardinality); calcLRD = true; // set, that we now have an lrd calculated, so do not do it // again } so.setLRD(k, lrd); // and can here set it for the k-distance (k) k++; // increase k to the next distance } // now we have sorted through all steps in k-distances which can be made with one // container // as we have to remember, that k-distance can be k+1 distance in some cases, // etc... } // now we have finished with the container } // (3) for all objects in the search room for (int i = 0; i < this.getNumberOfObjects(); i++) { if (operator != null) { operator.checkForStop(); } SearchObject so = this.listOfObjects.elementAt(i); // get the object p (so) sumCardinality = 0; // set the value to zero for each new object browse k = 1; // for each object start k at 1 for 1-distance // sumlrdrelations = 0; // set the sum of the lrd(o)/lrd(p) to zero double[] sumlrdratio = new double[kMax + 1]; // store all growing sumlrd ratios in this // array for (int u = 0; u <= kMax; u++) { sumlrdratio[u] = 0; } // for this object so now browse through its containers ListIterator<KdistanceContainer> li = so.getKdContainerListIterator(); // first get an // iterator // over the // container list // iterate over the container list while (li.hasNext() && k <= kMax) { // for all containers in the list KdistanceContainer container = li.next(); // get the container sumCardinality = sumCardinality + container.getNumberOfObjects(); // add container // objects to # // in distance boolean calcLOF = false; // for each container's object list calculate the LOF only // once, not yet calc'ed lof = 0; // set lof to zero for the time being while (k <= sumCardinality && k <= kMax) { if (!calcLOF) { // if we haven't calculated the LOF yet, we should do it ListIterator<SearchObject> lobj = container.getListIterator(); // get an // iterator // over the // container while (lobj.hasNext()) { SearchObject sobj = lobj.next(); // get the next object // from the container for (int j = 1; j <= kMax; j++) { // explaination for this see below... double lrd2 = so.getLRD(j); double lrd3 = sobj.getLRD(j); if (!(Double.isInfinite(lrd2) || Double.isInfinite(lrd3))) { // for a huge number of duplicates the k-lrd becomes infinite. // In this case we need to skip the sum-step because it is // mathematically undefined. sumlrdratio[j] = sumlrdratio[j] + lrd3 / lrd2; } } // sumlrdrelations = sumlrdrelations + sobj.getlrd(k)/so.getlrd(k); // this has been taken out, because it has been wrong approach // left in as a remembering } // lof = sumlrdrelations / sumCardinality; // this has been taken out, because it has been wrong approach // left in as a remembering /* * This has been changed, because we need to take the respective lrds for * all the objects in the MinPts-neighbourhood, but with the _MinPts_ index * for _all_ and not step by step growing indices. Hence we compute the * lrd_upsumming relations (lrd_MinPts(o)/lrd_MinPts(p)) in the growing loop * for all MinPts's and store them in sumlrdratio[MinPts] and take the LOF * from that by dividing through |N_MinPts(p)| cardinality (which is the * step by step summed up from the containers. */ lof = sumlrdratio[k] / sumCardinality; calcLOF = true; } so.setLOF(k, lof); // set the k-LOF for so to lof (we keep the k-LOFs, to // analyse e.g. if (k >= kMin && so.getOutlierFactor() <= lof) { so.setOutlierFactor(lof); // if this k-LOF is maximal, set ooutlier status // to this... // but only take those into account for k-dists > kMin! } k++; // increase k } } // all containers iterated } // all objects conducted } /** * This function computes the D^k_n Outliers according to Ramaswamy, Rastogi and Shim which * computes the top-n D^k-Outliers, the outliers (= objects) with the maximum distance to the * k-th nearest neighbors. * * Please be aware that this function requires the findAllKdContainers method has to be run * first, else it will simply stop or will not work. * * @param dk * @param n * @param operator * if this is NOT <code>null</code>, will call {@link Operator#checkForStop()}. * @throws ProcessStoppedException * only if the the operator parameter was not <code>null</code> and a stop request * was issued */ public void computeDKN(int dk, int n, Operator operator) throws ProcessStoppedException { Vector<SearchObject> listofDKNcandidates = new Vector<>(); int minDKNdistindex = 0; double minD = 0; int sumCardinality; int k; int kMax = dk; // do not look for k-distances over k double minDistInList = 0; // the smallest distance in the candidates list /* * This has three steps: (1) get the k-distances from the containerinformation (like in LOF) * and store it in the SearchObjects (2) browse through all the SearchObjects in the room * and sort those with the max dk-distance into the candidates (3) push the information in * the candidates list into the SearchObjects Outlier status variables (fields) */ /* * First (like in LOF algorithm), get the real k-distances from the containers and store the * information in the kdistance-Vektor of each SearchObject. */ for (int i = 0; i < this.getNumberOfObjects(); i++) { if (operator != null) { operator.checkForStop(); } SearchObject so = this.listOfObjects.elementAt(i); // get the object p (so) sumCardinality = 0; // set the value to zero for each new object browse k = 1; // for each object start k at 1 for 1-distance // for this object so now browse through its containers ListIterator<KdistanceContainer> li = so.getKdContainerListIterator(); // first get an iterator over the container list // iterate over the container list while (li.hasNext() && k <= kMax) { // for all containers in the list KdistanceContainer container = li.next(); // get the container sumCardinality = sumCardinality + container.getNumberOfObjects(); // add container // objects to # // in distance /* * we have to find a solution to push the items in a zero-distance container * (contains all objects in the same spot (which each have a zero-distance container * with all the respective objects in the same spot)) to the next k-distance, * because the second condition for k-distance is: at most k-1 items (not counting * p) should be < distance than k-distance. For items in zero distance, this cannot * be true, because only p should be in 0-distance of itself -> thus the 1-distance * has to be the next distance, making the following situation: at least 1 object is * <= 1-distance; at most 0 objects without p < 1-distance. */ // if (container.getDistance() != 0) { while (k <= sumCardinality && k <= kMax) { so.setKDistance(k, container.getDistance()); // the k-distance is the container // distance so.setCardN(k, sumCardinality); k++; // increase k } // } } // all containers iterated } // all objects conducted /* * In the second step, get the actual list of DKN candidates from the k-distances of all the * SearchObjects. */ for (int i = 0; i < this.getNumberOfObjects(); i++) { if (operator != null) { operator.checkForStop(); } // get the next SearchObject SearchObject so = this.listOfObjects.elementAt(i); // 1. if the candidates list is empty, simply add the element to the candidates list if (listofDKNcandidates.size() == 0) { listofDKNcandidates.add(so); // add the candidate to the list } else { // 2.1 if there are already elements in the list, check if it is no more than n // elements if (listofDKNcandidates.size() <= n + 1) { listofDKNcandidates.add(so); // add the candidate to the list } else { // 2.2 if the list is already full and only if the new candidate has more // distance if (so.getKDistance(dk) > minDistInList) { listofDKNcandidates.remove(minDKNdistindex); // remove the candidate with // minimal distance in list listofDKNcandidates.add(so); // add the new candidate to the list } } } // 3. iterate through the candidates list to find the actual smallest distance and the // respective index for (int j = 0; j < listofDKNcandidates.size(); j++) { SearchObject sobj = listofDKNcandidates.elementAt(j); // get the reference to the // candidate minD = sobj.getKDistance(dk); // set minD to candidates' distance if (j == 0) { // if first in list, simply initialize with candidates values minDistInList = minD; // for minimal distance minDKNdistindex = j; // and the index of the minimal distance } else { // if not first in list, we have initialized data to compare with if (minDistInList > minD) { // if actual candidate's distance is smaller minDistInList = minD; // set this to be the new minimal distance minDKNdistindex = j; // and set the index of that minimal distance } } } // now we know what the minDistInList is and have the new minDKNdistindex } // now we get the next SearchObject of the for loop (see above) /* * In the last step, mark all SearchObjects in the top-n List as the Outliers. Maybe later * enhance by sorting and rank the top-n... */ for (int z = 0; z < listofDKNcandidates.size(); z++) { SearchObject sobj2 = listofDKNcandidates.elementAt(z); sobj2.setOutlierStatus(true); } } // end of computeDKN method }