/* * RapidMiner * * Copyright (C) 2001-2008 by Rapid-I and the contributors * * Complete list of developers available at our web site: * * http://rapid-i.com * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.operator.preprocessing.outlier; import java.util.LinkedList; import java.util.List; import java.util.ListIterator; /** * The SearchObject class creates SearchObjects which handle the representation * of objects from the test data set in the core of the outlier operators. Such an object * is able to store all relevant coordinates, dimensions, etc. for an object (e.g. * from an Example from a RapidMiner ExampleSet) as well as perform various operations, * such as radius search to other objects. * * @author Stephan Deutsch, Ingo Mierswa * @version $Id: SearchObject.java,v 1.5 2008/05/09 19:22:55 ingomierswa Exp $ */ public class SearchObject { /** * Number of dimensions of the <tt>SearchObject</tt> as an internal integer value */ private int dimensions; /** * The vector of the Object (e.g. its vector in the common sense and not a * JAVA language Vector class) with (double) value array [0,...dimensions] * for the value of each dimension for the vektor. */ private double vector[]; /** * The boolean Outlier status of the Object it holds after a yes/no state * Outlier test has been conducted. */ private boolean outlierStatus; /** * The Outlier factor as a double value in case the Object shall know the * results of non-trivial state outlier tests (such as LOF, CBLOF, etc.) */ private double outlierFactor; /** * The label of the object to differentiate it in further analysis. */ private String label; /** * <p>List (Linked List) of k-distance object containers for an object. In this * list, in ascending order, the subsets of objects are stored, which are in * the same distance from the SearchObject.</p> * * <p>This is (sort of) representing the objects on a radius around the * SearchObject and objects at the same difference (on the same radius) are * stored in the same container. Thsi data structure is very important to compute * the actual k-distance neighbourhoods afterwards, as this neighbourhoods require * a certain (at least) number of objects to be within a given distance and a * certain smaller (= at most) number of objects to be within a smaller distance.</p> */ private List<KdistanceContainer> listOfkDContainers; /** * The array of all k-distances of an object, e.g. kDistance[2] represents the * 2-distance, thus the array needs to be initialized with an n+1 dimension, as * the 0-distance is not used and Java counts arrays from 0...n-1 for n-dimensional * arrays. */ private double[] kDistance; /** * The array of all local reachability densities of an object, e.g. lrd[2] represents * the 2-density, thus the array needs to be initialized with an n+1 dimension, as the * 0-distance is not used and Java counts arrays from 0...n-1 for n-dimensional * arrays. */ private double[] lrd; /** * The array of all cardinalities of the k-Neighbourhoods of an object, e.g. cardN[2] * represents the number of objects in the 2-Neighbourhood (|N_k(p)|), thus the array * needs to be initialized with an n+1 dimension, as the 0-distance is not used * and Java counts arrays from 0...n-1 for n-dimensional arrays. */ private int[] cardN; /** * <p>The array of all LOFs of an object for MinPts=k, e.g. localOutlierFactor[3] represents * the LOF for MinPts=3, thus the array needs to be initialized with an n+1 dimension, * as the 0-distance is not used and Java counts arrays from 0...n-1 for * n-dimensional arrays.</p> * * <p>Please be aware, that usually for the MinPts-LOF check, the maximum LOF will be * choosen for all LOF[MinPts] between MinPtsLowerBound and MinPtsUpperBound. This value * will be stored in the SearchObjects OutlierFactor variable, as there's * already some methods to automatically print those.</p> */ private double[] localOutlierFactor; /** * A lower bound for MinPts for the SearchObject. */ //private int minPtsLowerBound; /** * An upper bound for MinPts for the SearchObject. */ private int minPtsUpperBound; /** * Constructor creates a new instance of <tt>SearchObject</tt> class and initializes * the object with integer <i>dim</i> dimensions and the String label <i>l</i>. Each * dimension vektor is set to (double) ZERO and Outlier status is set to false * and Outlier Factor is set to ZERO as well. */ public SearchObject(int dim, String l) { this.dimensions = dim; // set dimensions to dim this.vector = new double[this.dimensions]; // construct a vector of floats with dimension dim for (int i = 0; i < dim; i++) { // fill that vector with zero's to be sure there's no funny numbers in it later this.vector[i] = 0; } this.setOutlierStatus(false); // as long as we do not know, this is not an outlier this.setOutlierFactor(0); // hence it also gets an Outlier factor of zero this.label = l; this.listOfkDContainers = new LinkedList<KdistanceContainer>(); // create a new list for the kdContainers } /** * Constructor creates a new instance of <tt>SearchObject</tt> class and * initializes the object with integer <i>2</i> dimensions and the String * label <i>not labeled object</i>. Each dimension vektor is set to (double) * ZERO and Outlier status is set to false and Outlier Factor is set to ZERO * as well. this is only a default constructor and should not be used for * 2-dimensional objects. The class does not provide sufficient consistency * checks to entirely rely on default construction. */ public SearchObject() { this(2, "not labeled object"); } /** * Constructor creates a new instance of <tt>SearchObject</tt> class and initializes the object with integer <i>dim</i> dimensions and the String label <i>l</i> and an (integer) MinPts-Range. Each dimension vektor is set to (double) ZERO and * Outlier status is set to false and Outlier Factor is set to ZERO as well. * * @param dim * @param l * @param minptslb * @param minptsub */ public SearchObject(int dim, String l, int minptslb, int minptsub) { this(dim, l); // first create the object with dim and label using that constructor this.cardN = new int[minptsub + 1]; this.kDistance = new double[minptsub + 1]; this.lrd = new double[minptsub + 1]; this.localOutlierFactor = new double[minptsub + 1]; //this.minPtsLowerBound = minptslb; this.minPtsUpperBound = minptsub; // the index in the future use will be 1,... <n+1 ! but we initialize the zero index as well for (int i = 0; i < this.minPtsUpperBound + 1; i++) { // initialize all with zero to be sure... this.cardN[i] = 0; this.kDistance[i] = 0; this.lrd[i] = 0; this.localOutlierFactor[i] = 0; } } /** * <p>Changes the number of dimensions for an object and copies the values of the old * vector for the object into the new vektor (which is initialized with the new * dimension number).</p> * * <p><em>Attention</em>: If the new dimension number is less than the old number, * only the values of the relevant new domain range are copied. If the new vector * has more dimensions, all the old are copied and the new ones are initialized with * ZERO. Those should afterwards be initialized with the {@link #setVektor(int, double)} * method in a proper manner.</p> * * <p>The safest way to change the dimensions of an object is to create a new one with * the new dimensions and to copy the vektor values and all other relevant data and to * initialize the additional dimensions with the proper values.</p> */ public void setDimensions(int dim) { double[] changeVektor = new double[this.dimensions]; // create a new vektor to hold the existing one int oldDimensions = this.dimensions; // store the old number of dimensions this.dimensions = dim; // set the number of dimensions for this object to new dim value for (int j = 0; j < oldDimensions; j++) { changeVektor[j] = this.vector[j]; // store all the old vektor values in changeVektor } this.vector = new double[this.dimensions]; // create a new this.vektor with the new dimensions for (int i = 0; i < this.dimensions; i++) { if (i < oldDimensions) { // as long as it is within old dimension range, copy value this.vector[i] = changeVektor[i]; } else { this.vector[i] = 0; // else initialize with ZERO } } // as you can see from the loop, if new vector has less dimensions, only the relevant are copied } /** * Provides the (integer) number of dimensions of the Object. * Remark: some methods actually use the this.dimensions reference which is used * by this, but this method would be able to provide the dimensions externally. */ public int getDimensions() { return (this.dimensions); } /** * Sets the label of the object to (String) <i>l</i>. */ public void setLabel(String l) { this.label = l; } /** * Returns the label of the object (e.g. its "name" for other purposes) */ public String getLabel() { return (this.label); } /** * Sets the vector for the object to (double) <i>value</i> for the dimension * (integer) <i>dim</i>, with this method subsequently all dimensions of an objects * vector can be set. * * @param dim * @param value */ public void setVektor(int dim, double value) { this.vector[dim] = value; } /** * Returns the value of the object's vektor with dimension (integer) <i>dim</i>. * * @param dim */ public double getVektor(int dim) { return (this.vector[dim]); } /** * Sets a BOOLEAN Outlier Status for the object to store the results of Outlier * tests according to a yes/no Outlier state (e.g. DB(p,D) Outliers and others. * * @param status */ public void setOutlierStatus(boolean status) { this.outlierStatus = status; } /** * Provides the BOOLEAN Outlier status of an Object (-> the status has to be set * through a test, so the user should see that the status is only set by methods * providing a consistent view on the outlier test, else this has only the meaning * of the accidentally stored status (default should be ZERO ;-). */ public boolean getOutlierStatus() { return (this.outlierStatus); } /** * Sets a (double) Outlier <i>factor</i> to store smooth Outlier status information, * such as local outlier factors and others. * * @param factor */ public void setOutlierFactor(double factor) { this.outlierFactor = factor; } /** * Returns the Outlier factor of an object. */ public double getOutlierFactor() { return (this.outlierFactor); } /** * <p>Returns the euclidian (metric) distance between two SearchObjects by looking * at the object's vektors and returning the length of the substracted vector * between the two object's vectors.</p> * * <p>The method checks if both objects have the same dimensions and for ensuring * smooth program execution takes the mimimum number of dimensions of the two objects. * So it looks at a higher dimensional object as if it has only as many dimensions as * the object with fewer dimensionality. ATTENTION: This - of course - creates different * distance as if the object with maximum dimensions would be taken as the reference * and the missing dimensions of the object with fewer dimensions would be set * to zero.</p> * * <p>It would be expected that an integrity check would be performed before using the * distance functions from any functions utilizing this distance. * E.g. {@link SearchSpace#dimensionsIntegrityCheck()} provides such an * integrity check for a search room's dimensions (although that function does not check * object to object integrity separately).</p> */ public double getDistanceEuclidian(SearchObject toObject) { double distance = 0; int dim_of_toObject = toObject.getDimensions(); int minimumDimensions = 0; minimumDimensions = Math.min(this.dimensions, dim_of_toObject); // if both are equal, we can take the equal min for (int i = 0; i < minimumDimensions; i++) { distance = distance + Math.pow((this.getVektor(i) - toObject.getVektor(i)), 2); } return (Math.sqrt(distance)); } /** * This method returns the distance between two objects according to a specification on * which distance shall be computed (at the moment the method supports EUCLIDIAN distance * (int kindOfDistance = 1) and COSINE distance (int kindOfDistance = 2) * and the following similar distances: SQUARED (0) (the squared value of the * metric/euclidian distance, INV_COSINE (3) the inversted cosine (actually the sine) * distance which is simply 1-cos, and ANGLE_RADIANT (4) the angle between the objects * related to zero coordinates in the actual n-dimensional euclidian coordinate system * (ARC COSINE in radiant between [0 ; pi]). * * <p> * The method substitutes the distance method * * @link #getDistance(SearchObject) which is only capable to compute the EUCLIDIAN distance. * * <p> * The parameter (int) kindOfDistance defines the kind of distance to compute, Attn.: If no kind of distance is specified properly, EUCLIDIAN is set as a default to prevent malfunction. A Warning is printed to STDOUT accordingly. * * <p> * The first parameter, however, as in the older getDistance function, is the SearchObject to which the distances is to be measured. * * <p> * For further information: The difference between EUCLIDIAN distance and COSINE distance is as follows: * <p> * d_euclidian(X,Y)=SQUARE_ROOT(SUM_i((x_i - y_i)^2)) and * <p> * d_cosine(X,Y)=SUM_i(x_i * y_i) / (SQUARE_ROOT(SUM_i(x_i)) * SQUARE_ROOT(SUM_i(y_i))) * <p> * Or in other words, while euclidian distance is measuring the metric distance between two vectors equalling the norm of the subtraction of the two vectors, the cosine distance is measuring the cosine of the angle between the two vectors. The * cosine distance is used especially for measuring the similarity between texts represented by their vectorized term structure (e.g. using Term Frequency or Inverse Term Frequency - TF/IDF) for the purpose of Information Retreival. * <p> * inverted cosine distance is supported by computing 1-cos distance, as with cosine in the interval between [1; 1/2*pi] is monotonic and falling, from [1;0] and the largest angles actually have the smallest value, it might very well be useful to * invert the scala to sine distance (1-cos distance) for reflecting increasing angles resulting in increasing values for the distance used. Attn: the effect in this case decellerates, e.g. the larger angles have less difference in distance * values, hence a grouping of objects kind of explodes in the middle and gets denser in the outer ring. * <p> * Therefore, in addition, the actual angle in radiant is introduced. With this kind of distance, the direct angle between obejects is used, resulting in a linear monotonic growing distance representation. * <p> * Overall, the user should decide on which kind of distance is to be used depending on the actual application, as some distance measures can have VERY funny effects is used in the wrong way. * * @param toObject * @param kindOfDistance */ public double getDistance(SearchObject toObject, int kindOfDistance) { double distance = 0; int SQUARED = 0; // squared value of the euclidian distance will be used int EUCLIDIAN = 1; // euclidian (metric) distance will be used int COSINE = 2; // cosine distance will be used int INV_COSINE = 3; // 1-cos distance will be used int ANGLE_RADIANT = 4; // the angle in radiant will be used // check if the distance modifier is properly set, if not, fall back to euclidian as default and log to STDOUT if (kindOfDistance != COSINE && kindOfDistance != SQUARED && kindOfDistance != INV_COSINE && kindOfDistance != ANGLE_RADIANT) { if (kindOfDistance != EUCLIDIAN) { kindOfDistance = EUCLIDIAN; } } // check, if the dimensions of the objects are ok (the same) and computation can go ahead, else fix this first int dim_of_toObject = toObject.getDimensions(); int minimumDimensions = 0; minimumDimensions = Math.min(this.dimensions, dim_of_toObject); // if both are equal, we can take the equal min // if the euclidian distance is sought for, compute and return if (kindOfDistance == EUCLIDIAN || kindOfDistance == SQUARED) { for (int i = 0; i < minimumDimensions; i++) { distance = distance + Math.pow((this.getVektor(i) - toObject.getVektor(i)), 2); } // if distance is squared, simply return the distance value, else for euclidian return the square-root value return (kindOfDistance == SQUARED ? distance : Math.sqrt(distance)); } /* * else, we assume that cosine distance or inverted cosine distance or angle in radiant is sought for and compute this and return */ double sumOfProductsxiyi = 0; double sumxisquared = 0; double sumyisquared = 0; for (int i = 0; i < minimumDimensions; i++) { sumOfProductsxiyi = sumOfProductsxiyi + (this.getVektor(i) * toObject.getVektor(i)); sumxisquared = sumxisquared + Math.pow(this.getVektor(i), 2); sumyisquared = sumyisquared + Math.pow(toObject.getVektor(i), 2); } distance = sumOfProductsxiyi / (Math.sqrt(sumxisquared) * Math.sqrt(sumyisquared)); if (kindOfDistance == COSINE) { return distance; // if COSINE, simply return the computed cosine distance } else { if (kindOfDistance == INV_COSINE) { return 1 - distance; // if inverted COSINE, return 1-cos (equals sin) of the cosine distance } else { return Math.acos(distance); // if the Angle is looked for, return it using arcus cosine function // according to JAVA Math-Class documentation, the result of acos() is in radiant! } } } /** * Adds a new KdContainer to the SearchObject at index in the container list. * */ public void addKdContainer(int index) { KdistanceContainer container = new KdistanceContainer(this); this.listOfkDContainers.add(index, container); } /** * Adds a new KdContainer to the SearchObject at index in the container list and also * sets the distance value of the container to dist. * */ public void addKdContainer(int index, double dist) { KdistanceContainer container = new KdistanceContainer(this); container.setDistance(dist); this.listOfkDContainers.add(index, container); } /** * Adds an existing KdContainer to the container list at position index. * * @param index * @param kd */ public void addKdContainer(int index, KdistanceContainer kd) { this.listOfkDContainers.add(index, kd); } /** * Adds an existing KdContainer to the container lost at the end of the list. * * @param kd */ public void addKdContainer(KdistanceContainer kd) { this.listOfkDContainers.add(kd); } /** * Adds a new KdContainer to the SearchObject at the end of the container list. * */ public void addKdContainer() { KdistanceContainer container = new KdistanceContainer(this); this.listOfkDContainers.add(container); } /** * returns a ListIterator for the list of containes in the SearchObject. */ public ListIterator getKdContainerListIterator() { ListIterator li = this.listOfkDContainers.listIterator(); return li; } /** * Sets the k-distance for the SearchObject for k to dist. * * @param k * @param dist */ public void setKDistance(int k, double dist) { this.kDistance[k] = dist; } /** * Returns the k-distance for the SearchObject for k. * * @param k */ public double getKDistance(int k) { return this.kDistance[k]; } /** * Sets the local reachability density for k for a SearchObject for k to lrdvalue. * * @param k * @param lrdvalue */ public void setLRD(int k, double lrdvalue) { this.lrd[k] = lrdvalue; } /** * Returns the local reachability density for k for a SearchObject. * * @param k */ public double getLRD(int k) { return this.lrd[k]; } /** * Sets the cardinality for k-neighbourhood (|N_k(p)|) for a SearchObject for k to card. * * @param k * @param card */ public void setCardN(int k, int card) { this.cardN[k] = card; } /** * Returns the cardinality for k-neighbourhood (|N_k(p)|) for a SearchObject for k. * * @param k */ public int getCardN(int k) { return this.cardN[k]; } /** * Sets the k-LOF for a SearchObject to lof for k. * * @param k * @param lof */ public void setLOF(int k, double lof) { this.localOutlierFactor[k] = lof; } /** * Returns the k-LOF for a SearchObject for k. * * @param k */ public double getLOF(int k) { return this.localOutlierFactor[k]; } }