/**
* Copyright (C) 2001-2017 by RapidMiner and the contributors
*
* Complete list of developers available at our web site:
*
* http://rapidminer.com
*
* This program is free software: you can redistribute it and/or modify it under the terms of the
* GNU Affero General Public License as published by the Free Software Foundation, either version 3
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
* even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License along with this program.
* If not, see http://www.gnu.org/licenses/.
*/
package com.rapidminer.operator.preprocessing.outlier;
import java.util.LinkedList;
import java.util.List;
import java.util.ListIterator;
/**
* The SearchObject class creates SearchObjects which handle the representation of objects from the
* test data set in the core of the outlier operators. Such an object is able to store all relevant
* coordinates, dimensions, etc. for an object (e.g. from an Example from a RapidMiner ExampleSet)
* as well as perform various operations, such as radius search to other objects.
*
* @author Stephan Deutsch, Ingo Mierswa
*/
public class SearchObject {
/**
* Number of dimensions of the <tt>SearchObject</tt> as an internal integer value
*/
private int dimensions;
/**
* The vector of the Object (e.g. its vector in the common sense and not a JAVA language Vector
* class) with (double) value array [0,...dimensions] for the value of each dimension for the
* vektor.
*/
private double vector[];
/**
* The boolean Outlier status of the Object it holds after a yes/no state Outlier test has been
* conducted.
*/
private boolean outlierStatus;
/**
* The Outlier factor as a double value in case the Object shall know the results of non-trivial
* state outlier tests (such as LOF, CBLOF, etc.)
*/
private double outlierFactor;
/**
* The label of the object to differentiate it in further analysis.
*/
private String label;
/**
* <p>
* List (Linked List) of k-distance object containers for an object. In this list, in ascending
* order, the subsets of objects are stored, which are in the same distance from the
* SearchObject.
* </p>
*
* <p>
* This is (sort of) representing the objects on a radius around the SearchObject and objects at
* the same difference (on the same radius) are stored in the same container. Thsi data
* structure is very important to compute the actual k-distance neighbourhoods afterwards, as
* this neighbourhoods require a certain (at least) number of objects to be within a given
* distance and a certain smaller (= at most) number of objects to be within a smaller distance.
* </p>
*/
private List<KdistanceContainer> listOfkDContainers;
/**
* The array of all k-distances of an object, e.g. kDistance[2] represents the 2-distance, thus
* the array needs to be initialized with an n+1 dimension, as the 0-distance is not used and
* Java counts arrays from 0...n-1 for n-dimensional arrays.
*/
private double[] kDistance;
/**
* The array of all local reachability densities of an object, e.g. lrd[2] represents the
* 2-density, thus the array needs to be initialized with an n+1 dimension, as the 0-distance is
* not used and Java counts arrays from 0...n-1 for n-dimensional arrays.
*/
private double[] lrd;
/**
* The array of all cardinalities of the k-Neighbourhoods of an object, e.g. cardN[2] represents
* the number of objects in the 2-Neighbourhood (|N_k(p)|), thus the array needs to be
* initialized with an n+1 dimension, as the 0-distance is not used and Java counts arrays from
* 0...n-1 for n-dimensional arrays.
*/
private int[] cardN;
/**
* <p>
* The array of all LOFs of an object for MinPts=k, e.g. localOutlierFactor[3] represents the
* LOF for MinPts=3, thus the array needs to be initialized with an n+1 dimension, as the
* 0-distance is not used and Java counts arrays from 0...n-1 for n-dimensional arrays.
* </p>
*
* <p>
* Please be aware, that usually for the MinPts-LOF check, the maximum LOF will be choosen for
* all LOF[MinPts] between MinPtsLowerBound and MinPtsUpperBound. This value will be stored in
* the SearchObjects OutlierFactor variable, as there's already some methods to automatically
* print those.
* </p>
*/
private double[] localOutlierFactor;
/**
* A lower bound for MinPts for the SearchObject.
*/
// private int minPtsLowerBound;
/**
* An upper bound for MinPts for the SearchObject.
*/
private int minPtsUpperBound;
/**
* Constructor creates a new instance of <tt>SearchObject</tt> class and initializes the object
* with integer <i>dim</i> dimensions and the String label <i>l</i>. Each dimension vektor is
* set to (double) ZERO and Outlier status is set to false and Outlier Factor is set to ZERO as
* well.
*/
public SearchObject(int dim, String l) {
this.dimensions = dim; // set dimensions to dim
this.vector = new double[this.dimensions]; // construct a vector of floats with dimension
// dim
for (int i = 0; i < dim; i++) { // fill that vector with zero's to be sure there's no funny
// numbers in it later
this.vector[i] = 0;
}
this.setOutlierStatus(false); // as long as we do not know, this is not an outlier
this.setOutlierFactor(0); // hence it also gets an Outlier factor of zero
this.label = l;
this.listOfkDContainers = new LinkedList<KdistanceContainer>(); // create a new list for the
// kdContainers
}
/**
* Constructor creates a new instance of <tt>SearchObject</tt> class and initializes the object
* with integer <i>2</i> dimensions and the String label <i>not labeled object</i>. Each
* dimension vektor is set to (double) ZERO and Outlier status is set to false and Outlier
* Factor is set to ZERO as well. this is only a default constructor and should not be used for
* 2-dimensional objects. The class does not provide sufficient consistency checks to entirely
* rely on default construction.
*/
public SearchObject() {
this(2, "not labeled object");
}
/**
* Constructor creates a new instance of <tt>SearchObject</tt> class and initializes the object
* with integer <i>dim</i> dimensions and the String label <i>l</i> and an (integer)
* MinPts-Range. Each dimension vektor is set to (double) ZERO and Outlier status is set to
* false and Outlier Factor is set to ZERO as well.
*
* @param dim
* @param l
* @param minptslb
* @param minptsub
*/
public SearchObject(int dim, String l, int minptslb, int minptsub) {
this(dim, l); // first create the object with dim and label using that constructor
this.cardN = new int[minptsub + 1];
this.kDistance = new double[minptsub + 1];
this.lrd = new double[minptsub + 1];
this.localOutlierFactor = new double[minptsub + 1];
// this.minPtsLowerBound = minptslb;
this.minPtsUpperBound = minptsub;
// the index in the future use will be 1,... <n+1 ! but we initialize the zero index as well
for (int i = 0; i < this.minPtsUpperBound + 1; i++) { // initialize all with zero to be
// sure...
this.cardN[i] = 0;
this.kDistance[i] = 0;
this.lrd[i] = 0;
this.localOutlierFactor[i] = 0;
}
}
/**
* <p>
* Changes the number of dimensions for an object and copies the values of the old vector for
* the object into the new vektor (which is initialized with the new dimension number).
* </p>
*
* <p>
* <em>Attention</em>: If the new dimension number is less than the old number, only the values
* of the relevant new domain range are copied. If the new vector has more dimensions, all the
* old are copied and the new ones are initialized with ZERO. Those should afterwards be
* initialized with the {@link #setVektor(int, double)} method in a proper manner.
* </p>
*
* <p>
* The safest way to change the dimensions of an object is to create a new one with the new
* dimensions and to copy the vektor values and all other relevant data and to initialize the
* additional dimensions with the proper values.
* </p>
*/
public void setDimensions(int dim) {
double[] changeVektor = new double[this.dimensions]; // create a new vektor to hold the
// existing one
int oldDimensions = this.dimensions; // store the old number of dimensions
this.dimensions = dim; // set the number of dimensions for this object to new dim value
for (int j = 0; j < oldDimensions; j++) {
changeVektor[j] = this.vector[j]; // store all the old vektor values in changeVektor
}
this.vector = new double[this.dimensions]; // create a new this.vektor with the new
// dimensions
for (int i = 0; i < this.dimensions; i++) {
if (i < oldDimensions) { // as long as it is within old dimension range, copy value
this.vector[i] = changeVektor[i];
} else {
this.vector[i] = 0; // else initialize with ZERO
}
} // as you can see from the loop, if new vector has less dimensions, only the relevant are
// copied
}
/**
* Provides the (integer) number of dimensions of the Object. Remark: some methods actually use
* the this.dimensions reference which is used by this, but this method would be able to provide
* the dimensions externally.
*/
public int getDimensions() {
return (this.dimensions);
}
/**
* Sets the label of the object to (String) <i>l</i>.
*/
public void setLabel(String l) {
this.label = l;
}
/**
* Returns the label of the object (e.g. its "name" for other purposes)
*/
public String getLabel() {
return (this.label);
}
/**
* Sets the vector for the object to (double) <i>value</i> for the dimension (integer)
* <i>dim</i>, with this method subsequently all dimensions of an objects vector can be set.
*
* @param dim
* @param value
*/
public void setVektor(int dim, double value) {
this.vector[dim] = value;
}
/**
* Returns the value of the object's vektor with dimension (integer) <i>dim</i>.
*
* @param dim
*/
public double getVektor(int dim) {
return (this.vector[dim]);
}
/**
* Sets a BOOLEAN Outlier Status for the object to store the results of Outlier tests according
* to a yes/no Outlier state (e.g. DB(p,D) Outliers and others.
*
* @param status
*/
public void setOutlierStatus(boolean status) {
this.outlierStatus = status;
}
/**
* Provides the BOOLEAN Outlier status of an Object (-> the status has to be set through a test,
* so the user should see that the status is only set by methods providing a consistent view on
* the outlier test, else this has only the meaning of the accidentally stored status (default
* should be ZERO ;-).
*/
public boolean getOutlierStatus() {
return (this.outlierStatus);
}
/**
* Sets a (double) Outlier <i>factor</i> to store smooth Outlier status information, such as
* local outlier factors and others.
*
* @param factor
*/
public void setOutlierFactor(double factor) {
this.outlierFactor = factor;
}
/**
* Returns the Outlier factor of an object.
*/
public double getOutlierFactor() {
return (this.outlierFactor);
}
/**
* <p>
* Returns the euclidian (metric) distance between two SearchObjects by looking at the object's
* vektors and returning the length of the substracted vector between the two object's vectors.
* </p>
*
* <p>
* The method checks if both objects have the same dimensions and for ensuring smooth program
* execution takes the mimimum number of dimensions of the two objects. So it looks at a higher
* dimensional object as if it has only as many dimensions as the object with fewer
* dimensionality. ATTENTION: This - of course - creates different distance as if the object
* with maximum dimensions would be taken as the reference and the missing dimensions of the
* object with fewer dimensions would be set to zero.
* </p>
*
* <p>
* It would be expected that an integrity check would be performed before using the distance
* functions from any functions utilizing this distance. E.g.
* {@link SearchSpace#dimensionsIntegrityCheck()} provides such an integrity check for a search
* room's dimensions (although that function does not check object to object integrity
* separately).
* </p>
*/
public double getDistanceEuclidian(SearchObject toObject) {
double distance = 0;
int dim_of_toObject = toObject.getDimensions();
int minimumDimensions = 0;
minimumDimensions = Math.min(this.dimensions, dim_of_toObject); // if both are equal, we can
// take the equal min
for (int i = 0; i < minimumDimensions; i++) {
distance = distance + Math.pow((this.getVektor(i) - toObject.getVektor(i)), 2);
}
return (Math.sqrt(distance));
}
/**
* This method returns the distance between two objects according to a specification on which
* distance shall be computed (at the moment the method supports EUCLIDIAN distance (int
* kindOfDistance = 1) and COSINE distance (int kindOfDistance = 2) and the following similar
* distances: SQUARED (0) (the squared value of the metric/euclidian distance, INV_COSINE (3)
* the inversted cosine (actually the sine) distance which is simply 1-cos, and ANGLE_RADIANT
* (4) the angle between the objects related to zero coordinates in the actual n-dimensional
* euclidian coordinate system (ARC COSINE in radiant between [0 ; pi]).
*
* <p>
* The method substitutes the distance method
*
* @link #getDistance(SearchObject) which is only capable to compute the EUCLIDIAN distance.
*
* <p>
* The parameter (int) kindOfDistance defines the kind of distance to compute, Attn.: If
* no kind of distance is specified properly, EUCLIDIAN is set as a default to prevent
* malfunction. A Warning is printed to STDOUT accordingly.
*
* <p>
* The first parameter, however, as in the older getDistance function, is the SearchObject
* to which the distances is to be measured.
*
* <p>
* For further information: The difference between EUCLIDIAN distance and COSINE distance
* is as follows:
* <p>
* d_euclidian(X,Y)=SQUARE_ROOT(SUM_i((x_i - y_i)^2)) and
* <p>
* d_cosine(X,Y)=SUM_i(x_i * y_i) / (SQUARE_ROOT(SUM_i(x_i)) * SQUARE_ROOT(SUM_i(y_i)))
* <p>
* Or in other words, while euclidian distance is measuring the metric distance between
* two vectors equalling the norm of the subtraction of the two vectors, the cosine
* distance is measuring the cosine of the angle between the two vectors. The cosine
* distance is used especially for measuring the similarity between texts represented by
* their vectorized term structure (e.g. using Term Frequency or Inverse Term Frequency -
* TF/IDF) for the purpose of Information Retreival.
* <p>
* inverted cosine distance is supported by computing 1-cos distance, as with cosine in
* the interval between [1; 1/2*pi] is monotonic and falling, from [1;0] and the largest
* angles actually have the smallest value, it might very well be useful to invert the
* scala to sine distance (1-cos distance) for reflecting increasing angles resulting in
* increasing values for the distance used. Attn: the effect in this case decellerates,
* e.g. the larger angles have less difference in distance values, hence a grouping of
* objects kind of explodes in the middle and gets denser in the outer ring.
* <p>
* Therefore, in addition, the actual angle in radiant is introduced. With this kind of
* distance, the direct angle between obejects is used, resulting in a linear monotonic
* growing distance representation.
* <p>
* Overall, the user should decide on which kind of distance is to be used depending on
* the actual application, as some distance measures can have VERY funny effects is used
* in the wrong way.
*
* @param toObject
* @param kindOfDistance
*/
public double getDistance(SearchObject toObject, int kindOfDistance) {
double distance = 0;
int SQUARED = 1; // squared value of the euclidian distance will be used
int EUCLIDIAN = 0; // euclidian (metric) distance will be used
int COSINE = 2; // cosine distance will be used
int INV_COSINE = 3; // 1-cos distance will be used
int ANGLE_RADIANT = 4; // the angle in radiant will be used
// check if the distance modifier is properly set, if not, fall back to euclidian as default
// and log to STDOUT
if (kindOfDistance != COSINE && kindOfDistance != SQUARED && kindOfDistance != INV_COSINE
&& kindOfDistance != ANGLE_RADIANT) {
if (kindOfDistance != EUCLIDIAN) {
kindOfDistance = EUCLIDIAN;
}
}
// check, if the dimensions of the objects are ok (the same) and computation can go ahead,
// else fix this first
int dim_of_toObject = toObject.getDimensions();
int minimumDimensions = 0;
minimumDimensions = Math.min(this.dimensions, dim_of_toObject); // if both are equal, we can
// take the equal min
// if the euclidian distance is sought for, compute and return
if (kindOfDistance == EUCLIDIAN || kindOfDistance == SQUARED) {
for (int i = 0; i < minimumDimensions; i++) {
distance = distance + Math.pow((this.getVektor(i) - toObject.getVektor(i)), 2);
}
// if distance is squared, simply return the distance value, else for euclidian return
// the square-root value
return (kindOfDistance == SQUARED ? distance : Math.sqrt(distance));
}
/*
* else, we assume that cosine distance or inverted cosine distance or angle in radiant is
* sought for and compute this and return
*/
double sumOfProductsxiyi = 0;
double sumxisquared = 0;
double sumyisquared = 0;
for (int i = 0; i < minimumDimensions; i++) {
sumOfProductsxiyi = sumOfProductsxiyi + (this.getVektor(i) * toObject.getVektor(i));
sumxisquared = sumxisquared + Math.pow(this.getVektor(i), 2);
sumyisquared = sumyisquared + Math.pow(toObject.getVektor(i), 2);
}
distance = sumOfProductsxiyi / (Math.sqrt(sumxisquared) * Math.sqrt(sumyisquared));
if (kindOfDistance == COSINE) {
return distance; // if COSINE, simply return the computed cosine distance
} else {
if (kindOfDistance == INV_COSINE) {
return 1 - distance; // if inverted COSINE, return 1-cos (equals sin) of the cosine
// distance
} else {
return Math.acos(distance); // if the Angle is looked for, return it using arcus
// cosine function
// according to JAVA Math-Class documentation, the result of acos() is in radiant!
}
}
}
/**
* Adds a new KdContainer to the SearchObject at index in the container list.
*
*/
public void addKdContainer(int index) {
KdistanceContainer container = new KdistanceContainer(this);
this.listOfkDContainers.add(index, container);
}
/**
* Adds a new KdContainer to the SearchObject at index in the container list and also sets the
* distance value of the container to dist.
*
*/
public void addKdContainer(int index, double dist) {
KdistanceContainer container = new KdistanceContainer(this);
container.setDistance(dist);
this.listOfkDContainers.add(index, container);
}
/**
* Adds an existing KdContainer to the container list at position index.
*
* @param index
* @param kd
*/
public void addKdContainer(int index, KdistanceContainer kd) {
this.listOfkDContainers.add(index, kd);
}
/**
* Adds an existing KdContainer to the container lost at the end of the list.
*
* @param kd
*/
public void addKdContainer(KdistanceContainer kd) {
this.listOfkDContainers.add(kd);
}
/**
* Adds a new KdContainer to the SearchObject at the end of the container list.
*
*/
public void addKdContainer() {
KdistanceContainer container = new KdistanceContainer(this);
this.listOfkDContainers.add(container);
}
/**
* returns a ListIterator for the list of containers in the SearchObject.
*/
public ListIterator<KdistanceContainer> getKdContainerListIterator() {
return listOfkDContainers.listIterator();
}
/**
* Sets the k-distance for the SearchObject for k to dist.
*
* @param k
* @param dist
*/
public void setKDistance(int k, double dist) {
this.kDistance[k] = dist;
}
/**
* Returns the k-distance for the SearchObject for k.
*
* @param k
*/
public double getKDistance(int k) {
return this.kDistance[k];
}
/**
* Sets the local reachability density for k for a SearchObject for k to lrdvalue.
*
* @param k
* @param lrdvalue
*/
public void setLRD(int k, double lrdvalue) {
this.lrd[k] = lrdvalue;
}
/**
* Returns the local reachability density for k for a SearchObject.
*
* @param k
*/
public double getLRD(int k) {
return this.lrd[k];
}
/**
* Sets the cardinality for k-neighbourhood (|N_k(p)|) for a SearchObject for k to card.
*
* @param k
* @param card
*/
public void setCardN(int k, int card) {
this.cardN[k] = card;
}
/**
* Returns the cardinality for k-neighbourhood (|N_k(p)|) for a SearchObject for k.
*
* @param k
*/
public int getCardN(int k) {
return this.cardN[k];
}
/**
* Sets the k-LOF for a SearchObject to lof for k.
*
* @param k
* @param lof
*/
public void setLOF(int k, double lof) {
this.localOutlierFactor[k] = lof;
}
/**
* Returns the k-LOF for a SearchObject for k.
*
* @param k
*/
public double getLOF(int k) {
return this.localOutlierFactor[k];
}
}