/*
* RapidMiner
*
* Copyright (C) 2001-2011 by Rapid-I and the contributors
*
* Complete list of developers available at our web site:
*
* http://rapid-i.com
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see http://www.gnu.org/licenses/.
*/
package com.rapidminer.operator.preprocessing.outlier;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.PriorityQueue;
import java.util.Set;
import com.rapidminer.example.Attribute;
import com.rapidminer.example.Attributes;
import com.rapidminer.example.Example;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.example.table.AttributeFactory;
import com.rapidminer.operator.OperatorDescription;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.UserError;
import com.rapidminer.operator.annotation.ResourceConsumptionEstimator;
import com.rapidminer.parameter.ParameterType;
import com.rapidminer.parameter.ParameterTypeInt;
import com.rapidminer.tools.Ontology;
import com.rapidminer.tools.OperatorResourceConsumptionHandler;
import com.rapidminer.tools.math.similarity.DistanceMeasure;
import com.rapidminer.tools.math.similarity.DistanceMeasures;
/**
* <p>
* This operator performs a Class Outlier Factor (COF) search. COF outliers (or Class Outliers method) search for
* observations (objects) those that arouse suspicions, taking into account the class labels according to the definition
* of Class Outlier by Hewaihi and Saad in "A comparative Study of Outlier Mining and Class Outlier Mining", CS Letters,
* Vol 1, No 1 (2009)", and "Class Outliers Mining: Distance-Based Approach", International Journal of Intelligent
* Systems and Technologies, Vol. 2, No. 1, pp 55-68, 2007".
* </p>
* <p>
* It detects rare / exceptional / suspicious cases with respect group of similar cases. The main key factors of
* computing COF are the probability of the instance�s class among its neighbors�s classes, the deviation of the
* instance from the instances of the same class, and the distance between the instance and its k nearest neighbors.
* </p>
*
* <p>
* The main concept of ECODB (Enhanced Class Outlier - Distance Based) algorithm is to rank each instance in the dataset
* D given the parameters N (top N class outliers), and K (the number of nearest neighbors. The Rank finds out the rank
* of each instance using the formula (COF = PCL(T,K) - norm(deviation(T)) + norm(kDist(T))). where PCL(T,K) is the
* Probability of the class label of the instance T with respect to the class labels of its K Nearest Neighbors. and
* norm(Deviation(T)) and norm(KDist(T)) are the normalized value of Deviation(T) and KDist(T) respectively and their
* value fall into the range [0 - 1]. Deviation(T) is how much the instance T deviates from instances of the same class,
* and computed by summing the distances between the instance T and every instance belong to the same class of the
* instance. KDist(T) is the summation of distances between the instance T and its K nearest neighbors.
* </p>
*
* <p>
* The ECODB algorithm maintains a list of only the instances of the top N class outliers. The less is the value of COF
* of an instance, the higher is the priority of the instance to be a class outlier.
* </p>
*
* <p>
* The operator supports mixed euclidian distance. The Operator takes an example set and passes it on with an boolean
* top-n COF outlier status in a new boolean-valued special outlier attribute indicating true (outlier) and false (no
* outlier), and another special attribute "COF Factor" which measures the degree of being Class Outlier for an object.
* </p>
*
* @author Motaz K. Saad
*/
public class EcodbOperator extends AbstractOutlierDetection {
/** The parameter name for "Specifies the k value for the k-th nearest neighbours to be the analyzed." */
public static final String PARAMETER_NUMBER_OF_NEIGHBORS = "number_of_neighbors";
/** The parameter name for "The number of top-n Class Outliers to be looked for." */
public static final String PARAMETER_NUMBER_OF_Class_OUTLIERS = "number_of_class_outliers";
public EcodbOperator(OperatorDescription description) {
super(description);
}
/**
* This method implements the main functionality of the Operator but can be considered as a sort of wrapper to pass
* the RapidMiner operator chain data deeper into the search space class, so do not expect a lot of things happening
* here.
*/
@Override
public ExampleSet apply(ExampleSet eSet) throws OperatorException {
// declaration and initializing the necessary fields from input
int k = this.getParameterAsInt(PARAMETER_NUMBER_OF_NEIGHBORS);
int n = this.getParameterAsInt(PARAMETER_NUMBER_OF_Class_OUTLIERS);
// initialize distance measure
DistanceMeasure measure = DistanceMeasures.createMeasure(this);
measure.init(eSet);
// check if the label attribute exists
if (eSet.getAttributes().getLabel() == null) {
throw new UserError(this, 105);
}
// check if the label attribute is nominal
if (!eSet.getAttributes().getLabel().isNominal()) {
throw new UserError(this, 101, eSet.getName(), eSet.getAttributes().getLabel().getName());
}
// create a new special attribute for the exampleSet
Attribute outlierAttribute = AttributeFactory.createAttribute("Outlier", Ontology.BINOMINAL);
// class outlier flag (true or false)
outlierAttribute.getMapping().mapString("false");
outlierAttribute.getMapping().mapString("true");
eSet.getExampleTable().addAttribute(outlierAttribute);
// class outlier factor (COF) attribute
Attribute COFoutlierAttribute = AttributeFactory.createAttribute("COF Factor", Ontology.REAL);
eSet.getExampleTable().addAttribute(COFoutlierAttribute);
// add these special attributes (outlier flag and class outlier factor attributes) to the example set
eSet.getAttributes().setOutlier(outlierAttribute);
eSet.getAttributes().setSpecialAttribute(COFoutlierAttribute, "COF Factor");
// reset all examples to positive infinity COF and class outlier flag to false
Iterator<Example> reader = eSet.iterator();
while (reader.hasNext()) {
Example example = reader.next(); // read the next example & create a search object
example.setValue(outlierAttribute, outlierAttribute.getMapping().mapString("false"));
example.setValue(COFoutlierAttribute, Double.POSITIVE_INFINITY);
}
// finding attributes names
ArrayList<String> sampleAttributeNames;
Attributes attributes = eSet.getAttributes();
sampleAttributeNames = new ArrayList<String>(attributes.size());
for (Attribute attribute : attributes) {
sampleAttributeNames.add(attribute.getName());
}
ArrayList<Attribute> sampleAttributes = new ArrayList<Attribute>(sampleAttributeNames.size());
for (String attributeName : sampleAttributeNames) {
sampleAttributes.add(attributes.get(attributeName));
}
// array list of COF objects that hold all dataset examples represented by double[] values array
ArrayList<COFObject> cofobjectList = new ArrayList<COFObject>();
int counter = 0;
// perform data transformation to double[] values
// get double[] values for each example in the example set
for (Example example : eSet) {
double[] values = new double[sampleAttributes.size()];
// reading values
int i = 0;
for (Attribute attribute : sampleAttributes) {
values[i] = example.getValue(attribute);
i++;
}
double label = example.getLabel();// get the label value
// insert the cof object initialization in the list
cofobjectList.add(new COFObject(values, label, Double.POSITIVE_INFINITY, counter++));
}
// define variables to hold max and min for Dev and kDist
double maxDev, minDev;
double maxkDist, minkDist;
// initialize max and min for Dev and kDist
maxkDist = Double.NEGATIVE_INFINITY;
minkDist = Double.POSITIVE_INFINITY;
maxDev = Double.NEGATIVE_INFINITY;
minDev = Double.POSITIVE_INFINITY;
// phase 1: compute cof value for all examples based on PCL
for (COFObject cofobject : cofobjectList) {
cofobject.computeCOF(cofobjectList, k, measure);
// specify max and min for dev and Kdist
double tempKdist = cofobject.getKDist();
if (tempKdist > maxkDist)
maxkDist = tempKdist;
if (tempKdist < minkDist)
minkDist = tempKdist;
double tempDev = cofobject.getDeviation();
if (tempDev > maxDev)
maxDev = tempDev;
if (tempDev < minDev)
minDev = tempDev;
}
// priority queue of top n cof outliers
PriorityQueue<COFObject> topCOFList = new PriorityQueue<COFObject>();
for (COFObject cofobject : cofobjectList) {
double cof = cofobject.getCOF();
if (topCOFList.size() < n)
topCOFList.offer(cofobject);
// if the list is full, remove the object with highest cof value
// i.e, keep only top n class outliers
else if (cof < topCOFList.peek().getCOF()) {
topCOFList.remove();
topCOFList.offer(cofobject);
}
}
// phase 2: recompute COF based on PCL, normalized Dev, and normalized kDist
for (COFObject cofobject : topCOFList) {
cofobject.recomputeCOF(minDev, maxDev, minkDist, maxkDist);
}
// set outlier status and cof values in the example set
for (COFObject cofobject : topCOFList) {
Example example = eSet.getExample(cofobject.getId());
example.setValue(outlierAttribute, outlierAttribute.getMapping().mapString("true"));
example.setValue(COFoutlierAttribute, cofobject.getCOF());
}
return eSet;
}
@Override
public List<ParameterType> getParameterTypes() {
List<ParameterType> types = super.getParameterTypes();
types.add(new ParameterTypeInt(PARAMETER_NUMBER_OF_NEIGHBORS, "Specifies the k value for the k-th nearest neighbours to be the analyzed. (default value is 10, minimum 1 and max is set to 1 million)", 1, Integer.MAX_VALUE, 7, false));
types.add(new ParameterTypeInt(PARAMETER_NUMBER_OF_Class_OUTLIERS, "The number of top-n Class Outliers to be looked for.(default value is 10, minimum 2 (internal reasons) and max is set to 1 million)", 1, Integer.MAX_VALUE, 10, false));
types.addAll(DistanceMeasures.getParameterTypes(this));
return types;
}
@Override
protected Set<String> getOutlierValues() {
HashSet<String> set = new HashSet<String>();
set.add("true");
set.add("false");
return set;
}
@Override
public ResourceConsumptionEstimator getResourceConsumptionEstimator() {
return OperatorResourceConsumptionHandler.getResourceConsumptionEstimator(getInputPort(), EcodbOperator.class, null);
}
}