/** * Copyright (C) 2001-2017 by RapidMiner and the contributors * * Complete list of developers available at our web site: * * http://rapidminer.com * * This program is free software: you can redistribute it and/or modify it under the terms of the * GNU Affero General Public License as published by the Free Software Foundation, either version 3 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License along with this program. * If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.operator.preprocessing.outlier; import com.rapidminer.example.Attribute; import com.rapidminer.example.Attributes; import com.rapidminer.example.Example; import com.rapidminer.example.ExampleSet; import com.rapidminer.example.table.AttributeFactory; import com.rapidminer.operator.OperatorDescription; import com.rapidminer.operator.OperatorException; import com.rapidminer.operator.annotation.ResourceConsumptionEstimator; import com.rapidminer.operator.ports.metadata.AttributeMetaData; import com.rapidminer.operator.ports.metadata.ExampleSetMetaData; import com.rapidminer.operator.ports.metadata.MetaData; import com.rapidminer.operator.ports.metadata.SetRelation; import com.rapidminer.parameter.ParameterType; import com.rapidminer.parameter.ParameterTypeCategory; import com.rapidminer.parameter.ParameterTypeInt; import com.rapidminer.tools.Ontology; import com.rapidminer.tools.OperatorResourceConsumptionHandler; import com.rapidminer.tools.math.container.Range; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Set; /** * <p> * This operator performs a LOF outlier search. LOF outliers or outliers with a local outlier factor * per object are density based outliers according to Breuning, Kriegel, et al. * </p> * * <p> * The approach to find those outliers is based on measuring the density of objects and its relation * to each other (referred to as local reachability density). Based on the average ratio of the * local reachability density of an object and its k-nearest neighbours (e.g. the objects in its * k-distance neighbourhood), a local outlier factor (LOF) is computed. The approach takes a * parameter MinPts (actually specifying the "k") and it uses the maximum LOFs for objects in a * MinPts range (lower bound and upper bound to MinPts). * </p> * * <p> * Currently, the operator supports cosine, sine or squared distances in addition to the usual * euclidian distance which can be specified by the corresponding parameter. In the first step, the * objects are grouped into containers. For each object, using a radius screening of all other * objects, all the available distances between that object and another object (or group of objects) * on the (same) radius given by the distance are associated with a container. That container than * has the distance information as well as the list of objects within that distance (usually only a * few) and the information, how many objects are in the container. * </p> * * <p> * In the second step, three things are done: (1) The containers for each object are counted in * acending order according to the cardinality of the object list within the container (= that * distance) to find the k-distances for each object and the objects in that k-distance (all objects * in all the subsequent containers with a smaller distance). (2) Using this information, the local * reachability densities are computed by using the maximum of the actual distance and the * k-distance for each object pair (object and objects in k-distance) and averaging it by the * cardinality of the k-neighbourhood and than taking the reciprocal value. (3) The LOF is computed * for each MinPts value in the range (actually for all up to upper bound) by averaging the ratio * between the MinPts-local reachability-density of all objects in the k-neighbourhood and the * object itself. The maximum LOF in the MinPts range is passed as final LOF to each object. * </p> * * <p> * Afterwards LOFs are added as values for a special real-valued outlier attribute in the example * set which the operator will return. * </p> * * @author Stephan Deutsch, Ingo Mierswa */ public class LOFOutlierOperator extends AbstractOutlierDetection { /** The parameter name for "The lower bound for MinPts for the Outlier test " */ public static final String PARAMETER_MINIMAL_POINTS_LOWER_BOUND = "minimal_points_lower_bound"; /** The parameter name for "The upper bound for MinPts for the Outlier test " */ public static final String PARAMETER_MINIMAL_POINTS_UPPER_BOUND = "minimal_points_upper_bound"; /** * The parameter name for "choose which distance function will be used for calculating * " */ public static final String PARAMETER_DISTANCE_FUNCTION = "distance_function"; private static final String[] distanceFunctionList = { "euclidian distance", "squared distance", "cosine distance", "inverted cosine distance", "angle" }; public LOFOutlierOperator(OperatorDescription description) { super(description); } /** * This method implements the main functionality of the Operator but can be considered as a sort * of wrapper to pass the RapidMiner operator chain data deeper into the SearchSpace class, so * do not expect a lot of things happening here. */ @Override public ExampleSet apply(ExampleSet eSet) throws OperatorException { // declaration and initializing the necessary fields from input int minPtsLowerBound = 0; int minPtsUpperBound = 0; int minPtsLB = this.getParameterAsInt(PARAMETER_MINIMAL_POINTS_LOWER_BOUND); int minPtsUB = this.getParameterAsInt(PARAMETER_MINIMAL_POINTS_UPPER_BOUND); int kindOfDistance = this.getParameterAsInt(PARAMETER_DISTANCE_FUNCTION); // check for the sanity of entered parameters: if (minPtsLB <= minPtsUB) { // if lower bound is smaller or equal upper bound, pass them on minPtsLowerBound = minPtsLB; minPtsUpperBound = minPtsUB; } else { // else change both to have a sensible set of parameters ;-) minPtsLowerBound = minPtsUB; minPtsUpperBound = minPtsLB; } // create a new SearchSpace for the LOF-Outlier search Iterator<Example> reader = eSet.iterator(); int searchSpaceDimension = eSet.getAttributes().size(); SearchSpace sr = new SearchSpace(searchSpaceDimension, minPtsLowerBound, minPtsUpperBound + 1); Attribute[] regularAttributes = eSet.getAttributes().createRegularAttributeArray(); // now read through the Examples of the ExampleSet int counter = 0; while (reader.hasNext()) { Example example = reader.next(); // read the next example & create a search object SearchObject so = new SearchObject(searchSpaceDimension, "object" + counter, minPtsLowerBound, minPtsUpperBound); // for now, give so an id like label and add the MinPts ranges, so that arrays are // initialized counter++; int i = 0; for (Attribute attribute : regularAttributes) { so.setVektor(i++, example.getValue(attribute)); } sr.addObject(so); // finally add the search object to the search room checkForStop(); } // set all Outlier Factors to ZERO to be sure sr.resetOutlierStatus(); // find all Containers for the LOF first sr.findAllKdContainers(kindOfDistance, this); // perform the LOF-Outlier search sr.computeLOF(minPtsLowerBound, minPtsUpperBound, this); Attribute outlierAttribute = AttributeFactory.createAttribute(Attributes.OUTLIER_NAME, Ontology.REAL); eSet.getExampleTable().addAttribute(outlierAttribute); eSet.getAttributes().setOutlier(outlierAttribute); counter = 0; // reset counter to zero Iterator<Example> reader2 = eSet.iterator(); while (reader2.hasNext()) { Example example = reader2.next(); // read the next example SearchObject sobj = sr.getSearchObjects().elementAt(counter); example.setValue(outlierAttribute, sobj.getOutlierFactor()); counter++; } return eSet; } @Override protected MetaData modifyMetaData(ExampleSetMetaData metaData) { AttributeMetaData amd = new AttributeMetaData(Attributes.OUTLIER_NAME, Ontology.REAL, Attributes.OUTLIER_NAME); amd.setValueRange(new Range(0, 1), SetRelation.EQUAL); metaData.addAttribute(amd); return metaData; } @Override public List<ParameterType> getParameterTypes() { List<ParameterType> types = super.getParameterTypes(); ParameterType type = new ParameterTypeInt(PARAMETER_MINIMAL_POINTS_LOWER_BOUND, "The lower bound for MinPts for the Outlier test " + "(default value is 10)", 0, Integer.MAX_VALUE, 10); type.setExpert(false); types.add(type); type = new ParameterTypeInt(PARAMETER_MINIMAL_POINTS_UPPER_BOUND, "The upper bound for MinPts for the Outlier test " + "(default value is 20)", 0, Integer.MAX_VALUE, 20); type.setExpert(false); types.add(type); type = new ParameterTypeCategory(PARAMETER_DISTANCE_FUNCTION, "choose which distance function will be used for calculating " + "the distance between two objects", distanceFunctionList, 0); type.setExpert(false); types.add(type); return types; } @Override /** * Isn't called because super method of modifyMetaData is overridden. */ protected Set<String> getOutlierValues() { HashSet<String> set = new HashSet<>(); set.add("true"); set.add("false"); return set; } @Override public ResourceConsumptionEstimator getResourceConsumptionEstimator() { return OperatorResourceConsumptionHandler.getResourceConsumptionEstimator(getInputPort(), LOFOutlierOperator.class, null); } }