/*
* beymani: Outlier and anamoly detection
* Author: Pranab Ghosh
*
* Licensed under the Apache License, Version 2.0 (the "License"); you
* may not use this file except in compliance with the License. You may
* obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
package org.beymani.predictor;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.chombo.util.RichAttribute;
import org.chombo.util.Utility;
/**
* Outlier detection based weighted cumulative probability of all attributes
* @author pranab
*
*/
public class EsimatedAttrtibuteProbabilityBasedPredictor extends DistributionBasedPredictor {
private Map<Integer, Map<String, Integer>> attrDistr = new HashMap<Integer, Map<String, Integer>>();
private Map<Integer, Integer> attrDistrCounts = new HashMap<Integer, Integer>();
private double[] attrWeights;
private boolean requireMissingAttrValue;
private String fieldDelim;
/**
* Storm usage
* @param conf
*/
public EsimatedAttrtibuteProbabilityBasedPredictor(Map conf) {
super(conf);
//per attribute distribution
buildAttributeWiseDistr();
//attribute weights
String[] weightStrs = conf.get("attr.weight").toString().split(",");
attrWeights = new double[weightStrs.length];
for (int a = 0; a < weightStrs.length; ++a) {
attrWeights[a] = Double.parseDouble(weightStrs[a]);
}
requireMissingAttrValue = Boolean.parseBoolean(conf.get("require.missing.attr.value").toString());
realTimeDetection = true;
}
/**
* Hadoop MR usage
* @param config
* @param distrFilePath
* @throws IOException
*/
public EsimatedAttrtibuteProbabilityBasedPredictor(Configuration config, String distrFilePath, String attrWeightParam,
String scoreThresholdParam, String fieldDelimParam) throws IOException {
super(config, distrFilePath);
buildAttributeWiseDistr();
//attribute weights
fieldDelim = config.get(fieldDelimParam, ",");
attrWeights = Utility.doubleArrayFromString(config.get(attrWeightParam), fieldDelim);
scoreThreshold = Double.parseDouble( config.get( scoreThresholdParam));
}
/**
*
*/
private void buildAttributeWiseDistr() {
//per attribute distribution
int i = 0;
for (RichAttribute field : schema.getFields()) {
Integer ordinal = field.getOrdinal();
Map<String, Integer> distr = attrDistr.get(ordinal);
if (null == distr){
distr = new HashMap<String, Integer>();
attrDistr.put(ordinal, distr);
}
int totalCount = 0;
for (String bucket : distrModel.keySet()) {
String[] items = bucket.split(subFieldDelim);
String attrBucket = items[i];
int bucketCount = distrModel.get(bucket);
Integer count = distr.get(attrBucket);
if (null == count) {
distr.put(attrBucket, bucketCount);
} else {
distr.put(attrBucket, count + bucketCount);
}
totalCount += bucketCount;
}
attrDistrCounts.put(ordinal, totalCount);
++i;
}
}
@Override
public double execute(String entityID, String record) {
String bucketKey = getBucketKey(record);
String[] bucketElements = bucketKey.split(subFieldDelim);
int i = 0;
double score = 0;
int rareCount = 0;
for (RichAttribute field : schema.getFields()) {
Integer ordinal = field.getOrdinal();
String bucketElem = bucketElements[i];
Integer count = attrDistr.get(ordinal).get(bucketElem);
if (null == count){
++rareCount;
}
double pr = count != null ? ((double)count / attrDistrCounts.get(ordinal)) : 0;
score += attrWeights[i] * (1.0 - pr);
++i;
}
if (requireMissingAttrValue && rareCount == 0) {
score = 0;
}
scoreAboveThreshold = score > scoreThreshold;
if (realTimeDetection && scoreAboveThreshold) {
//write if above threshold
outQueue.send(entityID + " " + score);
}
return score;
}
}