/*
* beymani: Outlier and anamoly detection
* Author: Pranab Ghosh
*
* Licensed under the Apache License, Version 2.0 (the "License"); you
* may not use this file except in compliance with the License. You may
* obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
package org.beymani.predictor;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.Map;
import java.util.Scanner;
import org.apache.hadoop.conf.Configuration;
import org.chombo.storm.Cache;
import org.chombo.storm.MessageQueue;
import org.chombo.util.RichAttribute;
import org.chombo.util.RichAttributeSchema;
import org.chombo.util.Utility;
import org.codehaus.jackson.JsonParseException;
import org.codehaus.jackson.map.JsonMappingException;
import org.codehaus.jackson.map.ObjectMapper;
/**
* @author pranab
*
*/
public abstract class DistributionBasedPredictor extends ModelBasedPredictor {
protected MessageQueue outQueue;
protected Cache cache;
protected Map<String, Integer> distrModel = new HashMap<String, Integer>();
protected int totalCount;
protected RichAttributeSchema schema;
protected StringBuilder stBld = new StringBuilder();
protected String subFieldDelim = ";";
/**
* Storm usage
* @param conf
*/
public DistributionBasedPredictor(Map conf) {
super();
outQueue = MessageQueue.createMessageQueue(conf, conf.get("output.queue").toString());
cache = Cache.createCache(conf);
String modelKey = conf.get("distribution.model.key").toString();
String model = cache.get(modelKey);
//distribution
Scanner scanner = new Scanner(model);
totalCount = 0;
while (scanner.hasNextLine()) {
String line = scanner.nextLine();
String[] items = line.split(",");
int count = Integer.parseInt(items[1]);
totalCount += count;
distrModel.put(items[0], count);
}
//schema
String schemaKey = conf.get("schema.key").toString();
String schemaStr = cache.get(schemaKey);
ObjectMapper mapper = new ObjectMapper();
try {
schema = mapper.readValue(schemaStr, RichAttributeSchema.class);
} catch (JsonParseException e) {
throw new IllegalStateException("invalid JSON schema");
} catch (JsonMappingException e) {
throw new IllegalStateException("invalid JSON schema");
} catch (IOException e) {
throw new IllegalStateException("failed to parse JSON schema");
}
scoreThreshold = Double.parseDouble(conf.get("score.threshold").toString());
}
/**
* Hadoop MR usage
* @param config
* @throws IOException
*/
public DistributionBasedPredictor(Configuration config, String distrFilePath) throws IOException {
super();
InputStream fs = Utility.getFileStream(config, distrFilePath);
BufferedReader reader = new BufferedReader(new InputStreamReader(fs));
String line = null;
String[] items = null;
while((line = reader.readLine()) != null) {
items = line.split(",");
int count = Integer.parseInt(items[1]);
totalCount += count;
distrModel.put(items[0], count);
}
schema = Utility.getRichAttributeSchema(config, "dbp.distr.schema.file.path");
scoreThreshold = Double.parseDouble(config.get("dbp.score.threshold"));
}
/**
* @param record
* @return
*/
protected String getBucketKey(String record) {
String[] items = record.split(",");
stBld.delete(0, stBld.length());
String bucketElement = null;
for (RichAttribute field : schema.getFields()) {
String item = items[field.getOrdinal()];
if (field.isCategorical()){
bucketElement = item;
} else if (field.isInteger()) {
bucketElement = "" + Integer.parseInt(item) / field.getBucketWidth();
} else if (field.isDouble()) {
bucketElement = "" + ((int)Double.parseDouble(item)) / field.getBucketWidth();
}
stBld.append(bucketElement).append(subFieldDelim);
}
stBld.delete(stBld.length()-1, stBld.length());
return stBld.toString();
}
}