/*
* chombo: Hadoop Map Reduce utility
* Author: Pranab Ghosh
*
* Licensed under the Apache License, Version 2.0 (the "License"); you
* may not use this file except in compliance with the License. You may
* obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
package org.chombo.mr;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.chombo.util.Attribute;
import org.chombo.util.GenericAttributeSchema;
import org.chombo.util.NumericalAttrStatsManager;
import org.chombo.util.ProcessorAttribute;
import org.chombo.util.ProcessorAttributeSchema;
import org.chombo.util.StatsParameters;
import org.chombo.util.Utility;
import org.codehaus.jackson.map.ObjectMapper;
public class NumericalAttrNormalizer extends Configured implements Tool {
@Override
public int run(String[] args) throws Exception {
Job job = new Job(getConf());
String jobName = "Numerical attribute normalizer MR";
job.setJobName(jobName);
job.setJarByClass(NumericalAttrNormalizer.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
Utility.setConfiguration(job.getConfiguration());
job.setMapperClass(NumericalAttrNormalizer.NormalizerMapper.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(Text.class);
int status = job.waitForCompletion(true) ? 0 : 1;
return status;
}
/**
* Mapper for attribute transformation
* @author pranab
*
*/
public static class NormalizerMapper extends Mapper<LongWritable, Text, NullWritable, Text> {
private Text outVal = new Text();
private String fieldDelimRegex;
private String fieldDelimOut;
private StringBuilder stBld = new StringBuilder();
private String itemValue;
private String[] items;
private GenericAttributeSchema schema;
private int scale;
private Map<Integer, StatsParameters> attrStats = new HashMap<Integer, StatsParameters>();
private int intFieldValue;
private double dblFieldValue;
private double normFieldValue;
private String decFormat;
private float outlierTruncationLevel;
private Map<Integer, String> normalizers = new HashMap<Integer, String>();
private StatsParameters stats;
private Attribute attr;
private ProcessorAttributeSchema cleanserSchema;
protected void setup(Context context) throws IOException, InterruptedException {
Configuration config = context.getConfiguration();
fieldDelimRegex = config.get("field.delim.regex", ",");
fieldDelimOut = config.get("field.delim", ",");
//schema
InputStream is = Utility.getFileStream(config, "nan.schema.file.path");
ObjectMapper mapper = new ObjectMapper();
schema = mapper.readValue(is, GenericAttributeSchema.class);
//stats data
String statsFilePath = config.get("nan.stats.file.path");
if (null == statsFilePath) {
throw new IllegalArgumentException("stats file path missing");
}
NumericalAttrStatsManager statsManager = new NumericalAttrStatsManager(config, statsFilePath, ",");
//mean and std dev
for (int i = 0; i < schema.getAttributeCount(); ++i) {
Attribute attr = schema.findAttributeByOrdinal(i);
if (attr.isInteger() || attr.isDouble()) {
attrStats.put(i, statsManager.getStatsParameters(i));
}
}
//scaling data
scale = config.getInt("nan.attr.scale", -1);
//decimal formatting
int decPrecision = config.getInt("nan.dec.precision", 3);
decFormat = "%." + decPrecision + "f";
//outlier truncation level
outlierTruncationLevel = config.getFloat("nan.outlier.truncation.level", (float)-1.0);
//data cleanser schema
String cleanserSchemPath = config.get("nan.cleanser.schema.file.path");
if (null != cleanserSchemPath) {
is = Utility.getFileStream(config, "nan.cleanser.schema.file.path");
mapper = new ObjectMapper();
cleanserSchema = mapper.readValue(is, ProcessorAttributeSchema.class);
for (int i : cleanserSchema.getAttributeOrdinals()) {
ProcessorAttribute attr = cleanserSchema.findAttributeByOrdinal(i);
if (attr.isInteger() || attr.isDouble()) {
normalizers.put(i, attr.getNormalizerStrategy());
}
}
} else {
String[] items = config.get("nan.attr.normalizer.list").split(",");
for (String item : items) {
String[] parts = item.split(":");
normalizers.put(Integer.parseInt(parts[0]), parts[1]);
}
}
}
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
items = value.toString().split(fieldDelimRegex, -1);
stBld.delete(0, stBld.length());
boolean toInclude = true;
for (int i = 0; i < items.length; ++i) {
toInclude = true;
itemValue = items[i];
attr = schema.findAttributeByOrdinal(i);
stats = attrStats.get(i);
if (attr.isInteger()) {
intFieldValue = Integer.parseInt(itemValue);
if (normalizers.get(i).equals("zScore")) {
normFieldValue = (intFieldValue - stats.getMean()) / stats.getStdDev();
if (isOutlier()) {
toInclude = false;
break;
}
} else {
normFieldValue = (intFieldValue - stats.getMin()) / (stats.getMax() - stats.getMin());
}
if (scale > 0) {
stBld.append((int)(scale * normFieldValue)).append(fieldDelimOut);
} else {
stBld.append(String.format(decFormat, normFieldValue)).append(fieldDelimOut);
}
}else if (attr.isDouble()) {
dblFieldValue = Double.parseDouble(itemValue);
if (normalizers.get(i).equals("zScore")) {
normFieldValue = (dblFieldValue - stats.getMean()) / stats.getStdDev();
if (isOutlier()) {
toInclude = false;
break;
}
} else {
normFieldValue = (dblFieldValue - stats.getMin()) / (stats.getMax() - stats.getMin());
}
if (scale > 0) {
stBld.append(String.format(decFormat, scale * normFieldValue)).append(fieldDelimOut);
} else {
stBld.append(String.format(decFormat, normFieldValue)).append(fieldDelimOut);
}
} else {
stBld.append(itemValue).append(fieldDelimOut);
}
}
if (toInclude) {
outVal.set(stBld.substring(0, stBld.length() -1));
context.write(NullWritable.get(), outVal);
}
}
/**
* @return
*/
private boolean isOutlier() {
boolean outlier = false;
if (outlierTruncationLevel > 0) {
if (Math.abs(normFieldValue) > outlierTruncationLevel) {
outlier = true;
}
}
return outlier;
}
}
/**
* @param args
*/
public static void main(String[] args) throws Exception {
int exitCode = ToolRunner.run(new NumericalAttrNormalizer(), args);
System.exit(exitCode);
}
}