/* * chombo: Hadoop Map Reduce utility * Author: Pranab Ghosh * * Licensed under the Apache License, Version 2.0 (the "License"); you * may not use this file except in compliance with the License. You may * obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. */ package org.chombo.mr; import java.io.IOException; import java.util.HashMap; import java.util.Map; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.log4j.Level; import org.apache.log4j.Logger; import org.chombo.util.BasicUtils; import org.chombo.util.Triplet; import org.chombo.util.Tuple; import org.chombo.util.Utility; /** * Does data normalization. Can use minmax or zscore normalization. With zscore * normalization, additionally outlier can be removed * @author pranab * */ public class Normalizer extends Configured implements Tool { private static final Logger LOG = Logger.getLogger(Normalizer.class); @Override public int run(String[] args) throws Exception { Job job = new Job(getConf()); String jobName = "Data normalizer MR"; job.setJobName(jobName); job.setJarByClass(Normalizer.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setMapperClass(Normalizer.NormalizerMapper.class); job.setReducerClass(Normalizer.NormalizerReducer.class); job.setMapOutputKeyClass(Tuple.class); job.setMapOutputValueClass(Tuple.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); Utility.setConfiguration(job.getConfiguration()); int numReducer = job.getConfiguration().getInt("nor.num.reducer", -1); numReducer = -1 == numReducer ? job.getConfiguration().getInt("num.reducer", 1) : numReducer; job.setNumReduceTasks(numReducer); int status = job.waitForCompletion(true) ? 0 : 1; return status; } /** * @author pranab * */ public static class NormalizerMapper extends Mapper<LongWritable, Text, Tuple, Tuple> { private Tuple outKey = new Tuple(); private Tuple outVal = new Tuple(); private String fieldDelimRegex; private String[] items; private int[] numAttributes; private Map<Integer, Triplet<String, Integer, String>> attributeProperties = new HashMap<Integer, Triplet<String, Integer, String>>(); private static final int ID_ORD = 0; private static final String STATS_KEY = "stats"; private Map<Integer, Stats> fieldStats = new HashMap<Integer, Stats>(); private int fieldOrd; private Stats stats; @Override protected void setup(Context context) throws IOException, InterruptedException { Configuration config = context.getConfiguration(); fieldDelimRegex = Utility.getFieldDelimiter(config, "nor.field.delim.regex", "field.delim.regex", ","); numAttributes = Utility.assertIntArrayConfigParam(config, "nor.num.attribute.ordinals", fieldDelimRegex, "missing numerical attribute ordinals"); getAttributeProperties(numAttributes,attributeProperties, config); for (int ord : attributeProperties.keySet()) { fieldStats.put(ord, new Stats()); } } @Override protected void cleanup(Context context) throws IOException, InterruptedException { //reduce will the stats first and then the data rows outKey.initialize(); outKey.add(0, STATS_KEY); for (int ord : fieldStats.keySet()) { outVal.initialize(); outVal.add(ord); fieldStats.get(ord).toTuple(outVal); context.write(outKey, outVal); } } /* (non-Javadoc) * @see org.apache.hadoop.mapreduce.Mapper#map(KEYIN, VALUEIN, org.apache.hadoop.mapreduce.Mapper.Context) */ @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { items = value.toString().split(fieldDelimRegex, -1); outKey.initialize(); outKey.add(1, items[ID_ORD]); outVal.initialize(); for (int i = 1; i < items.length; ++i) { fieldOrd = i; stats = fieldStats.get(fieldOrd); if (null != stats) { //numeric stats.add(Double.parseDouble(items[fieldOrd])); } outVal.add(items[fieldOrd]); } context.write(outKey, outVal); } } /** * @author pranab * */ public static class NormalizerReducer extends Reducer<Tuple, Tuple, NullWritable, Text> { private Text outVal = new Text(); private String fieldDelim; private int[] numAttributes; private Map<Integer, Triplet<String, Integer, String>> attributeProperties = new HashMap<Integer, Triplet<String, Integer, String>>(); private String normalizingStrategy; private float outlierTruncationLevel; private Map<Integer, Stats> fieldStats = new HashMap<Integer, Stats>(); private int fieldOrd; private Stats stats; private boolean excluded; private double normalizedValue; private int precision; private int ordinal; private StringBuilder stBld = new StringBuilder(); private static final String NORM_MIN_MAX = "minmax"; private static final String NORM_ZSCORE = "zscore"; private static final String NORM_CENTER = "center"; private static final String NORM_UNIT_SUM = "unitSum"; @Override protected void setup(Context context) throws IOException, InterruptedException { Configuration config = context.getConfiguration(); if (config.getBoolean("debug.on", false)) { LOG.setLevel(Level.DEBUG); } fieldDelim = config.get("field.delim.out", ","); //attribute properties numAttributes = Utility.assertIntArrayConfigParam(config, "nor.num.attribute.ordinals", Utility.configDelim, "missing numerical attribute ordinals"); getAttributeProperties(numAttributes,attributeProperties, config); precision = config.getInt("nor.floating.precision", 3); normalizingStrategy = config.get("nor.normalizing.strategy", NORM_MIN_MAX); outlierTruncationLevel = config.getFloat("nor.outlier.truncation.level", (float)-1.0); for (int ord : attributeProperties.keySet()) { Triplet<String, Integer, String> attrProp = attributeProperties.get(ord); stats = new Stats(); stats.scale = attrProp.getCenter(); stats.transformer = attrProp.getRight(); fieldStats.put(ord, stats); } } /* (non-Javadoc) * @see org.apache.hadoop.mapreduce.Reducer#reduce(KEYIN, java.lang.Iterable, org.apache.hadoop.mapreduce.Reducer.Context) */ protected void reduce(Tuple key, Iterable<Tuple> values, Context context) throws IOException, InterruptedException { if (key.getInt(0) == 0) { //aggregate stats for (Tuple value : values){ fieldOrd = value.getInt(0); stats = new Stats(); stats.fromTuple(value); fieldStats.get(fieldOrd).aggregate(stats); } //process stats for (int ord : fieldStats.keySet()) { Stats stats = fieldStats.get(ord); stats.process(); //System.out.println("ord:" + ord + " min:" + stats.min + " max:" + stats.max); } } else { //records for (Tuple value : values){ stBld.delete(0, stBld.length()); stBld.append(key.getString(1)); excluded = false; for (int i = 0; i < value.getSize(); ++i) { excluded = false; ordinal = i + 1; stats= fieldStats.get(ordinal); if (null != stats) { //numeric normalize(Double.parseDouble(value.getString(i)), stats); if (excluded) { break; } else { stBld.append(fieldDelim).append(formattedTypedValue(ordinal)); } } else { //other types stBld.append(fieldDelim).append(value.getString(i)); } } if (!excluded) { outVal.set(stBld.toString()); context.write(NullWritable.get(), outVal); } } } } /** * @param value * @param stat * @param scale * @return */ private void normalize(double value, Stats stats) { //transform value = stats.transform(value); //normalize normalizedValue = 0; if (normalizingStrategy.equals(NORM_MIN_MAX)) { normalizedValue = ((value - stats.min) * stats.scale) / stats.range; } else if (normalizingStrategy.equals(NORM_CENTER)) { normalizedValue = (value - stats.mean) * stats.scale; } else if (normalizingStrategy.equals(NORM_UNIT_SUM)) { normalizedValue = (value / stats.sum) * stats.scale; } else if (normalizingStrategy.equals(NORM_ZSCORE)) { if (stats.gotTransformer()) { throw new IllegalStateException("can not apply zscore normalizer when data is transformed"); } double temp = (value - stats.mean) / stats.stdDev; if (outlierTruncationLevel > 0) { if (Math.abs(temp) > outlierTruncationLevel) { excluded = true; } else { //keep bounded between -.5 * scale and .5 * scale temp /= outlierTruncationLevel; } } normalizedValue = temp * stats.scale / 2; } else { throw new IllegalStateException("invalid normalization strategy"); } } /** * @param ord * @return */ private String formattedTypedValue(int ord) { String value = null; Triplet<String, Integer, String> attrProp = attributeProperties.get(ord); String dataType = attrProp.getLeft(); if (dataType.equals("int")) { int iValue = (int)normalizedValue; value = "" + iValue; } else if (dataType.equals("long")) { long lValue = (long)normalizedValue; value = "" + lValue; } else if (dataType.equals("double")) { value = BasicUtils.formatDouble(normalizedValue, precision); } else { throw new IllegalStateException("invalid numeric data types"); } return value; } } /** * @param numAttributes * @param attributeProperties * @param config */ private static void getAttributeProperties(int[] numAttributes, Map<Integer, Triplet<String, Integer, String>> attributeProperties, Configuration config) { for (int i : numAttributes) { String key = "nor.attribute.prop." + i; String value = config.get(key); if (null == value) { throw new IllegalStateException("missing attribute properties"); } String[] parts = value.split(Utility.configDelim); Triplet<String, Integer, String> attributeProp = null; if (parts.length == 2) { //data type, scale attributeProp = new Triplet<String, Integer, String>(parts[0], Integer.parseInt(parts[1]), "none"); } else if (parts.length == 3) { //data type, scale, transformer attributeProp = new Triplet<String, Integer, String>(parts[0], Integer.parseInt(parts[1]), parts[2]); } else { throw new IllegalStateException("invalid attribute properties format"); } attributeProperties.put(i, attributeProp); } } /** * @author pranab * */ private static class Stats { private int count = 0; private double min = Double.MAX_VALUE; private double max = Double.MIN_VALUE; private double sum = 0; private double sqSum = 0; private double mean; private double range; private double stdDev; private int scale; private String transformer; /** * @param val */ private void add(double val) { ++count; if (val < min) { min = val; } if (val > max) { max = val; } sum += val; sqSum += val * val; } /** * @param tuple */ private void toTuple(Tuple tuple) { tuple.add(count, min, max, sum, sqSum); } /** * @param tuple */ private void fromTuple(Tuple tuple) { count = tuple.getInt(1); min = tuple.getDouble(2); max = tuple.getDouble(3); sum = tuple.getDouble(4); sqSum = tuple.getDouble(5); } /** * @param that */ private void aggregate(Stats that) { count += that.count; if (that.min < min) { min = that.min; } if (that.max > max) { max = that.max; } sum += that.sum; sqSum += that.sqSum; } /** * @return */ private Stats process() { mean = sum / count; range = max - min; double temp = sqSum / count - mean * mean; stdDev = Math.sqrt(temp); if (transformer.equals("multiplicativeInverse")) { //update min max and range double tempMin = min; min = 1.0 / max; max = 1.0 / tempMin; range = max - min; } return this; } /** * @param value * @return */ private double transform(double value) { double newValue = 0; if (transformer.equals("additiveInverse")) { newValue = max - value; } else if (transformer.equals("multiplicativeInverse")) { newValue = 1.0 / value; } else if (transformer.equals("none")) { newValue = value; } else { throw new IllegalStateException("invalid data transformer"); } return newValue; } /** * @return */ private boolean gotTransformer() { return !transformer.equals("none"); } } /** * @param args */ public static void main(String[] args) throws Exception { int exitCode = ToolRunner.run(new Normalizer(), args); System.exit(exitCode); } }