/* * chombo: Hadoop Map Reduce utility * Author: Pranab Ghosh * * Licensed under the Apache License, Version 2.0 (the "License"); you * may not use this file except in compliance with the License. You may * obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. */ package org.chombo.mr; import java.io.IOException; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.commons.lang3.ArrayUtils; import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.Reducer.Context; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.log4j.Level; import org.apache.log4j.Logger; import org.chombo.redis.RedisCache; import org.chombo.util.Pair; import org.chombo.util.SecondarySort; import org.chombo.util.Tuple; import org.chombo.util.Utility; /** * Does weighted average of set of numerical attributes and then sorts by the average value * in ascending or descending order. can optionally scale filed values, fetching max values from * cache * @author pranab * */ public class WeightedAverage extends Configured implements Tool { @Override public int run(String[] args) throws Exception { Job job = new Job(getConf()); String jobName = "Weighted average calculating MR"; job.setJobName(jobName); job.setJarByClass(WeightedAverage.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setMapperClass(WeightedAverage.AverageMapper.class); job.setReducerClass(WeightedAverage.AverageReducer.class); job.setMapOutputKeyClass(Tuple.class); job.setMapOutputValueClass(Tuple.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); Utility.setConfiguration(job.getConfiguration()); if (job.getConfiguration().getInt("wea.group.by.field", -1) >= 0) { //group by job.setGroupingComparatorClass(SecondarySort.TuplePairGroupComprator.class); job.setPartitionerClass(SecondarySort.TuplePairPartitioner.class); } int numReducer = job.getConfiguration().getInt("wea.num.reducer", -1); numReducer = -1 == numReducer ? job.getConfiguration().getInt("num.reducer", 1) : numReducer; job.setNumReduceTasks(numReducer); int status = job.waitForCompletion(true) ? 0 : 1; return status; } /** * @author pranab * */ public static class AverageMapper extends Mapper<LongWritable, Text, Tuple, Tuple> { private Tuple outKey = new Tuple(); private Tuple outVal = new Tuple(); private String fieldDelimRegex; private boolean sortOrderAscending; private int groupByField; private String[] items; private List<Pair<Integer, Integer>> filedWeights; private double weightedValue; private double sum; private int totalWt = 0; private int[] invertedFields; private double fieldValue; private int scale; private RedisCache redisCache; private Map<Integer, Integer> fieldMaxValues = new HashMap<Integer, Integer>(); private boolean singleTennant; private int fieldOrd; private int[] suppressingFields; private long secondaryKey; private int[] keyFields; private Integer maxValue; private boolean scalingNeeded; private boolean maxValueFromcache; private static final Logger LOG = Logger.getLogger(WeightedAverage.AverageMapper.class); /* (non-Javadoc) * @see org.apache.hadoop.mapreduce.Mapper#setup(org.apache.hadoop.mapreduce.Mapper.Context) */ protected void setup(Context context) throws IOException, InterruptedException { Configuration config = context.getConfiguration(); if (config.getBoolean("debug.on", false)) { LOG.setLevel(Level.DEBUG); System.out.println("turned debug on"); } fieldDelimRegex = Utility.getFieldDelimiter(config, "wea.field.delim.regex", "field.delim.regex", ","); sortOrderAscending = config.getBoolean("wea.sort.order.ascending", true); scale = config.getInt("wea.field.scale", 100); keyFields = Utility.intArrayFromString(config.get("wea.key.fields")); groupByField = config.getInt("wea.group.by.field", -1); LOG.debug("keyFields:" + keyFields + " groupByField:" + groupByField); //field weights String fieldWeightsStr = config.get("wea.field.weights"); filedWeights = Utility.getIntPairList(fieldWeightsStr, ",", ":"); for (Pair<Integer, Integer> pair : filedWeights) { totalWt += pair.getRight(); LOG.debug("field:" + pair.getLeft() + " weight:" + pair.getRight()); } //inverted fields String invertedFieldsStr = config.get("wea.inverted.fields"); if (!Utility.isBlank(invertedFieldsStr)) { invertedFields = Utility.intArrayFromString(invertedFieldsStr); } //suppressing fields String suppressingFieldsStr = config.get("wea.suppressing.fields"); if (!Utility.isBlank(suppressingFieldsStr)) { suppressingFields = Utility.intArrayFromString(suppressingFieldsStr); } //scaling scalingNeeded = config.getBoolean("wea.scaling.needed", false); if (scalingNeeded) { maxValueFromcache = config.getBoolean("wea.max.value.from.cache", false); if (maxValueFromcache) { //field max values from cache String fieldMaxValuesCacheKey = config.get("wea.field.max.values.cache.key"); List<Pair<Integer, String>> filedMaxValueKeys = Utility.getIntStringList(fieldMaxValuesCacheKey, ",", ":"); String redisHost = config.get("wea.redis.server.host", "localhost"); int redisPort = config.getInt("wea.redis.server.port", 6379); String defaultOrgId = config.get("wea.default.org.id"); singleTennant = false; if (!StringUtils.isBlank(defaultOrgId)) { //default org singleTennant = true; String cacheName = "si-" + defaultOrgId; redisCache = new RedisCache( redisHost, redisPort, cacheName); for (Pair<Integer, String> pair : filedMaxValueKeys) { int maxValue = redisCache.getIntMax(pair.getRight()); fieldMaxValues.put(pair.getLeft(), maxValue); LOG.debug("field:" + pair.getLeft() + " max value:" + maxValue); } } else { //multi organization } } else { //from configuration singleTennant = true; fieldMaxValues = Utility.assertIntegerIntegerMapConfigParam(config, "wea.attribute.max.values", Utility.configDelim, Utility.configSubFieldDelim, "missing max values for scaling", false); } } } /* (non-Javadoc) * @see org.apache.hadoop.mapreduce.Mapper#map(KEYIN, VALUEIN, org.apache.hadoop.mapreduce.Mapper.Context) */ @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { items = value.toString().split(fieldDelimRegex, -1); sum = 0; for (Pair<Integer, Integer> pair : filedWeights) { fieldOrd = pair.getLeft(); fieldValue = Double.parseDouble(items[fieldOrd]); //if suppressing field and value is 0 then skip the record if (null != suppressingFields && ArrayUtils.contains(suppressingFields, fieldOrd) && items[fieldOrd].equals("0") ) { context.getCounter("Record stat", "Suppressed").increment(1); return; } //scale field value if needed if (scalingNeeded) { if (singleTennant) { maxValue = fieldMaxValues.get(fieldOrd); if (null != maxValue) { fieldValue = (fieldValue * scale) / maxValue; } } else { } } //invert if needed if (null != invertedFields && ArrayUtils.contains(invertedFields, fieldOrd)) { fieldValue = scale - fieldValue; } sum += fieldValue * pair.getRight(); } weightedValue = sum / totalWt; weightedValue = weightedValue < 0 ? 0 : weightedValue; //key outKey.initialize(); long wtVal = (long)(weightedValue * 1000); secondaryKey = sortOrderAscending ? wtVal : Long.MAX_VALUE - wtVal; if (groupByField >= 0) { //secondary sorting by weight outKey.add(items[groupByField], secondaryKey); } else { //primary sorting by weight outKey.add(secondaryKey); } //value outVal.initialize(); if (null != keyFields) { outVal.addFromArray(items, keyFields); } outVal.add( weightedValue); context.write(outKey, outVal); } } /** * @author pranab * */ public static class AverageReducer extends Reducer<Tuple, Tuple, NullWritable, Text> { private Text outVal = new Text(); private String fieldDelim; private boolean outputAsFloat; private int precision; private double weightedValue; private int keyFieldsLength; private int groupByField; private boolean outputGroupByField; private StringBuilder stBld = new StringBuilder(); /* (non-Javadoc) * @see org.apache.hadoop.mapreduce.Reducer#setup(org.apache.hadoop.mapreduce.Reducer.Context) */ protected void setup(Context context) throws IOException, InterruptedException { Configuration config = context.getConfiguration(); fieldDelim = config.get("field.delim.out", ","); keyFieldsLength = Utility.intArrayFromString(config.get("wea.key.fields")).length; outputAsFloat = config.getBoolean("wea.output.as.float", true); if (outputAsFloat) { precision = config.getInt("wea.output.precision", 3); } groupByField = config.getInt("wea.group.by.field", -1); outputGroupByField = config.getBoolean("wea.output.group.by.field", false); } /* (non-Javadoc) * @see org.apache.hadoop.mapreduce.Reducer#reduce(KEYIN, java.lang.Iterable, org.apache.hadoop.mapreduce.Reducer.Context) */ protected void reduce(Tuple key, Iterable<Tuple> values, Context context) throws IOException, InterruptedException { for (Tuple value : values){ stBld.delete(0, stBld.length()); if (outputGroupByField && groupByField >= 0) { stBld.append(key.getString(0)).append(fieldDelim); } stBld.append(value.toString(0, keyFieldsLength)); weightedValue = value.getDouble(keyFieldsLength); if (outputAsFloat) { stBld.append(fieldDelim).append(Utility.formatDouble(weightedValue, precision)); } else { stBld.append(fieldDelim).append("" + (long)weightedValue); } outVal.set(stBld.toString()); context.write(NullWritable.get(), outVal); } } } /** * @param args */ public static void main(String[] args) throws Exception { int exitCode = ToolRunner.run(new WeightedAverage(), args); System.exit(exitCode); } }