/* * avenir: Predictive analytic based on Hadoop Map Reduce * Author: Pranab Ghosh * * Licensed under the Apache License, Version 2.0 (the "License"); you * may not use this file except in compliance with the License. You may * obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. */ package org.avenir.bayesian; import java.io.IOException; import java.io.InputStream; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.util.Version; import org.chombo.mr.FeatureField; import org.chombo.util.FeatureSchema; import org.chombo.util.Triplet; import org.chombo.util.Tuple; import org.chombo.util.Utility; import org.codehaus.jackson.map.ObjectMapper; /** * Calculates all distributions for bayesian classifier * @author pranab * */ public class BayesianDistribution extends Configured implements Tool { @Override public int run(String[] args) throws Exception { Job job = new Job(getConf()); String jobName = "prior and posterior distribution MR"; job.setJobName(jobName); job.setJarByClass(BayesianDistribution.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); Utility.setConfiguration(job.getConfiguration(), "avenir"); job.setMapperClass(BayesianDistribution.DistributionMapper.class); job.setReducerClass(BayesianDistribution.DistributionReducer.class); job.setMapOutputKeyClass(Tuple.class); job.setMapOutputValueClass(Tuple.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); job.setNumReduceTasks(job.getConfiguration().getInt("num.reducer", 1)); int status = job.waitForCompletion(true) ? 0 : 1; return status; } /** * @author pranab * */ public static class DistributionMapper extends Mapper<LongWritable, Text, Tuple, Tuple> { private String[] items; private Tuple outKey = new Tuple(); private Tuple outVal = new Tuple(); private String fieldDelimRegex; private FeatureSchema schema; private List<FeatureField> fields; private FeatureField classAttrField; private String classAttrVal; private String featureAttrVal; private Integer featureAttrOrdinal; private String featureAttrBin; private int bin; private boolean tabularInput; private Analyzer analyzer; private long val; private long valSq; private final int ONE = 1; /* (non-Javadoc) * @see org.apache.hadoop.mapreduce.Mapper#setup(org.apache.hadoop.mapreduce.Mapper.Context) */ protected void setup(Context context) throws IOException, InterruptedException { Configuration config = context.getConfiguration(); fieldDelimRegex = config.get("field.delim.regex", ","); tabularInput = config.getBoolean("bad.tabular.input", true); if (tabularInput) { //tabular input InputStream fs = Utility.getFileStream(context.getConfiguration(), "bad.feature.schema.file.path"); ObjectMapper mapper = new ObjectMapper(); schema = mapper.readValue(fs, FeatureSchema.class); //class attribute field classAttrField = schema.findClassAttrField(); fields = schema.getFields(); } else { //text input analyzer = new StandardAnalyzer(Version.LUCENE_44); featureAttrOrdinal = 1; outVal.initialize(); outVal.add(ONE); } } /* (non-Javadoc) * @see org.apache.hadoop.mapreduce.Mapper#map(KEYIN, VALUEIN, org.apache.hadoop.mapreduce.Mapper.Context) */ @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { if (tabularInput) { items = value.toString().split(fieldDelimRegex); classAttrVal = items[classAttrField.getOrdinal()]; for (FeatureField field : fields) { if (field.isFeature()) { boolean binned = true; featureAttrVal = items[field.getOrdinal()]; featureAttrOrdinal = field.getOrdinal(); if (field.isCategorical()) { featureAttrBin= featureAttrVal; } else { if (field.isBucketWidthDefined()) { bin = Integer.parseInt(featureAttrVal) / field.getBucketWidth(); featureAttrBin = "" + bin; } else { binned = false; val = Integer.parseInt(featureAttrVal); valSq = val * val; } } outKey.initialize(); outVal.initialize(); if (binned) { //1.cjass attribute vale 2.feature attribute ordinal 3. feature attribute bin outKey.add(classAttrVal, featureAttrOrdinal, featureAttrBin); outVal.add(ONE); } else { //1.cjass attribute vale 2.feature attribute ordinal outKey.add(classAttrVal, featureAttrOrdinal); outVal.add(ONE, val, valSq); } context.write(outKey, outVal); } } } else { mapText( value, context); } } /** * @param value * @param context * @throws IOException * @throws InterruptedException */ private void mapText(Text value, Context context) throws IOException, InterruptedException { items = value.toString().split(fieldDelimRegex); classAttrVal = items[1]; List<String> tokens = Utility.tokenize(items[0], analyzer); for (String token : tokens ) { outKey.initialize(); outKey.add(classAttrVal, featureAttrOrdinal, token); context.write(outKey, outVal); } } } /** * @author pranab * */ public static class DistributionReducer extends Reducer<Tuple, Tuple, NullWritable, Text> { private Text outVal = new Text(); private String fieldDelim; private FeatureSchema schema; private List<FeatureField> fields; private int count; private StringBuilder stBld = new StringBuilder(); private long valSum; private long valSqSum; private long featurePosteriorMean; private long featurePosteriorStdDev; private Map<Integer, Triplet<Integer, Long, Long>> featurePriorDistr = new HashMap<Integer, Triplet<Integer, Long, Long>>(); private Integer featureOrd; private boolean tabularInput; private boolean binned; private String classAttrValue; private FeatureField field; /* (non-Javadoc) * @see org.apache.hadoop.mapreduce.Reducer#setup(org.apache.hadoop.mapreduce.Reducer.Context) */ protected void setup(Context context) throws IOException, InterruptedException { Configuration config = context.getConfiguration(); fieldDelim = context.getConfiguration().get("field.delim.out", ","); tabularInput = config.getBoolean("bad.tabular.input", true); //tabular input if (tabularInput) { InputStream fs = Utility.getFileStream(context.getConfiguration(), "bad.feature.schema.file.path"); ObjectMapper mapper = new ObjectMapper(); schema = mapper.readValue(fs, FeatureSchema.class); } } /* (non-Javadoc) * @see org.apache.hadoop.mapreduce.Reducer#cleanup(org.apache.hadoop.mapreduce.Reducer.Context) */ protected void cleanup(Context context) throws IOException, InterruptedException { //emit feature prior probability parameters for numerical continuous variables for (int featureOrd : featurePriorDistr.keySet()) { context.getCounter("Distribution Data", "Feature prior cont ").increment(1); Triplet<Integer, Long, Long> distr = featurePriorDistr.get(featureOrd); count = distr.getLeft(); valSum = distr.getCenter(); valSqSum = distr.getRight(); long mean = valSum / count; double temp = valSqSum - count * mean * mean; long stdDev = (long)(Math.sqrt(temp / (count -1))); stBld.delete(0, stBld.length()); stBld.append(fieldDelim).append(featureOrd).append(fieldDelim).append(fieldDelim).append(mean). append(fieldDelim).append(stdDev); outVal.set(stBld.toString()); context.write(NullWritable.get(),outVal); } } /* (non-Javadoc) * @see org.apache.hadoop.mapreduce.Reducer#reduce(KEYIN, java.lang.Iterable, org.apache.hadoop.mapreduce.Reducer.Context) */ protected void reduce(Tuple key, Iterable<Tuple> values, Context context) throws IOException, InterruptedException { count = 0; classAttrValue = key.getString(0); featureOrd = key.getInt(1); field = schema.findFieldByOrdinal(featureOrd); binned = !tabularInput || field.isCategorical() || field.isBucketWidthDefined(); if (!binned){ valSum = valSqSum = 0; } for (Tuple val : values) { count += val.getInt(0); if (!binned) { valSum += val.getLong(1); valSqSum += val.getLong(2); } } if (!binned) { featurePosteriorMean = valSum / count; double temp = valSqSum - count * featurePosteriorMean * featurePosteriorMean; featurePosteriorStdDev = (long)(Math.sqrt(temp / (count -1))); //collect feature prior values across all class attribute values Triplet<Integer, Long, Long> distr = featurePriorDistr.get(featureOrd); if (null == distr) { distr = new Triplet<Integer, Long, Long>(count, valSum, valSqSum); featurePriorDistr.put(featureOrd, distr); } else { distr.setLeft(distr.getLeft() + count); distr.setCenter(distr.getCenter() + valSum); distr.setRight(distr.getRight() + valSqSum); } } //emit feature posterior stBld.delete(0, stBld.length()); if (binned) { context.getCounter("Distribution Data", "Feature posterior binned ").increment(1); stBld.append(key.toString()).append(fieldDelim).append(count); } else { context.getCounter("Distribution Data", "Feature posterior cont ").increment(1); stBld.append(key.toString()).append(fieldDelim).append(fieldDelim).append(featurePosteriorMean). append(fieldDelim).append(featurePosteriorStdDev); } outVal.set(stBld.toString()); context.write(NullWritable.get(),outVal); //emit class prior context.getCounter("Distribution Data", "Class prior").increment(1); stBld.delete(0, stBld.length()); stBld.append(key.getString(0)).append(fieldDelim).append(fieldDelim).append(fieldDelim).append(count); outVal.set(stBld.toString()); context.write(NullWritable.get(),outVal); //feature prior if (binned) { context.getCounter("Distribution Data", "Feature prior binned ").increment(1); stBld.delete(0, stBld.length()); stBld.append(fieldDelim).append(key.getInt(1)).append(fieldDelim).append(key.getString(2)). append(fieldDelim).append(count); outVal.set(stBld.toString()); context.write(NullWritable.get(),outVal); } } } /** * @param args * @throws Exception */ public static void main(String[] args) throws Exception { int exitCode = ToolRunner.run(new BayesianDistribution(), args); System.exit(exitCode); } }