/* * avenir: Predictive analytic based on Hadoop Map Reduce * Author: Pranab Ghosh * * Licensed under the Apache License, Version 2.0 (the "License"); you * may not use this file except in compliance with the License. You may * obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. */ package org.avenir.tree; import java.io.IOException; import java.io.InputStream; import java.util.Arrays; import java.util.List; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.log4j.Level; import org.apache.log4j.Logger; import org.avenir.util.AttributeSplitHandler; import org.chombo.mr.FeatureField; import org.chombo.util.FeatureSchema; import org.chombo.util.SecondarySort; import org.chombo.util.Utility; import org.codehaus.jackson.map.ObjectMapper; /** * Partitions data based on a split selected among some candidate splits * generated from the parent node and corresponding data * @author pranab * */ public class DataPartitioner extends Configured implements Tool { private static final Logger LOG = Logger.getLogger(DataPartitioner.class); private boolean debugOn; @Override public int run(String[] args) throws Exception { Job job = new Job(getConf()); String jobName = "Partitions data by some split"; job.setJobName(jobName); job.setJarByClass(DataPartitioner.class); Utility.setConfiguration(job.getConfiguration(), "avenir"); debugOn = job.getConfiguration().getBoolean("debug.on", false); if (debugOn) { LOG.setLevel(Level.DEBUG); } job.setMapperClass(DataPartitioner.PartitionerMapper.class); job.setReducerClass(DataPartitioner.PartitionerReducer.class); //find best split and create output path String inPath = getNodePath(job); if (debugOn) System.out.println("inPath:" + inPath); Split split = findBestSplitKey(job, inPath); String outPath = inPath + "/" + "split=" + split.getIndex(); if (debugOn) System.out.println("outPath:" + outPath); FileInputFormat.addInputPath(job, new Path(inPath)); FileOutputFormat.setOutputPath(job, new Path(outPath)); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); job.setPartitionerClass(SecondarySort.RawIntKeyTextPartitioner.class); int numReducers = split.getSegmentCount(); if (debugOn) System.out.println("numReducers:" + numReducers); job.setNumReduceTasks(numReducers); int status = job.waitForCompletion(true) ? 0 : 1; //move output to segment directories if (status == 0) { moveOutputToSegmentDir( outPath, split.getSegmentCount(), job.getConfiguration()); } return status; } /** * @param outPath * @param segmentCount * @param conf * @throws IOException */ private void moveOutputToSegmentDir(String outPath, int segmentCount, Configuration conf) throws IOException { FileSystem fileSystem = FileSystem.get(conf); for (int i = 0; i < segmentCount; ++i) { //create segment dir String dir = outPath + "/segment=" + i + "/data"; Path segmentPath = new Path(dir); fileSystem.mkdirs(segmentPath); //move output to segment dir Path srcFile = new Path(outPath + "/part-r-0000" + i); Path dstFile = new Path(outPath + "/segment=" + i + "/data/partition.txt"); fileSystem.rename(srcFile, dstFile); } fileSystem.close(); } /** * @param job * @return */ private String getNodePath(Job job) { String nodePath = null; Configuration conf = job.getConfiguration(); String basePath = conf.get("dap.project.base.path"); if (Utility.isBlank(basePath)) { throw new IllegalStateException("base path not defined"); } String splitPath = conf.get("dap.split.path"); if (debugOn) System.out.println("basePath:" + basePath + " splitPath:" + splitPath); nodePath = Utility.isBlank(splitPath) ? basePath + "/split=root/data" : basePath + "/split=root/data/" + splitPath; return nodePath; } /** * Finds best split according to chosen strategy * @param job * @param iplutPath * @return * @throws IOException */ private Split findBestSplitKey(Job job, String inputPath) throws IOException { String splitKey = null; Configuration conf = job.getConfiguration(); String splitSelectionStrategy = conf.get("dap.split.selection.strategy", "best"); String candidateSplitsPath = Utility.getSiblingPath(inputPath, "splits/part-r-00000"); if (debugOn) System.out.println("candidateSplitsPath:" + candidateSplitsPath); conf.set("dap.candidate.splits.path", candidateSplitsPath); List<String> lines = Utility.getFileLines(conf, "dap.candidate.splits.path"); //create split objects and sort Split[] splits = new Split[lines.size()]; int i = 0; for (String line : lines) { splits[i] = new Split(line,i); ++i; } //sort splits Arrays.sort(splits); //find split int splitIndex = 0; if (splitSelectionStrategy.equals("best")) { } else if (splitSelectionStrategy.equals("randomFromTop")) { int numSplits = conf.getInt("dap.num.top.splits", 5); splitIndex = (int)(Math.random() * numSplits); } Split split = splits[splitIndex]; //set asplit attribute ordinal and split key int splitAttribute = split.getAttributeOrdinal(); conf.setInt("dap.split.attribute", splitAttribute); if (debugOn) System.out.println("splitAttribute:" + splitAttribute); splitKey = split.getSplitKey(); if (debugOn) System.out.println("splitKey:" + splitKey); conf.set("dap.split.key", splitKey); return split; } /** * Sortable split * @author pranab * */ private static class Split implements Comparable<Split> { private String line; private int index; private String[] items; public Split(String line, int index) { this.line = line; this.index = index; items = line.split(";"); } @Override public int compareTo(Split that) { double thisVal = Double.parseDouble(items[2]); double thatVal = Double.parseDouble(that.items[2]); //descending order return thisVal > thatVal ? -1 : (thisVal < thatVal ? 1 : 0); } /** * Split segment * @return */ public String getSplitKey() { return items[1]; } /** * Split segment * @return */ public String getNormalizedSplitKey() { String key = items[1].replaceAll("\\s+", ""); key = key.replaceAll("\\[", ""); key = key.replaceAll("\\]", ""); key = key.replaceAll(":", "-"); return key; } /** * Split attribute ordinal * @return */ public int getAttributeOrdinal() { return Integer.parseInt(items[0]); } /** * Number of segments in the split * @return */ public int getSegmentCount() { String[] segments = items[1].split(":"); return segments.length; } public String getLine() { return line; } public int getIndex() { return index; } } /** * @author pranab * */ public static class PartitionerMapper extends Mapper<LongWritable, Text, IntWritable, Text> { private String fieldDelimRegex; private String[] items; private IntWritable outKey = new IntWritable(); private Text outVal = new Text(); private FeatureSchema schema; private int splitAttrOrd; private FeatureField featureField; private AttributeSplitHandler.Split split; private int splitSegment; private String attrVal; private static final Logger LOG = Logger.getLogger(PartitionerMapper.class); /* (non-Javadoc) * @see org.apache.hadoop.mapreduce.Mapper#setup(org.apache.hadoop.mapreduce.Mapper.Context) */ protected void setup(Context context) throws IOException, InterruptedException { Configuration conf = context.getConfiguration(); if (conf.getBoolean("debug.on", false)) { LOG.setLevel(Level.DEBUG); } fieldDelimRegex = conf.get("field.delim.regex", ","); splitAttrOrd = conf.getInt("dap.split.attribute", -1); if (splitAttrOrd == -1) { throw new IOException("split attribute not found"); } LOG.debug("splitAttrOrd:" + splitAttrOrd); String splitKey = conf.get("dap.split.key"); LOG.debug("splitKey:" + splitKey); InputStream fs = Utility.getFileStream(context.getConfiguration(), "dap.feature.schema.file.path"); ObjectMapper mapper = new ObjectMapper(); schema = mapper.readValue(fs, FeatureSchema.class); featureField = schema.findFieldByOrdinal(splitAttrOrd); if (featureField.isInteger()) { split = new AttributeSplitHandler.IntegerSplit(splitKey); } else if (featureField.isCategorical()) { split = new AttributeSplitHandler.CategoricalSplit(splitKey); } split.fromString(); } @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { items = value.toString().split(fieldDelimRegex); //key is split segment attrVal = items[splitAttrOrd]; splitSegment = split.getSegmentIndex(attrVal); LOG.debug("splitSegment:" + splitSegment); outKey.set(splitSegment); context.write(outKey,value); } } /** * @author pranab * */ public static class PartitionerReducer extends Reducer<IntWritable, Text, NullWritable, Text> { /* (non-Javadoc) * @see org.apache.hadoop.mapreduce.Reducer#reduce(KEYIN, java.lang.Iterable, org.apache.hadoop.mapreduce.Reducer.Context) */ protected void reduce(IntWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException { for (Text value : values) { context.write(NullWritable.get(), value); } } } /** * @param args */ public static void main(String[] args) throws Exception { int exitCode = ToolRunner.run(new DataPartitioner(), args); System.exit(exitCode); } }