/*
* avenir: Predictive analytic based on Hadoop Map Reduce
* Author: Pranab Ghosh
*
* Licensed under the Apache License, Version 2.0 (the "License"); you
* may not use this file except in compliance with the License. You may
* obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
package org.avenir.tree;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.avenir.tree.DecisionPathList.DecisionPathPredicate;
import org.avenir.tree.SplitManager.AttributePredicate;
import org.avenir.util.AttributeSplitStat;
import org.avenir.util.InfoContentStat;
import org.chombo.mr.FeatureField;
import org.chombo.util.BasicUtils;
import org.chombo.util.FeatureSchema;
import org.chombo.util.Pair;
import org.chombo.util.Tuple;
import org.chombo.util.Utility;
import org.codehaus.jackson.map.ObjectMapper;
/**
* @author pranab
*
*/
public class DecisionTreeBuilder extends Configured implements Tool {
public static final String ROOT_PATH = "$root";
private static final String CHILD_PATH = "$child";
public static final String PRED_DELIM = ";";
public static final String SPLIT_DELIM = ":";
private static final Logger LOG = Logger.getLogger(DecisionTreeBuilder.class);
@Override
public int run(String[] args) throws Exception {
Job job = new Job(getConf());
String jobName = "Decision tree builder";
job.setJobName(jobName);
job.setJarByClass(DecisionTreeBuilder.class);
Utility.setConfiguration(job.getConfiguration(), "avenir");
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setMapperClass(DecisionTreeBuilder.BuilderMapper.class);
job.setReducerClass(DecisionTreeBuilder.BuilderReducer.class);
job.setMapOutputKeyClass(Tuple.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(Text.class);
int numReducer = job.getConfiguration().getInt("dtb.num.reducer", -1);
numReducer = -1 == numReducer ? job.getConfiguration().getInt("num.reducer", 1) : numReducer;
job.setNumReduceTasks(numReducer);
int status = job.waitForCompletion(true) ? 0 : 1;
return status;
}
/**
* Decision tree or random forest. For random forest, data is sampled in the first iteration
* @author pranab
*
*/
public static class BuilderMapper extends Mapper<LongWritable, Text, Tuple, Text> {
private String fieldDelimRegex;
private String[] items;
private Tuple outKey = new Tuple();
private Text outVal = new Text();
private FeatureSchema schema;
private List<Integer> splitAttrs;
private FeatureField classField;
private SplitManager splitManager;
private String attrSelectStrategy;
private int randomSplitSetSize;
private String classVal;
private String currenttDecPath;
private String decPathDelim;
private DecisionPathList decPathList;
private Map<String, Boolean> validDecPaths = new HashMap<String, Boolean>();
private String subSamlingStrategy;
private boolean treeAvailable;
private int samplingRate;
private int samplingBufferSize;
private String[] samplingBuffer;
private int count;
private boolean debugOn;
private static final String SUB_SAMPLING_WITH_REPLACE = "withReplace";
private static final String SUB_SAMPLING_WITHOUT_REPLACE = "withoutReplace";
private static final String SUB_SAMPLING_WITHOUT_NONE = "none";
private static final String ATTR_SEL_ALL = "all";
private static final String ATTR_SEL_NOT_USED_YET = "notUsedYet";
private static final String ATTR_SEL_RANDOM_ALL = "randomAll";
private static final String ATTR_SEL_RANDOM_NOT_USED_YET = "randomNotUsedYet";
/* (non-Javadoc)
* @see org.apache.hadoop.mapreduce.Mapper#setup(org.apache.hadoop.mapreduce.Mapper.Context)
*/
protected void setup(Context context) throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
debugOn = conf.getBoolean("debug.on", false);
if (debugOn) {
LOG.setLevel(Level.DEBUG);
}
fieldDelimRegex = conf.get("field.delim.regex", ",");
//schema
schema = Utility.getFeatureSchema(conf, "dtb.feature.schema.file.path");
//decision path list file
InputStream fs = Utility.getFileStream(context.getConfiguration(), "dtb.decision.file.path.in");
if (null != fs) {
ObjectMapper mapper = new ObjectMapper();
decPathList = mapper.readValue(fs, DecisionPathList.class);
treeAvailable = true;
}
//split manager
decPathDelim = conf.get("dtb.dec.path.delim", ";");
splitManager = new SplitManager(schema, decPathDelim);
splitManager.setDebugOn(debugOn);
String customBaseAttributeOrdinalsStr = conf.get("dtb.custom.base.attributes");
//use limited set of candidate attributes instead of all
if (null != customBaseAttributeOrdinalsStr) {
int[] customBaseAttributeOrdinals = Utility.intArrayFromString(customBaseAttributeOrdinalsStr);
splitManager.withCustomBaseAttributeOrdinals(customBaseAttributeOrdinals);
}
//attribute selection strategy
attrSelectStrategy = conf.get("dtb.split.attribute.selection.strategy", "notUsedYet");
randomSplitSetSize = conf.getInt("dtb.random.split.set.size", 3);
//class attribute
classField = schema.findClassAttrField();
validDecPaths.clear();
//sub sampling
subSamlingStrategy = conf.get("dtb.sub.sampling.strategy", "withReplace");
if (subSamlingStrategy.equals(SUB_SAMPLING_WITHOUT_REPLACE)) {
samplingRate = Utility.assertIntConfigParam(conf, "dtb.sub.sampling.rate",
"samling rate should be provided for sampling without replacement");
} else if (subSamlingStrategy.equals(SUB_SAMPLING_WITH_REPLACE)) {
int samplingBufferSize = conf.getInt("dtb.sub.sampling.buffer.size", 10000);
samplingBuffer = new String[samplingBufferSize];
}
}
@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
if (!treeAvailable && subSamlingStrategy.equals(SUB_SAMPLING_WITH_REPLACE)) {
//remaining in buffer
for (int i = 0; i < count; ++i) {
int sel = (int)(Math.random() * count);
sel = sel == count ? count -1 : sel;
rootMapHelper(samplingBuffer[sel], context);
}
}
super.cleanup(context);
}
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
//sampling
if (!treeAvailable) {
//first iteration for root predicate
if (subSamlingStrategy.equals(SUB_SAMPLING_WITH_REPLACE)) {
//sampling with replace
if (count < samplingBufferSize) {
samplingBuffer[count++] = value.toString();
} else {
//sample all
for (int i = 0; i < samplingBufferSize; ++i) {
int sel = (int)(Math.random() * samplingBufferSize);
sel = sel == samplingBufferSize ? samplingBufferSize -1 : sel;
rootMapHelper(samplingBuffer[sel], context);
}
//start refilling buffer
count = 0;
samplingBuffer[count++] = value.toString();
}
} else if (subSamlingStrategy.equals(SUB_SAMPLING_WITHOUT_REPLACE)) {
//sampling without replace
int sel = (int)(Math.random() * 100);
if (sel < samplingRate) {
rootMapHelper(value.toString(), context);
}
} else {
//no sampling
rootMapHelper(value.toString(), context);
}
} else {
//intermediate iteration
pathMapHelper(value.toString(), context);
}
}
/**
* @param record
* @param context
* @throws IOException
* @throws InterruptedException
*/
private void rootMapHelper(String record, Context context)
throws IOException, InterruptedException {
outKey.initialize();
outKey.add(ROOT_PATH);
outVal.set(record);
context.write(outKey, outVal);
}
/**
* @param record
* @param context
* @throws IOException
* @throws InterruptedException
*/
private void pathMapHelper(String record, Context context)
throws IOException, InterruptedException {
items = record.split(fieldDelimRegex, -1);
classVal = items[classField.getOrdinal()];
currenttDecPath = null;
if (treeAvailable) {
//strip split ID
currenttDecPath = items[0];
String[] predicates = DecisionPathList.stripSplitId(currenttDecPath.split(decPathDelim));
currenttDecPath = BasicUtils.join(predicates, decPathDelim);
//find decision path status
Boolean status = validDecPaths.get(currenttDecPath);
if (null == status) {
//from decision path list object
DecisionPathList.DecisionPath decPathObj = decPathList.findDecisionPath(predicates) ;
status = null == decPathObj ? false : true;
//cache it
validDecPaths.put(currenttDecPath, status);
}
//rejected decision path from earlier iteration rejected splits
if (!status) {
return;
}
}
//get split attributes
getSplitAttributes();
//all attributes
int splitId = 0;
for (int attr : splitAttrs) {
FeatureField field = schema. findFieldByOrdinal(attr);
Object attrValue = null;
//all splits
List<List<AttributePredicate>> allSplitPredicates = null;
if (field.isInteger()) {
allSplitPredicates = splitManager.createIntAttrSplitPredicates(attr);
Integer iValue = Integer.parseInt(items[attr + 1]);
attrValue = iValue;
} else if (field.isDouble()) {
allSplitPredicates = splitManager.createDoubleAttrSplitPredicates(attr);
Double dValue = Double.parseDouble(items[attr + 1]);
attrValue = dValue;
} else if (field.isCategorical()) {
allSplitPredicates = splitManager.createCategoricalAttrSplitPredicates(attr);
attrValue = items[attr + 1];
}
//evaluate split predicates
for (List<AttributePredicate> predicates : allSplitPredicates) {
//unique split id for each partion in a split
++splitId;
//predicates for a split
boolean predicateMatched = false;
for (AttributePredicate predicate : predicates) {
if (predicate.evaluate(attrValue)) {
//data belongs to this split segment
predicateMatched = true;
outKey.initialize();
if (null == currenttDecPath) {
outKey.add(predicate.toString());
outVal.set(record);
} else {
//existing predicates
String[] curDecPathItems = items[0].split(decPathDelim);
for (String curDecPathItem : curDecPathItems) {
outKey.add(curDecPathItem);
}
//new predicate
outKey.add("" + splitId + SPLIT_DELIM + predicate.toString());
int pos = record.indexOf(fieldDelimRegex);
//exclude predicate
outVal.set(record.substring(pos + fieldDelimRegex.length()));
}
context.write(outKey, outVal);
}
}
if (!predicateMatched) {
throw new IllegalStateException("no matching predicate for attribute: " + attr);
}
}
}
}
/**
* @param attrSelectStrategy
* @param conf
*/
private void getSplitAttributes() {
if (attrSelectStrategy.equals(ATTR_SEL_ALL)) {
//all attributes
splitAttrs = splitManager.getAllAttributes();
} else if (attrSelectStrategy.equals(ATTR_SEL_NOT_USED_YET)) {
//attributes that have not been used yet
splitAttrs = splitManager.getRemainingAttributes(currenttDecPath);
} else if (attrSelectStrategy.equals(ATTR_SEL_RANDOM_ALL)) {
//randomly selected k attributes from all
splitManager.getRandomAllAttributes(randomSplitSetSize);
} else if (attrSelectStrategy.equals(ATTR_SEL_RANDOM_NOT_USED_YET)) {
//randomly selected k attributes from attributes not used yet
splitManager.getRandomRemainingAttributes(currenttDecPath, randomSplitSetSize);
} else {
throw new IllegalArgumentException("invalid splitting attribute selection strategy");
}
}
}
/**
* @author pranab
*
*/
public static class BuilderReducer extends Reducer<Tuple, Text, NullWritable, Text> {
private FeatureSchema schema;
private String fieldDelim;
private Text outVal = new Text();
private String infoAlgorithm;
private boolean outputSplitProb;
private Map<String, Map<String, InfoContentStat>> decPaths = new HashMap<String, Map<String, InfoContentStat>>();
private Map<String, Map<String, Map<String, InfoContentStat>>> decPathsInfoContentBySplit =
new HashMap<String, Map<String, Map<String, InfoContentStat>>>();
private int classAttrOrdinal;
private String classAttrValue;
private String parentDecPath;
private String decPath;
private String childPath;
private String decPathDelim;
private DecisionPathStoppingStrategy pathStoppingStrategy;
private DecisionPathList decPathList;
private boolean decTreeAvailable;
private String spltSelStrategy;
private int topSplitCount;
private boolean debugOn;
private static String SPLIT_SEL_BEST = "best";
private static String SPLIT_SEL_RANDOM_TOP = "randomAmongTop";
@Override
protected void setup(Context context) throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
debugOn = conf.getBoolean("debug.on", false);
if (debugOn) {
LOG.setLevel(Level.DEBUG);
AttributeSplitStat.enableLog();
}
//schema
schema = Utility.getFeatureSchema(conf, "dtb.feature.schema.file.path");
//decision path list file
InputStream fs = Utility.getFileStream(context.getConfiguration(), "dtb.decision.file.path.in");
if (null != fs) {
ObjectMapper mapper = new ObjectMapper();
decPathList = mapper.readValue(fs, DecisionPathList.class);
decTreeAvailable = true;
}
fieldDelim = conf.get("field.delim.out", ",");
infoAlgorithm = conf.get("dtb.split.algorithm", "giniIndex");
outputSplitProb = conf.getBoolean("dtb.output.split.prob", false);
classAttrOrdinal = schema.findClassAttrField().getOrdinal();
decPathDelim = conf.get("dtb.dec.path.delim", ";");
//split selection strategy
spltSelStrategy = conf.get("dtb.split.select.strategy", SPLIT_SEL_BEST);
if (spltSelStrategy.equals(SPLIT_SEL_RANDOM_TOP)) {
topSplitCount = conf.getInt("dtb.top.split.count", 3);
}
//stopping strategy
String stoppingStrategy = conf.get("dtb.path.stopping.strategy", DecisionPathStoppingStrategy.STOP_MIN_INFO_GAIN);
int maxDepthLimit = -1;
double minInfoGainLimit = -1;
int minPopulationLimit = -1;
if (stoppingStrategy.equals(DecisionPathStoppingStrategy.STOP_MAX_DEPTH)) {
maxDepthLimit = Utility.assertIntConfigParam(conf, "dtb.max.depth.limit", "missing max depth limit for tree");
} else if (stoppingStrategy.equals(DecisionPathStoppingStrategy.STOP_MIN_INFO_GAIN)) {
minInfoGainLimit = Utility.assertDoubleConfigParam(conf, "dtb.min.info.gain.limit", "missing min info gain limit");
} else if (stoppingStrategy.equals(DecisionPathStoppingStrategy.STOP_MIN_POPULATION)) {
minPopulationLimit = Utility.assertIntConfigParam(conf, "dtb.min.population.limit", "missing min population limit");
} else {
throw new IllegalArgumentException("invalid stopping strategy " + stoppingStrategy);
}
pathStoppingStrategy = new DecisionPathStoppingStrategy(stoppingStrategy, maxDepthLimit,
minInfoGainLimit,minPopulationLimit);
}
@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
if (decTreeAvailable) {
expandTree(context);
} else {
generateRoot(context);
}
}
/**
* @throws IOException
*
*/
private void generateRoot(Context context) throws IOException {
boolean isAlgoEntropy = infoAlgorithm.equals("entropy");
Map<String, InfoContentStat> childStats = decPaths.get(ROOT_PATH);
InfoContentStat childStat = childStats.get(CHILD_PATH);
childStat.processStat(isAlgoEntropy);
DecisionPathList newDecPathList = new DecisionPathList();
DecisionPathList.DecisionPath decPath = new DecisionPathList.DecisionPath(childStat.getTotalCount(), childStat.getStat(),
childStat.getClassValPr());
DecisionPathPredicate predicate = DecisionPathPredicate.createRootPredicate(ROOT_PATH);
newDecPathList.addDecisionPath(decPath);
//save new decision path list
writeDecisioList(newDecPathList, "dtb.decision.file.path.out", context.getConfiguration() );
}
/**
* @param context
* @throws IOException
*/
private void expandTree(Context context) throws IOException {
Map<Double, Pair<String, Integer>> splits = new TreeMap<Double, Pair<String, Integer>>();
//group by split
infoContentBySplit();
DecisionPathList newDecPathList = new DecisionPathList();
boolean isAlgoEntropy = infoAlgorithm.equals("entropy");
double parentStat = 0;
List< DecisionPathList.DecisionPathPredicate> predicates = null;
//parent paths
Map<String, Map<String, InfoContentStat>> splitInfoContent;
for (String parentPath : decPathsInfoContentBySplit.keySet()) {
//parent decision path in existing tree
DecisionPathList.DecisionPath parentDecPath = findParentDecisionPath(parentPath);
if (null == parentDecPath) {
throw new IllegalStateException("parent decision path not found: " + parentPath);
}
parentStat = parentDecPath.getInfoContent();
//splits
double minInfoContent = 1000000;
String selectedSplit = null;
int selectedSplitAttr = -1;
splits.clear();
splitInfoContent = decPathsInfoContentBySplit.get(parentPath);
for (String splitId : splitInfoContent.keySet()) {
Map<String, InfoContentStat> predInfoContent = splitInfoContent.get(splitId);
if (debugOn) {
System.out.println("split: " + splitId);
}
//predicates
double weightedInfoContent = 0;
int totalCount = 0;
int attr = -1;
for (String predicate : predInfoContent.keySet()) {
if (debugOn) {
System.out.println("predicate: " + predicate);
}
attr = Integer.parseInt(predicate.split("\\s+")[0]);
InfoContentStat stat = predInfoContent.get(predicate);
weightedInfoContent += stat.processStat(isAlgoEntropy) * stat.getTotalCount();
totalCount += stat.getTotalCount();
}
//average info content across splits
double avInfoContent = weightedInfoContent / totalCount;
if (spltSelStrategy.equals(SPLIT_SEL_BEST)) {
//pick split with minimum info content
if (avInfoContent < minInfoContent) {
minInfoContent = avInfoContent;
selectedSplit = splitId;
selectedSplitAttr = attr;
if (debugOn) {
System.out.println("selectedSplit: " + selectedSplit + " selectedSplitAttr: " + selectedSplitAttr +
" minInfoContent: " + minInfoContent);
}
}
} else if (spltSelStrategy.equals(SPLIT_SEL_RANDOM_TOP)) {
splits.put(avInfoContent, new Pair<String, Integer>(splitId, attr));
} else {
throw new IllegalStateException("ivalid split slection strategy");
}
}
//select randomly from top k splits
if (spltSelStrategy.equals(SPLIT_SEL_RANDOM_TOP)) {
Pair<String, Integer> split = selectRandomSplitFromTop(splits);
selectedSplit = split.getLeft();
selectedSplitAttr = split.getRight();
}
//expand based on selected split
Map<String, InfoContentStat> predInfoContent = splitInfoContent.get(selectedSplit);
if (debugOn) {
System.out.println("selected split: " + selectedSplit + " selected attribute: " +selectedSplitAttr );
}
//parent predicates
List<DecisionPathList.DecisionPathPredicate> parentPredicates =
DecisionPathList.DecisionPathPredicate.createPredicates(parentPath, schema);
//generate new path based on predicates of selected split
FeatureField field = schema.findFieldByOrdinal(selectedSplitAttr);
for (String predicateStr : predInfoContent.keySet()) {
if (debugOn) {
System.out.println("predicate in selected split: " + predicateStr );
}
DecisionPathList.DecisionPathPredicate predicate = null;
if (field.isInteger()) {
predicate = DecisionPathList.DecisionPathPredicate.createIntPredicate(predicateStr);
} else if (field.isDouble()) {
predicate = DecisionPathList.DecisionPathPredicate.createDoublePredicate(predicateStr);
} else if (field.isCategorical()) {
predicate = DecisionPathList.DecisionPathPredicate.createCategoricalPredicate(predicateStr);
}
//append new predicate to parent predicate list
predicates = new ArrayList< DecisionPathList.DecisionPathPredicate>();
predicates.addAll(parentPredicates);
predicates.add(predicate);
//create new decision path
InfoContentStat stat = predInfoContent.get(predicateStr);
boolean toBeStopped = pathStoppingStrategy.shouldStop(stat, parentStat, parentPredicates.size() + 1);
DecisionPathList.DecisionPath decPath = new DecisionPathList.DecisionPath(predicates, stat.getTotalCount(),
stat.getStat(), toBeStopped, stat.getClassValPr());
newDecPathList.addDecisionPath(decPath);
}
}
//save new decision path list
writeDecisioList(newDecPathList, "dtb.decision.file.path.out", context.getConfiguration() );
}
/**
* @param splits
* @return
*/
private Pair<String, Integer> selectRandomSplitFromTop(Map<Double, Pair<String, Integer>> splits) {
List<Pair<String, Integer>> topSplits = new ArrayList<Pair<String, Integer>>();
int i = 0;
for (Double inforContent : splits.keySet()) {
topSplits.add(splits.get(inforContent));
if (++i == topSplitCount)
break;
}
return BasicUtils.selectRandom(topSplits);
}
/**
*
*/
private void infoContentBySplit() {
decPathsInfoContentBySplit.clear();
//parent paths
for (String parentPath : decPaths.keySet() ) {
//strip off slit IDs
String filtParentPath = stripSplitIds(parentPath);
Map<String, Map<String, InfoContentStat>> splitInfoContent = decPathsInfoContentBySplit.get(filtParentPath);
if (null == splitInfoContent) {
splitInfoContent = new HashMap<String, Map<String, InfoContentStat>>();
decPathsInfoContentBySplit.put(filtParentPath, splitInfoContent);
}
//child paths
Map<String, InfoContentStat> childStats = decPaths.get(parentPath);
for (String pred : childStats.keySet()) {
String[] items = BasicUtils.splitOnFirstOccurence(pred, SPLIT_DELIM , true);
String splitId = items[0];
String predicate = items[1];
if (debugOn) {
System.out.println("parentPath: " + parentPath + " splitId: " + splitId + " predicate: " + predicate);
}
Map<String, InfoContentStat> predInfoContent = splitInfoContent.get(splitId);
if (null == predInfoContent) {
predInfoContent = new HashMap<String, InfoContentStat>();
splitInfoContent.put(splitId, predInfoContent);
}
predInfoContent.put(predicate, childStats.get(pred) );
}
}
}
/**
* @param decPath
* @return
*/
private String stripSplitIds(String decPath) {
String filtDecPath = null;
String[] predicates = decPath.split(PRED_DELIM);
String[] filtPredicates = new String[predicates.length];
for (int i = 0; i < predicates.length; ++i ) {
if (predicates[i].equals(ROOT_PATH)) {
filtPredicates[i] = predicates[i];
} else {
filtPredicates[i] = BasicUtils.splitOnFirstOccurence(predicates[i], SPLIT_DELIM , true)[1];
}
}
filtDecPath = BasicUtils.join(filtPredicates, PRED_DELIM);
return filtDecPath;
}
/**
* finds decision path from current tree
* @param parentPath
* @return
*/
private DecisionPathList.DecisionPath findParentDecisionPath(String parentPath) {
DecisionPathList.DecisionPath decPath = null ;
if (null != decPathList) {
if (parentPath.equals(ROOT_PATH)) {
decPath = decPathList.findDecisionPath(ROOT_PATH);
} else {
String[] parentPathItems = parentPath.split(decPathDelim);
decPath = decPathList.findDecisionPath(parentPathItems);
}
}
return decPath;
}
/**
* @param newDecPathList
* @param outFilePathParam
* @param conf
* @throws IOException
*/
private void writeDecisioList(DecisionPathList newDecPathList, String outFilePathParam, Configuration conf )
throws IOException {
FSDataOutputStream ouStrm = FileSystem.get(conf).create(new Path(conf.get(outFilePathParam)));
ObjectMapper mapper = new ObjectMapper();
mapper.writeValue(ouStrm, newDecPathList);
ouStrm.flush();
}
/* (non-Javadoc)
* @see org.apache.hadoop.mapreduce.Reducer#reduce(KEYIN, java.lang.Iterable, org.apache.hadoop.mapreduce.Reducer.Context)
*/
protected void reduce(Tuple key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
int keySize = key.getSize();
key.setDelim(";");
decPath = key.toString();
if (keySize > 1) {
//tree exists
parentDecPath = key.toString(0, keySize-1);
childPath = key.getString(keySize-1);
} else {
//tree does not exist
parentDecPath = key.getString(0);
childPath = CHILD_PATH;
}
//all child class stats
Map<String, InfoContentStat> candidateChildrenPath = decPaths.get(parentDecPath);
if (null == candidateChildrenPath) {
candidateChildrenPath = new HashMap<String, InfoContentStat>();
decPaths.put(parentDecPath, candidateChildrenPath);
}
//class stats
InfoContentStat classStats = candidateChildrenPath.get(childPath);
if (null == classStats) {
classStats = new InfoContentStat();
candidateChildrenPath.put(childPath, classStats);
}
for (Text value : values) {
classAttrValue = value.toString().split(fieldDelim)[classAttrOrdinal];
classStats.incrClassValCount(classAttrValue);
outVal.set(decPath + fieldDelim + value.toString());
context.write(NullWritable.get(), outVal);
}
}
}
/**
* @param args
*/
public static void main(String[] args) throws Exception {
int exitCode = ToolRunner.run(new DecisionTreeBuilder(), args);
System.exit(exitCode);
}
}