DecisionTreeBuilder.java example

Explorer

avenir-master
- src
  - main
    - java
      - org
        avenir
        association
        AssociationRuleMiner.java
        FrequentItemsApriori.java
        InfrequentItemMarker.java
        ItemSetList.java
        bayesian
        BayesianDistribution.java
        BayesianModel.java
        BayesianPredictor.java
        FeaturePosterior.java
        cluster
        AgglomerativeGraphical.java
        EdgeWeightedCluster.java
        discriminant
        FisherDiscriminant.java
        explore
        AdaBoostError.java
        AdaBoostUpdate.java
        BaggingSampler.java
        CategoricalClassAffinity.java
        CategoricalContinuousEncoding.java
        CategoricalCorrelation.java
        ClassPartitionGenerator.java
        CramerCorrelation.java
        HeterogeneityReductionCorrelation.java
        MutualInformation.java
        MutualInformationScore.java
        NumericalCorrelation.java
        ReliefFeatureRelevance.java
        RuleEvaluator.java
        TopMatchesByClass.java
        UnderSamplingBalancer.java
        knn
        FeatureCondProbJoiner.java
        NearestNeighbor.java
        Neighborhood.java
        markov
        HiddenMarkovModel.java
        HiddenMarkovModelBuilder.java
        MarkovModel.java
        MarkovModelClassifier.java
        MarkovStateTransitionModel.java
        ProbabilisticSuffixTreeGenerator.java
        SuffixTreeBuilder.java
        SuffixTreeNode.java
        ViterbiDecoder.java
        ViterbiStatePredictor.java
        model
        DeterministicPredictiveModel.java
        EnsemblePredictiveModel.java
        ModelPredictor.java
        PredictiveModel.java
        ProbabilisticPredictiveModel.java
        regress
        LogisticRegressionJob.java
        LogisticRegressor.java
        reinforce
        Action.java
        ActionPursuitLearner.java
        ActionWriter.java
        AuerDeterministic.java
        ExplorationCounter.java
        ExponentialWeightLearner.java
        GreedyRandomBandit.java
        GroupedItems.java
        IntervalEstimatorLearner.java
        OptimisticSampsonSamplerLearner.java
        RandomFirstGreedyBandit.java
        RandomGreedyLearner.java
        RedisActionWriter.java
        RedisRewardReader.java
        RedisSpout.java
        ReinforcementLearner.java
        ReinforcementLearnerBolt.java
        ReinforcementLearnerFactory.java
        ReinforcementLearnerGroup.java
        ReinforcementLearnerTopology.java
        RewardComparisonLearner.java
        RewardReader.java
        SampsonSamplerLearner.java
        SoftMaxBandit.java
        SoftMaxLearner.java
        UpperConfidenceBoundOneLearner.java
        UpperConfidenceBoundTwoLearner.java
        sequence
        CandidateGenerationWithSelfJoin.java
        SequencePositionalCluster.java
        text
        WordCounter.java
        tree
        DataPartitioner.java
        DecisionPathList.java
        DecisionPathStoppingStrategy.java
        DecisionTreeBuilder.java
        DecisionTreeModel.java
        SplitGenerator.java
        SplitManager.java
        util
        AttributeSplitHandler.java
        AttributeSplitStat.java
        ClassAttributeCounter.java
        ClassBasedNeighborhood.java
        ConfusionMatrix.java
        ContingencyMatrix.java
        CostAttribute.java
        CostBasedArbitrator.java
        CostSchema.java
        EntityDistanceMapFileAccessor.java
        InfoContentStat.java
        RuleExpression.java
        StateTransitionProbability.java

/*
 * avenir: Predictive analytic based on Hadoop Map Reduce
 * Author: Pranab Ghosh
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"); you
 * may not use this file except in compliance with the License. You may
 * obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0 
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * permissions and limitations under the License.
 */


package org.avenir.tree;

import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.avenir.tree.DecisionPathList.DecisionPathPredicate;
import org.avenir.tree.SplitManager.AttributePredicate;
import org.avenir.util.AttributeSplitStat;
import org.avenir.util.InfoContentStat;
import org.chombo.mr.FeatureField;
import org.chombo.util.BasicUtils;
import org.chombo.util.FeatureSchema;
import org.chombo.util.Pair;
import org.chombo.util.Tuple;
import org.chombo.util.Utility;
import org.codehaus.jackson.map.ObjectMapper;

/**
 * @author pranab
 *
 */
public class DecisionTreeBuilder   extends Configured implements Tool {
    public static final String ROOT_PATH = "$root";
    private static final String CHILD_PATH = "$child";
    public static final String PRED_DELIM = ";";
    public static final String SPLIT_DELIM = ":";
    private static final Logger LOG = Logger.getLogger(DecisionTreeBuilder.class);

	@Override
	public int run(String[] args) throws Exception {
        Job job = new Job(getConf());
        String jobName = "Decision tree  builder";
        job.setJobName(jobName);
        job.setJarByClass(DecisionTreeBuilder.class);
        Utility.setConfiguration(job.getConfiguration(), "avenir");

        FileInputFormat.addInputPath(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
       
        job.setMapperClass(DecisionTreeBuilder.BuilderMapper.class);
        job.setReducerClass(DecisionTreeBuilder.BuilderReducer.class);
        
        job.setMapOutputKeyClass(Tuple.class);
        job.setMapOutputValueClass(Text.class);

        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(Text.class);

        int numReducer = job.getConfiguration().getInt("dtb.num.reducer", -1);
        numReducer = -1 == numReducer ? job.getConfiguration().getInt("num.reducer", 1) : numReducer;
        job.setNumReduceTasks(numReducer);

        int status =  job.waitForCompletion(true) ? 0 : 1;
        return status;
	}

	/**
	 * Decision tree or random forest. For random forest, data is sampled in the first iteration
	 * @author pranab
	 *
	 */
	public static class BuilderMapper extends Mapper<LongWritable, Text, Tuple, Text> {
		private String fieldDelimRegex;
		private String[] items;
        private Tuple outKey = new Tuple();
		private Text outVal  = new Text();
        private FeatureSchema schema;
        private List<Integer> splitAttrs;
        private FeatureField classField;
        private SplitManager splitManager;
        private String attrSelectStrategy;
        private int randomSplitSetSize;
        private String classVal;
        private String currenttDecPath;
        private String decPathDelim;
        private DecisionPathList decPathList;
        private Map<String, Boolean> validDecPaths = new HashMap<String, Boolean>();
        private String subSamlingStrategy;
        private boolean treeAvailable;
        private int samplingRate;
        private int samplingBufferSize;
        private String[]  samplingBuffer;
        private int count;
        private boolean debugOn;
        private static final String SUB_SAMPLING_WITH_REPLACE = "withReplace";
        private static final String SUB_SAMPLING_WITHOUT_REPLACE = "withoutReplace";
        private static final String SUB_SAMPLING_WITHOUT_NONE = "none";
        private static final String ATTR_SEL_ALL = "all";
        private static final String ATTR_SEL_NOT_USED_YET = "notUsedYet";
        private static final String ATTR_SEL_RANDOM_ALL = "randomAll";
        private static final String ATTR_SEL_RANDOM_NOT_USED_YET = "randomNotUsedYet";
       

        /* (non-Javadoc)
         * @see org.apache.hadoop.mapreduce.Mapper#setup(org.apache.hadoop.mapreduce.Mapper.Context)
         */
        protected void setup(Context context) throws IOException, InterruptedException {
        	Configuration conf = context.getConfiguration();
        	debugOn = conf.getBoolean("debug.on", false);
            if (debugOn) {
            	LOG.setLevel(Level.DEBUG);
            }
        	fieldDelimRegex = conf.get("field.delim.regex", ",");
        	
        	//schema
            schema = Utility.getFeatureSchema(conf, "dtb.feature.schema.file.path");
            
            //decision path list  file
            InputStream  fs = Utility.getFileStream(context.getConfiguration(), "dtb.decision.file.path.in");
            if (null != fs) {
            	ObjectMapper  mapper = new ObjectMapper();
            	decPathList = mapper.readValue(fs, DecisionPathList.class);
            	treeAvailable = true;
            }
           
            //split manager
            decPathDelim = conf.get("dtb.dec.path.delim", ";");
            splitManager = new SplitManager(schema, decPathDelim); 
            splitManager.setDebugOn(debugOn);
            String customBaseAttributeOrdinalsStr = conf.get("dtb.custom.base.attributes");
            
            //use limited set of candidate attributes instead of all
            if (null != customBaseAttributeOrdinalsStr) {
            	int[] customBaseAttributeOrdinals = Utility.intArrayFromString(customBaseAttributeOrdinalsStr);
            	splitManager.withCustomBaseAttributeOrdinals(customBaseAttributeOrdinals);
            }
            
            //attribute selection strategy
            attrSelectStrategy = conf.get("dtb.split.attribute.selection.strategy", "notUsedYet");
 
           	randomSplitSetSize = conf.getInt("dtb.random.split.set.size", 3);
            
            //class attribute
            classField = schema.findClassAttrField();
            
            validDecPaths.clear();
            
            //sub sampling
            subSamlingStrategy = conf.get("dtb.sub.sampling.strategy", "withReplace");
            if (subSamlingStrategy.equals(SUB_SAMPLING_WITHOUT_REPLACE)) {
            	samplingRate = Utility.assertIntConfigParam(conf, "dtb.sub.sampling.rate", 
            			"samling rate should be provided for sampling without replacement");
            } else if (subSamlingStrategy.equals(SUB_SAMPLING_WITH_REPLACE)) {
            	int samplingBufferSize = conf.getInt("dtb.sub.sampling.buffer.size",  10000);
            	samplingBuffer = new String[samplingBufferSize];
            }
        }
        
		@Override
		protected void cleanup(Context context) throws IOException, InterruptedException {
			if (!treeAvailable && subSamlingStrategy.equals(SUB_SAMPLING_WITH_REPLACE)) {
				//remaining in buffer
				for (int i = 0; i < count; ++i) {
					int sel = (int)(Math.random() * count);
					sel = sel == count ? count -1 : sel;
					rootMapHelper(samplingBuffer[sel], context);
				}
			}
			super.cleanup(context);
		}
        
        @Override
        protected void map(LongWritable key, Text value, Context context)
            throws IOException, InterruptedException {
        	//sampling
        	if (!treeAvailable) {
        		//first iteration for root predicate
        		 if (subSamlingStrategy.equals(SUB_SAMPLING_WITH_REPLACE)) {
        			 	//sampling with replace
		        		if (count <  samplingBufferSize)  {
		        			samplingBuffer[count++] = value.toString();
		        		} else {
		        			//sample all
		        			for (int i = 0; i < samplingBufferSize; ++i) {
		        				int sel = (int)(Math.random() * samplingBufferSize);
		        				sel = sel == samplingBufferSize ? samplingBufferSize -1 : sel;
		        				rootMapHelper(samplingBuffer[sel], context);
		        			}
		        			
		        			//start refilling buffer
		        			count = 0;
		        			samplingBuffer[count++] = value.toString();
		        		}
        		 } else if (subSamlingStrategy.equals(SUB_SAMPLING_WITHOUT_REPLACE)) {
        			 	//sampling without replace
     					int sel = (int)(Math.random() * 100);
     					if (sel < samplingRate) {
     						rootMapHelper(value.toString(),  context);
     					}
        		 } else {
        			 //no sampling
        			 rootMapHelper(value.toString(),  context);
        		 }
        	} else {
        		//intermediate iteration
        		pathMapHelper(value.toString(),  context);
        	}
  		}

        /**
         * @param record
         * @param context
         * @throws IOException
         * @throws InterruptedException
         */
        private void rootMapHelper(String record, Context context)
                throws IOException, InterruptedException {
			outKey.initialize();
			outKey.add(ROOT_PATH);
			outVal.set(record);
			context.write(outKey, outVal);
        }        
        
        /**
         * @param record
         * @param context
         * @throws IOException
         * @throws InterruptedException
         */
        private void pathMapHelper(String record, Context context)
                throws IOException, InterruptedException {
       		items  =  record.split(fieldDelimRegex, -1);
       	    classVal = items[classField.getOrdinal()];
            	
             currenttDecPath = null;
              if (treeAvailable) {
            	    //strip split ID
                	currenttDecPath = items[0];
            		String[] predicates = DecisionPathList.stripSplitId(currenttDecPath.split(decPathDelim));
            		currenttDecPath = BasicUtils.join(predicates, decPathDelim);
            		
                	//find decision path status
                	Boolean status = validDecPaths.get(currenttDecPath);
                	if (null == status) {
                		//from decision path list object
                		DecisionPathList.DecisionPath decPathObj = decPathList.findDecisionPath(predicates) ;
                		status = null == decPathObj ? false : true;
                		
                		//cache it
                		validDecPaths.put(currenttDecPath, status);
                	} 
                	
                	//rejected decision path from earlier iteration rejected splits
                	if (!status) {
                		return;
                	}
                }
                
              //get split attributes
              getSplitAttributes();
                
              //all attributes
          	 int splitId = 0;
              for (int attr :  splitAttrs) {
                	FeatureField field = schema. findFieldByOrdinal(attr);
                	Object attrValue = null;
                	//all splits
                	List<List<AttributePredicate>> allSplitPredicates = null;
                	if (field.isInteger()) {
                		allSplitPredicates = splitManager.createIntAttrSplitPredicates(attr);
                		Integer iValue = Integer.parseInt(items[attr + 1]);
                		attrValue = iValue;
                	} else if (field.isDouble()) {
                		allSplitPredicates = splitManager.createDoubleAttrSplitPredicates(attr);
                		Double dValue = Double.parseDouble(items[attr + 1]);
                		attrValue = dValue;
                	} else if (field.isCategorical()) {
                		allSplitPredicates = splitManager.createCategoricalAttrSplitPredicates(attr);
                		attrValue = items[attr + 1];
                	}
                    
                	//evaluate split predicates
                    for (List<AttributePredicate> predicates : allSplitPredicates) {
                    	//unique split id for each partion in a split
                    	++splitId;
                    	
                    	//predicates for a split
                    	boolean predicateMatched = false;
                    	for (AttributePredicate predicate : predicates) {
                    		if (predicate.evaluate(attrValue)) {
                    			//data belongs to this split segment
                    			predicateMatched = true;
                    			outKey.initialize();
                    			if (null == currenttDecPath) {
                    				outKey.add(predicate.toString());
                        			outVal.set(record);
                    			} else {
                    				//existing predicates
                    				String[] curDecPathItems = items[0].split(decPathDelim);
                    				for (String curDecPathItem : curDecPathItems) {
                        				outKey.add(curDecPathItem);
                    				}
                    				
                    				//new predicate
                     				outKey.add("" + splitId + SPLIT_DELIM +   predicate.toString());
                     				int pos = record.indexOf(fieldDelimRegex);
                     				
                     				//exclude predicate
                        			outVal.set(record.substring(pos + fieldDelimRegex.length()));
                    			}               		
                				context.write(outKey, outVal);
                    		}	
                    	}
                    	if (!predicateMatched) {
                    		throw new IllegalStateException("no matching predicate for attribute: " + attr);
                    	}
                    }
              }
      	}
       
        /**
         * @param attrSelectStrategy
         * @param conf
         */
        private void getSplitAttributes() {
            if (attrSelectStrategy.equals(ATTR_SEL_ALL)) {
            	//all attributes
            	splitAttrs = splitManager.getAllAttributes();
            } else if (attrSelectStrategy.equals(ATTR_SEL_NOT_USED_YET)) {
            	//attributes that have not been used yet
            	splitAttrs = splitManager.getRemainingAttributes(currenttDecPath);
            } else if (attrSelectStrategy.equals(ATTR_SEL_RANDOM_ALL)) {
            	//randomly selected k attributes from all
            	splitManager.getRandomAllAttributes(randomSplitSetSize);
            } else if (attrSelectStrategy.equals(ATTR_SEL_RANDOM_NOT_USED_YET)) {
            	//randomly selected k attributes from attributes not used yet
            	splitManager.getRandomRemainingAttributes(currenttDecPath, randomSplitSetSize);
            } else {
            	throw new IllegalArgumentException("invalid splitting attribute selection strategy");
            }
        }	
	}

	/**
	 * @author pranab
	 *
	 */
	public static class BuilderReducer extends Reducer<Tuple, Text, NullWritable, Text> {
 		private FeatureSchema schema;
		private String fieldDelim;
		private Text outVal  = new Text();
		private String  infoAlgorithm;
        private boolean outputSplitProb;
        private Map<String, Map<String, InfoContentStat>> decPaths = new HashMap<String, Map<String, InfoContentStat>>();
        private Map<String, Map<String, Map<String, InfoContentStat>>> decPathsInfoContentBySplit = 
        		new HashMap<String, Map<String, Map<String, InfoContentStat>>>();
        private int classAttrOrdinal;
        private String classAttrValue;
        private String parentDecPath;
        private String decPath;
        private String childPath;
        private String decPathDelim;
        private DecisionPathStoppingStrategy pathStoppingStrategy;
        private DecisionPathList decPathList;
        private boolean decTreeAvailable;
        private String spltSelStrategy;
        private int topSplitCount;
        private boolean debugOn;
        private static String  SPLIT_SEL_BEST = "best";
        private static String  SPLIT_SEL_RANDOM_TOP = "randomAmongTop";
        
	   	@Override
	   	protected void setup(Context context) throws IOException, InterruptedException {
        	Configuration conf = context.getConfiguration();
        	debugOn = conf.getBoolean("debug.on", false);
            if (debugOn) {
            	LOG.setLevel(Level.DEBUG);
            	AttributeSplitStat.enableLog();
            }
            
            //schema
            schema = Utility.getFeatureSchema(conf, "dtb.feature.schema.file.path");

            //decision path list  file
            InputStream fs = Utility.getFileStream(context.getConfiguration(), "dtb.decision.file.path.in");
            if (null != fs) {
            	ObjectMapper mapper = new ObjectMapper();
            	decPathList = mapper.readValue(fs, DecisionPathList.class);
            	decTreeAvailable = true;
            }
            
        	fieldDelim = conf.get("field.delim.out", ",");
        	infoAlgorithm = conf.get("dtb.split.algorithm", "giniIndex");
        	outputSplitProb = conf.getBoolean("dtb.output.split.prob", false);
        	classAttrOrdinal = schema.findClassAttrField().getOrdinal();
            decPathDelim = conf.get("dtb.dec.path.delim", ";");
            
            //split selection strategy
            spltSelStrategy = conf.get("dtb.split.select.strategy", SPLIT_SEL_BEST);
            if (spltSelStrategy.equals(SPLIT_SEL_RANDOM_TOP)) {
            	topSplitCount = conf.getInt("dtb.top.split.count", 3);
            }

        	//stopping strategy
        	String stoppingStrategy =  conf.get("dtb.path.stopping.strategy", DecisionPathStoppingStrategy.STOP_MIN_INFO_GAIN);
        	int maxDepthLimit = -1;
        	double minInfoGainLimit = -1;
        	int minPopulationLimit = -1;
        	if (stoppingStrategy.equals(DecisionPathStoppingStrategy.STOP_MAX_DEPTH)) {
        		maxDepthLimit = Utility.assertIntConfigParam(conf, "dtb.max.depth.limit", "missing max depth limit for tree");
        	} else if (stoppingStrategy.equals(DecisionPathStoppingStrategy.STOP_MIN_INFO_GAIN)) {
            	minInfoGainLimit =  Utility.assertDoubleConfigParam(conf, "dtb.min.info.gain.limit", "missing min info gain limit");     
        	} else if (stoppingStrategy.equals(DecisionPathStoppingStrategy.STOP_MIN_POPULATION)) {
            	minPopulationLimit =  Utility.assertIntConfigParam(conf, "dtb.min.population.limit", "missing min population limit");                 
        	} else {
        		throw new IllegalArgumentException("invalid stopping strategy " + stoppingStrategy);
        	}
        	pathStoppingStrategy = new DecisionPathStoppingStrategy(stoppingStrategy, maxDepthLimit, 
        			minInfoGainLimit,minPopulationLimit);
	   	}   

	   	@Override
	   	protected void cleanup(Context context)  throws IOException, InterruptedException {
	   		if (decTreeAvailable) {
	   			expandTree(context);
	   		} else {
	   			generateRoot(context);
	   		}
	   	}
	   	
	   	/**
	   	 * @throws IOException 
	   	 * 
	   	 */
	   	private void generateRoot(Context context) throws IOException {
	   		boolean isAlgoEntropy = infoAlgorithm.equals("entropy");
	   		Map<String, InfoContentStat> childStats = decPaths.get(ROOT_PATH);
	   		InfoContentStat childStat = childStats.get(CHILD_PATH);
	   		childStat.processStat(isAlgoEntropy);
	   		
	   		DecisionPathList newDecPathList = new DecisionPathList();
	   		DecisionPathList.DecisionPath decPath = new DecisionPathList.DecisionPath(childStat.getTotalCount(), childStat.getStat(),
	   				childStat.getClassValPr());
	   		DecisionPathPredicate predicate = DecisionPathPredicate.createRootPredicate(ROOT_PATH);
	   		
	   		newDecPathList.addDecisionPath(decPath);
	   		
	   		//save new decision path list
	   		writeDecisioList(newDecPathList, "dtb.decision.file.path.out",  context.getConfiguration() );
	   	}
	   	
	   	/**
	   	 * @param context
	   	 * @throws IOException
	   	 */
	   	private void expandTree(Context context) throws IOException {
	   		Map<Double, Pair<String, Integer>> splits = new TreeMap<Double, Pair<String, Integer>>();
	   		
	   		//group by split
	   		infoContentBySplit();
	   		
	   		DecisionPathList newDecPathList = new DecisionPathList();
	   		boolean isAlgoEntropy = infoAlgorithm.equals("entropy");
	   		double parentStat = 0;
		    List< DecisionPathList.DecisionPathPredicate> predicates = null;
	   		
	   		//parent paths
	   		Map<String, Map<String, InfoContentStat>> splitInfoContent;
	   		for (String parentPath :  decPathsInfoContentBySplit.keySet()) {
	   			
	   			//parent decision path in existing tree
	   			DecisionPathList.DecisionPath parentDecPath = findParentDecisionPath(parentPath);
	   			if (null  == parentDecPath) {
	   				throw new IllegalStateException("parent decision path not found: "  + parentPath);
	   			}
	   			parentStat = parentDecPath.getInfoContent();
	   			 
	   			//splits
	   			double minInfoContent = 1000000;
   				String selectedSplit = null;
   				int selectedSplitAttr = -1;
   				splits.clear();
	   			splitInfoContent = decPathsInfoContentBySplit.get(parentPath);
	   			for (String splitId :  splitInfoContent.keySet()) {
	   				Map<String, InfoContentStat> predInfoContent = splitInfoContent.get(splitId);
	   				if (debugOn) {
	   					System.out.println("split: " + splitId);
	   				}
	   				
	   				//predicates
	   				double weightedInfoContent = 0;
	   				int totalCount = 0;
	   				int attr = -1;
	   				for (String predicate : predInfoContent.keySet()) {
	   					if (debugOn) {
		   					System.out.println("predicate: " + predicate);
	   					}
	   					
	   					attr = Integer.parseInt(predicate.split("\\s+")[0]);
	   					InfoContentStat stat = predInfoContent.get(predicate);
	   					weightedInfoContent += stat.processStat(isAlgoEntropy) * stat.getTotalCount();
	   					totalCount += stat.getTotalCount();
	   				}
	   				//average info content across splits
	   				double  avInfoContent = weightedInfoContent / totalCount;
	   				if (spltSelStrategy.equals(SPLIT_SEL_BEST)) {
		   				//pick split with  minimum info content
		   				if (avInfoContent  < minInfoContent) {
		   					minInfoContent = avInfoContent;
		   					selectedSplit = splitId;
		   					selectedSplitAttr = attr;
		   					if (debugOn) {
		   						System.out.println("selectedSplit: " +  selectedSplit +  "  selectedSplitAttr:  "  +  selectedSplitAttr  + 
		   								"  minInfoContent: " + minInfoContent);
		   					}
		   				}
	   				} else if (spltSelStrategy.equals(SPLIT_SEL_RANDOM_TOP)) {
	   					splits.put(avInfoContent, new Pair<String, Integer>(splitId, attr));
	   				} else {
	   					throw new IllegalStateException("ivalid split slection strategy");
	   				}
	   			}
	   			
	   			//select randomly from top k splits
	   			if (spltSelStrategy.equals(SPLIT_SEL_RANDOM_TOP)) {
	   				Pair<String, Integer> split =  selectRandomSplitFromTop(splits);
	   				selectedSplit = split.getLeft();
	   				selectedSplitAttr = split.getRight();
	   			}
	   			
	   			//expand based on selected split
   				Map<String, InfoContentStat> predInfoContent = splitInfoContent.get(selectedSplit);
   				if (debugOn) {
   					System.out.println("selected split: " + selectedSplit  +  " selected attribute: " +selectedSplitAttr );
   				}
   				
	   			//parent predicates
	   			List<DecisionPathList.DecisionPathPredicate> parentPredicates = 
	   					DecisionPathList.DecisionPathPredicate.createPredicates(parentPath, schema);
	   			 
	   			//generate new path based on predicates of selected split 
				FeatureField field = schema.findFieldByOrdinal(selectedSplitAttr);
   				for (String predicateStr : predInfoContent.keySet()) {
   					if (debugOn) {
   						System.out.println("predicate in selected split: " + predicateStr );
   					}
   					DecisionPathList.DecisionPathPredicate predicate = null;
   					if (field.isInteger()) {
   						predicate = DecisionPathList.DecisionPathPredicate.createIntPredicate(predicateStr);
   					} else if (field.isDouble()) {
   						predicate = DecisionPathList.DecisionPathPredicate.createDoublePredicate(predicateStr);
   					} else if (field.isCategorical()) {
   						predicate = DecisionPathList.DecisionPathPredicate.createCategoricalPredicate(predicateStr);
   					} 
   					
 					//append new predicate to parent predicate list
   				    predicates = new ArrayList< DecisionPathList.DecisionPathPredicate>();
					predicates.addAll(parentPredicates);
					predicates.add(predicate);
   				
  					//create new decision path
					InfoContentStat stat = predInfoContent.get(predicateStr);
   					boolean toBeStopped = pathStoppingStrategy.shouldStop(stat, parentStat, parentPredicates.size() + 1);
   					DecisionPathList.DecisionPath decPath = new DecisionPathList.DecisionPath(predicates, stat.getTotalCount(),
   							stat.getStat(),  toBeStopped, stat.getClassValPr());
   					newDecPathList.addDecisionPath(decPath);
   					
  				}
	   		}
	   		
	   		//save new decision path list
	   		writeDecisioList(newDecPathList, "dtb.decision.file.path.out",  context.getConfiguration() );
	   	}
	   	
	   	/**
	   	 * @param splits
	   	 * @return
	   	 */
	   	private Pair<String, Integer> selectRandomSplitFromTop(Map<Double, Pair<String, Integer>> splits) {
	   		List<Pair<String, Integer>> topSplits = new ArrayList<Pair<String, Integer>>();
			int i = 0;
			for (Double inforContent : splits.keySet()) {
				topSplits.add(splits.get(inforContent));
				if (++i == topSplitCount)
					break;
			}
			return BasicUtils.selectRandom(topSplits);
		}
	   	
	   	/**
	   	 * 
	   	 */
	   	private void infoContentBySplit() {
	   		decPathsInfoContentBySplit.clear();
	   		
	   		//parent paths
	   		for (String parentPath :  decPaths.keySet() ) {		
	   			//strip off slit IDs
	   			String filtParentPath = stripSplitIds(parentPath);
	   			
	   			Map<String, Map<String, InfoContentStat>> splitInfoContent = decPathsInfoContentBySplit.get(filtParentPath);
	   			if (null == splitInfoContent) {
	   				splitInfoContent = new HashMap<String, Map<String, InfoContentStat>>();
	   				decPathsInfoContentBySplit.put(filtParentPath, splitInfoContent);
	   			}
	   					
	   			//child paths
	   			Map<String, InfoContentStat> childStats = decPaths.get(parentPath);
	   			for (String pred :  childStats.keySet()) {
	   				String[] items = BasicUtils.splitOnFirstOccurence(pred, SPLIT_DELIM , true);
	   				String splitId = items[0];
	   				String predicate = items[1];
	   				if (debugOn) {
	   					System.out.println("parentPath: " + parentPath +   " splitId: " + splitId + " predicate: " + predicate);
	   				}
	   				Map<String, InfoContentStat> predInfoContent = splitInfoContent.get(splitId);
	   				if (null == predInfoContent) {
	   					predInfoContent = new HashMap<String, InfoContentStat>();
	   					splitInfoContent.put(splitId, predInfoContent);
	   				}
	   				predInfoContent.put(predicate, childStats.get(pred) );
	   				
	   			}
	   		}
	   	}
	   	
	   	/**
	   	 * @param decPath
	   	 * @return
	   	 */
	   	private  String stripSplitIds(String decPath) {
	   		String filtDecPath = null;
	   		String[] predicates = decPath.split(PRED_DELIM);
	   		String[] filtPredicates = new String[predicates.length];
	   		for (int i = 0; i < predicates.length; ++i )  {
	   			if (predicates[i].equals(ROOT_PATH)) {
	   				filtPredicates[i] = predicates[i];
	   			} else {
	   				filtPredicates[i] = BasicUtils.splitOnFirstOccurence(predicates[i], SPLIT_DELIM , true)[1];
	   			}
	   		}
	   		filtDecPath = BasicUtils.join(filtPredicates, PRED_DELIM);
	   		return filtDecPath;
	   	}
	   	
	   	/**
	   	 * finds decision path from current tree
	   	 * @param parentPath
	   	 * @return
	   	 */
	   	private DecisionPathList.DecisionPath findParentDecisionPath(String parentPath) {
	   		DecisionPathList.DecisionPath decPath =  null ;
	   		if (null != decPathList) {
	   			if (parentPath.equals(ROOT_PATH)) {
	   				decPath = decPathList.findDecisionPath(ROOT_PATH);
	   			} else {
	   		   		String[] parentPathItems = parentPath.split(decPathDelim);
	   		   		decPath = decPathList.findDecisionPath(parentPathItems);
	   			}
	   		}
	   		return decPath;
	   	}
	   	
	   	/**
	   	 * @param newDecPathList
	   	 * @param outFilePathParam
	   	 * @param conf
	   	 * @throws IOException
	   	 */
	   	private void writeDecisioList(DecisionPathList newDecPathList, String outFilePathParam, Configuration conf ) 
	   			throws IOException {
	   		FSDataOutputStream ouStrm = FileSystem.get(conf).create(new Path(conf.get(outFilePathParam)));	
	   		ObjectMapper mapper = new ObjectMapper();
	   		mapper.writeValue(ouStrm, newDecPathList);
	   		ouStrm.flush();
	   	}
	   	
        /* (non-Javadoc)
         * @see org.apache.hadoop.mapreduce.Reducer#reduce(KEYIN, java.lang.Iterable, org.apache.hadoop.mapreduce.Reducer.Context)
         */
        protected void reduce(Tuple  key, Iterable<Text> values, Context context)
        		throws IOException, InterruptedException {
        	int keySize = key.getSize();
        	key.setDelim(";");
        	decPath = key.toString();
        	
        	if (keySize > 1) {
        		//tree exists
        		parentDecPath =  key.toString(0, keySize-1);
        		childPath = key.getString(keySize-1);
        	} else {
        		//tree does not exist
        		parentDecPath = key.getString(0);
        		childPath = CHILD_PATH;
        	}
        	
        	//all child class stats
        	Map<String, InfoContentStat> candidateChildrenPath =  decPaths.get(parentDecPath);
        	if (null == candidateChildrenPath) {
        		candidateChildrenPath = new HashMap<String, InfoContentStat>();
        		decPaths.put(parentDecPath,  candidateChildrenPath);
        	}
        	
        	//class stats
        	InfoContentStat classStats = candidateChildrenPath.get(childPath);
        	if (null == classStats) {
        		classStats = new InfoContentStat();
        		candidateChildrenPath.put(childPath, classStats);
        	}
        	
        	
        	for (Text value : values) {
        		classAttrValue = value.toString().split(fieldDelim)[classAttrOrdinal];
        		classStats.incrClassValCount(classAttrValue);
            	outVal.set(decPath + fieldDelim + value.toString());
            	context.write(NullWritable.get(), outVal);
        	}
        }
	}
	
	/**
	 * @param args
	 */
	public static void main(String[] args) throws Exception {
        int exitCode = ToolRunner.run(new DecisionTreeBuilder(), args);
        System.exit(exitCode);
	}
	
}