Projection.java example

Explorer

chombo-master
- src
  - main
    - java
      - org
        chombo
        distance
        AttributeDistance.java
        AttributeDistanceAggregator.java
        AttributeDistanceSchema.java
        CharacterPairSimilarity.java
        CosineSimilarity.java
        DiceSimilarity.java
        DynamicVectorSimilarity.java
        InterRecordDistance.java
        JaccardSimilarity.java
        mr
        CatAttrDistByValueDiffMetric.java
        CategoricalAttrClassDistr.java
        CategoricalAttrDistrStats.java
        Combinator.java
        DuplicateRemover.java
        FeatureField.java
        FeatureFieldCollection.java
        FlatRecordExtractor.java
        FlatRecordExtractorFromJson.java
        FunctionalDependencyAnalyzer.java
        Histogram.java
        Joiner.java
        MissingValueCounter.java
        MultiJoiner.java
        MultiVarHistogram.java
        Normalizer.java
        NumericSorter.java
        NumericalAttrDistrStats.java
        NumericalAttrMedian.java
        NumericalAttrNormalizer.java
        NumericalAttrStats.java
        OutlierBasedDataValidation.java
        PatternBasedFieldExtractor.java
        PatternMatchedValueCounter.java
        Projection.java
        RankAggregator.java
        RecordSetBulkMutator.java
        RecordSetModifier.java
        RecordSimilarity.java
        RunningAggregator.java
        SeasonalCycleFinder.java
        SeasonalDetector.java
        SetMerger.java
        SimpleValidationChecker.java
        StringAttrStats.java
        TemporalAggregator.java
        TemporalFilter.java
        TextSorter.java
        TimeGapSequenceGenerator.java
        TimeSequenceFilter.java
        TokenCounter.java
        TopMatches.java
        Transformer.java
        UniqueCounter.java
        UniqueKeyAnalyzer.java
        ValidationChecker.java
        ValueCounter.java
        WeightedAverage.java
        redis
        RedisCache.java
        storm
        Cache.java
        GenericBolt.java
        GenericComponent.java
        GenericSpout.java
        MessageHolder.java
        MessageQueue.java
        PubSub.java
        RedisCache.java
        RedisPubSub.java
        RedisQueue.java
        transformer
        AttributeTransformer.java
        ContextAwareTransformer.java
        CustomTransformer.java
        CustomTransformerFactory.java
        DateTransformer.java
        JsonFieldExtractor.java
        MultiLineFlattener.java
        MultiLineJsonFlattener.java
        NumericTransformer.java
        RawAttribute.java
        RawAttributeSchema.java
        ScalarGeneratingTransformer.java
        StringTransformer.java
        TransformerFactory.java
        UnstructuredFieldExtractor.java
        util
        Attribute.java
        AttributeFilter.java
        AttributePredicate.java
        AttributeSchema.java
        AttributeZscoreFilter.java
        AverageValue.java
        BaseAttribute.java
        BasicUtils.java
        BigTupleList.java
        BinCount.java
        BinaryCategoryCreator.java
        CategoricalHistogramStat.java
        CategoricalSampler.java
        CategoricalSet.java
        ClassImbalancedSampler.java
        ConfigUtility.java
        ConfigurationLoader.java
        Domain.java
        DoubleAttributePredicate.java
        DoubleRange.java
        DoubleTable.java
        DuplicateRemover.java
        DynamicBean.java
        Entity.java
        ExponentialDistribution.java
        FeatureCount.java
        FeatureSchema.java
        GenericAttributeSchema.java
        HistogramStat.java
        IncrementalSimpleStat.java
        IntAttributePredicate.java
        IntPair.java
        IntRange.java
        LongRunningStats.java
        MathUtility.java
        Matrix.java
        MedianStatsManager.java
        NamedObject.java
        NamedObjectTree.java
        NumericalAttrStatsManager.java
        NumericalSampler.java
        Pair.java
        PoisonDistr.java
        ProbabilityDistribution.java
        ProcessorAttribute.java
        ProcessorAttributeSchema.java
        RandomSampler.java
        RealtimeUtil.java
        RecencyWeightedHistogramStat.java
        RecencyWeightedStat.java
        RichAttribute.java
        RichAttributeSchema.java
        RowColumnFilter.java
        SeasonalAnalyzer.java
        SecondarySort.java
        SimpleStat.java
        SpatioTemporalPoint.java
        StandardNormalDistribution.java
        StatsParameters.java
        StringAttributePredicate.java
        TabularData.java
        TextInt.java
        TextLong.java
        TextPair.java
        TextTransformer.java
        TextTuple.java
        Triplet.java
        Tuple.java
        Utility.java
        validator
        CategoricalValidator.java
        CustomValidatorFactory.java
        GenericValidator.java
        InvalidData.java
        NumericalValidator.java
        StringValidator.java
        Validator.java
        ValidatorFactory.java

/*
 * chombo: Hadoop Map Reduce utility
 * Author: Pranab Ghosh
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"); you
 * may not use this file except in compliance with the License. You may
 * obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0 
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * permissions and limitations under the License.
 */

package org.chombo.mr;

import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.chombo.redis.RedisCache;
import org.chombo.util.AttributeFilter;
import org.chombo.util.BasicUtils;
import org.chombo.util.RowColumnFilter;
import org.chombo.util.SecondarySort;
import org.chombo.util.Tuple;
import org.chombo.util.Utility;

/**
 * Simple query with projection. Can do group by and order by.  If grouped by can do count or 
 * unique count. sum and avearge
 * @author pranab
 *
 */
public class Projection extends Configured implements Tool {

	@Override
	public int run(String[] args) throws Exception {
        Job job = new Job(getConf());
        String jobName = "Projection  and grouping  MR";
        job.setJobName(jobName);
        
        job.setJarByClass(Projection.class);

        FileInputFormat.addInputPath(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        Utility.setConfiguration(job.getConfiguration());
        String operation = job.getConfiguration().get("projection.operation",  "project");
        
        if (operation.startsWith("grouping")) {
        	//group by
            job.setMapperClass(Projection.ProjectionMapper.class);
            job.setReducerClass(Projection.ProjectionReducer.class);

            job.setMapOutputKeyClass(Tuple.class);
            job.setMapOutputValueClass(Text.class);

            int numReducer = job.getConfiguration().getInt("pro.num.reducer", -1);
            numReducer = -1 == numReducer ? job.getConfiguration().getInt("num.reducer", 1) : numReducer;
            job.setNumReduceTasks(numReducer);
            
            //order by
        	boolean doOrderBy = job.getConfiguration().getInt("pro.orderBy.field", -1) >= 0;
        	if (doOrderBy) {
                job.setGroupingComparatorClass(SecondarySort.TuplePairGroupComprator.class);
                job.setPartitionerClass(SecondarySort.TupleTextPartitioner.class);
        	}

        } else {
        	//simple projection
            job.setMapperClass(Projection.SimpleProjectionMapper.class);
        }
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(Text.class);
        
        
        int status =  job.waitForCompletion(true) ? 0 : 1;
        return status;
	}
	
	/**
	 * @author pranab
	 *
	 */
	public static class SimpleProjectionMapper extends Mapper<LongWritable, Text, NullWritable, Text> {
		private Text outVal = new Text();
		private int  keyField;
		private int[]  projectionFields;
        private String fieldDelimRegex;
        private String fieldDelimOut;
        private AttributeFilter attrFilter;
        private RowColumnFilter rowColFilter = new RowColumnFilter();
        private boolean idIncluded;

        protected void setup(Context context) throws IOException, InterruptedException {
        	Configuration config = context.getConfiguration();
        	keyField = config.getInt("pro.key.field", 0);
        	fieldDelimRegex = config.get("field.delim.regex", ",");
        	fieldDelimOut = config.get("field.delim", ",");
        	String fileterFieldDelimRegex = config.get("pro.filter.field.delim.regex", ",");
        	
        	//projection
        	projectionFields = Utility.intArrayFromString(config.get("pro.projection.field"),fieldDelimRegex );
        	if (null == projectionFields) {
        		//projected field from the output of another MR
        		projectionFields = findIncludedColumns(config, rowColFilter);
        	}
        	idIncluded = config.getBoolean("pro.id.incuded.in.projection", true);
        	
        	
        	//selection
        	String selectFilter = config.get("pro.select.filter");
        	if (null != selectFilter) {
        		String notInSetName = config.get("pro.operator.notin.set.name");
        		if (null == notInSetName) {
        			attrFilter = new AttributeFilter(selectFilter);
        		} else {
               		//bulk data for in or notin operator
        			attrFilter =  new AttributeFilter();
            		createExcludedRowsContext( config,  rowColFilter,attrFilter, selectFilter);
            	}
        	} 
       }
        
        @Override
        protected void map(LongWritable key, Text value, Context context)
            throws IOException, InterruptedException {
            String[] items  =  value.toString().split(fieldDelimRegex, -1);
            if (null == attrFilter || attrFilter.evaluate(items)) {
            	if (idIncluded) {
            		outVal.set(Utility.extractFields(items , projectionFields, fieldDelimOut, false));
            	} else {
            		outVal.set(items[keyField] + fieldDelimOut +  Utility.extractFields(items , projectionFields, 
            				fieldDelimOut, false));
            	}
            	context.write(NullWritable.get(), outVal);
            }
        }
	}
	
	/**
	 * @author pranab
	 *
	 */
	public static class ProjectionMapper extends Mapper<LongWritable, Text, Tuple, Text> {
		private Tuple outKey = new Tuple();
		private Text outVal = new Text();
		private int  keyField;
		private int[]  projectionFields;
        private String fieldDelimRegex;
        private String fieldDelimOut;
        private int orderByField;
        private boolean groupBy;
        private boolean isOrderByFieldNumeric;
        private AttributeFilter attrFilter;
        private RowColumnFilter rowColFilter = new RowColumnFilter();
        
        /* (non-Javadoc)
         * @see org.apache.hadoop.mapreduce.Mapper#setup(org.apache.hadoop.mapreduce.Mapper.Context)
         */
        protected void setup(Context context) throws IOException, InterruptedException {
        	Configuration config = context.getConfiguration();
        	String operation = config.get("pro.projection.operation",  "project");
        	groupBy = operation.startsWith("grouping");
        	
        	keyField = config.getInt("pro.key.field", 0);
        	fieldDelimRegex = config.get("field.delim.regex", ",");
        	fieldDelimOut = config.get("field.delim.out", ",");
        	projectionFields = Utility.intArrayFromString(config.get("pro.projection.field"),fieldDelimRegex );
        	if (null == projectionFields) {
        		//projected field from the output of another MR
        		projectionFields = findIncludedColumns(config, rowColFilter);
        	}
        	
        	//order by
        	orderByField = config.getInt("pro.orderBy.field", -1);
        	isOrderByFieldNumeric = config.getBoolean("pro.orderBy.filed.numeric", false);
        	
        	//selection
        	String selectFilter = config.get("pro.select.filter");
        	if (null != selectFilter) {
        		String notInSetName = config.get("pro.operator.notin.set.name");
        		if (null == notInSetName) {
        			attrFilter = new AttributeFilter(selectFilter);
        		} else {
               		//bulk data for in or notin operator
        			attrFilter =  new AttributeFilter();
            		createExcludedRowsContext( config,  rowColFilter,attrFilter, selectFilter);
            	}
        	}
       }

        /* (non-Javadoc)
         * @see org.apache.hadoop.mapreduce.Mapper#map(KEYIN, VALUEIN, org.apache.hadoop.mapreduce.Mapper.Context)
         */
        @Override
        protected void map(LongWritable key, Text value, Context context)
            throws IOException, InterruptedException {
            String[] items  =  value.toString().split(fieldDelimRegex, -1);
            if (null == attrFilter || attrFilter.evaluate(items)) {
	        	outKey.initialize();
	            if (orderByField >= 0) {
	            	//group by and order by
	            	if (isOrderByFieldNumeric) {
	               		outKey.add(items[keyField],Double.parseDouble( items[orderByField]));
	            	} else {
	            		outKey.add(items[keyField], items[orderByField]);
	            	}
	            } else {
	            	//group by
	            	outKey.add(items[keyField]);
	            }
	        	outVal.set( Utility.extractFields(items , projectionFields, fieldDelimOut, false));
	        	context.write(outKey, outVal);
            }
        }
	}
	
    /**
     * @author pranab
     *
     */
    public static class ProjectionReducer extends Reducer<Tuple, Text, NullWritable, Text> {
		private Text outVal = new Text();
		private StringBuilder stBld =  new StringBuilder();;
		private String fieldDelim;
		private RedisCache redisCache;
		private String aggregateValueKeyPrefix;
		private String[] aggrFunctions;
		private int[] aggrFunctionValues;
		private int[] aggrFunctionValuesMax;
		private List<String> strValues = new ArrayList<String>();
		private List<Integer> intValues = new ArrayList<Integer>();
		private Set<String> strValuesSet = new HashSet<String>();
		private int sum;
		private int sqSum;
		private  boolean sortOrderAscending;
		private List<String> sortedValues = new ArrayList<String>();
		private int limitTo;
		private boolean formatCompact;
		private int averageFunctionIndex;
		private double  stdDev;
		private boolean useRank;
		
		/* (non-Javadoc)
		 * @see org.apache.hadoop.mapreduce.Reducer#setup(org.apache.hadoop.mapreduce.Reducer.Context)
		 */
		protected void setup(Context context) throws IOException, InterruptedException {
        	Configuration config = context.getConfiguration();
        	fieldDelim = config.get("field.delim.out", "[]");
        	if (!StringUtils.isBlank(config.get("pro.agrregate.fumctions"))) {
        		aggrFunctions = config.get("pro.agrregate.fumctions").split(fieldDelim);
        		aggrFunctionValues = new int[aggrFunctions.length];
        		aggrFunctionValuesMax = new int[aggrFunctions.length];
        		for (int i = 0; i < aggrFunctionValuesMax.length;  ++i) {
        			aggrFunctionValuesMax[i] = Integer.MIN_VALUE;
        		}
				aggregateValueKeyPrefix = config.get("pro.aggregate.value.key.prefix");
	        	redisCache = RedisCache.createRedisCache(config, "ch");
        	}
        	sortOrderAscending = config.getBoolean("pro.sort.order.ascending", true);
        	limitTo = config.getInt("pro.limit.to", -1);
        	formatCompact = config.getBoolean("pro.format.compact", true);
        	useRank = config.getBoolean("pro.use.rank", false);
       }

		/* (non-Javadoc)
		 * @see org.apache.hadoop.mapreduce.Reducer#cleanup(org.apache.hadoop.mapreduce.Reducer.Context)
		 */
		protected void cleanup(Context context) throws IOException, InterruptedException {
			if (null != aggrFunctions) {
				for (int i = 0; i < aggrFunctions.length; ++i) {
					redisCache.put(aggregateValueKeyPrefix + "." +aggrFunctions[i] ,  "" + aggrFunctionValuesMax[i], true);
				}
			}
		}
		
    	/* (non-Javadoc)
    	 * @see org.apache.hadoop.mapreduce.Reducer#reduce(KEYIN, java.lang.Iterable, org.apache.hadoop.mapreduce.Reducer.Context)
    	 */
    	protected void reduce(Tuple key, Iterable<Text> values, Context context)
        	throws IOException, InterruptedException {
    		if (null != aggrFunctions) {
    			//aggregate functions
        		stBld.delete(0, stBld.length());
        		stBld.append(key.getString(0));

        		strValues.clear();
    			intValues.clear();
    			strValuesSet.clear();
    			sum = 0;
    			sqSum = 0;
    			averageFunctionIndex = -1;
	        	for (Text value : values){
	        		strValues.add(value.toString());
	        	}    			
	        	
	        	//all aggregate functions
	        	for (int i = 0; i <  aggrFunctions.length; ++i) {
	        		if (aggrFunctions[i].equals("count")) {
	        			//count
	        			aggrFunctionValues[i] = strValues.size(); 
	        		} else if (aggrFunctions[i].equals("uniqueCount")) {
	        			//unique count
	        			for (String stVal : strValues) {
	        				strValuesSet.add(stVal);
	        			}
	        			aggrFunctionValues[i] = strValuesSet.size(); 
	        		} else if (aggrFunctions[i].equals("sum")) {
	        			//sum
	        			doSum();
	        			aggrFunctionValues[i] = sum; 
	        		} else if (aggrFunctions[i].equals("average")) {
	        			//average
	        			if (sum == 0) {
		        			doSum();
	        			}
	        			aggrFunctionValues[i] = sum / intValues.size() ; 
	        		} else if (aggrFunctions[i].equals("stdDev")) {
	        			//standard deviation
	        			if (averageFunctionIndex < 0) {
	        				throw new IllegalStateException("average aggregate function must be included for std deviation");
	        			}
		        		doSqSum();
	        			stdDev = (double)sqSum / intValues.size() -  (double)aggrFunctionValues[averageFunctionIndex] * 
	        					aggrFunctionValues[averageFunctionIndex];
	        			stdDev = Math.sqrt(stdDev);
	        			aggrFunctionValues[i] = (int)stdDev ; 
	        		}
	        	}
	        	for (int i = 0; i < aggrFunctionValues.length; ++i) {
	        		if (aggrFunctionValues[i] > aggrFunctionValuesMax[i]) {
	        			aggrFunctionValuesMax[i] =  aggrFunctionValues[i]; 
	        		}
	    	   		stBld.append(fieldDelim).append(aggrFunctionValues[i]);
	        	}
	        	outVal.set(stBld.toString());
				context.write(NullWritable.get(), outVal);
    		}  else {
       			//actual values
    			 if (formatCompact) {
    				emitCompactFormat( key,  values, context);
    			} else {
    				emitLongFormat( key,  values, context);
    			}
    		}
    	}

    	/**
    	 * emits actual values in compact format
    	 * @param key
    	 * @param values
    	 * @param context
    	 * @throws InterruptedException 
    	 * @throws IOException 
    	 */
    	private void emitCompactFormat(Tuple key, Iterable<Text> values, Context context) 
    		throws IOException, InterruptedException {
			//actual values
    		stBld.delete(0, stBld.length());
    		stBld.append(key.getString(0));
			if (sortOrderAscending) {
				int i = 0;
	        	for (Text value : values){
	        		if (i == limitTo) {
	        			break;
	        		}
	    	   		stBld.append(fieldDelim).append(value);
	    	   		++i;
	        	}    		
			} else {
				sortedValues.clear();
	        	for (Text value : values){
	        		sortedValues.add(value.toString());
	        	}    	
	        	
	        	//reverse order
				int i = 0;
	        	for (int j = sortedValues.size() -1; j >= 0; --j) {
	        		if (i == limitTo) {
	        			break;
	        		}
	    	   		stBld.append(fieldDelim).append(sortedValues.get(j));
	    	   		++i;
	        	}
			}
	       	outVal.set(stBld.toString());
			context.write(NullWritable.get(), outVal);
    	}
    	
    	/**
    	 * emits actual values in long format
    	 * @param key
    	 * @param values
    	 * @param context
    	 * @throws InterruptedException 
    	 * @throws IOException 
    	 */
    	private void emitLongFormat(Tuple key, Iterable<Text> values, Context context) 
    		throws IOException, InterruptedException {
			//actual values
 			if (sortOrderAscending) {
				int i = 0;
	        	for (Text value : values){
	        		if (i == limitTo) {
	        			break;
	        		}
	        		stBld.delete(0, stBld.length());
	        		stBld.append(key.getString(0));
	        		if (useRank) {
	        			//rank
	        			stBld.append(fieldDelim).append(i+1);
	        		} else {
	        			//actual value
	        			stBld.append(fieldDelim).append(value);
	        		}
	    	       	outVal.set(stBld.toString());
	    			context.write(NullWritable.get(), outVal);
	    	   		++i;
	        	}    		
			} else {
				sortedValues.clear();
	        	for (Text value : values){
	        		sortedValues.add(value.toString());
	        	}    	
	        	
	        	//reverse order
				int i = 0;
	        	for (int j = sortedValues.size() -1; j >= 0; --j) {
	        		if (i == limitTo) {
	        			break;
	        		}
	        		stBld.delete(0, stBld.length());
	        		stBld.append(key.getString(0));
	        		if (useRank) {
	        			//rank
	        			stBld.append(fieldDelim).append(i+1);
	        		} else {
	        			//actual value
	        			stBld.append(fieldDelim).append(sortedValues.get(j));
	        		}
	    	       	outVal.set(stBld.toString());
	    			context.write(NullWritable.get(), outVal);
	    	   		++i;
	        	}
			}
    	}

    	/**
    	 * 
    	 */
    	private void doSum() {
			if (intValues.isEmpty()) {
				initializeIntValues();
			}
			for (int intVal : intValues) {
				sum += intVal;
			}
    	}

    	/**
    	 * 
    	 */
    	private void doSqSum() {
			if (intValues.isEmpty()) {
				initializeIntValues();
			}
			for (int intVal : intValues) {
				sqSum += intVal * intVal;
			}
    	}

    	/**
    	 * 
    	 */
    	private void initializeIntValues() {
			for (String stVal : strValues) {
				intValues.add(Integer.parseInt(stVal));
			}
    	}
    }
 
    /**
     * @param config
     * @param rowColFilter
     * @return
     * @throws IOException
     */
    public static int[] findIncludedColumns(Configuration config, RowColumnFilter rowColFilter) throws IOException {
		//projected field from the output of another MR
    	String fileterFieldDelimRegex = config.get("pro.filter.field.delim.regex", ",");
		InputStream colStream = Utility.getFileStream(config, "pro.exclude.columns.file");
		if (null == colStream) {
			throw new IllegalStateException("error aceesing excluded column file");
		}
		rowColFilter.processColumns(colStream, fileterFieldDelimRegex);
		int numCols = Utility.assertIntConfigParam(config, "pro.num.fields", "missing configuration for number of fields");
		return rowColFilter.getIncludedColOrdinals(numCols);
    }
    
    /**
     * @param config
     * @param rowColFilter
     * @param attrFilter
     * @throws IOException
     */
    public static void createExcludedRowsContext(Configuration config, RowColumnFilter rowColFilter, 
    		AttributeFilter attrFilter, String selectFilter) throws IOException {
    	String fileterFieldDelimRegex = config.get("pro.filter.field.delim.regex", ",");
		String notInSetName = config.get("pro.operator.notin.set.name");
		
		//notin operator with out of band set values
		InputStream rowStream = Utility.getFileStream(config, "pro.exclude.rows.file");
		if (null == rowStream) {
			throw new IllegalStateException("error aceesing excluded row file");
		}
		rowColFilter.processRows(rowStream, fileterFieldDelimRegex);
		String[] exclRowKeys = rowColFilter.getExcludedRowKeys();
		Map<String, Object> setOpContext = new HashMap<String, Object>();
		setOpContext.put(notInSetName, BasicUtils.generateSetFromArray(exclRowKeys));
		attrFilter.withContext(setOpContext).build(selectFilter);;
    }    
    
	/**
	 * @param args
	 */
	public static void main(String[] args) throws Exception {
        int exitCode = ToolRunner.run(new Projection(), args);
        System.exit(exitCode);
	}

}