CardCountBySuitFilteredByPipAndColor.java example

Explorer

code-master
- hadoop
  - getting_started
    - src
      - main
        java
        demo
        cards
        drivers
        App.java
        RowCount.java
        mappers
        RecordMapper.java
        reducers
        NoKeyRecordCountReducer.java
  - groupbypip
    - src
      - main
        java
        cards
        groupbypip
        App.java
      - test
        java
        cards
        groupbypip
        AppTest.java
  - nyse
    - src
      - main
        java
        StockCompanyDistCacheJoinMapper
        StockCompanyDistCacheJoinMapper.java
        analytics
        nyse
        hive
        udfs
        DateTranslate.java
        cards
        CardCountBySuit.java
        filesystem
        CopyMerge.java
        GetFiles.java
        nyse
        RecordCount.java
        avgstockvolpermonth
        AvgStockVolPerMonthCombiner.java
        AvgStockVolPerMonthMapper.java
        AvgStockVolPerMonthReducer.java
        AvgStockVolumePerMonthDriver.java
        comparators
        FirstFieldLongPairGroupingComparator.java
        LongPairPrimitiveGroupingComparator.java
        LongPairPrimitiveSortingComparator.java
        counters
        NoTradeDays.java
        keyvalues
        LongPair.java
        LongPairPrimitive.java
        NYSEWritable.java
        TextPair.java
        parsers
        CompanyParser.java
        NYSEParser.java
        partitioners
        FirstKeyLongPairPartitioner.java
        FirstKeyTextPairPartitioner.java
        SecondKeyTextPairPartitioner.java
        stockcompanyjoin
        distcache
        StockCompanyJoinDistCacheDriver.java
        StockCompanyJoinDistCacheMapper.java
        topthreestocksbyvolume
        TopThreeStocksByVolumePerDayCompressDriver.java
        TopThreeStocksByVolumePerDayDriver.java
        TopThreeStocksByVolumePerDayMapper.java
        TopThreeStocksByVolumePerDayReducer.java
        totalvolume
        TotalVolumePerYearPerStock.java
      - test
        java
        analytics
        nyse
        AppTest.java
  - performance_tuning
    - src
      - main
        java
        demo
        cards
        drivers
        App.java
        CardCountByPip.java
        CardCountBySuitFilteredByPipAndColor.java
        CompressionCopy.java
        DistributeCardsByPip.java
        RowCount.java
        RowCountCombinedFileInputFormat.java
        mappers
        RecordMapper.java
        parsers
        CardParser.java
        reducers
        NoKeyRecordCountReducer.java
- nosql
  - dynamo
    - src
  - hbase
    - src
      - main
        java
        hbase
        GettingStarted.java
        NyseLoad.java
        NyseParser.java

package demo.cards.drivers;

/**
 * Created with IntelliJ IDEA.
 * User: cloudera
 * Date: 9/22/13
 * Time: 1:27 PM
 * To change this template use File | Settings | File Templates.
 */

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import demo.cards.parsers.CardParser;

import java.io.IOException;

/* This will demonstrate how we should design and develop efficient map reduce programs
 * select suit, count(1) from deck where pip='J' and color='RED' group by suit;
 * You will learn using fs package to eliminate unnecessary directories, significance
 * of setup method in map reduce life cycle as well as passing parameters from command line to 
 * map/reduce functions.
 * 
 * 1) Understand requirements
 * 2) Design directories and files (partition by pip)
 * 3) Eliminate unnecessary directories as part of job configuration using org.apache.hadoop.fs
 * 4) Pass on color as parameter to map function and filter out all non RED colored cards
 * 5) Review counters
 * 
 * Usage: hadoop jar performance_tuning.jar 
 * demo.cards.drivers.CardCountBySuitFilteredByPipAndColor 
 * /user/hduser/cards /user/hduser/output.cards 
 * J RED
 */

public class CardCountBySuitFilteredByPipAndColor extends Configured implements
		Tool {

	public static class Map extends
			Mapper<LongWritable, Text, Text, IntWritable> {
		private final static IntWritable one = new IntWritable(1);
		private CardParser parser = new CardParser();
		private String suit;

		private Configuration jobconf = null;

		public void setup(Context context) throws IOException,
				InterruptedException {
			this.jobconf = context.getConfiguration();
		}

		public void map(LongWritable key, Text value, Context output)
				throws IOException, InterruptedException {
			parser.parse(value.toString());

			String param = jobconf.get("param.color");

			if (parser.getColor().equals(param)) {
				suit = parser.getSuit();
				output.write(new Text(suit), one);
			}
		}
	}

	public static class Reduce extends
			Reducer<Text, IntWritable, Text, IntWritable> {
		public void reduce(Text key, Iterable<IntWritable> values,
				Context output) throws IOException, InterruptedException {
			int sum = 0;
			while (values.iterator().hasNext()) {
				sum += values.iterator().next().get();
			}
			output.write(key, new IntWritable(sum));
		}
	}

	public int run(String[] args) throws IOException, InterruptedException,
			ClassNotFoundException {
		Configuration conf = getConf();

		conf.set("param.color", args[3]);

		Job job = Job.getInstance(conf, "select suit, count(1) from deck "
				+ "where pip='J' and color='RED' group by suit");

		job.setJarByClass(getClass());

		String pip = args[2];
		Path inputPath = new Path(args[0] + "/" + "pip=" + pip);

		FileInputFormat.setInputPaths(job, inputPath);

		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(IntWritable.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);

		job.setMapperClass(Map.class);
		// job.setGroupingComparatorClass(TextPair.Comparator.class);
		// job.setSortComparatorClass();
		// job.setCombinerClass(Reduce.class);
		job.setReducerClass(Reduce.class);

		job.setNumReduceTasks(2);

		job.setInputFormatClass(TextInputFormat.class);
		job.setOutputFormatClass(TextOutputFormat.class);

		FileOutputFormat.setOutputPath(job, new Path(args[1]));

		return job.waitForCompletion(true) ? 0 : 1;
	}

	public static void main(String[] args) throws Exception {
		int exitCode = ToolRunner.run(
				new CardCountBySuitFilteredByPipAndColor(), args);
		System.exit(exitCode);

	}
}