/*
* Copyright © 2014-2016 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package co.cask.cdap.examples.fileset;
import co.cask.cdap.api.Resources;
import co.cask.cdap.api.data.batch.Input;
import co.cask.cdap.api.data.batch.Output;
import co.cask.cdap.api.mapreduce.AbstractMapReduce;
import co.cask.cdap.api.mapreduce.MapReduceContext;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/**
* A simple word counter. It reads inputs from the "lines" FileSet and writes its output to
* the "counts" FileSet. The input and output path can be configured as runtime arguments:
* <ul>
* <li>"dataset.lines.input.paths" for the input. Multiple paths can be given, separated by commas.</li>
* <li>"dataset.counts.output.path" for the output.</li>
* </ul>
*/
public class WordCount extends AbstractMapReduce {
@Override
public void configure() {
setMapperResources(new Resources(1024));
setReducerResources(new Resources(1024));
}
@Override
public void beforeSubmit(MapReduceContext context) throws Exception {
Job job = context.getHadoopJob();
job.setMapperClass(Tokenizer.class);
job.setReducerClass(Counter.class);
job.setNumReduceTasks(1);
String inputDataset = context.getRuntimeArguments().get("input");
inputDataset = inputDataset != null ? inputDataset : "lines";
String outputDataset = context.getRuntimeArguments().get("output");
outputDataset = outputDataset != null ? outputDataset : "counts";
context.addInput(Input.ofDataset(inputDataset));
context.addOutput(Output.ofDataset(outputDataset));
}
/**
* A mapper that tokenizes each input line and emits each token with a value of 1.
*/
public static class Tokenizer extends Mapper<LongWritable, Text, Text, IntWritable> {
private Text word = new Text();
private static final IntWritable ONE = new IntWritable(1);
@Override
public void map(LongWritable key, Text data, Context context)
throws IOException, InterruptedException {
for (String token : data.toString().split(" ")) {
word.set(token);
context.write(word, ONE);
}
}
}
/**
* A reducer that sums up the counts for each key.
*/
public static class Counter extends Reducer<Text, IntWritable, String, Long> {
@Override
public void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
long sum = 0L;
for (IntWritable value : values) {
sum += value.get();
}
context.write(key.toString(), sum);
}
}
}