/*
* Copyright © 2016 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package co.cask.cdap.examples.clicksandviews;
import co.cask.cdap.api.ProgramLifecycle;
import co.cask.cdap.api.Resources;
import co.cask.cdap.api.data.batch.Input;
import co.cask.cdap.api.data.batch.Output;
import co.cask.cdap.api.dataset.lib.PartitionKey;
import co.cask.cdap.api.dataset.lib.PartitionedFileSet;
import co.cask.cdap.api.dataset.lib.PartitionedFileSetArguments;
import co.cask.cdap.api.mapreduce.AbstractMapReduce;
import co.cask.cdap.api.mapreduce.MapReduceContext;
import co.cask.cdap.api.mapreduce.MapReduceTaskContext;
import com.google.common.base.Joiner;
import com.google.common.base.Preconditions;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
/**
* A MapReduce program that reads from a CLICKS stream and a VIEWS stream. It performs a join across these two streams,
* based upon the viewId of the records. It then keeps track of how many clicks a particular view resulted in.
*/
public class ClicksAndViewsMapReduce extends AbstractMapReduce {
static final String NAME = "ClicksAndViewsMapReduce";
private static final Joiner TAB_JOINER = Joiner.on("\t");
@Override
public void configure() {
setName(NAME);
setMapperResources(new Resources(1024));
setReducerResources(new Resources(1024));
}
@Override
public void beforeSubmit(MapReduceContext context) throws Exception {
context.addInput(Input.ofStream(ClicksAndViews.CLICKS));
context.addInput(Input.ofStream(ClicksAndViews.VIEWS));
PartitionedFileSet joinedPFS = context.getDataset(ClicksAndViews.JOINED);
PartitionKey outputPartitionKey =
PartitionedFileSetArguments.getOutputPartitionKey(context.getRuntimeArguments(), joinedPFS.getPartitioning());
if (outputPartitionKey == null) {
outputPartitionKey = PartitionKey.builder().addLongField("runtime", context.getLogicalStartTime()).build();
}
Map<String, String> outputArgs = new HashMap<>();
PartitionedFileSetArguments.setOutputPartitionKey(outputArgs, outputPartitionKey);
context.addOutput(Output.ofDataset(ClicksAndViews.JOINED, outputArgs));
Job job = context.getHadoopJob();
job.setMapperClass(ImpressionKeyingMapper.class);
job.setReducerClass(JoiningReducer.class);
}
/**
* A Mapper which tags all of the records with the source that it is coming from.
* It also keys all of the output records with the viewId, so that a single reducer gets all of the records
* for a particular viewId.
*/
public static class ImpressionKeyingMapper extends Mapper<LongWritable, Text, LongWritable, Text>
implements ProgramLifecycle<MapReduceTaskContext<LongWritable, Text>> {
private String inputName;
@Override
public void initialize(MapReduceTaskContext<LongWritable, Text> context) throws Exception {
inputName = context.getInputName();
Preconditions.checkNotNull(inputName);
Preconditions.checkArgument(ClicksAndViews.CLICKS.equals(inputName) || ClicksAndViews.VIEWS.equals(inputName));
}
@Override
public void destroy() {
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] parts = value.toString().split("\t");
// tag each record with the input name, so the reducer is simpler
context.write(new LongWritable(Long.valueOf(parts[0])), new Text(TAB_JOINER.join(inputName, value.toString())));
}
}
/**
* Reducer class which looks at all the records for the viewId.
* It outputs the view record, while appending the count of how many click records there are for it.
*/
public static class JoiningReducer extends Reducer<LongWritable, Text, NullWritable, String> {
@Override
public void reduce(LongWritable key, Iterable<Text> values,
Context context) throws IOException, InterruptedException {
String viewData = null;
int totalClicks = 0;
for (Text value : values) {
String[] parts = value.toString().split("\t", 2);
String source = parts[0];
if (ClicksAndViews.CLICKS.equals(source)) {
totalClicks += 1;
} else if (ClicksAndViews.VIEWS.equals(source)) {
viewData = parts[1];
}
}
Preconditions.checkNotNull(viewData);
// the viewId (which we key on from the mapper output) is already in the viewData
context.write(NullWritable.get(), TAB_JOINER.join(viewData, totalClicks));
}
}
}