/*
* Copyright © 2015 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package co.cask.cdap.examples.wikipedia;
import co.cask.cdap.api.Resources;
import co.cask.cdap.api.common.Bytes;
import co.cask.cdap.api.data.stream.StreamBatchReadable;
import co.cask.cdap.api.dataset.lib.KeyValueTable;
import co.cask.cdap.api.flow.flowlet.StreamEvent;
import co.cask.cdap.api.mapreduce.AbstractMapReduce;
import co.cask.cdap.api.mapreduce.MapReduceContext;
import co.cask.cdap.api.workflow.Value;
import co.cask.cdap.api.workflow.WorkflowToken;
import com.google.common.annotations.VisibleForTesting;
import com.google.gson.Gson;
import com.google.gson.annotations.SerializedName;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.UUID;
/**
* MapReduce program that dumps events from a stream to a dataset.
*/
public class StreamToDataset extends AbstractMapReduce {
private static final Logger LOG = LoggerFactory.getLogger(StreamToDataset.class);
private final String name;
public StreamToDataset(String name) {
this.name = name;
}
@Override
public void configure() {
setName(name);
setDescription("A MapReduce program that dumps events from a stream to a dataset.");
setMapperResources(new Resources(512));
}
@Override
public void beforeSubmit(MapReduceContext context) throws Exception {
Job job = context.getHadoopJob();
job.setNumReduceTasks(0);
WorkflowToken workflowToken = context.getWorkflowToken();
Class<? extends Mapper> mapper = PageTitleToDatasetMapper.class;
String inputStream = WikipediaPipelineApp.PAGE_TITLES_STREAM;
String outputDataset = WikipediaPipelineApp.PAGE_TITLES_DATASET;
if (workflowToken != null) {
Value likesToDatasetResult = workflowToken.get("result", WikipediaPipelineApp.LIKES_TO_DATASET_MR_NAME);
if (likesToDatasetResult != null && likesToDatasetResult.getAsBoolean()) {
// The "likes" stream to the dataset has already run and has been successful in this run so far.
// Now run raw wikipedia stream to dataset.
mapper = RawWikiDataToDatasetMapper.class;
inputStream = WikipediaPipelineApp.RAW_WIKIPEDIA_STREAM;
outputDataset = WikipediaPipelineApp.RAW_WIKIPEDIA_DATASET;
}
}
LOG.info("Using '{}' as the input stream and '{}' as the output dataset.", inputStream, outputDataset);
job.setMapperClass(mapper);
StreamBatchReadable.useStreamInput(context, inputStream);
context.addOutput(outputDataset);
}
@Override
public void onFinish(boolean succeeded, MapReduceContext context) throws Exception {
WorkflowToken workflowToken = context.getWorkflowToken();
if (workflowToken != null) {
workflowToken.put("result", Value.of(succeeded));
}
}
/**
* Mapper that dumps stream events to a {@link KeyValueTable}.
*/
public static final class PageTitleToDatasetMapper extends Mapper<LongWritable, StreamEvent, byte [], byte []> {
private final Gson gson = new Gson();
@Override
protected void map(LongWritable timestamp, StreamEvent streamEvent,
Context context) throws IOException, InterruptedException {
String contents = Bytes.toString(streamEvent.getBody());
Page page = gson.fromJson(contents, Page.class);
context.write(Bytes.toBytes(page.getId()), Bytes.toBytes(page.getName()));
// Increment the same counter from all map-reduce programs so we can use them for verification via
// Workflow Token in tests as well as Condition Node Predicates where applicable.
context.getCounter("custom", "num.records").increment(1);
}
@VisibleForTesting
static class Page {
private final String name;
private final String id;
@SuppressWarnings("unused")
@SerializedName("created_time")
private final String createdTime;
Page(String name, String id, String createdTime) {
this.name = name;
this.id = id;
this.createdTime = createdTime;
}
public String getName() {
return name;
}
public String getId() {
return id;
}
}
}
/**
* Mapper that dumps raw Wikipedia data from a stream to a {@link KeyValueTable}.
*/
public static final class RawWikiDataToDatasetMapper extends Mapper<LongWritable, StreamEvent, byte [], byte []> {
@Override
protected void map(LongWritable key, StreamEvent streamEvent,
Context context) throws IOException, InterruptedException {
String contents = Bytes.toString(streamEvent.getBody());
context.write(Bytes.toBytes(UUID.randomUUID()), Bytes.toBytes(contents));
context.getCounter("custom", "num.records").increment(1);
}
}
}