/* * Copyright © 2014-2015 Cask Data, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package co.cask.cdap.test.app; import co.cask.cdap.api.annotation.Output; import co.cask.cdap.api.annotation.ProcessInput; import co.cask.cdap.api.annotation.UseDataSet; import co.cask.cdap.api.app.AbstractApplication; import co.cask.cdap.api.common.Bytes; import co.cask.cdap.api.data.stream.Stream; import co.cask.cdap.api.dataset.DatasetProperties; import co.cask.cdap.api.flow.AbstractFlow; import co.cask.cdap.api.flow.flowlet.AbstractFlowlet; import co.cask.cdap.api.flow.flowlet.Callback; import co.cask.cdap.api.flow.flowlet.FailurePolicy; import co.cask.cdap.api.flow.flowlet.FailureReason; import co.cask.cdap.api.flow.flowlet.InputContext; import co.cask.cdap.api.flow.flowlet.OutputEmitter; import co.cask.cdap.api.flow.flowlet.StreamEvent; import co.cask.cdap.api.mapreduce.AbstractMapReduce; import co.cask.cdap.api.mapreduce.MapReduceContext; import co.cask.cdap.api.metrics.Metrics; import co.cask.cdap.api.service.BasicService; import co.cask.cdap.api.service.http.AbstractHttpServiceHandler; import co.cask.cdap.api.service.http.HttpServiceRequest; import co.cask.cdap.api.service.http.HttpServiceResponder; import com.google.common.base.Charsets; import com.google.common.collect.ImmutableMap; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import java.io.IOException; import java.nio.ByteBuffer; import java.nio.charset.CharacterCodingException; import java.util.Map; import java.util.StringTokenizer; import javax.annotation.Nullable; import javax.ws.rs.GET; import javax.ws.rs.Path; import javax.ws.rs.PathParam; /** * */ public class WordCountApp extends AbstractApplication { @Override public void configure() { setName("WordCountApp"); addStream(new Stream("text")); addDatasetModule("my-kv", MyKeyValueTableDefinition.Module.class); createDataset("mydataset", "myKeyValueTable", DatasetProperties.EMPTY); createDataset("totals", "myKeyValueTable", DatasetProperties.EMPTY); addFlow(new WordCountFlow()); addService(new BasicService("WordFrequency", new WordFrequencyHandler())); addMapReduce(new CountTotal()); addMapReduce(new CountFromStream()); } /** * Output object of stream source. */ public static final class MyRecord { private final String title; private final String text; private final boolean expired; public MyRecord(String title, String text, boolean expired) { this.title = title; this.text = text; this.expired = expired; } public String getTitle() { return title; } public String getText() { return text; } public boolean isExpired() { return expired; } @Override public String toString() { return "MyRecord{" + "title='" + title + '\'' + ", text='" + text + '\'' + ", expired=" + expired + '}'; } } /** * Flow that counts words coming from stream source. */ public static class WordCountFlow extends AbstractFlow { @Override protected void configureFlow() { setName("WordCountFlow"); setDescription("Flow for counting words"); addFlowlet(new StreamSource()); addFlowlet(new Tokenizer()); addFlowlet(new CountByField()); connectStream("text", "StreamSource"); connect("StreamSource", "Tokenizer"); connect("Tokenizer", "CountByField"); } } /** * Stream source for word count flow. */ public static final class StreamSource extends AbstractFlowlet { private OutputEmitter<MyRecord> output; private Metrics metrics; @ProcessInput public void process(StreamEvent event, InputContext context) throws CharacterCodingException { if (!"text".equals(context.getOrigin())) { return; } metrics.count("stream.event", 1); ByteBuffer buf = event.getBody(); output.emit(new MyRecord( event.getHeaders().get("title"), buf == null ? null : Charsets.UTF_8.newDecoder().decode(buf).toString(), false)); } } /** * Tokenizer for word count flow. */ public static class Tokenizer extends AbstractFlowlet { @Output("field") private OutputEmitter<Map<String, String>> outputMap; private boolean error = true; @ProcessInput public void foo(MyRecord data) { tokenize(data.getTitle(), "title"); tokenize(data.getText(), "text"); if (error) { error = false; throw new IllegalStateException(data.toString()); } } private void tokenize(String str, String field) { if (str == null) { return; } final String delimiters = "[ .-]"; for (String token : str.split(delimiters)) { outputMap.emit(ImmutableMap.of("field", field, "word", token)); } } } /** * Flow that counts words and stores them in a table. */ public static class CountByField extends AbstractFlowlet implements Callback { @UseDataSet("mydataset") private MyKeyValueTableDefinition.KeyValueTable counters; @ProcessInput("field") public void process(Map<String, String> fieldToken) { String token = fieldToken.get("word"); if (token == null) { return; } String field = fieldToken.get("field"); if (field != null) { token = field + ":" + token; } Long current = Long.valueOf(counters.get(token, "0")); counters.put(token, String.valueOf(current + 1)); } @Override public void onSuccess(@Nullable Object input, @Nullable InputContext inputContext) { } @Override public FailurePolicy onFailure(@Nullable Object input, @Nullable InputContext inputContext, FailureReason reason) { return FailurePolicy.RETRY; } } /** * Service handler to query word counts. */ public static class WordFrequencyHandler extends AbstractHttpServiceHandler { @UseDataSet("mydataset") private MyKeyValueTableDefinition.KeyValueTable counters; @UseDataSet("totals") private MyKeyValueTableDefinition.KeyValueTable totals; @GET @Path("wordfreq/{word}") public void wordfreq(HttpServiceRequest request, HttpServiceResponder responder, @PathParam("word") String word) throws IOException { Map<String, Long> result = ImmutableMap.of(word, Long.valueOf(this.counters.get(word, "0"))); responder.sendJson(result); } @GET @Path("total") public void total(HttpServiceRequest request, HttpServiceResponder responder) throws IOException { long result = Long.valueOf(this.totals.get("total_words_count")); responder.sendJson(result); } @GET @Path("stream_total") public void streamTotal(HttpServiceRequest request, HttpServiceResponder responder) throws IOException { long result = Long.valueOf(this.totals.get("stream_total_words_count")); responder.sendJson(result); } } /** * Map Reduce to count total of counts. */ public static class CountTotal extends AbstractMapReduce { @Override public void configure() { setName("countTotal"); setInputDataset("mydataset"); setOutputDataset("totals"); } @Override public void beforeSubmit(MapReduceContext context) throws Exception { Job job = context.getHadoopJob(); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); } /** * Mapper for map reduce job. */ public static class MyMapper extends Mapper<String, String, BytesWritable, LongWritable> { @Override protected void map(String key, String value, Context context) throws IOException, InterruptedException { context.write(new BytesWritable(Bytes.toBytes("total")), new LongWritable(Long.valueOf(value))); } } /** * Reducer for map reduce job. */ public static class MyReducer extends Reducer<BytesWritable, LongWritable, String, String> { @Override protected void reduce(BytesWritable key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException { long total = 0; for (LongWritable longWritable : values) { total += longWritable.get(); } context.write("total_words_count", String.valueOf(total)); } } } /** * Performs word count from stream data directly. */ public static final class CountFromStream extends AbstractMapReduce { @Override public void configure() { setName("countFromStream"); useStreamInput("text"); setOutputDataset("totals"); } @Override public void beforeSubmit(MapReduceContext context) throws Exception { Job job = context.getHadoopJob(); job.setMapperClass(StreamMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(LongWritable.class); job.setReducerClass(StreamReducer.class); } /** * Mapper for the count from stream. */ public static final class StreamMapper extends Mapper<LongWritable, Text, Text, LongWritable> { private static final Text TOTAL = new Text("total"); @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { StringTokenizer itr = new StringTokenizer(value.toString()); long total = 0; while (itr.hasMoreTokens()) { total++; itr.nextToken(); } context.write(TOTAL, new LongWritable(total)); } } /** * Reducer for the count from stream. */ public static final class StreamReducer extends Reducer<Text, LongWritable, String, String> { @Override protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException { long sum = 0; for (LongWritable val : values) { sum += val.get(); } context.write("stream_total_words_count", String.valueOf(sum)); } } } }