/* * Copyright © 2015-2016 Cask Data, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package co.cask.cdap.examples.streamconversion; import co.cask.cdap.api.Resources; import co.cask.cdap.api.common.Bytes; import co.cask.cdap.api.data.batch.Input; import co.cask.cdap.api.data.batch.Output; import co.cask.cdap.api.data.stream.StreamBatchReadable; import co.cask.cdap.api.dataset.lib.TimePartitionedFileSet; import co.cask.cdap.api.dataset.lib.TimePartitionedFileSetArguments; import co.cask.cdap.api.flow.flowlet.StreamEvent; import co.cask.cdap.api.mapreduce.AbstractMapReduce; import co.cask.cdap.api.mapreduce.MapReduceContext; import com.google.common.collect.Maps; import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.GenericRecordBuilder; import org.apache.avro.mapred.AvroKey; import org.apache.avro.mapreduce.AvroJob; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.util.Map; import java.util.concurrent.TimeUnit; /** * MapReduce job that reads events from a stream over a given time interval and writes the events out to a FileSet * in avro format. */ public class StreamConversionMapReduce extends AbstractMapReduce { private static final Logger LOG = LoggerFactory.getLogger(StreamConversionMapReduce.class); private static final Schema SCHEMA = new Schema.Parser().parse(StreamConversionApp.SCHEMA_STRING); private final Map<String, String> dsArguments = Maps.newHashMap(); @Override public void configure() { setDescription("Job to read a chunk of stream events and write them to a FileSet"); setMapperResources(new Resources(512)); } @Override public void beforeSubmit(MapReduceContext context) throws Exception { Job job = context.getHadoopJob(); job.setMapperClass(StreamConversionMapper.class); job.setNumReduceTasks(0); job.setMapOutputKeyClass(AvroKey.class); job.setMapOutputValueClass(NullWritable.class); AvroJob.setOutputKeySchema(job, SCHEMA); // read 5 minutes of events from the stream, ending at the logical start time of this run long logicalTime = context.getLogicalStartTime(); context.addInput(Input.ofStream("events", logicalTime - TimeUnit.MINUTES.toMillis(5), logicalTime)); // each run writes its output to a partition with the logical start time. TimePartitionedFileSetArguments.setOutputPartitionTime(dsArguments, logicalTime); context.addOutput(Output.ofDataset("converted", dsArguments)); TimePartitionedFileSet partitionedFileSet = context.getDataset("converted", dsArguments); LOG.info("Output location for new partition is: {}", partitionedFileSet.getEmbeddedFileSet().getOutputLocation()); } /** * Mapper that reads events from a stream and writes them out as Avro. */ public static class StreamConversionMapper extends Mapper<LongWritable, StreamEvent, AvroKey<GenericRecord>, NullWritable> { @Override public void map(LongWritable timestamp, StreamEvent streamEvent, Context context) throws IOException, InterruptedException { GenericRecordBuilder recordBuilder = new GenericRecordBuilder(SCHEMA) .set("time", streamEvent.getTimestamp()) .set("body", Bytes.toString(streamEvent.getBody())); GenericRecord record = recordBuilder.build(); context.write(new AvroKey<>(record), NullWritable.get()); } } }