/*
* Copyright © 2015 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package co.cask.cdap.data.stream;
import co.cask.cdap.api.common.Bytes;
import co.cask.cdap.api.data.format.FormatSpecification;
import co.cask.cdap.api.data.format.StructuredRecord;
import co.cask.cdap.api.data.schema.Schema;
import co.cask.cdap.api.flow.flowlet.StreamEvent;
import co.cask.cdap.api.stream.GenericStreamEventData;
import co.cask.cdap.api.stream.StreamEventData;
import co.cask.cdap.api.stream.StreamEventDecoder;
import co.cask.cdap.data.stream.decoder.BytesStreamEventDecoder;
import co.cask.cdap.data.stream.decoder.IdentityStreamEventDecoder;
import co.cask.cdap.data.stream.decoder.StringStreamEventDecoder;
import co.cask.cdap.data.stream.decoder.TextStreamEventDecoder;
import co.cask.cdap.format.TextRecordFormat;
import com.google.common.base.Charsets;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.io.Files;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.JobContextImpl;
import org.apache.hadoop.mapred.JobID;
import org.apache.hadoop.mapred.TaskAttemptID;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl;
import org.junit.Assert;
import org.junit.ClassRule;
import org.junit.Test;
import org.junit.rules.TemporaryFolder;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.StringTokenizer;
/**
*
*/
public class StreamInputFormatTest {
@ClassRule
public static TemporaryFolder tmpFolder = new TemporaryFolder();
private static final long CURRENT_TIME = 2000;
@Test
public void testTTL() throws Exception {
File inputDir = tmpFolder.newFolder();
File outputDir = tmpFolder.newFolder();
outputDir.delete();
final long currentTime = CURRENT_TIME;
final long ttl = 1500;
// Write 500 events in one bucket under one partition, with timestamps 0..499 by 1
// This partition file should be skipped by AbstractStreamFileConsumerFactory
generateEvents(inputDir, 500, 0, 1, new GenerateEvent() {
@Override
public String generate(int index, long timestamp) {
return "expiredEvent " + timestamp;
}
});
// Write 1000 events in one bucket under a different partition, with timestamps 0..999 by 1
generateEvents(inputDir, 1000, 0, 1, new GenerateEvent() {
@Override
public String generate(int index, long timestamp) {
if (timestamp + ttl < currentTime) {
return "expiredEvent " + timestamp;
} else {
return "nonExpiredEvent " + timestamp;
}
}
});
// Write 1000 events in one bucket under a different partition, with timestamps 1000..1999 by 1
generateEvents(inputDir, 1000, 1000, 1, new GenerateEvent() {
@Override
public String generate(int index, long timestamp) {
return "nonExpiredEvent " + timestamp;
}
});
// Run MR with TTL = 1500, currentTime = CURRENT_TIME
runMR(inputDir, outputDir, 0, Long.MAX_VALUE, 2000, ttl);
// Verify the result. It should have 1500 "nonExpiredEvent {timestamp}" for timestamp 500..1999 by 1.
Map<String, Integer> output = loadMRResult(outputDir);
Assert.assertEquals(ttl + 1, output.size());
Assert.assertEquals(null, output.get("expiredEvent"));
Assert.assertEquals(ttl, output.get("nonExpiredEvent").intValue());
for (long i = (currentTime - ttl); i < currentTime; i++) {
Assert.assertEquals(1, output.get(Long.toString(i)).intValue());
}
}
@Test
public void testTTLMultipleEventsWithSameTimestamp() throws Exception {
File inputDir = tmpFolder.newFolder();
File outputDir = tmpFolder.newFolder();
outputDir.delete();
final long currentTime = CURRENT_TIME;
final long ttl = 1;
// Write 1000 events in one bucket under one partition, with timestamp currentTime - ttl - 1
generateEvents(inputDir, 1000, currentTime - ttl - 1, 0, new GenerateEvent() {
@Override
public String generate(int index, long timestamp) {
return "expiredEvent " + timestamp;
}
});
// Write 1000 events in one bucket under a different partition, with currentTime
generateEvents(inputDir, 1000, currentTime, 0, new GenerateEvent() {
@Override
public String generate(int index, long timestamp) {
return "nonExpiredEvent " + timestamp;
}
});
runMR(inputDir, outputDir, 0, Long.MAX_VALUE, 2000, ttl);
// Verify the result. It should have 1000 "nonExpiredEvent {currentTime}".
Map<String, Integer> output = loadMRResult(outputDir);
Assert.assertEquals(2, output.size());
Assert.assertEquals(null, output.get("expiredEvent"));
Assert.assertEquals(1000, output.get("nonExpiredEvent").intValue());
Assert.assertEquals(1000, output.get(Long.toString(currentTime)).intValue());
}
@Test
public void testAllEvents() throws Exception {
// Write 1000 events in one bucket under one partition.
File inputDir = tmpFolder.newFolder();
File outputDir = tmpFolder.newFolder();
outputDir.delete();
generateEvents(inputDir);
runMR(inputDir, outputDir, 0, Long.MAX_VALUE, 1000, Long.MAX_VALUE);
// Verify the result. It should have 1000 "testing", and 100 for each integers in 0..9.
Map<String, Integer> output = loadMRResult(outputDir);
Assert.assertEquals(11, output.size());
Assert.assertEquals(1000, output.get("Testing").intValue());
for (int i = 0; i < 10; i++) {
Assert.assertEquals(100, output.get(Integer.toString(i)).intValue());
}
}
@Test
public void testTimeRange() throws Exception {
// Write 1000 events in one bucket under one partition.
File inputDir = tmpFolder.newFolder();
File outputDir = tmpFolder.newFolder();
outputDir.delete();
generateEvents(inputDir);
// Run a MapReduce on 1 timestamp only.
runMR(inputDir, outputDir, 1401, 1402, 1000, Long.MAX_VALUE);
// Verify the result. It should have 1 "testing", and 1 "1".
Map<String, Integer> output = loadMRResult(outputDir);
Assert.assertEquals(2, output.size());
Assert.assertEquals(1, output.get("Testing").intValue());
Assert.assertEquals(1, output.get("1").intValue());
}
@Test
public void testLiveStream() throws Exception {
File inputDir = tmpFolder.newFolder();
File outputDir = tmpFolder.newFolder();
outputDir.delete();
// Write 2 events, and keep the writer open
File partition = new File(inputDir, "0.1000");
File eventFile = new File(partition, "bucket.1.0." + StreamFileType.EVENT.getSuffix());
File indexFile = new File(partition, "bucket.1.0." + StreamFileType.INDEX.getSuffix());
partition.mkdirs();
StreamDataFileWriter writer = new StreamDataFileWriter(Files.newOutputStreamSupplier(eventFile),
Files.newOutputStreamSupplier(indexFile),
100L);
writer.append(StreamFileTestUtils.createEvent(0, "Testing 0"));
writer.append(StreamFileTestUtils.createEvent(1, "Testing 1"));
writer.flush();
// Run MapReduce to process all data.
runMR(inputDir, outputDir, 0, Long.MAX_VALUE, 1000, Long.MAX_VALUE);
Map<String, Integer> output = loadMRResult(outputDir);
Assert.assertEquals(3, output.size());
Assert.assertEquals(2, output.get("Testing").intValue());
Assert.assertEquals(1, output.get("0").intValue());
Assert.assertEquals(1, output.get("1").intValue());
}
@Test
public void testIdentityStreamEventDecoder() {
ImmutableMap.Builder<String, String> headers = ImmutableMap.builder();
headers.put("key1", "value1");
headers.put("key2", "value2");
ByteBuffer buffer = Charsets.UTF_8.encode("testdata");
StreamEvent event = new StreamEvent(headers.build(), buffer, System.currentTimeMillis());
StreamEventDecoder<LongWritable, StreamEvent> decoder = new IdentityStreamEventDecoder();
StreamEventDecoder.DecodeResult<LongWritable, StreamEvent> result
= new StreamEventDecoder.DecodeResult<>();
result = decoder.decode(event, result);
Assert.assertEquals(new LongWritable(event.getTimestamp()), result.getKey());
Assert.assertEquals(event, result.getValue());
}
@Test
public void testStringStreamEventDecoder() {
String body = "Testing";
StreamEvent event = new StreamEvent(ImmutableMap.<String, String>of(), Charsets.UTF_8.encode(body));
StreamEventDecoder<LongWritable, String> decoder = new StringStreamEventDecoder();
StreamEventDecoder.DecodeResult<LongWritable, String> result
= new StreamEventDecoder.DecodeResult<>();
result = decoder.decode(event, result);
Assert.assertEquals(event.getTimestamp(), result.getKey().get());
Assert.assertEquals(body, result.getValue());
}
@Test
public void testStreamDecoderInference() {
Configuration conf = new Configuration();
StreamInputFormat.inferDecoderClass(conf, BytesWritable.class);
Assert.assertEquals(BytesStreamEventDecoder.class, StreamInputFormat.getDecoderClass(conf));
StreamInputFormat.inferDecoderClass(conf, Text.class);
Assert.assertEquals(TextStreamEventDecoder.class, StreamInputFormat.getDecoderClass(conf));
StreamInputFormat.inferDecoderClass(conf, String.class);
Assert.assertEquals(StringStreamEventDecoder.class, StreamInputFormat.getDecoderClass(conf));
StreamInputFormat.inferDecoderClass(conf, StreamEvent.class);
Assert.assertEquals(IdentityStreamEventDecoder.class, StreamInputFormat.getDecoderClass(conf));
StreamInputFormat.inferDecoderClass(conf, StreamEventData.class);
Assert.assertEquals(IdentityStreamEventDecoder.class, StreamInputFormat.getDecoderClass(conf));
}
@Test
public void testStreamRecordReader() throws Exception {
File inputDir = tmpFolder.newFolder();
File partition = new File(inputDir, "1.1000");
partition.mkdirs();
File eventFile = new File(partition, "bucket.1.0." + StreamFileType.EVENT.getSuffix());
File indexFile = new File(partition, "bucket.1.0." + StreamFileType.INDEX.getSuffix());
// write 1 event
StreamDataFileWriter writer = new StreamDataFileWriter(Files.newOutputStreamSupplier(eventFile),
Files.newOutputStreamSupplier(indexFile),
100L);
writer.append(StreamFileTestUtils.createEvent(1000, "test"));
writer.flush();
// get splits from the input format. Expect to get 2 splits,
// one from 0 - some offset and one from offset - Long.MAX_VALUE.
Configuration conf = new Configuration();
TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
StreamInputFormat.setStreamPath(conf, inputDir.toURI());
StreamInputFormat format = new StreamInputFormat();
List<InputSplit> splits = format.getSplits(new JobContextImpl(new JobConf(conf), new JobID()));
Assert.assertEquals(2, splits.size());
// write another event so that the 2nd split has something to read
writer.append(StreamFileTestUtils.createEvent(1001, "test"));
writer.close();
// create a record reader for the 2nd split
StreamRecordReader<LongWritable, StreamEvent> recordReader =
new StreamRecordReader<>(new IdentityStreamEventDecoder());
recordReader.initialize(splits.get(1), context);
// check that we read the 2nd stream event
Assert.assertTrue(recordReader.nextKeyValue());
StreamEvent output = recordReader.getCurrentValue();
Assert.assertEquals(1001, output.getTimestamp());
Assert.assertEquals("test", Bytes.toString(output.getBody()));
// check that there is nothing more to read
Assert.assertFalse(recordReader.nextKeyValue());
}
@Test
public void testFormatStreamRecordReader() throws IOException, InterruptedException {
File inputDir = tmpFolder.newFolder();
File partition = new File(inputDir, "1.1000");
partition.mkdirs();
File eventFile = new File(partition, "bucket.1.0." + StreamFileType.EVENT.getSuffix());
File indexFile = new File(partition, "bucket.1.0." + StreamFileType.INDEX.getSuffix());
// write 1 event
StreamDataFileWriter writer = new StreamDataFileWriter(Files.newOutputStreamSupplier(eventFile),
Files.newOutputStreamSupplier(indexFile),
100L);
StreamEvent streamEvent = new StreamEvent(ImmutableMap.of("header1", "value1", "header2", "value2"),
Charsets.UTF_8.encode("hello world"),
1000);
writer.append(streamEvent);
writer.close();
FormatSpecification formatSpec =
new FormatSpecification(TextRecordFormat.class.getName(),
Schema.recordOf("event", Schema.Field.of("body", Schema.of(Schema.Type.STRING))),
Collections.<String, String>emptyMap());
Configuration conf = new Configuration();
StreamInputFormat.setBodyFormatSpecification(conf, formatSpec);
StreamInputFormat.setStreamPath(conf, inputDir.toURI());
TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
StreamInputFormat format = new StreamInputFormat();
// read all splits and store the results in the list
List<GenericStreamEventData<StructuredRecord>> recordsRead = Lists.newArrayList();
List<InputSplit> inputSplits = format.getSplits(context);
for (InputSplit split : inputSplits) {
RecordReader<LongWritable, GenericStreamEventData<StructuredRecord>> recordReader =
format.createRecordReader(split, context);
recordReader.initialize(split, context);
while (recordReader.nextKeyValue()) {
recordsRead.add(recordReader.getCurrentValue());
}
}
// should only have read 1 record
Assert.assertEquals(1, recordsRead.size());
GenericStreamEventData<StructuredRecord> eventData = recordsRead.get(0);
Assert.assertEquals(streamEvent.getHeaders(), eventData.getHeaders());
Assert.assertEquals("hello world", eventData.getBody().get("body"));
}
private void generateEvents(File inputDir, int numEvents, long startTime, long timeIncrement,
GenerateEvent generator) throws IOException {
File partition = new File(inputDir, Long.toString(startTime / 1000) + ".1000");
File eventFile = new File(partition, "bucket.1.0." + StreamFileType.EVENT.getSuffix());
File indexFile = new File(partition, "bucket.1.0." + StreamFileType.INDEX.getSuffix());
partition.mkdirs();
StreamDataFileWriter writer = new StreamDataFileWriter(Files.newOutputStreamSupplier(eventFile),
Files.newOutputStreamSupplier(indexFile),
100L);
// Write 1000 events
for (int i = 0; i < numEvents; i++) {
long timestamp = startTime + i * timeIncrement;
writer.append(StreamFileTestUtils.createEvent(timestamp, generator.generate(i, timestamp)));
}
writer.close();
}
private void generateEvents(File inputDir) throws IOException {
generateEvents(inputDir, 1000, 1000, 1, new GenerateEvent() {
@Override
public String generate(int index, long timestamp) {
return "Testing " + (index % 10);
}
});
}
private void runMR(File inputDir, File outputDir, long startTime, long endTime,
long splitSize, long ttl) throws Exception {
Job job = Job.getInstance();
Configuration conf = job.getConfiguration();
StreamInputFormat.setTTL(conf, ttl);
StreamInputFormat.setStreamPath(conf, inputDir.toURI());
StreamInputFormat.setTimeRange(conf, startTime, endTime);
StreamInputFormat.setMaxSplitSize(conf, splitSize);
job.setInputFormatClass(TestStreamInputFormat.class);
TextOutputFormat.setOutputPath(job, new Path(outputDir.toURI()));
job.setOutputFormatClass(TextOutputFormat.class);
job.setJarByClass(StreamInputFormatTest.class);
job.setMapperClass(TokenizeMapper.class);
job.setReducerClass(AggregateReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
job.setMapOutputValueClass(IntWritable.class);
job.waitForCompletion(true);
}
private Map<String, Integer> loadMRResult(File outputDir) throws IOException {
Map<String, Integer> output = Maps.newTreeMap();
BufferedReader reader = Files.newReader(new File(outputDir, "part-r-00000"), Charsets.UTF_8);
try {
String line = reader.readLine();
while (line != null) {
int idx = line.indexOf('\t');
output.put(line.substring(0, idx), Integer.parseInt(line.substring(idx + 1)));
line = reader.readLine();
}
} finally {
reader.close();
}
return output;
}
private interface GenerateEvent {
String generate(int index, long timestamp);
}
/**
* StreamInputFormat for testing.
*/
private static final class TestStreamInputFormat extends StreamInputFormat<LongWritable, Text> {
@Override
protected StreamEventDecoder<LongWritable, Text> createStreamEventDecoder(Configuration conf) {
return new TextStreamEventDecoder();
}
@Override
protected long getCurrentTime() {
return CURRENT_TIME;
}
}
/**
* Mapper for testing.
*/
public static final class TokenizeMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
private static final IntWritable ONE = new IntWritable(1);
private final Text word = new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());
context.write(word, ONE);
}
}
}
/**
* Reducer for testing.
*/
public static final class AggregateReducer extends Reducer<Text, IntWritable, Text, LongWritable> {
private final LongWritable result = new LongWritable();
@Override
protected void reduce(Text key, Iterable<IntWritable> values,
Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
}