/*
* Copyright © 2014-2015 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package co.cask.cdap.data.stream;
import co.cask.cdap.api.common.Bytes;
import co.cask.cdap.api.data.schema.Schema;
import co.cask.cdap.api.flow.flowlet.StreamEvent;
import co.cask.cdap.common.io.BinaryDecoder;
import co.cask.cdap.common.io.ByteBuffers;
import co.cask.cdap.common.io.Decoder;
import co.cask.cdap.common.io.SeekableInputStream;
import co.cask.cdap.common.stream.StreamEventDataCodec;
import co.cask.cdap.data.file.FileReader;
import co.cask.cdap.data.file.ReadFilter;
import co.cask.cdap.internal.io.SchemaTypeAdapter;
import com.google.common.base.Stopwatch;
import com.google.common.collect.ImmutableMap;
import com.google.common.io.ByteStreams;
import com.google.common.io.InputSupplier;
import com.google.gson.JsonSyntaxException;
import com.google.gson.stream.JsonReader;
import java.io.EOFException;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.util.Arrays;
import java.util.Collection;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import javax.annotation.Nullable;
import javax.annotation.concurrent.NotThreadSafe;
/**
* Class for reading data file written by {@link StreamDataFileWriter}.
*
* @see StreamDataFileWriter
*/
@NotThreadSafe
public final class StreamDataFileReader implements FileReader<PositionStreamEvent, Long> {
private final InputSupplier<? extends SeekableInputStream> eventInputSupplier;
private final InputSupplier<? extends InputStream> indexInputSupplier;
private final long startTime;
private final long offset;
private final byte[] timestampBuffer;
private final StreamEventBuffer streamEventBuffer;
private StreamDataFileIndex index;
private SeekableInputStream eventInput;
private long position;
private long timestamp;
private boolean closed;
private boolean eof;
private Decoder decoder;
private StreamEvent eventTemplate;
/**
* Opens a new {@link StreamDataFileReader} with the given inputs.
*
* @param eventInputSupplier An {@link InputSupplier} for providing the stream to read events.
* @return A new instance of {@link StreamDataFileReader}.
*/
public static StreamDataFileReader create(InputSupplier<? extends SeekableInputStream> eventInputSupplier) {
return new StreamDataFileReader(eventInputSupplier, null, 0L, 0L);
}
/**
* Opens a new {@link StreamDataFileReader} with the given inputs that starts reading events that are
* written at or after the given timestamp.
*
* @param eventInputSupplier An {@link InputSupplier} for providing the stream to read events.
* @param indexInputSupplier An {@link InputSupplier} for providing the stream to read event index.
* @param startTime Timestamp in milliseconds for the event time to start reading with.
* @return A new instance of {@link StreamDataFileReader}.
*/
public static StreamDataFileReader createByStartTime(
InputSupplier<? extends SeekableInputStream> eventInputSupplier,
@Nullable InputSupplier<? extends InputStream> indexInputSupplier, long startTime) {
return new StreamDataFileReader(eventInputSupplier, indexInputSupplier, startTime, 0L);
}
/**
* Opens a new {@link StreamDataFileReader} with the given inputs, which starts reading events at a the smallest
* event position that is larger than or equal to the given offset.
*
* @param eventInputSupplier An {@link InputSupplier} for providing the stream to read events.
* @param indexInputSupplier An {@link InputSupplier} for providing the stream to read event index.
* @param offset An arbitrary event file offset.
* @return A new instance of {@link StreamDataFileReader}.
*/
public static StreamDataFileReader createWithOffset(InputSupplier<? extends SeekableInputStream> eventInputSupplier,
@Nullable InputSupplier<? extends InputStream> indexInputSupplier,
long offset) {
return new StreamDataFileReader(eventInputSupplier, indexInputSupplier, 0L, offset);
}
private StreamDataFileReader(InputSupplier<? extends SeekableInputStream> eventInputSupplier,
@Nullable InputSupplier<? extends InputStream> indexInputSupplier,
long startTime, long offset) {
this.eventInputSupplier = eventInputSupplier;
this.indexInputSupplier = indexInputSupplier;
this.streamEventBuffer = new StreamEventBuffer();
this.startTime = startTime;
this.offset = offset;
this.timestampBuffer = new byte[8];
this.timestamp = -1L;
}
@Override
public Long getPosition() {
return position;
}
/**
* Opens this reader to prepare for consumption. Calling this method is optional as the
* {@link #read(java.util.Collection, int, long, java.util.concurrent.TimeUnit, co.cask.cdap.data.file.ReadFilter)}
* method would do the initialization if this method hasn't been called.
*
* @throws IOException If there is error initializing.
*/
@Override
public void initialize() throws IOException {
try {
if (eventInput == null) {
doOpen();
}
} catch (IOException e) {
if (!(e instanceof EOFException || e instanceof FileNotFoundException)) {
throw e;
}
// It's ok if the file doesn't exists or EOF. As that's the tailing behavior.
}
}
@Override
public void close() throws IOException {
if (closed) {
return;
}
try {
if (eventInput != null) {
eventInput.close();
}
} finally {
closed = true;
}
}
@Override
public int read(Collection<? super PositionStreamEvent> events, int maxEvents,
long timeout, TimeUnit unit) throws IOException, InterruptedException {
return read(events, maxEvents, timeout, unit, ReadFilter.ALWAYS_ACCEPT);
}
@Override
public int read(Collection<? super PositionStreamEvent> events, int maxEvents,
long timeout, TimeUnit unit, ReadFilter readFilter) throws IOException, InterruptedException {
if (closed) {
throw new IOException("Reader already closed.");
}
int eventCount = 0;
long sleepNano = computeSleepNano(timeout, unit);
try {
Stopwatch stopwatch = new Stopwatch();
stopwatch.start();
// Keep reading events until max events.
while (!eof && eventCount < maxEvents) {
try {
if (eventInput == null) {
doOpen();
}
PositionStreamEvent event = nextStreamEvent(readFilter);
if (event != null) {
events.add(event);
eventCount++;
} else if (eof) {
break;
}
} catch (IOException e) {
if (eventInput != null) {
eventInput.close();
eventInput = null;
}
if (!(e instanceof EOFException || e instanceof FileNotFoundException)) {
throw e;
}
// If end of stream file or no timeout is allowed, break the loop.
if (eof || timeout <= 0) {
break;
}
if (stopwatch.elapsedTime(unit) >= timeout) {
break;
}
TimeUnit.NANOSECONDS.sleep(sleepNano);
if (stopwatch.elapsedTime(unit) >= timeout) {
break;
}
}
}
return (eventCount == 0 && eof) ? -1 : eventCount;
} catch (IOException e) {
close();
throw e;
}
}
/**
* Returns the index for the stream data or {@code null} if index is absent.
*/
private StreamDataFileIndex getIndex() {
if (index == null && indexInputSupplier != null) {
index = new StreamDataFileIndex(indexInputSupplier);
}
return index;
}
/**
* Opens and initialize this reader.
*/
private void doOpen() throws IOException {
try {
eventInput = eventInputSupplier.getInput();
decoder = new BinaryDecoder(eventInput);
// If position is <= 0, the reader is not being used yet, hence needs to initialize.
if (position <= 0) {
init();
} else {
// If position > 0, the reader has already been initialized.
// We just need to seek to beginning of a data-block, depending on whether there is event in the buffer
if (streamEventBuffer.hasEvent()) {
// If there is event in the buffer, we seek to the data block that come after the buffered events
// to prepare for the reading of the data block after the current buffered events are fully consumed.
eventInput.seek(streamEventBuffer.getEndPosition());
} else {
// Otherwise, we seek to the current position, which should be pointing to the beginning of a data block
eventInput.seek(position);
}
}
} catch (IOException e) {
if (eventInput != null) {
eventInput.close();
eventInput = null;
}
throw e;
}
}
private long computeSleepNano(long timeout, TimeUnit unit) {
long sleepNano = TimeUnit.NANOSECONDS.convert(timeout, unit) / 10;
return sleepNano <= 0 ? 1 : sleepNano;
}
private void init() throws IOException {
readHeader();
// If it is constructed with an arbitrary offset, need to find an event position
if (offset > 0) {
initByOffset(offset);
} else if (startTime > 0) {
initByTime(startTime);
}
}
private void readHeader() throws IOException {
// Read the header of the event file
// First 2 bytes should be 'E' '1'
byte[] magic = new byte[StreamDataFileConstants.MAGIC_HEADER_SIZE];
ByteStreams.readFully(eventInput, magic);
int fileVersion = decodeFileVersion(magic);
// Read the properties map.
Map<String, String> properties = StreamUtils.decodeMap(new BinaryDecoder(eventInput));
verifySchema(properties);
// Create event template
if (fileVersion >= 2) {
eventTemplate = createEventTemplate(properties);
} else {
eventTemplate = new StreamEvent(ImmutableMap.<String, String>of(), ByteBuffers.EMPTY_BUFFER, -1L);
}
position = eventInput.getPos();
}
/**
* Decodes the file version from the magic header.
*
* @return the file version
* @throws IOException if failed to decode file version from the magic header
*/
private int decodeFileVersion(byte[] magic) throws IOException {
if (Arrays.equals(magic, StreamDataFileConstants.MAGIC_HEADER_V1)) {
return 1;
}
if (Arrays.equals(magic, StreamDataFileConstants.MAGIC_HEADER_V2)) {
return 2;
}
throw new IOException(
String.format("Unsupported stream file format. First two bytes must be %s or %s",
Bytes.toStringBinary(StreamDataFileConstants.MAGIC_HEADER_V1),
Bytes.toStringBinary(StreamDataFileConstants.MAGIC_HEADER_V2))
);
}
/**
* Creates a {@link StreamEvent} that will be used as a template for all events consumable from this reader.
*/
private StreamEvent createEventTemplate(Map<String, String> properties) throws IOException {
long timestamp = -1L;
// See if all events in the file are of the same timestamp
String uniTimestamp = properties.get(StreamDataFileConstants.Property.Key.UNI_TIMESTAMP);
if (StreamDataFileConstants.Property.Value.CLOSE_TIMESTAMP.equals(uniTimestamp)) {
// Seek to the end - 8 of the stream to read the close timestamp
long pos = eventInput.getPos();
eventInput.seek(eventInput.size() - 8);
timestamp = Math.abs(readTimestamp());
eventInput.seek(pos);
} else if (uniTimestamp != null) {
timestamp = Long.parseLong(uniTimestamp);
}
// Grab the set of default headers for all events
ImmutableMap.Builder<String, String> headers = ImmutableMap.builder();
String prefix = StreamDataFileConstants.Property.Key.EVENT_HEADER_PREFIX;
for (Map.Entry<String, String> entry : properties.entrySet()) {
if (entry.getKey().startsWith(prefix)) {
headers.put(entry.getKey().substring(prefix.length()), entry.getValue());
}
}
return new StreamEvent(headers.build(), ByteBuffers.EMPTY_BUFFER, timestamp);
}
private void initByOffset(final long offset) throws IOException {
// If index is provided, lookup the position smaller but closest to the offset.
StreamDataFileIndex index = getIndex();
long pos = index == null ? 0 : index.floorPosition(offset);
if (pos > 0) {
eventInput.seek(pos);
}
skipUntil(new SkipCondition() {
@Override
public boolean apply(long position, long timestamp) {
return position >= offset;
}
});
}
private void initByTime(final long time) throws IOException {
// If index is provided, lookup the index find the offset closest to start time.
// If no offset is found, starts from the beginning of the events
StreamDataFileIndex index = getIndex();
long offset = index == null ? 0 : index.floorPositionByTime(time);
if (offset > 0) {
eventInput.seek(offset);
}
skipUntil(new SkipCondition() {
@Override
public boolean apply(long position, long timestamp) {
return timestamp >= time;
}
});
}
/**
* Skips events until the given condition is true.
*/
private void skipUntil(SkipCondition condition) throws IOException {
long positionBound = position = eventInput.getPos();
try {
while (!eof) {
// Read timestamp
long timestamp = readTimestamp();
// If EOF or condition match, upper bound found. Break the loop.
eof = timestamp < 0;
if (eof || condition.apply(positionBound, timestamp)) {
break;
}
int len = readLength();
position = positionBound;
// Jump to next timestamp
eventInput.seek(eventInput.getPos() + len);
positionBound = eventInput.getPos();
// need to check this here before we loop around again because it's possible the condition was
// satisfied by moving up the position even though the timestamp has not changed yet.
if (condition.apply(positionBound, timestamp)) {
break;
}
}
if (eof) {
position = positionBound;
return;
}
// search for the exact StreamData position within the bound.
eventInput.seek(position);
readDataBlock(ReadFilter.ALWAYS_ACCEPT);
while (position < positionBound) {
if (condition.apply(streamEventBuffer.getPosition(), timestamp)) {
break;
}
nextStreamEvent(ReadFilter.ALWAYS_REJECT_OFFSET);
}
} catch (IOException e) {
// It's ok if hitting EOF, meaning it's could be a live stream file or closed by a dead stream handler.
if (!(e instanceof EOFException)) {
throw e;
}
}
}
private void verifySchema(Map<String, String> properties) throws IOException {
String schemaKey = StreamDataFileConstants.Property.Key.SCHEMA;
String schemaStr = properties.get(schemaKey);
if (schemaStr == null) {
throw new IOException("Missing '" + schemaKey + "' property.");
}
try {
Schema schema = new SchemaTypeAdapter().read(new JsonReader(new StringReader(schemaStr)));
if (!StreamEventDataCodec.STREAM_DATA_SCHEMA.equals(schema)) {
throw new IOException("Unsupported schema " + schemaStr);
}
} catch (JsonSyntaxException e) {
throw new IOException("Invalid schema.", e);
}
}
private long readTimestamp() throws IOException {
ByteStreams.readFully(eventInput, timestampBuffer);
return Bytes.toLong(timestampBuffer);
}
private int readLength() throws IOException {
return decoder.readInt();
}
private void readDataBlock(ReadFilter filter) throws IOException {
// Data block is <timestamp> <length> <stream_data>+
position = eventInput.getPos();
long timestamp = readTimestamp();
if (timestamp < 0) {
eof = true;
return;
}
// Use the template timestamp if available
timestamp = eventTemplate.getTimestamp() >= 0 ? eventTemplate.getTimestamp() : timestamp;
if (acceptTimestamp(filter, timestamp)) {
streamEventBuffer.fillBuffer(eventInput, readLength());
this.timestamp = timestamp;
return;
}
// If timestamp is not accepted and the timestamp comes from event template, then the whole file can be skipped
if (eventTemplate.getTimestamp() >= 0) {
eof = true;
return;
}
long nextTimestamp = filter.getNextTimestampHint();
if (nextTimestamp > timestamp) {
eventInput.seek(position);
initByTime(nextTimestamp);
return;
}
int length = readLength();
long bytesSkipped = eventInput.skip(length);
if (bytesSkipped != length) {
throw new EOFException("Expected to skip " + length + " but only " + bytesSkipped + " was skipped.");
}
position = eventInput.getPos();
}
/**
* Reads or skips a {@link StreamEvent}.
*
* @param filter to determine to accept or skip a stream event by offset
* and accept or skip a stream event block by timestamp.
* @return The next StreamEvent or {@code null} if the event is rejected by the filter or reached EOF.
*/
private PositionStreamEvent nextStreamEvent(ReadFilter filter) throws IOException {
while (!eof && !(streamEventBuffer.hasEvent() && acceptTimestamp(filter, timestamp))) {
readDataBlock(filter);
}
if (eof) {
return null;
}
PositionStreamEvent event = streamEventBuffer.nextEvent(timestamp, eventTemplate.getHeaders(), filter);
position = streamEventBuffer.getPosition();
return event;
}
private boolean acceptTimestamp(ReadFilter filter, long timestamp) {
filter.reset();
return filter.acceptTimestamp(timestamp);
}
private interface SkipCondition {
boolean apply(long position, long timestamp);
}
}