/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.drill.exec.store.easy.text.compliant;
import io.netty.buffer.DrillBuf;
import io.netty.util.internal.PlatformDependent;
import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import org.apache.drill.exec.memory.BoundsChecking;
import org.apache.hadoop.fs.ByteBufferReadable;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.Seekable;
import org.apache.hadoop.io.compress.CompressionInputStream;
import com.google.common.base.Preconditions;
import com.univocity.parsers.common.Format;
/**
* Class that fronts an InputStream to provide a byte consumption interface.
* Also manages only reading lines to and from each split.
*/
final class TextInput {
private final byte[] lineSeparator;
private final byte normalizedLineSeparator;
private final TextParsingSettings settings;
private long lineCount;
private long charCount;
/**
* The starting position in the file.
*/
private final long startPos;
private final long endPos;
private int bufferMark;
private long streamMark;
private long streamPos;
private final Seekable seekable;
private final FSDataInputStream inputFS;
private final InputStream input;
private final DrillBuf buffer;
private final ByteBuffer underlyingBuffer;
private final long bStart;
private final long bStartMinus1;
private final boolean bufferReadable;
/**
* Whether there was a possible partial line separator on the previous
* read so we dropped it and it should be appended to next read.
*/
private int remByte = -1;
/**
* The current position in the buffer.
*/
public int bufferPtr;
/**
* The quantity of valid data in the buffer.
*/
public int length = -1;
private boolean endFound = false;
/**
* Creates a new instance with the mandatory characters for handling newlines transparently.
* lineSeparator the sequence of characters that represent a newline, as defined in {@link Format#getLineSeparator()}
* normalizedLineSeparator the normalized newline character (as defined in {@link Format#getNormalizedNewline()}) that is used to replace any lineSeparator sequence found in the input.
*/
public TextInput(TextParsingSettings settings, InputStream input, DrillBuf readBuffer, long startPos, long endPos) {
this.lineSeparator = settings.getNewLineDelimiter();
byte normalizedLineSeparator = settings.getNormalizedNewLine();
Preconditions.checkArgument(input instanceof Seekable, "Text input only supports an InputStream that supports Seekable.");
boolean isCompressed = input instanceof CompressionInputStream ;
Preconditions.checkArgument(!isCompressed || startPos == 0, "Cannot use split on compressed stream.");
// splits aren't allowed with compressed data. The split length will be the compressed size which means we'll normally end prematurely.
if(isCompressed && endPos > 0){
endPos = Long.MAX_VALUE;
}
this.input = input;
this.seekable = (Seekable) input;
this.settings = settings;
if(input instanceof FSDataInputStream){
this.inputFS = (FSDataInputStream) input;
this.bufferReadable = inputFS.getWrappedStream() instanceof ByteBufferReadable;
}else{
this.inputFS = null;
this.bufferReadable = false;
}
this.startPos = startPos;
this.endPos = endPos;
this.normalizedLineSeparator = normalizedLineSeparator;
this.buffer = readBuffer;
this.bStart = buffer.memoryAddress();
this.bStartMinus1 = bStart -1;
this.underlyingBuffer = buffer.nioBuffer(0, buffer.capacity());
}
/**
* Test the input to position for read start. If the input is a non-zero split or
* splitFirstLine is enabled, input will move to appropriate complete line.
* @throws IOException
*/
final void start() throws IOException {
lineCount = 0;
if(startPos > 0){
seekable.seek(startPos);
}
updateBuffer();
if (length > 0) {
if(startPos > 0 || settings.isSkipFirstLine()){
// move to next full record.
skipLines(1);
}
}
}
/**
* Helper method to get the most recent characters consumed since the last record started.
* May get an incomplete string since we don't support stream rewind. Returns empty string for now.
* @return String of last few bytes.
* @throws IOException
*/
public String getStringSinceMarkForError() throws IOException {
return " ";
}
long getPos(){
return streamPos + bufferPtr;
}
public void mark(){
streamMark = streamPos;
bufferMark = bufferPtr;
}
/**
* read some more bytes from the stream. Uses the zero copy interface if available. Otherwise, does byte copy.
* @throws IOException
*/
private void read() throws IOException {
if(bufferReadable){
if(remByte != -1){
for (int i = 0; i <= remByte; i++) {
underlyingBuffer.put(lineSeparator[i]);
}
remByte = -1;
}
length = inputFS.read(underlyingBuffer);
}else{
byte[] b = new byte[underlyingBuffer.capacity()];
if(remByte != -1){
int remBytesNum = remByte + 1;
System.arraycopy(lineSeparator, 0, b, 0, remBytesNum);
length = input.read(b, remBytesNum, b.length - remBytesNum);
remByte = -1;
}else{
length = input.read(b);
}
underlyingBuffer.put(b);
}
}
/**
* Read more data into the buffer. Will also manage split end conditions.
* @throws IOException
*/
private void updateBuffer() throws IOException {
streamPos = seekable.getPos();
underlyingBuffer.clear();
if(endFound){
length = -1;
return;
}
read();
// check our data read allowance.
if(streamPos + length >= this.endPos){
updateLengthBasedOnConstraint();
}
charCount += bufferPtr;
bufferPtr = 1;
buffer.writerIndex(underlyingBuffer.limit());
buffer.readerIndex(underlyingBuffer.position());
}
/**
* Checks to see if we can go over the end of our bytes constraint on the data. If so,
* adjusts so that we can only read to the last character of the first line that crosses
* the split boundary.
*/
private void updateLengthBasedOnConstraint() {
final long max = bStart + length;
for(long m = bStart + (endPos - streamPos); m < max; m++) {
for (int i = 0; i < lineSeparator.length; i++) {
long mPlus = m + i;
if (mPlus < max) {
// we found a line separator and don't need to consult the next byte.
if (lineSeparator[i] == PlatformDependent.getByte(mPlus) && i == lineSeparator.length - 1) {
length = (int) (mPlus - bStart) + 1;
endFound = true;
return;
}
} else {
// the last N characters of the read were remnant bytes. We'll hold off on dealing with these bytes until the next read.
remByte = i;
length = length - i;
return;
}
}
}
}
/**
* Get next byte from stream. Also maintains the current line count. Will throw a StreamFinishedPseudoException
* when the stream has run out of bytes.
* @return next byte from stream.
* @throws IOException
*/
public final byte nextChar() throws IOException {
byte byteChar = nextCharNoNewLineCheck();
int bufferPtrTemp = bufferPtr - 1;
if (byteChar == lineSeparator[0]) {
for (int i = 1; i < lineSeparator.length; i++, bufferPtrTemp++) {
if (lineSeparator[i] != buffer.getByte(bufferPtrTemp)) {
return byteChar;
}
}
lineCount++;
byteChar = normalizedLineSeparator;
// we don't need to update buffer position if line separator is one byte long
if (lineSeparator.length > 1) {
bufferPtr += (lineSeparator.length - 1);
if (bufferPtr >= length) {
if (length != -1) {
updateBuffer();
} else {
throw StreamFinishedPseudoException.INSTANCE;
}
}
}
}
return byteChar;
}
/**
* Get next byte from stream. Do no maintain any line count Will throw a StreamFinishedPseudoException
* when the stream has run out of bytes.
* @return next byte from stream.
* @throws IOException
*/
public final byte nextCharNoNewLineCheck() throws IOException {
if (length == -1) {
throw StreamFinishedPseudoException.INSTANCE;
}
if (BoundsChecking.BOUNDS_CHECKING_ENABLED) {
buffer.checkBytes(bufferPtr - 1, bufferPtr);
}
byte byteChar = PlatformDependent.getByte(bStartMinus1 + bufferPtr);
if (bufferPtr >= length) {
if (length != -1) {
updateBuffer();
bufferPtr--;
} else {
throw StreamFinishedPseudoException.INSTANCE;
}
}
bufferPtr++;
return byteChar;
}
/**
* Number of lines read since the start of this split.
* @return
*/
public final long lineCount() {
return lineCount;
}
/**
* Skip forward the number of line delimiters. If you are in the middle of a line,
* a value of 1 will skip to the start of the next record.
* @param lines Number of lines to skip.
* @throws IOException
*/
public final void skipLines(int lines) throws IOException {
if (lines < 1) {
return;
}
long expectedLineCount = this.lineCount + lines;
try {
do {
nextChar();
} while (lineCount < expectedLineCount);
if (lineCount < lines) {
throw new IllegalArgumentException("Unable to skip " + lines + " lines from line " + (expectedLineCount - lines) + ". End of input reached");
}
} catch (EOFException ex) {
throw new IllegalArgumentException("Unable to skip " + lines + " lines from line " + (expectedLineCount - lines) + ". End of input reached");
}
}
public final long charCount() {
return charCount + bufferPtr;
}
public long getLineCount() {
return lineCount;
}
public void close() throws IOException{
input.close();
}
}