/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.drill.exec.store.easy.text.compliant; import io.netty.buffer.DrillBuf; import io.netty.util.internal.PlatformDependent; import java.io.EOFException; import java.io.IOException; import java.io.InputStream; import java.nio.ByteBuffer; import org.apache.drill.exec.memory.BoundsChecking; import org.apache.hadoop.fs.ByteBufferReadable; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.Seekable; import org.apache.hadoop.io.compress.CompressionInputStream; import com.google.common.base.Preconditions; import com.univocity.parsers.common.Format; /** * Class that fronts an InputStream to provide a byte consumption interface. * Also manages only reading lines to and from each split. */ final class TextInput { private final byte[] lineSeparator; private final byte normalizedLineSeparator; private final TextParsingSettings settings; private long lineCount; private long charCount; /** * The starting position in the file. */ private final long startPos; private final long endPos; private int bufferMark; private long streamMark; private long streamPos; private final Seekable seekable; private final FSDataInputStream inputFS; private final InputStream input; private final DrillBuf buffer; private final ByteBuffer underlyingBuffer; private final long bStart; private final long bStartMinus1; private final boolean bufferReadable; /** * Whether there was a possible partial line separator on the previous * read so we dropped it and it should be appended to next read. */ private int remByte = -1; /** * The current position in the buffer. */ public int bufferPtr; /** * The quantity of valid data in the buffer. */ public int length = -1; private boolean endFound = false; /** * Creates a new instance with the mandatory characters for handling newlines transparently. * lineSeparator the sequence of characters that represent a newline, as defined in {@link Format#getLineSeparator()} * normalizedLineSeparator the normalized newline character (as defined in {@link Format#getNormalizedNewline()}) that is used to replace any lineSeparator sequence found in the input. */ public TextInput(TextParsingSettings settings, InputStream input, DrillBuf readBuffer, long startPos, long endPos) { this.lineSeparator = settings.getNewLineDelimiter(); byte normalizedLineSeparator = settings.getNormalizedNewLine(); Preconditions.checkArgument(input instanceof Seekable, "Text input only supports an InputStream that supports Seekable."); boolean isCompressed = input instanceof CompressionInputStream ; Preconditions.checkArgument(!isCompressed || startPos == 0, "Cannot use split on compressed stream."); // splits aren't allowed with compressed data. The split length will be the compressed size which means we'll normally end prematurely. if(isCompressed && endPos > 0){ endPos = Long.MAX_VALUE; } this.input = input; this.seekable = (Seekable) input; this.settings = settings; if(input instanceof FSDataInputStream){ this.inputFS = (FSDataInputStream) input; this.bufferReadable = inputFS.getWrappedStream() instanceof ByteBufferReadable; }else{ this.inputFS = null; this.bufferReadable = false; } this.startPos = startPos; this.endPos = endPos; this.normalizedLineSeparator = normalizedLineSeparator; this.buffer = readBuffer; this.bStart = buffer.memoryAddress(); this.bStartMinus1 = bStart -1; this.underlyingBuffer = buffer.nioBuffer(0, buffer.capacity()); } /** * Test the input to position for read start. If the input is a non-zero split or * splitFirstLine is enabled, input will move to appropriate complete line. * @throws IOException */ final void start() throws IOException { lineCount = 0; if(startPos > 0){ seekable.seek(startPos); } updateBuffer(); if (length > 0) { if(startPos > 0 || settings.isSkipFirstLine()){ // move to next full record. skipLines(1); } } } /** * Helper method to get the most recent characters consumed since the last record started. * May get an incomplete string since we don't support stream rewind. Returns empty string for now. * @return String of last few bytes. * @throws IOException */ public String getStringSinceMarkForError() throws IOException { return " "; } long getPos(){ return streamPos + bufferPtr; } public void mark(){ streamMark = streamPos; bufferMark = bufferPtr; } /** * read some more bytes from the stream. Uses the zero copy interface if available. Otherwise, does byte copy. * @throws IOException */ private void read() throws IOException { if(bufferReadable){ if(remByte != -1){ for (int i = 0; i <= remByte; i++) { underlyingBuffer.put(lineSeparator[i]); } remByte = -1; } length = inputFS.read(underlyingBuffer); }else{ byte[] b = new byte[underlyingBuffer.capacity()]; if(remByte != -1){ int remBytesNum = remByte + 1; System.arraycopy(lineSeparator, 0, b, 0, remBytesNum); length = input.read(b, remBytesNum, b.length - remBytesNum); remByte = -1; }else{ length = input.read(b); } underlyingBuffer.put(b); } } /** * Read more data into the buffer. Will also manage split end conditions. * @throws IOException */ private void updateBuffer() throws IOException { streamPos = seekable.getPos(); underlyingBuffer.clear(); if(endFound){ length = -1; return; } read(); // check our data read allowance. if(streamPos + length >= this.endPos){ updateLengthBasedOnConstraint(); } charCount += bufferPtr; bufferPtr = 1; buffer.writerIndex(underlyingBuffer.limit()); buffer.readerIndex(underlyingBuffer.position()); } /** * Checks to see if we can go over the end of our bytes constraint on the data. If so, * adjusts so that we can only read to the last character of the first line that crosses * the split boundary. */ private void updateLengthBasedOnConstraint() { final long max = bStart + length; for(long m = bStart + (endPos - streamPos); m < max; m++) { for (int i = 0; i < lineSeparator.length; i++) { long mPlus = m + i; if (mPlus < max) { // we found a line separator and don't need to consult the next byte. if (lineSeparator[i] == PlatformDependent.getByte(mPlus) && i == lineSeparator.length - 1) { length = (int) (mPlus - bStart) + 1; endFound = true; return; } } else { // the last N characters of the read were remnant bytes. We'll hold off on dealing with these bytes until the next read. remByte = i; length = length - i; return; } } } } /** * Get next byte from stream. Also maintains the current line count. Will throw a StreamFinishedPseudoException * when the stream has run out of bytes. * @return next byte from stream. * @throws IOException */ public final byte nextChar() throws IOException { byte byteChar = nextCharNoNewLineCheck(); int bufferPtrTemp = bufferPtr - 1; if (byteChar == lineSeparator[0]) { for (int i = 1; i < lineSeparator.length; i++, bufferPtrTemp++) { if (lineSeparator[i] != buffer.getByte(bufferPtrTemp)) { return byteChar; } } lineCount++; byteChar = normalizedLineSeparator; // we don't need to update buffer position if line separator is one byte long if (lineSeparator.length > 1) { bufferPtr += (lineSeparator.length - 1); if (bufferPtr >= length) { if (length != -1) { updateBuffer(); } else { throw StreamFinishedPseudoException.INSTANCE; } } } } return byteChar; } /** * Get next byte from stream. Do no maintain any line count Will throw a StreamFinishedPseudoException * when the stream has run out of bytes. * @return next byte from stream. * @throws IOException */ public final byte nextCharNoNewLineCheck() throws IOException { if (length == -1) { throw StreamFinishedPseudoException.INSTANCE; } if (BoundsChecking.BOUNDS_CHECKING_ENABLED) { buffer.checkBytes(bufferPtr - 1, bufferPtr); } byte byteChar = PlatformDependent.getByte(bStartMinus1 + bufferPtr); if (bufferPtr >= length) { if (length != -1) { updateBuffer(); bufferPtr--; } else { throw StreamFinishedPseudoException.INSTANCE; } } bufferPtr++; return byteChar; } /** * Number of lines read since the start of this split. * @return */ public final long lineCount() { return lineCount; } /** * Skip forward the number of line delimiters. If you are in the middle of a line, * a value of 1 will skip to the start of the next record. * @param lines Number of lines to skip. * @throws IOException */ public final void skipLines(int lines) throws IOException { if (lines < 1) { return; } long expectedLineCount = this.lineCount + lines; try { do { nextChar(); } while (lineCount < expectedLineCount); if (lineCount < lines) { throw new IllegalArgumentException("Unable to skip " + lines + " lines from line " + (expectedLineCount - lines) + ". End of input reached"); } } catch (EOFException ex) { throw new IllegalArgumentException("Unable to skip " + lines + " lines from line " + (expectedLineCount - lines) + ". End of input reached"); } } public final long charCount() { return charCount + bufferPtr; } public long getLineCount() { return lineCount; } public void close() throws IOException{ input.close(); } }