/** * Copyright 2011-2017 Asakusa Framework Team. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.asakusafw.runtime.io.text.directio; import java.io.IOException; import java.io.InputStream; /** * Trims {@link InputStream} to provide only contents in the current split for line separated text. * This requires that each {@code 0x0d} byte in input always represents line feed (LF). * @since 0.9.1 * @see InputSplitters */ public final class LineFeedDelimitedInputStream extends InputStream { /* * NOTE: splitting text files * 0. [input] is already skipped until [offset] * 1. scan the first record end from [offset] and drop the range if offset > 0: it is part of the previous split * 2. scan the record end from [offset]+[length] and take the range * * # offset == 0: * ---------------- (original text = input) -------------- ... ->EOF * <------(length)-------> * <----(guaranteed)----->---(scan)-->$ * <=========(CURRENT SPLIT)==========><---(next split)--- ... -> * * # offset > 0: * ## guaranteed > 0 * ... --------------------------------- (original text) -------------------------------- ... ->EOF * ... --(skipped)-->------------------------(input)------------------------------------- ... ->EOF * ... --(offset)---><-------------(length)-------------> * ---(scan)-->$<----(guaranteed)----->---(scan)-->$ * ... ------(prev split)--------><=========(CURRENT SPLIT)==========><---(next split)--- ... -> * * ## guaranteed < 0 -> current split becomes empty * ... --------------------------------- (original text) -------------------------------- ... ->EOF * ... --(skipped)-->------------------------(input)------------------------------------- ... ->EOF * ... --(offset)---><-------------(length)-------------> * -------------(scan)----------------->$ * ... ------(prev split)---------------------------------><---------(next split)-------- ... -> * * ## guaranteed = 0 -> current split becomes NOT empty, and the next split will skip the cascaded range * ... --------------------------------- (original text) -------------------------------- ... ->EOF * ... --(skipped)-->------------------------(input)------------------------------------- ... ->EOF * ... --(offset)---><-------------(length)-------------> * -------------(scan)--------------->$-----(scan)---->$ * ... ------(prev split)-------------------------------><===(CURRENT)===><----(next)---- ... -> */ private final InputStream source; private final byte[] buffer; private int bufferPosition; private int bufferLimit; private final StateMachine stateMachine; private boolean skipUntilPrevSplitEnd; private long guaranteedSplitRest; /** * Creates a new instance. * @param source the source input stream * @param offset the current stream position from the original head, in bytes * @param length the split length from the current stream position; * the split may become smaller if the stream does not have enough size, * and is continue until the last record end was appeared over this length */ public LineFeedDelimitedInputStream(InputStream source, long offset, long length) { this(source, offset, length, new StateMachine()); } private LineFeedDelimitedInputStream(InputStream source, long offset, long length, StateMachine stateMachine) { this.source = source; this.buffer = new byte[1024]; this.stateMachine = stateMachine; this.skipUntilPrevSplitEnd = offset > 0; this.guaranteedSplitRest = length; } @Override public int read() throws IOException { if (prepare()) { return buffer[bufferPosition++]; } else { return -1; } } @Override public int read(byte[] b, int off, int len) throws IOException { if (prepare()) { int pos = bufferPosition; int remaining = bufferLimit - pos; assert remaining > 0; int read = Math.min(remaining, len); System.arraycopy(buffer, pos, b, off, read); bufferPosition = pos + read; return read; } return -1; } private boolean prepare() throws IOException { // has remaining if (bufferPosition < bufferLimit) { return true; } // skip until the last end of split if (skipUntilPrevSplitEnd) { skipUntilPrevSplitEnd = false; doSkipUntilPrevSplitEnd(); return prepare(); } // already saw end-of-split if (stateMachine.isFinished()) { return false; } // read from the upstream int read = source.read(buffer); assert read != 0; if (read < 0) { // no more contents forceEof(); return false; } computeBufferRange(0, read); assert bufferPosition < bufferLimit; return true; } private void doSkipUntilPrevSplitEnd() throws IOException { StateMachine sm = stateMachine; assert sm.isFinished() == false; assert bufferPosition == 0; assert bufferLimit == 0; long rest = guaranteedSplitRest; while (true) { int read = source.read(buffer); if (read < 0) { // previous split end is EOF: the current split has no more records forceEof(); break; } int position = findEndOfSplit(0, read); rest -= position < 0 ? read : position; if (rest < 0) { // previous split end exceeds the current split range: the current split has no more records forceEof(); break; } if (position >= 0) { // found the previous end // reset state machine to compute end of the current split sm.reset(); guaranteedSplitRest = rest; computeBufferRange(position, read); break; } } } private void computeBufferRange(int start, int end) { bufferPosition = start; bufferLimit = end; int length = end - start; if (length <= guaranteedSplitRest) { // the rest buffer content is in the current split range guaranteedSplitRest -= length; } else { // buffer range was exceeded from the current split range: we drop after the end of split int found = findEndOfSplit(start + (int) guaranteedSplitRest, end); if (found >= 0) { assert stateMachine.isFinished(); bufferLimit = found; } guaranteedSplitRest = 0L; } } private int findEndOfSplit(int start, int limit) { return stateMachine.findEndOfSplit(buffer, start, limit); } private void forceEof() { bufferPosition = 0; bufferLimit = 0; stateMachine.finish(); } @Override public int available() throws IOException { return bufferLimit - bufferPosition; } @Override public void close() throws IOException { source.close(); } private static final class StateMachine { static final byte LF = '\n'; private boolean sawEndOfLine; StateMachine() { reset(); } void reset() { this.sawEndOfLine = false; } void finish() { this.sawEndOfLine = true; } boolean isFinished() { return sawEndOfLine; } int findEndOfSplit(byte[] contents, int start, int end) { assert isFinished() == false; for (int i = start; i < end; i++) { if (contents[i] == LF) { sawEndOfLine = true; return i + 1; } } return -1; } } }