/* * MIT License * * Copyright (c) 2016 mikes * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ package com.thinkbiganalytics.inputformat.hadoop.mapred; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.Text; import java.io.IOException; import java.io.InputStream; /** * A class that provides an escaped line reader from an input stream. */ public class EscapedLineReader { private static final byte DEFAULT_ESCAPE_CHARACTER = '\\'; private static final int DEFAULT_BUFFER_SIZE = 64 * 1024; private static final byte CR = '\r'; private static final byte LF = '\n'; private int bufferSize = DEFAULT_BUFFER_SIZE; private InputStream in; private byte[] buffer; // the number of bytes in the real buffer private int bufferLength; // the current position of the buffer private int bufferPos; private byte escapeChar; /** * Create a multi-line reader that reads from the given stream using the * given buffer-size. * * @param in The input stream * @param bufferSize Size of the read buffer */ public EscapedLineReader(InputStream in, int bufferSize, byte escapeChar) { this.escapeChar = escapeChar; this.in = in; this.bufferSize = bufferSize; this.buffer = new byte[this.bufferSize]; } /** * Create a multi-line reader that reads from the given stream using the * default buffer-size (64K). * * @param in The input stream */ public EscapedLineReader(InputStream in, byte escapeChar) { this(in, DEFAULT_BUFFER_SIZE, escapeChar); } public EscapedLineReader(InputStream in) { this(in, DEFAULT_BUFFER_SIZE, DEFAULT_ESCAPE_CHARACTER); } /** * Create a multi-line reader that reads from the given stream using the * <code>io.file.buffer.size</code> specified in the given * <code>Configuration</code>. * * @param in input stream * @param conf configuration */ public EscapedLineReader(InputStream in, Configuration conf) throws IOException { this(in, conf.getInt("io.file.buffer.size", DEFAULT_BUFFER_SIZE), DEFAULT_ESCAPE_CHARACTER); } public EscapedLineReader(InputStream in, Configuration conf, byte escapeChar) throws IOException { this(in, conf.getInt("io.file.buffer.size", DEFAULT_BUFFER_SIZE), escapeChar); } /** * Close the underlying stream. */ public void close() throws IOException { in.close(); } /** * Read one line from the InputStream into the given Text. A line * can be terminated by one of the following: '\n' (LF), '\r' (CR), * or '\r\n' (CR+LF). Will ignore any of these termination characters * if they are proceeded by a designated escape character. EOF also * terminates an otherwise unterminated line. * * @param str the object to store the given line (without the newline) * @param maxLineLength the maximum number of bytes to store into str; the rest will be silently discarded. * @param maxBytesToConsume the maximum number of bytes to consume in this call. This is only a hint, because if the line crosses this threshold, we allow it to happen. It can overshoot * potentially by as much as one buffer length. * @return the number of bytes read including the (longest) newline found */ public int readLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException { /* We're reading data from in, but the head of the stream may be * already buffered in buffer, so we have several cases: * 1. No newline characters are in the buffer, so we need to copy * everything and read another buffer from the stream. * 2. An unambiguously terminated line is in buffer, so we just * copy to str. * 3. Ambiguously terminated line is in buffer, i.e. buffer ends * in CR. In this case we copy everything up to CR to str, but * we also need to see what follows CR: if it's LF, then we * need consume LF as well, so next call to readLine will read * from after that. * We use a flag prevCharCR to signal if previous character was CR * and, if it happens to be at the end of the buffer, delay * consuming it until we have a chance to look at the char that * follows. */ str.clear(); int txtLength = 0; // tracks str.getLength() as an optimization int newLineLength = 0; // length of the terminating newline boolean prevCharCR = false; // true if prev char was \r long bytesConsumed = 0; do { int startPos = bufferPos; // starting from where we left off if (bufferPos >= bufferLength) { startPos = bufferPos = 0; if (prevCharCR) { ++bytesConsumed; // account for CR from previous read } bufferLength = in.read(buffer); if (bufferLength <= 0) { break; // EOF } } for (; bufferPos < bufferLength; ++bufferPos) { boolean escaped = false; if (prevCharCR && bufferPos > 1) { escaped = (buffer[bufferPos - 2] == escapeChar); } if (!prevCharCR && bufferPos > 0) { escaped = (buffer[bufferPos - 1] == escapeChar); } if (buffer[bufferPos] == LF && !escaped) { newLineLength = prevCharCR ? 2 : 1; ++bufferPos; // at next loop proceed from following byte break; } if (prevCharCR && !escaped) { // CR + notLF, we are at notLF newLineLength = 1; break; } prevCharCR = (buffer[bufferPos] == CR); //prevCharCR = (buffer[bufferPos] == CR && !escaped); } int readLength = bufferPos - startPos; if (prevCharCR && newLineLength == 0) { --readLength; } bytesConsumed += readLength; int appendLength = readLength - newLineLength; if (appendLength > maxLineLength - txtLength) { appendLength = maxLineLength - txtLength; } if (appendLength > 0) { str.append(buffer, startPos, appendLength); txtLength += appendLength; } } while (newLineLength == 0 && bytesConsumed < maxBytesToConsume); if (bytesConsumed > (long) Integer.MAX_VALUE) { throw new IOException("Too many bytes before newline: " + bytesConsumed); } return (int) bytesConsumed; } /** * Read from the InputStream into the given Text. * * @param str the object to store the given line * @param maxLineLength the maximum number of bytes to store into str * @return the number of bytes read including newline * @throws IOException if the underlying stream throws */ public int readLine(Text str, int maxLineLength) throws IOException { return readLine(str, maxLineLength, Integer.MAX_VALUE); } /** * Read from the InputStream into the given Text. * * @param str the object to store the given line * @return the number of bytes read including newline * @throws IOException if the underlying stream throws */ public int readLine(Text str) throws IOException { return readLine(str, Integer.MAX_VALUE, Integer.MAX_VALUE); } }