/** * Copyright (c) 2002-2014 "Neo Technology," * Network Engine for Objects in Lund AB [http://neotechnology.com] * * This file is part of Neo4j. * * Neo4j is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package com.ldbc.driver.csv.charseeker; import java.io.BufferedReader; import java.io.IOException; import java.io.Reader; import java.nio.CharBuffer; import static java.lang.Math.max; /** * Much like a {@link BufferedReader} for a {@link Reader}. */ public class BufferedCharSeeker implements CharSeeker { private static final int KB = 1024, MB = KB * KB; public static final int DEFAULT_BUFFER_SIZE = 2 * MB; public static final char DEFAULT_QUOTE_CHAR = '"'; private static final char EOL_CHAR = '\n'; private static final char EOL_CHAR_2 = '\r'; private static final char EOF_CHAR = (char) -1; private static final char BACK_SLASH = '\\'; private final CharReadable reader; private final char[] buffer; // Wraps the char[] buffer and is only used during reading more data, using f.ex. compact() // so that we don't have to duplicate that functionality. private final CharBuffer charBuffer; private int bufferPos; private long lineStartPos; private int seekStartPos; private int lineNumber = 1; private boolean eof; private final char quoteChar; public BufferedCharSeeker(CharReadable reader) { this(reader, DEFAULT_BUFFER_SIZE, DEFAULT_QUOTE_CHAR); } public BufferedCharSeeker(CharReadable reader, int bufferSize) { this(reader, bufferSize, DEFAULT_QUOTE_CHAR); } public BufferedCharSeeker(CharReadable reader, int bufferSize, char quoteChar) { this.reader = reader; this.buffer = new char[bufferSize]; this.charBuffer = CharBuffer.wrap(buffer); this.bufferPos = bufferSize; this.quoteChar = quoteChar; } @Override public boolean seek(Mark mark, int[] untilOneOfChars) throws IOException { if (eof) { // We're at the end return eof(mark); } // Keep a start position in case we need to further fill the buffer in nextChar, a value can at maximum be the // whole buffer, so max one fill per value is supported. seekStartPos = bufferPos; // seekStartPos updated in nextChar if buffer flips over, that's why it's a member int ch; int endOffset = 1; int skippedChars = 0; int quoteDepth = 0; while (!eof) { ch = nextChar(skippedChars); if (quoteDepth == 0) { // In normal mode, i.e. not within quotes if (ch == quoteChar && seekStartPos == bufferPos - 1/* -1 since we just advanced one */) { // We found a quote, which was the first of the value, skip it and switch mode quoteDepth++; seekStartPos++; continue; } else if (isNewLine(ch)) { // Encountered newline, done for now break; } else { for (int i = 0; i < untilOneOfChars.length; i++) { if (ch == untilOneOfChars[i]) { // We found a delimiter, set marker and return true mark.set(lineNumber, seekStartPos, bufferPos - endOffset - skippedChars, ch); return true; } } } } else { // In quoted mode, i.e. within quotes if (ch == quoteChar) { // Found a quote within a quote, peek at next char int nextCh = peekChar(); if (nextCh == quoteChar) { // Found a double quote, skip it and we're going down one more quote depth (quote-in-quote) repositionChar(bufferPos++, ++skippedChars); quoteDepth = quoteDepth == 1 ? 2 : 1; // toggle between quote and quote-in-quote } else { // Found an ending quote, skip it and switch mode endOffset++; quoteDepth--; } } else if ((ch == EOL_CHAR || ch == EOL_CHAR_2)) { // Found a new line, just keep going nextChar(skippedChars); } else if (ch == BACK_SLASH) { // Legacy concern, support java style quote encoding int nextCh = peekChar(); if (nextCh == quoteChar) { // Found a slash encoded quote repositionChar(bufferPos++, ++skippedChars); } } } } int valueLength = bufferPos - seekStartPos - 1; if (eof && valueLength == 0 && seekStartPos == lineStartPos) { // We didn't find any of the characters sought for return eof(mark); } // We found the last value of the line or stream skippedChars += skipEolChars(); mark.set(lineNumber, seekStartPos, bufferPos - endOffset - skippedChars, Mark.END_OF_LINE_CHARACTER); lineNumber++; lineStartPos = bufferPos; return true; } private void repositionChar(int offset, int stepsBack) { // We reposition characters because we might have skipped some along the way, double-quotes and what not. // We want to take an as little hit as possible for that, so we reposition each character as long as // we're still reading the same value. All other values will not have to take any hit of skipped chars // for this particular value. buffer[offset - stepsBack] = buffer[offset]; } private boolean isNewLine(int ch) { return ch == EOL_CHAR || ch == EOL_CHAR_2; } private int peekChar() throws IOException { fillBufferIfWeHaveExhaustedIt(); return buffer[bufferPos]; } private boolean eof(Mark mark) { mark.set(lineNumber, -1, -1, Mark.END_OF_LINE_CHARACTER); return false; } @Override public <EXTRACTOR extends Extractor<?>> EXTRACTOR extract(Mark mark, EXTRACTOR extractor) { long from = mark.startPosition(); long to = mark.position(); extractor.extract(buffer, (int) (from), (int) (to - from)); return extractor; } private int skipEolChars() throws IOException { int skipped = 0; while (isNewLine(nextChar(0/*doesn't matter since we ignore the chars anyway*/))) { // Just loop through, skipping them skipped++; } bufferPos--; // since nextChar advances one step return skipped; } private int nextChar(int skippedChars) throws IOException { fillBufferIfWeHaveExhaustedIt(); int ch = buffer[bufferPos++]; if (skippedChars > 0) { repositionChar(bufferPos - 1, skippedChars); } if (ch == EOF_CHAR) { eof = true; } return ch; } private void fillBufferIfWeHaveExhaustedIt() throws IOException { if (bufferPos >= buffer.length) { if (seekStartPos == 0) { throw new IllegalStateException("Tried to read in a value larger than buffer size " + buffer.length); } charBuffer.position(seekStartPos); charBuffer.compact(); int remaining = charBuffer.remaining(); int read = reader.read(buffer, charBuffer.position(), remaining); if (read < remaining) { buffer[charBuffer.position() + max(read, 0)] = EOF_CHAR; } bufferPos = charBuffer.position(); seekStartPos = 0; } } @Override public void close() throws IOException { reader.close(); } @Override public String toString() { return getClass().getSimpleName() + "[buffer:" + charBuffer + ", seekPos:" + seekStartPos + ", line:" + lineNumber + "]"; } }