package org.simpleflatmapper.csv.parser; import java.io.IOException; /** * Consume the charBuffer. */ public final class CharConsumer { public static final int ROW_DATA = 64; public static final int COMMENTED = 32; public static final int CELL_DATA = 16; public static final int ESCAPED = 8; public static final int LAST_CHAR_WAS_SEPARATOR = 4; public static final int LAST_CHAR_WAS_CR = 2; public static final int ESCAPED_AREA = 1; public static final int NONE = 0; private static final int TURN_OFF_LAST_CHAR_MASK = ~(LAST_CHAR_WAS_CR|LAST_CHAR_WAS_SEPARATOR); private static final int TURN_OFF_ESCAPED_AREA = ~(ESCAPED_AREA); private static final char LF = '\n'; private static final char CR = '\r'; private static final char SPACE = ' '; private static final char COMMENT = '#'; private final CharBuffer csvBuffer; private final TextFormat textFormat; private final CellPreProcessor cellPreProcessor; private int _currentIndex = 0; private int _currentState = NONE; public CharConsumer(CharBuffer csvBuffer, TextFormat textFormat, CellPreProcessor cellPreProcessor) { this.csvBuffer = csvBuffer; this.cellPreProcessor = cellPreProcessor; this.textFormat = textFormat; } public final void consumeAllBuffer(final CellConsumer cellConsumer) { final boolean notIgnoreLeadingSpace = !cellPreProcessor.ignoreLeadingSpace(); final boolean yamlComment = textFormat.yamlComment; final char escapeChar = textFormat.escapeChar; final char separatorChar = textFormat.separatorChar; int currentState = _currentState; int currentIndex = _currentIndex; final char[] chars = csvBuffer.buffer; final int bufferSize = csvBuffer.bufferSize; while(currentIndex < bufferSize) { // unescaped loop if ((currentState & ESCAPED_AREA) == 0) { if ((currentState & COMMENTED) == 0) { while (currentIndex < bufferSize) { final char character = chars[currentIndex]; final int cellEnd = currentIndex; currentIndex++; if (character == separatorChar) { // separator cellPreProcessor.newCell(chars, csvBuffer.mark, cellEnd, cellConsumer, currentState); csvBuffer.mark = currentIndex; currentState = LAST_CHAR_WAS_SEPARATOR | ROW_DATA; continue; } else if (character == LF) { // \n if ((currentState & LAST_CHAR_WAS_CR) == 0) { cellPreProcessor.newCell(chars, csvBuffer.mark, cellEnd, cellConsumer, currentState); cellConsumer.endOfRow(); } csvBuffer.mark = currentIndex; currentState = NONE; continue; } else if (character == CR) { // \r cellPreProcessor.newCell(chars, csvBuffer.mark, cellEnd, cellConsumer, currentState); csvBuffer.mark = currentIndex; currentState = LAST_CHAR_WAS_CR; cellConsumer.endOfRow(); continue; } else if (((currentState ^ CELL_DATA) & (ESCAPED | CELL_DATA)) != 0 && character == escapeChar) { currentState = ESCAPED_AREA | ESCAPED; break; } else if (yamlComment && (currentState & (CELL_DATA | ROW_DATA)) == 0 && character == COMMENT) { currentState |= COMMENTED; break; } currentState &= TURN_OFF_LAST_CHAR_MASK; if (notIgnoreLeadingSpace || character != SPACE) { currentState |= CELL_DATA; } } } else { // comment int nextEndOfLineChar = findNexEndOfLineChar(chars, currentIndex, bufferSize); if (nextEndOfLineChar != -1) { cellPreProcessor.newCell(chars, csvBuffer.mark, nextEndOfLineChar, cellConsumer, currentState); cellConsumer.endOfRow(); currentIndex = nextEndOfLineChar + 1; csvBuffer.mark = currentIndex; currentState = chars[nextEndOfLineChar] == CR ? LAST_CHAR_WAS_CR : NONE; } else { currentIndex = bufferSize; } } } else { // escaped area int nextEscapeChar = findNexChar(chars, currentIndex, bufferSize, escapeChar); if (nextEscapeChar != -1) { currentIndex = nextEscapeChar + 1; currentState &= TURN_OFF_ESCAPED_AREA; } else { currentIndex = bufferSize; } } } _currentState = currentState; _currentIndex = currentIndex; } public final boolean consumeToNextRow(CellConsumer cellConsumer) { final boolean notIgnoreLeadingSpace = !cellPreProcessor.ignoreLeadingSpace(); final char escapeChar = textFormat.escapeChar; final char separatorChar = textFormat.separatorChar; final boolean yamlComment = textFormat.yamlComment; int currentState = _currentState; int currentIndex = _currentIndex; final char[] chars = csvBuffer.buffer; final int bufferSize = csvBuffer.bufferSize; while(currentIndex < bufferSize) { // unescaped loop if ((currentState & ESCAPED_AREA) == 0) { if ((currentState & COMMENTED) == 0) { while(currentIndex < bufferSize) { final char character = chars[currentIndex]; final int cellEnd = currentIndex; currentIndex ++; if (character == separatorChar) { // separator cellPreProcessor.newCell(chars, csvBuffer.mark, cellEnd, cellConsumer, currentState); csvBuffer.mark = currentIndex; currentState = LAST_CHAR_WAS_SEPARATOR | ROW_DATA; continue; } else if (character == LF) { // \n if ((currentState & LAST_CHAR_WAS_CR) == 0) { cellPreProcessor.newCell(chars, csvBuffer.mark, cellEnd, cellConsumer, currentState); if (cellConsumer.endOfRow()) { csvBuffer.mark = currentIndex; _currentState = NONE; _currentIndex = currentIndex; return true; } } csvBuffer.mark = currentIndex; currentState = NONE; continue; } else if (character == CR) { // \r cellPreProcessor.newCell(chars, csvBuffer.mark, cellEnd, cellConsumer, currentState); csvBuffer.mark = currentIndex; currentState = LAST_CHAR_WAS_CR; if (cellConsumer.endOfRow()) { _currentState = currentState; _currentIndex = currentIndex; return true; } continue; } else if (((currentState ^ CELL_DATA) & (ESCAPED | CELL_DATA)) != 0 && character == escapeChar) { currentState = ESCAPED_AREA | ESCAPED; break; } else if (yamlComment && (currentState & (CELL_DATA | ROW_DATA)) == 0 && character == COMMENT) { currentState |= COMMENTED; break; } currentState &= TURN_OFF_LAST_CHAR_MASK; if (notIgnoreLeadingSpace || character != SPACE) { currentState |= CELL_DATA; } } } else { int nextEndOfLineChar = findNexEndOfLineChar(chars, currentIndex, bufferSize); if (nextEndOfLineChar != -1) { currentIndex = nextEndOfLineChar + 1; cellPreProcessor.newCell(chars, csvBuffer.mark, nextEndOfLineChar, cellConsumer, currentState); csvBuffer.mark = currentIndex; currentState = chars[nextEndOfLineChar] == CR ? LAST_CHAR_WAS_CR : NONE; if (cellConsumer.endOfRow()) { _currentState = currentState; _currentIndex = currentIndex; return true; } } else { currentIndex = bufferSize; } } } else { int nextEscapeChar = findNexChar(chars, currentIndex, bufferSize, escapeChar); if (nextEscapeChar != -1) { currentIndex = nextEscapeChar + 1; currentState &= TURN_OFF_ESCAPED_AREA; } else { currentIndex = bufferSize; } } } _currentState = currentState; _currentIndex = currentIndex; return false; } private int findNexChar(char[] chars, int start, int end, char c) { for(int i = start; i < end; i++) { if (chars[i] == c) return i; } return -1; } private int findNexEndOfLineChar(char[] chars, int start, int end) { for(int i = start; i < end; i++) { char c = chars[i]; if (c == CR || c == LF) return i; } return -1; } public final void finish(CellConsumer cellConsumer) { if ( hasUnconsumedData() || (_currentState & LAST_CHAR_WAS_SEPARATOR) != 0) { cellPreProcessor.newCell(csvBuffer.buffer, csvBuffer.mark, _currentIndex, cellConsumer, _currentState); csvBuffer.mark = _currentIndex + 1; _currentState = NONE; } cellConsumer.end(); } private boolean hasUnconsumedData() { return _currentIndex > csvBuffer.mark; } public boolean next() throws IOException { int mark = csvBuffer.mark; boolean b = csvBuffer.next(); _currentIndex -= mark - csvBuffer.mark; return b; } }