/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.commons.csv; import java.io.IOException; import static org.apache.commons.csv.Token.Type.*; class CSVLexer extends Lexer { // ctor needs to be public so can be called dynamically by PerformanceTest class public CSVLexer(CSVFormat format, ExtendedBufferedReader in) { super(format, in); } /** * Returns the next token. * <p/> * A token corresponds to a term, a record change or an end-of-file indicator. * * @param tkn an existing Token object to reuse. The caller is responsible to initialize the Token. * @return the next token found * @throws java.io.IOException on stream access error */ @Override Token nextToken(Token tkn) throws IOException { // get the last read char (required for empty line detection) int lastChar = in.readAgain(); // read the next char and set eol int c = in.read(); /* * Note: * The following call will swallow LF if c == CR. * But we don't need to know if the last char * was CR or LF - they are equivalent here. */ boolean eol = isEndOfLine(c); // empty line detection: eol AND (last char was EOL or beginning) if (emptyLinesIgnored) { while (eol && isStartOfLine(lastChar)) { // go on char ahead ... lastChar = c; c = in.read(); eol = isEndOfLine(c); // reached end of file without any content (empty line at the end) if (isEndOfFile(c)) { tkn.type = EOF; // don't set tkn.isReady here because no content return tkn; } } } // did we reach eof during the last iteration already ? EOF if (isEndOfFile(lastChar) || (!isDelimiter(lastChar) && isEndOfFile(c))) { tkn.type = EOF; // don't set tkn.isReady here because no content return tkn; } if (isStartOfLine(lastChar) && isCommentStart(c)) { String comment = in.readLine().trim(); tkn.content.append(comment); tkn.type = COMMENT; return tkn; } // important: make sure a new char gets consumed in each iteration while (tkn.type == INVALID) { // ignore whitespaces at beginning of a token if (surroundingSpacesIgnored) { while (isWhitespace(c) && !eol) { c = in.read(); eol = isEndOfLine(c); } } // ok, start of token reached: encapsulated, or token if (isDelimiter(c)) { // empty token return TOKEN("") tkn.type = TOKEN; } else if (eol) { // empty token return EORECORD("") //noop: tkn.content.append(""); tkn.type = EORECORD; } else if (isEncapsulator(c)) { // consume encapsulated token encapsulatedTokenLexer(tkn); } else if (isEndOfFile(c)) { // end of file return EOF() //noop: tkn.content.append(""); tkn.type = EOF; tkn.isReady = true; // there is data at EOF } else { // next token must be a simple token // add removed blanks when not ignoring whitespace chars... simpleTokenLexer(tkn, c); } } return tkn; } /** * A simple token lexer * <p/> * Simple token are tokens which are not surrounded by encapsulators. * A simple token might contain escaped delimiters (as \, or \;). The * token is finished when one of the following conditions become true: * <ul> * <li>end of line has been reached (EORECORD)</li> * <li>end of stream has been reached (EOF)</li> * <li>an unescaped delimiter has been reached (TOKEN)</li> * </ul> * * @param tkn the current token * @param c the current character * @return the filled token * @throws IOException on stream access error */ private Token simpleTokenLexer(Token tkn, int c) throws IOException { // Faster to use while(true)+break than while(tkn.type == INVALID) while (true) { if (isEndOfLine(c)) { tkn.type = EORECORD; break; } else if (isEndOfFile(c)) { tkn.type = EOF; tkn.isReady = true; // There is data at EOF break; } else if (isDelimiter(c)) { tkn.type = TOKEN; break; } else if (isEscape(c)) { tkn.content.append((char) readEscape()); c = in.read(); // continue } else { tkn.content.append((char) c); c = in.read(); // continue } } if (surroundingSpacesIgnored) { trimTrailingSpaces(tkn.content); } return tkn; } /** * An encapsulated token lexer * <p/> * Encapsulated tokens are surrounded by the given encapsulating-string. * The encapsulator itself might be included in the token using a * doubling syntax (as "", '') or using escaping (as in \", \'). * Whitespaces before and after an encapsulated token are ignored. * The token is finished when one of the following conditions become true: * <ul> * <li>an unescaped encapsulator has been reached, and is followed by optional whitespace then:</li> * <ul> * <li>delimiter (TOKEN)</li> * <li>end of line (EORECORD)</li> * </ul> * <li>end of stream has been reached (EOF)</li> * </ul> * * @param tkn the current token * @return a valid token object * @throws IOException on invalid state: * EOF before closing encapsulator or invalid character before delimiter or EOL */ private Token encapsulatedTokenLexer(Token tkn) throws IOException { // save current line number in case needed for IOE int startLineNumber = getLineNumber(); int c; while (true) { c = in.read(); if (isEscape(c)) { tkn.content.append((char) readEscape()); } else if (isEncapsulator(c)) { if (isEncapsulator(in.lookAhead())) { // double or escaped encapsulator -> add single encapsulator to token c = in.read(); tkn.content.append((char) c); } else { // token finish mark (encapsulator) reached: ignore whitespace till delimiter while (true) { c = in.read(); if (isDelimiter(c)) { tkn.type = TOKEN; return tkn; } else if (isEndOfFile(c)) { tkn.type = EOF; tkn.isReady = true; // There is data at EOF return tkn; } else if (isEndOfLine(c)) { tkn.type = EORECORD; return tkn; } else if (!isWhitespace(c)) { // error invalid char between token and next delimiter throw new IOException("(line " + getLineNumber() + ") invalid char between encapsulated token and delimiter"); } } } } else if (isEndOfFile(c)) { // error condition (end of file before end of token) throw new IOException("(startline " + startLineNumber + ") EOF reached before encapsulated token finished"); } else { // consume character tkn.content.append((char) c); } } } }