package org.supercsv.io; import java.io.IOException; import java.io.LineNumberReader; import java.io.Reader; import java.util.List; import org.supercsv.exception.SuperCSVException; import org.supercsv.prefs.CsvPreference; import org.supercsv.util.CSVContext; /** * The tokenizer is an internal mechanism to the csv parser * * @author Kasper B. Graversen */ public class Tokenizer implements ITokenizer { CsvPreference preferences; LineNumberReader lnr; StringBuilder sb = null; public Tokenizer(final Reader stream, final CsvPreference preference) { this.preferences = preference; lnr = new LineNumberReader(stream); sb = new StringBuilder(500); } private static void addSpaces(final StringBuilder sb, final int spaces) { for( int i = 0; i < spaces; i++ ) { sb.append(" "); } } /** * {@inheritDoc} */ public void close() throws IOException { lnr.close(); } /** * {@inheritDoc} */ public int getLineNumber() { return lnr.getLineNumber(); } /** * {@inheritDoc} */ public boolean readStringList(final List<String> result) throws IOException { result.clear(); PARSERSTATE state = PARSERSTATE.NORMAL; // start out in normal mode // fast access to preferences final int quote = preferences.getQuoteChar(); final int delim = preferences.getDelimiterChar(); String line; // read non-empty lines only do { line = lnr.readLine(); if( line == null ) { return false; // EOF } } while( line.length() == 0 ); // skip zero len lines // start parsing line += "\n"; // add a newline to determine end of line (making // parsing easier) sb.delete(0, sb.length()); // reset the stringbuilder // proccess the line (and maybe more lines of the file) int p = 0; // the pos of the cursor on the line int linenoQuoteState = -1; // the line number of the file where a potential multiline cell starts int potentialSpaces = 0; // spaces between words or after the last word. // when in non-quote mode, count the spaces and add them only if a non-delimiter or non-end-of-line is met // (otherwise its just empty stuff at the end of the cell which is ignored such as 'foo ,' which is read as // 'foo' whereas 'foo a' -> 'foo a' while( true ) { // relies on p being incremented at least at the end of the while final char c = line.charAt(p); // react to char c depending on the state we are in switch( state ) { case NORMAL: // if(log.isDebugEnabled()) log.debug("normal " + p); if( c == delim ) { result.add(sb.toString()); // save token sb.delete(0, sb.length()); // reset the stringbuilder potentialSpaces = 0; break; // read more } else if( c == ' ' ) { // trim starting spaces (trailing spaces // are removed using the String.trim() if( sb.length() > 0 ) { // first on the line potentialSpaces++; } break; // read more } else if( c == '\n' ) { // save token result.add(sb.toString()); // done at start of method: sb.delete(0, sb.length()); // reset the stringbuilder // done at start of method: potentialSpaces = 0; return true; // we've read a line } else if( c == quote ) { if( sb.length() == 0 ) { // quote first on line cannot be escaped state = PARSERSTATE.QUOTESCOPE; // update variable in order to do debug statements linenoQuoteState = getLineNumber(); break; // read more } else if( line.charAt(p + 1) == quote && sb.length() > 0 ) { // an escaped quote - can not happen as first character, hence the "sb.length > 0" addSpaces(sb, potentialSpaces); potentialSpaces = 0; sb.append(c); // add and skip the first quote // (end of switch will skip the next quote) p++; break; // read more } else if( line.charAt(p + 1) != quote ) { // a single quote, change state and don't // append state = PARSERSTATE.QUOTESCOPE; // update variable in order to do debug statements linenoQuoteState = getLineNumber(); addSpaces(sb, potentialSpaces); potentialSpaces = 0; break; // read more } } else { // if just a normal character addSpaces(sb, potentialSpaces); potentialSpaces = 0; sb.append(c); // add the char } break; // for each situation above, repeat now in the quote scope case QUOTESCOPE: // System.out.println("quote: '" + p + "'"); if( c == '\n' ) { // newline does not count as newline in // quote scope sb.append('\n'); // parse the next line of the file p = -1; // reset delta to point to start of new line (set to // -1 as it will be incremented to 0 at the end of // the switch) line = lnr.readLine(); if( line == null ) { throw new SuperCSVException( "File ended unexpectedly while reading a quoted cell starting on line: " + linenoQuoteState, new CSVContext(linenoQuoteState, 0)); } line += '\n'; // add \n to make parsing easy break; // read more } else if( c == quote ) { if( line.charAt(p + 1) == quote ) { // an escaped quote, sb.append(c); // add and skip the first quote (end of // switch will skip the next quote p++; break; // read more } else { // if(line.charAt(p + 1) != quote) { // a single quote, only change state state = PARSERSTATE.NORMAL; break; // read more } } else { // if just a normal character or delimiter (they don't count in this mode) sb.append(c); // add the char // System.out.println("Adding char '" + c + "'"); } break; default: throw new RuntimeException("this can never happen!"); } // switch p++; // read next char of the line } } }