// **********************************************************************
//
// <copyright>
//
// BBN Technologies
// 10 Moulton Street
// Cambridge, MA 02138
// (617) 873-8000
//
// Copyright (C) BBNT Solutions LLC. All rights reserved.
//
// </copyright>
// **********************************************************************
//
// $Source: /cvs/distapps/openmap/src/openmap/com/bbn/openmap/util/CSVTokenizer.java,v $
// $RCSfile: CSVTokenizer.java,v $
// $Revision: 1.6 $
// $Date: 2008/03/03 16:44:13 $
// $Author: dietrick $
//
// **********************************************************************
package com.bbn.openmap.util;
/**
* Tokenizer for comma separated values files, at least as generated by excel.
* <p>
* token() returns the next token, which can be either:
* <ul>
* <li>null, indicating an empty field.
* <li>a Double, indicating a numeric field.
* <li>a String, indicating an alphanumeric field.
* <li>the NEWLINE object, indicating the end of a record.
* <li>the EOF object, test with isEOF(), indicating the end of file.
* </ul>
*/
public class CSVTokenizer extends Tokenizer {
/** A flag the makes the tokenizer read numbers as strings. */
boolean numberReadAsString = false;
public CSVTokenizer(java.io.Reader in) {
super(in);
}
/**
* If you set numberReadAsString is true, then any number will be maintained
* as a String.
*
* @param in input Reader
* @param numberReadAsString true if numbers should be interpreted to
* Strings
*/
public CSVTokenizer(java.io.Reader in, boolean numberReadAsString) {
super(in);
this.numberReadAsString = numberReadAsString;
}
protected Object lastTokened = NEWLINE;
/**
* @return the next object read from the stream.
*/
public Object token() {
int c = next();
Object ret = null;
if (c == ',') {
if (lastTokened == NEWLINE) {
// Catch the first empty field on a new line.
putback(c);
ret = EMPTY;
} else {
ret = tokenAfterComma();
}
} else if (c == '\n')
ret = NEWLINE;
else if (c == '"')
ret = tokenString(next());
else if (c == '\\')
ret = tokenString(c);
else if ((c == '-' || c == '.' || isDigit(c)) && !numberReadAsString)
ret = tokenNumber(c);
else if (c == -1)
ret = EOF;
else
ret = tokenAny(c);
lastTokened = ret;
return ret;
}
/**
* Return the next object read from the stream, called if a comma is found
* first in order to catch empty fields accurately.
*/
protected Object tokenAfterComma() {
int c = next();
if (c == ',' || c == '\n') {
putback(c);
return EMPTY;
} else if (c == '"')
return tokenString(next());
else if (c == '\\')
return tokenString(c);
else if ((c == '-' || c == '.' || isDigit(c)) && !numberReadAsString)
return tokenNumber(c);
else if (c == -1)
return EOF;
else
return tokenAny(c);
}
/**
* seq(is('"'), many(alt(seq(isNot('"')), bpush) <BR>
* seq(is('"')),alt(seq(is('"'), bpush))),
*/
Object tokenString(int c) {
while (true) {
// Enable escapes to force characters into string.
if (c == '\\') {
bpush(next());
c = next();
} else if (c == '"') {
// Changed from the commented-out code below,
// in order to ignore quotes in any order until
// delimiter is reached. Quotes preceded by the
// escape character live on in the string, via the
// code above.
c = next();
if (isDelimiter(c))
return bclear();
else
continue;
// int c1 = next();
// if (c1 == '"') {
// bpush(c1);
// c = next();
// } else {
// if (isDelimiter(c1)) {
// return bclear();
// } else {
// return error("Expected Delimiter after string!");
// }
// }
} else if (isAny(c)) {
bpush(c);
c = next();
} else {
return bclear();
}
}
}
/**
* This checks for the delimiter at the end of a token. We assume it can
* either be a ',' separating the next field, or '\n' indicating the end of
* a field and the end of a record, so we putback(c).
* <P>
* isDelimiter.set(alt(is(','), is(-1), seq(is('\n'), putback)));
*/
boolean isDelimiter(int c) {
// All delimiters are handled equally now. We used to not put back
// commas and EOF because it was more efficient to just return, but now
// we putback and catch them in the token() call, in order to better
// field empty fields, especially the ellusive common newline combo.
if (c == ',' || c == -1 || c == '\n') {
putback(c); // Wait for next token().
return true;
} else {
return false;
}
}
/**
* Return a number or a string.
*/
Object tokenNumber(int c) {
Object result = tokenAny(c);
try {
Double d = new Double((String) result);
return d;
} catch (NumberFormatException e) {
return result;
}
}
/**
* Return anything up to the next delimiter as a string.
* tokenAny.set(alt(seq(isDelimiter, bclear), seq(bpush,tokenAny)))
*/
Object tokenAny(int c) {
while (true) {
if (isDelimiter(c)) {
return bclear();
} else {
bpush(c);
c = next();
}
}
}
public static void main(String[] args) {
try {
CSVTokenizer csv = new
// CSVTokenizer(new java.io.FileReader(args[0]));
CSVTokenizer(new java.io.BufferedReader(new java.io.FileReader(args[0])));
// new java.io.InputStreamReader
// (new java.io.FileInputStream(args[0]))));
while (true) {
Object token = csv.token();
if (csv.isEOF(token)) {
csv.close();
return;
}
System.out.println(token);
}
} catch (Exception e) {
System.out.println(e);
}
}
}