/**
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
licenses@blazegraph.com
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/*
* Created on Aug 13, 2007
*/
package com.bigdata.util;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.text.DateFormat;
import java.text.DecimalFormat;
import java.text.Format;
import java.text.NumberFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Iterator;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.TreeMap;
import org.apache.log4j.Logger;
/**
* A helper class to read CSV (comma separated value) and similar kinds of
* delimited data. Files may use commas or tabs to delimit columns. If you have
* to parse other kinds of delimited data then you should override
* {@link #split(String)}.
* <p>
* Note: The default parsing of column values will provide {@link Long} integers
* and {@link Double} precision floating point values rather than
* {@link Integer} or {@link Float}. If you want to change this you need to
* customize the {@link Header} class since that is responsible for interpreting
* column values.
* <p>
* Note: If no headers are defined (by the caller) or read from the file (by the
* caller), then default headers named by the origin ONE column indices will be
* used.
*
* @todo replace with <a href="http://flatpack.sourceforge.net/">flatpack</a>?
* It uses an Apache 2 license.
*
* @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
* @version $Id$
*/
public class CSVReader implements Iterator<Map<String, Object>> {
private static final Logger log = Logger.getLogger(CSVReader.class);
// protected static final boolean INFO = log.isInfoEnabled();
/**
* The #of characters to buffer in the reader.
*/
protected static final int BUF_SIZE = Bytes.kilobyte32 * 20;
/**
* A header for a column that examines its values and interprets them as
* floating point numbers, integers, dates, or times when possible and
* as uninterpreted character data otherwise.
*
* @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan
* Thompson</a>
* @version $Id$
*/
public static class Header {
private final String name;
public String getName() {
return name;
}
/**
* An array of formats to be tested against the column values. The order
* in the array is the order in which the formats are tested.
* <P>
* Note that formats DO NOT require a complete match on the source test.
* For this reason, they are arranged based on the expected ability for
* the format to be diagnositic. For example, a date such as 4/6/2002
* would be interpreted as an integer if you tested the integer format
* first.
*
* @todo consider using regular expressions to select which formats to
* apply (or which formats to ignore).
*
* @todo verify that the entire text was matched by the format in order
* to allow the format to be the approved interpretation of the
* text.
*/
protected static final Format[] formats;
static {
formats = new Format[] {
// handles 2 digits years by adjusting for the current century
new SimpleDateFormat("MM/dd/yy"),
// handles explicit years (treats 2 digit years as yyAD).
new SimpleDateFormat("MM/dd/yyyy"),
DateFormat.getDateInstance(DateFormat.SHORT),
DateFormat.getDateInstance(DateFormat.MEDIUM),
DateFormat.getDateInstance(DateFormat.LONG),
DateFormat.getDateInstance(DateFormat.FULL),
DateFormat.getTimeInstance(DateFormat.SHORT),
DateFormat.getTimeInstance(DateFormat.MEDIUM),
DateFormat.getTimeInstance(DateFormat.LONG),
DateFormat.getTimeInstance(DateFormat.FULL),
DateFormat.getDateTimeInstance(DateFormat.SHORT,DateFormat.SHORT),
DateFormat.getDateTimeInstance(DateFormat.MEDIUM,DateFormat.MEDIUM),
DateFormat.getDateTimeInstance(DateFormat.LONG,DateFormat.LONG),
DateFormat.getDateTimeInstance(DateFormat.FULL,DateFormat.FULL),
NumberFormat.getCurrencyInstance(),
NumberFormat.getPercentInstance(),
NumberFormat.getNumberInstance(),
/*
* Note: There are no factory methods for formats that
* handle exponents. I've put a few in here, but there may
* very well be other examples that will still not be parsed
* correctly.
*/
// scientific
new DecimalFormat("0.###E0"),
// engineering
new DecimalFormat("##0.#####E0"),
NumberFormat.getIntegerInstance(),
};
}
public Header(final String name) {
if (name == null)
throw new IllegalArgumentException();
if (name.trim().length() == 0)
throw new IllegalArgumentException();
this.name = name;
}
/**
* Attempts to interpret the character data as a date/time,
* currency, percentage, floating point value, or integer. If the
* text can not be interpreted as any of those formats then it
* returns the uninterpreted character data.
*
* @param text
* The character data.
*
* @return The parsed value.
*/
public Object parseValue(final String text) {
for (int i = 0; i < formats.length; i++) {
try {
final Format f = formats[i];
if (f instanceof DateFormat) {
return ((DateFormat) f).parse(text);
} else if (f instanceof NumberFormat) {
return ((NumberFormat) f).parse(text);
} else
throw new AssertionError();
} catch (NumberFormatException ex) {
// ignore.
} catch (ParseException ex) {
// ignore.
}
}
return text;
}
/**
* Equal if the headers have the same data.
*/
@Override
public boolean equals(final Object o) {
if (this == o)
return true;
if (!(o instanceof Header)) {
return false;
}
return name.equals(((Header) o).name);
}
// public boolean equals(final Header o) {
//
// if(this==o) return true;
//
// return name.equals(o.name);
//
// }
/**
* Based on the header name.
*/
@Override
public int hashCode() {
return name.hashCode();
}
@Override
public String toString() {
return name;
}
}
/**
* The source.
*/
protected final BufferedReader r;
/**
* The current line. Set to null initially and by {@link #next()}. Set
* to non-null by {@link #hasNext()} when testing for more lines or by
* {@link #next()} when attempting to read the next line.
*/
private String line = null;
/**
* The current line # (origin one).
*/
private int lineNo = 0;
/**
* Set to true when {@link BufferedReader#readLine()} returns false
* indicating that no more data may be read from the source.
*/
private boolean exhausted = false;
private boolean skipCommentLines = true;
private boolean skipBlankLines = true;
private boolean trimWhitespace = true;
private long tailDelayMillis = 0L;
/**
* The header definitions (initially null).
*
* @see #readHeaders()
* @see #setHeaders(String[])
*/
protected Header[] headers;
public CSVReader(final InputStream is, final String charSet)
throws IOException {
if (is == null)
throw new IllegalArgumentException();
if (charSet == null)
throw new IllegalArgumentException();
r = new BufferedReader(new InputStreamReader(is, charSet),
BUF_SIZE);
}
public CSVReader(final Reader r) throws IOException {
if (r == null)
throw new IllegalArgumentException();
this.r = new BufferedReader(r, BUF_SIZE);
}
/**
* The current line number (origin one).
*/
public int lineNo() {
return lineNo;
}
public boolean setSkipCommentLines(boolean skipCommentLines) {
boolean tmp = this.skipCommentLines;
this.skipCommentLines = skipCommentLines;
return tmp;
}
public boolean getSkipCommentLines() {
return skipCommentLines;
}
public boolean setSkipBlankLines(final boolean skipBlankLines) {
final boolean tmp = this.skipBlankLines;
this.skipBlankLines = skipBlankLines;
return tmp;
}
public boolean getSkipBlankLines() {
return skipBlankLines;
}
public boolean setTrimWhitespace(final boolean trimWhitespace) {
final boolean tmp = this.trimWhitespace;
this.trimWhitespace = trimWhitespace;
return tmp;
}
public boolean getTrimWhitespace() {
return trimWhitespace;
}
/**
* The #of milliseconds that the {@link CSVReader} should wait before
* attempting to read another line from the source (when reading from
* a pipe) -or- 0L if the {@link CSVReader} should NOT continue reading
* once it has reached the end of the input (default 0L).
*/
public long getTailDelayMillis() {
return tailDelayMillis;
}
public long setTailDelayMillis(final long tailDelayMillis) {
if (tailDelayMillis < 0)
throw new IllegalArgumentException();
long tmp = this.tailDelayMillis;
this.tailDelayMillis = tailDelayMillis;
return tmp;
}
@Override
public boolean hasNext() {
if (exhausted)
return false;
if (line != null) {
return true;
}
// final Thread currentThread = Thread.currentThread();
try {
while (true) {
if (Thread.interrupted()) {
if (log.isInfoEnabled())
log.info("Interrupted");
exhausted = true;
return false;
}
while (tailDelayMillis != 0L && !r.ready()) {
/*
* Wait until more data is available.
*
* @todo may have to wait until a newline is available, or
* just incrementally buffer until we have a full line of
* text.
*/
try {
Thread.sleep(tailDelayMillis);
} catch (InterruptedException e) {
// Interrupted - stop processing.
log.warn(e.getMessage());
return false;
}
}
line = r.readLine();
if (line == null) {
exhausted = true;
return false;
}
lineNo++;
if (skipBlankLines && line.trim().length() == 0)
continue;
if (skipCommentLines && line.length()>0 && line.charAt(0) == '#')
continue;
return true;
}
} catch (IOException ex) {
throw new RuntimeException(ex);
}
}
@Override
public Map<String, Object> next() {
if (!hasNext())
throw new NoSuchElementException();
// parse the line.
Map<String, Object> map = parse(trim(split(line)));
// the line has been consumed.
this.line = null;
// return the parsed data.
return map;
}
/**
* Split the line into columns based on tabs or commas.
*
* @param line
* The line.
*
* @return The columns. There will be one value for each column
* identified in the line.
*
* @todo allow quoted values that contain commas.
*/
protected String[] split(final String line) {
final String[] cols = line.split("[,\t]");
return cols;
}
/**
* Trim whitespace and optional quotes from each value iff
* {@link #getTrimWhitespace()} is true.
*
* @param cols
* The column values.
*
* @return The column values.
*/
protected String[] trim(final String[] cols) {
if (!trimWhitespace)
return cols;
for (int i = 0; i < cols.length; i++) {
String col = cols[i];
if (col != null) {
col = col.trim();
if (col.startsWith("\"") && col.endsWith("\"")) {
col = col.substring(1, col.length() - 1);
}
cols[i] = col;
}
}
return cols;
}
/**
* Parse the line into column values. If no headers have been defined then
* default headers are automatically using {@link #setDefaultHeaders(int)}.
*
* @param line
* The line.
*
* @return A map containing the parsed data.
*/
protected Map<String, Object> parse(final String[] values) {
final Map<String, Object> map = new TreeMap<String, Object>();
if (headers == null) {
log.warn("No headers - using defaults.");
setDefaultHeaders(values.length);
}
if (headers.length < values.length) {
throw new RuntimeException("Too many values: line=" + lineNo);
}
for (int i = 0; i < values.length; i++) {
final Header h = headers[i];
final String text = values[i];
map.put(h.name, h.parseValue(text));
}
return map;
}
/**
* Creates default headers named by the origin ONE column indices
* {1,2,3,4,...}.
*
* @param ncols
* The #of columns.
*/
protected void setDefaultHeaders(final int ncols) {
final Header[] headers = new Header[ncols];
for (int i = 0; i < ncols; i++) {
headers[i] = new Header("" + (i + 1));
}
this.headers = headers;
}
/**
* Parse a line containing headers.
*
* @param line
* The line.
*
* @return The header definitions.
*/
protected Header[] parseHeaders(final String line) {
final String[] cols = trim(split(line));
final Header[] headers = new Header[cols.length];
for (int i = 0; i < cols.length; i++) {
headers[i] = new Header(cols[i]);
}
return headers;
}
/**
* Interpret the next row as containing headers.
*
* @throws IOException
*/
public void readHeaders() throws IOException {
if (!hasNext())
throw new IOException("No more rows");
// parse the line to extract the headers.
headers = parseHeaders(line);
// the line has been consumed.
line = null;
}
/**
* Return the current headers (by reference).
*/
public Header[] getHeaders() {
return headers.clone();
}
/**
* Explictly set the headers.
*
* @param headers
* The headers.
*/
public void setHeaders(final Header[] headers) {
if (headers == null)
throw new IllegalArgumentException();
this.headers = headers;
}
/**
* Re-define the {@link Header} at the specified index.
*
* @param index
* The index in [0:#headers-1].
* @param header
* The new {@link Header} definition.
*/
public void setHeader(final int index, final Header header) {
if (index < 0 || index > headers.length)
throw new IndexOutOfBoundsException();
if (header == null)
throw new IllegalArgumentException();
headers[index] = header;
}
/**
* Unsupported operation.
*/
@Override
public void remove() {
throw new UnsupportedOperationException();
}
}