package nl.helixsoft.recordstream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.ArrayList;
import java.util.EnumSet;
import java.util.List;
import nl.helixsoft.stats.DataFrame;
import nl.helixsoft.stats.DataFrameOperation;
import nl.helixsoft.stats.DefaultDataFrame;
import nl.helixsoft.util.HStringUtils;
//TODO: rename to FileRecordStream
/**
* Turn a stream of delimited values into a record stream.
* Uses the <a href="http://www.informit.com/articles/article.aspx?p=1216151&seqNum=2">Builder Pattern</a> for configuration.
* <p>
* Usage examples:<br>
* <p>
* Open a tab-delimited text file:<br>
* <code>
* TsvRecordStream.open(file).get();
* </code>
* <p>
* Open a tab-delimited text file with #-comments, specifying the header:<br>
* <code>
* TsvRecordStream.open(file).filterComments().setHeader(new String["ice-cream consumption", "drowning deaths"]).get();
* </code>
* <p>
* Open a comma-separated values file that has quotes around string values:<br>
* <code>
* TsvRecordStream.open(file).commaSeparated().removeOptionalQuotes().get();
* </code>
* <p>
* NOTE: I am considering to rename this FileRecordStream, because this class can deal with various kinds of files, not just TSV and CSV but arbitrary separated files.
*/
public class TsvRecordStream extends AbstractRecordStream
{
private enum Flags {
FILTER_COMMENTS,
NO_HEADER,
REMOVING_OPTIONAL_QUOTES,
COMMA_DELIMITED
};
private EnumSet<Flags> flags = EnumSet.noneOf(Flags.class);
private final BufferedReader reader;
private final RecordMetaData rmd;
private String delimiter = "\t";
/**
* Turn a {@link Reader} object into a {@link RecordStream}.
* @return a builder object, on which configuration settings can be chained.
*/
public static Builder open (Reader _reader)
{
return new Builder(_reader);
}
/**
* Turn a {@link InputStream} object into a {@link RecordStream}.
* @return a builder object, on which configuration settings can be chained.
*/
public static Builder open (InputStream _is)
{
return new Builder(_is);
}
/**
* Open a {@link File} and create a {@link RecordStream} from it.
* @return a builder object, on which configuration settings can be chained.
*/
public static Builder open (File _file) throws FileNotFoundException
{
return new Builder(_file);
}
/**
* Builder for configuration options.
* <p>
* All the various configuration options are available through this builder. Once the configuration is complete,
* call the get() method to obtain the actual RecordStream.
* <p>
* Each configuration method returns <code>this</code>, so the configuration can be chained.
*/
public static class Builder
{
private final Reader reader;
private String delimiter = "\t";
private EnumSet<Flags> flags = EnumSet.noneOf(Flags.class);
private String[] header = null;
Builder(Reader _reader)
{
this.reader = _reader;
}
Builder(File f) throws FileNotFoundException
{
this.reader = new FileReader (f);
}
Builder(InputStream is)
{
this.reader = new InputStreamReader (is);
}
/**
* Configure a tab-separated stream
*/
public Builder tabSeparated()
{
delimiter = "\t";
return this;
}
/**
* Configure a comma-separated stream instead of the default (tab-delimited)
*/
public Builder commaSeparated()
{
delimiter = ",";
return this;
}
/**
* Use a regular expression as a custom separator.
* For example, the following threats any consecutive whitespace as a delimiter.
* <code>
* customSeparator("\s+")
* </code>
*/
public Builder customSeparator(String regex)
{
delimiter = regex;
return this;
}
/**
* If a field is surrounded by '"' quotes, remove them. Particularly useful for CSV files.
*
* for the combination of commaSeparated and removeOptionalQuotes, use the function StringUtils.quotedCommaSplit, to deal correctly with comma's inside quotes
* //TODO: currently doesn't handle newlines within quotes, as per the semi-official specification: https://en.wikipedia.org/wiki/Comma-separated_values
*/
public Builder removeOptionalQuotes()
{
flags.add(Flags.REMOVING_OPTIONAL_QUOTES);
return this;
}
/**
* The first line is a header line.
*/
public Builder firstLineIsHeader()
{
flags.remove(Flags.NO_HEADER);
return this;
}
/**
* Instead of using the first line as a header line, set the values of the header you would like to see.
* <p>
* Implies that there is no header line in the data.
*/
public Builder setHeader(String[] header)
{
this.header = header;
flags.add(Flags.NO_HEADER);
return this;
}
/**
* Instead of using the first line as a header line, set the values of the header you would like to see.
* <p>
* Implies that there is no header line in the data.
*/
public Builder setHeader(List<String> header)
{
this.header = header.toArray(new String[header.size()]);
flags.add(Flags.NO_HEADER);
return this;
}
/**
* Filter out any lines that start with a '#' comment marker.
*/
public Builder filterComments()
{
flags.add(Flags.FILTER_COMMENTS);
return this;
}
public TsvRecordStream get() throws StreamException
{
return asRecordStream();
}
/** create a RecordStream, with only a few rows in memory at a time */
public TsvRecordStream asRecordStream() throws StreamException
{
if (header == null)
{
return new TsvRecordStream (reader, delimiter, flags);
}
else
{
return new TsvRecordStream (reader, delimiter, header, flags);
}
}
/** create a DataFrame, loading the entire table in memory */
public DataFrame asDataFrame() throws StreamException
{
return DefaultDataFrame.createFromRecordStream(get());
}
}
private String[] splitLine(String line)
{
String[] result;
if (flags.contains(Flags.REMOVING_OPTIONAL_QUOTES) && (",".equals(delimiter)))
{
result = HStringUtils.quotedCommaSplit(line).toArray(new String[] {});
}
else
{
result = line.split(delimiter, -1);
if (flags.contains(Flags.REMOVING_OPTIONAL_QUOTES))
{
for (int i = 0; i < result.length; ++i)
{
result[i] = HStringUtils.removeOptionalQuotes(result[i]);
}
}
}
return result;
}
/**
* Don't use, use open() instead.
*/
private TsvRecordStream (Reader _reader, String _delimiter, String[] _header, EnumSet<Flags> flags) throws StreamException
{
this.flags = flags;
if (flags.contains(Flags.COMMA_DELIMITED))
{
delimiter = ",";
}
else
{
delimiter = _delimiter;
}
this.reader = new BufferedReader(_reader);
rmd = new DefaultRecordMetaData (_header);
}
// TODO: this constructor has some redundancy with TsvRecordStream(Reader, String, String[], EnumSet)
/**
* Don't use, use open() instead.
*/
private TsvRecordStream (Reader _reader, String _delimiter, EnumSet<Flags> flags) throws StreamException
{
this.flags = flags;
if (flags.contains(Flags.COMMA_DELIMITED))
{
delimiter = ",";
}
else
{
delimiter = _delimiter;
}
try
{
this.reader = new BufferedReader(_reader);
String headerLine = getNextNonCommentLine();
List<String> header = new ArrayList<String>();
if (headerLine != null) // empty file has no header
{
for (String h : splitLine(headerLine))
{
header.add (h);
}
}
rmd = new DefaultRecordMetaData(header);
}
catch (IOException e)
{
throw new StreamException(e);
}
}
/**
* Return the next Record, part of the RecordStream interface
* {@inheritDoc}
*/
@Override
public Record getNext() throws StreamException
{
try
{
String line;
// fetch next line that doesn't start with "#"
line = getNextNonCommentLine();
if (line == null) { return null; }
String[] split = splitLine(line);
String[] fields;
if (split.length == rmd.getNumCols())
{
fields = split;
}
else
{
// ensure that array of fields is the expected length
fields = new String[rmd.getNumCols()];
int col = 0;
for (String field : split)
{
fields[col] = field;
col++;
if (col == rmd.getNumCols())
{
// there are extra columns at the end. Check if they are empty or if they contain data.
for (int i = col; i < split.length; ++i)
{
if (!split[col].equals(""))
{
System.err.println ("Warning: found extra non-empty columns in TSV file");
break; // ignoring extra column
}
}
break;
}
}
}
return new DefaultRecord(rmd, fields);
}
catch (IOException e)
{
throw new StreamException(e);
}
}
private String getNextNonCommentLine() throws IOException
{
if (!flags.contains(Flags.FILTER_COMMENTS)) return reader.readLine();
String line;
do {
line = reader.readLine();
if (line == null) return null;
}
while (line.startsWith("#"));
return line;
}
/**
* get the assodicated RecordMetaData, part of the RecordStream interface
* {@inheritDoc}
*/
@Override
public RecordMetaData getMetaData()
{
return rmd;
}
@Override
public void close()
{
try {
reader.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}