package edu.stanford.nlp.io;
import edu.stanford.nlp.util.logging.Redwood;
import java.util.*;
import java.io.*;
/**
* For reading files or input streams which are structured as records and fields
* (rows and columns). Each time you call <code>next()</code>, you get back the
* next record as a list of strings. You can specify the field delimiter (as a
* regular expression), how many fields to expect, and whether to filter lines
* containing the wrong number of fields.
*
* The iterator may be empty, if the file is empty. If there is an
* <code>IOException</code> when <code>next()</code> is called, it is
* caught silently, and <code>null</code> is returned (!).
*
* @author Bill MacCartney
*/
public class RecordIterator implements Iterator<List<String>> {
/** A logger for this class */
private static Redwood.RedwoodChannels log = Redwood.channels(RecordIterator.class);
private static String WHITESPACE = "\\s+";
private BufferedReader reader;
private int fields; // -1 means infer from first line of input
private boolean filter;
private String delim = WHITESPACE;
private List<String> nextResult;
// factory methods -------------------------------------------------------
/**
* Returns an <code>Iterator</code> over records (lists of strings)
* corresponding to lines in the specified <code>Reader</code>.
*
* @param reader the reader to read from
* @param fields how many fields to expect in each record
* @param filter whether to filter lines containing wrong number of fields
* @param delim a regexp on which to split lines into fields (default whitespace)
*/
public RecordIterator(Reader reader, int fields, boolean filter, String delim) {
this.reader = new BufferedReader(reader);
this.fields = fields;
this.filter = filter;
this.delim = delim;
if (delim == null) this.delim = WHITESPACE;
advance();
}
/**
* Returns an <code>Iterator</code> over records (lists of strings)
* corresponding to lines in the specified file.
*
* @param filename the file to read from
* @param fields how many fields to expect in each record
* @param filter whether to filter lines containing wrong number of fields
* @param delim a regexp on which to split lines into fields (default whitespace)
*/
public RecordIterator(String filename, int fields, boolean filter, String delim)
throws FileNotFoundException {
this(new FileReader(filename), fields, filter, delim);
}
/**
* Returns an <code>Iterator</code> over records (lists of strings)
* corresponding to lines in the specified <code>InputStream</code>.
*
* @param in the <code>InputStream</code> to read from
* @param fields how many fields to expect in each record
* @param filter whether to filter lines containing wrong number of fields
* @param delim a regexp on which to split lines into fields (default whitespace)
*/
public RecordIterator(InputStream in, int fields, boolean filter, String delim) {
this(new InputStreamReader(in), fields, filter, delim);
}
/**
* Returns an <code>Iterator</code> over records (lists of strings)
* corresponding to lines in the specified file. The default whitespace
* delimiter is used.
*
* @param filename the file to read from
* @param fields how many fields to expect in each record
* @param filter whether to filter lines containing wrong number of fields
*/
public RecordIterator(String filename, int fields, boolean filter)
throws FileNotFoundException {
this(filename, fields, filter, WHITESPACE);
}
/**
* Returns an <code>Iterator</code> over records (lists of strings)
* corresponding to lines in the specified file. The default whitespace
* delimiter is used. The first line is used to determine how many
* fields per record to expect.
*
* @param filename the file to read from
* @param filter whether to filter lines containing wrong number of fields
*/
public RecordIterator(String filename, boolean filter)
throws FileNotFoundException {
this(filename, -1, filter, WHITESPACE);
}
/**
* Returns an <code>Iterator</code> over records (lists of strings)
* corresponding to lines in the specified file. The default whitespace
* delimiter is used. Lines which contain other than <code>fields</code>
* fields are filtered.
*
* @param filename the file to read from
* @param fields how many fields to expect in each record
*/
public RecordIterator(String filename, int fields)
throws FileNotFoundException {
this(filename, fields, true, WHITESPACE);
}
/**
* Returns an <code>Iterator</code> over records (lists of strings)
* corresponding to lines in the specified file. No lines are filtered.
*
* @param filename the file to read from
* @param delim a regexp on which to split lines into fields (default whitespace)
*/
public RecordIterator(String filename, String delim)
throws FileNotFoundException {
this(filename, 0, false, delim);
}
/**
* Returns an <code>Iterator</code> over records (lists of strings)
* corresponding to lines in the specified file. The default whitespace
* delimiter is used. No lines are filtered.
*
* @param filename the file to read from
*/
public RecordIterator(String filename)
throws FileNotFoundException {
this(filename, 0, false, WHITESPACE);
}
/**
* Returns an <code>Iterator</code> over records (lists of strings)
* corresponding to lines in the specified <code>InputStream</code>. The
* default whitespace delimiter is used. No lines are filtered.
*
* @param in the stream to read from
*/
public RecordIterator(InputStream in) {
this(in, 0, false, WHITESPACE);
}
// iterator methods ------------------------------------------------------
public boolean hasNext() {
return (nextResult != null);
}
public List<String> next() {
List<String> result = nextResult;
advance();
return result;
}
public void remove() {
throw new UnsupportedOperationException();
}
// convenience methods ---------------------------------------------------
/**
* A static convenience method that returns the first line of the
* specified file as list of strings, using the specified regexp as
* delimiter.
*
* @param filename the file to read from
* @param delim a regexp on which to split lines into fields (default whitespace)
*/
public static List<String> firstRecord(String filename, String delim)
throws FileNotFoundException {
RecordIterator it = new RecordIterator(filename, delim);
if (!it.hasNext()) return null;
return it.next();
}
/**
* A static convenience method that returns the first line of the
* specified file as list of strings, using the default whitespace
* delimiter.
*
* @param filename the file to read from
*/
public static List<String> firstRecord(String filename) throws FileNotFoundException {
return firstRecord(filename, WHITESPACE);
}
/**
* A static convenience method that tells you how many fields are in the
* first line of the specified file, using the specified regexp as
* delimiter.
*
* @param filename the file to read from
* @param delim a regexp on which to split lines into fields (default whitespace)
*/
public static int determineNumFields(String filename, String delim) throws FileNotFoundException {
List<String> fields = firstRecord(filename, delim);
if (fields == null) return -1;
else return fields.size();
}
/**
* A static convenience method that tells you how many fields are in the
* first line of the specified file, using the default whitespace
* delimiter.
*
* @param filename the file to read from
*/
public static int determineNumFields(String filename) throws FileNotFoundException {
return determineNumFields(filename, WHITESPACE);
}
// private methods -------------------------------------------------------
private void advance() {
nextResult = null;
while (true) { // 2 exits in body of loop
String line = null;
try {
line = reader.readLine(); // could block if reader is not ready
} catch (IOException e) {
// swallow it, yikes!
}
if (line == null) return; // end of input: nextResult remains null
String[] tokens = line.split(delim);
if (fields < 0) fields = tokens.length; // remember number of fields in first line
if (filter &&
(tokens.length != fields || // wrong number of fields
(tokens.length == 1 && tokens[0].equals("")))) // it's a blank line
continue; // skip this line
nextResult = new ArrayList<>();
for (String token : tokens) nextResult.add(token);
return; // this line will be our next result
}
}
// -----------------------------------------------------------------------
/**
* Just for testing. Reads from the file named on the command line, or from
* stdin, and echoes the records it reads to stdout.
*/
public static void main(String[] args) throws FileNotFoundException {
RecordIterator it = null;
if (args.length > 0) {
it = new RecordIterator(args[0]);
} else {
it = new RecordIterator(System.in);
log.info("[Reading from stdin...]");
}
while (it != null && it.hasNext()) {
List<String> record = it.next();
for (String field : record) {
System.out.printf("[%-10s]", field);
}
System.out.println();
}
}
}