package prefuse.data.io; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import prefuse.data.Table; import prefuse.data.parser.DataParseException; import prefuse.data.parser.DataParser; import prefuse.data.parser.ParserFactory; import prefuse.data.parser.TypeInferencer; import prefuse.util.collections.ByteArrayList; import prefuse.util.io.IOLib; /** * Abstract base class for TableReader instances that read in a table * from a textual data file. * * @author <a href="http://jheer.org">jeffrey heer</a> */ public abstract class AbstractTextTableReader extends AbstractTableReader { private ParserFactory m_pfactory; private boolean m_hasHeader; /** * Create a new AbstractTextTableReader using a default ParserFactory. */ public AbstractTextTableReader() { this(ParserFactory.getDefaultFactory()); } /** * Create a new AbstractTextTableReader. * @param parserFactory the ParserFactory to use for parsing text strings * into table values. */ public AbstractTextTableReader(ParserFactory parserFactory) { m_pfactory = parserFactory; m_hasHeader = true; } /** * Set whether or not the table data file includes a header row. * @param hasHeaderRow true if the the data file includes a header row, * false otherwise. */ public void setHasHeader(boolean hasHeaderRow) { m_hasHeader = hasHeaderRow; } /** * @see prefuse.data.io.AbstractTableReader#readTable(java.io.InputStream) */ public Table readTable(InputStream is) throws DataIOException { // determine input stream capabilities // if we can't reset the stream, we read in all the bytes // and make our own local stream ByteArrayList buf = null; if ( is.markSupported() ) { // mark the stream to our reset point is.mark(Integer.MAX_VALUE); } else { // load in the entirety of the input stream try { buf = IOLib.readAsBytes(is); } catch ( IOException ioe ) { throw new DataIOException(ioe); } // create our own input stream is = buf.getAsInputStream(); } final TypeInferencer di = new TypeInferencer(m_pfactory); final ArrayList headers = getColumnNames(); final int[] dim = new int[] { 0, 0 }; TableReadListener scanner = new TableReadListener() { int prevLine = -1; public void readValue(int line, int col, String value) throws DataParseException { // sample value to determine data type if ( line > 1 || !m_hasHeader ) { di.sample(col-1, value); // update num rows if ( line != prevLine ) { prevLine = line; dim[0]++; } } else if ( line == 1 && m_hasHeader ) { headers.add(value); } // update num cols if ( col > dim[1] ) dim[1] = col; } }; // do a scan of the stream, collecting length and type data try { read(is, scanner); } catch ( IOException ioe ) { throw new DataIOException(ioe); } catch ( DataParseException de ) { // can't happen } // create the table int nrows = dim[0]; int ncols = dim[1]; final Table table = new Table(nrows, ncols); // create the table columns for ( int i=0; i < ncols; ++i ) { String header; if ( m_hasHeader || i < headers.size() ) { header = (String)headers.get(i); } else { header = getDefaultHeader(i); } table.addColumn(header, di.getType(i)); table.getColumn(i).setParser(di.getParser(i)); } // reset dim array, will hold row/col indices dim[0] = dim[1] = -1; TableReadListener parser = new TableReadListener() { int prevLine = -1; public void readValue(int line, int col, String value) throws DataParseException { // early exit on header value if ( line == 1 && m_hasHeader ) return; if ( line != prevLine ) { prevLine = line; ++dim[0]; } dim[1] = col-1; // XXX NOTE-2005.08.29-jheer // For now we use generic routines for filling column values. // This results in the autoboxing of primitive types, slowing // performance a bit and possibly triggering avoidable garbage // collections. If this proves to be a problem down the road, // we can add more nuance later. DataParser dp = di.getParser(dim[1]); table.set(dim[0], dim[1], dp.parse(value)); } }; // read the data into the table try { // prepare the input stream if ( is.markSupported() ) { is.reset(); } else { is = buf.getAsInputStream(); } // read the data read(is, parser); } catch ( IOException ioe ) { throw new DataIOException(ioe); } catch ( DataParseException de ) { throw new DataIOException("Parse exception for column " + '\"' + dim[1] + '\"' + " at row: " + dim[0], de); } return table; } /** * Subclasses can override this to provide column names through * a custom mechanism. * @return an ArrayList of String instances indicating the column names */ protected ArrayList getColumnNames() { return new ArrayList(); } /** * Returns default column header names of the type "A", "B", ..., * "Z", "AA", "AB", etc. * @param idx the index of the column header * @return a default column header name for the given index. */ public static String getDefaultHeader(int idx) { if ( idx == 0 ) return "A"; int len = ((int)(Math.log(idx) / Math.log(26))) + 1; char[] h = new char[len]; int p = len; h[--p] = (char)('A'+(idx%26)); idx = idx / 26; while ( idx > 26 ) { h[--p] = (char)('A'+(idx%26)); idx = idx/26; } if ( idx > 0 ) { h[--p] = (char)('A'+((idx-1)%26)); } return new String(h, p, len); } /** * Scans the input stream, making call backs for each encountered entry * on the provided TextReadListener. * @param is the InputStream to read * @param trl the TextReadListener that will receive callbacks * @throws IOException * @throws DataParseException */ protected abstract void read(InputStream is, TableReadListener trl) throws IOException, DataParseException; } // end of abstract class AbstractTextTableReader