/* * Copyright 2006, United States Government as represented by the Administrator * for the National Aeronautics and Space Administration. No copyright is * claimed in the United States under Title 17, U.S. Code. All Other Rights * Reserved. * * Created on Apr 23, 2004 */ package gov.nasa.ial.mde.io; import gov.nasa.ial.mde.solver.symbolic.AnalyzedData; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.text.NumberFormat; import java.text.ParseException; import java.util.ArrayList; import java.util.Arrays; import java.util.List; /** * The <code>TextDataFileParser</code> will parse a text data file with an * optional text header describing each column of data and with the data * arranged rows and columns. The result of parsing the text data file is a * <code>List</code> of <code>AnalyzedData</code> items where each entry in * the <code>List</code> corresponds to one segment of data and where each * segment corresponds to the chunk of data between a gaps in the values in the * first column of data. The <code>TextDataFileParser</code> will analyze the * text data file to determine what character is consistently used as a * delimiter between columns of data. This allows the * <code>TextDataFileParser</code> to parse most any text data file provided * that the delimiter is consistently used throughout the file. * <p> * The <code>TextDataFileParser</code> can also read text data files in the * Comma Separated Values (CSV) format, which is compatible with the CSV file * format used by Microsoft Excel. * <p> * Here is how the <code>TextDataFileParser</code> works: * <ol> * <li>The text data file must have at least two columns.</li> * <li>If a text header is exists, the number of columns in the text headers * must match the number of columns for the data.</li> * <li>It will skip blank lines in the header and data fields.</li> * <li>If a data cell is empty (for a given row and column, nothing is * specified but there are valid delimiters) it will use a value of 0.0 for it.</li> * <li>The first column of data must be in ascending order.</li> * <li>Duplicate entries in the first column of data are allowed.</li> * <li>If the gap between first column data values is greater than 2 times the * average difference between the values then a new segment of data is created.</li> * </ol> * * @author Dan Dexter * @version 1.0 * @since 1.0 */ public class TextDataFileParser implements FileParser { /** A referene to the file */ File file; /** The number of columns of data. */ int columnCnt; /** The number of rows of test header. */ int headerRowCnt; /** The number of rows of data. */ int dataRowCnt; /** The type of the delimiter. */ int delimType; /** The text headers. */ String[] headers; private double[][] data; private NumberFormat numberFormat = NumberFormat.getInstance(); private static final boolean ENABLE_DATA_SEGMENTATION = true; /** * Default constructor not allowed. */ @SuppressWarnings("unused") private TextDataFileParser() { throw new RuntimeException("Default constructor not allowed."); } /** * Creates an instance of <code>TextDataFileParser</code> that will parse * the specified <code>filename</code> for the data. * * @param filename * the specified name of the text file to parse the data from */ public TextDataFileParser(String filename) { this(new File(filename)); } /** * Creates an instance of <code>TextDataFileParser</code> that will parse * the specified <code>File</code> object for the data. * * @param file * the specified <code>File</code> object of the text file to * parse the data from */ public TextDataFileParser(File file) { if (file == null) { throw new NullPointerException("Null file."); } if (!file.exists() || !file.isFile()) { throw new IllegalArgumentException( "The specified file does not exist or is an invalid file."); } this.file = file; clear(); } /** * Parse the file and return a <code>List</code> of * <code>AnalyzedData</code> item's, where each entry in the * <code>List</code> corresponds to one segment of data. * * @return a <code>List</code> of <code>AnalyzedData</code> item's * @throws IOException * is thrown for file Input/Output errors * @throws ParseException * is thrown for parse errors * @see gov.nasa.ial.mde.solver.symbolic.AnalyzedData */ public List<AnalyzedData> parse() throws IOException, ParseException { analyzeFile(); if (columnCnt < 2) { throw new IllegalArgumentException("Data file must have at least two columns of data."); } // Parse the file for the column header information. parseFileForHeader(); // Use a default header name for the columns that do not have a header. checkHeaders(); // Parse the file for the data. parseFileForData(); // Make sure we have some data. if ((data == null) || (data.length <= 0) || (data[0].length <= 0)) { throw new IllegalArgumentException("Data file does not contain any data."); } // For now, we only support the first column values being sorted in // ascending order. if (!isSortedInAscendingOrder(data[0])) { throw new IllegalArgumentException( "Data values in the first column must be sorted in ascending order."); } return getAnalyzedDataList(); } /* (non-Javadoc) * @see gov.nasa.ial.mde.io.FileParser#clear() */ public void clear() { this.columnCnt = 0; this.headerRowCnt = 0; this.dataRowCnt = 0; this.delimType = UNKNOWN_DELIM; this.headers = null; if (data != null) { int len = data.length; for (int i = 0; i < len; i++) { data[i] = null; } this.data = null; } } /* (non-Javadoc) * @see gov.nasa.ial.mde.io.FileParser#dispose() */ public void dispose() { clear(); this.numberFormat = null; this.file = null; } /** * Analyze the file to determine how many columns and rows of header and * data there are. It also verifies that there is a consistant number of * columns of data. * * @throws IOException thrown if the file could not be read. */ protected void analyzeFile() throws IOException { String line; String[] columnValues; BufferedReader in = null; boolean findColCnt = true; boolean findHeader = true; int row = 0; columnCnt = 0; headerRowCnt = 0; dataRowCnt = 0; delimType = UNKNOWN_DELIM; char delimValue = ','; try { in = new BufferedReader(new FileReader(file)); while ((line = in.readLine()) != null) { line = line.trim(); // Skip blank lines. if (line.length() <= 0) { continue; } // Determine what deliminator is being used. if (delimType == UNKNOWN_DELIM) { delimType = lineUsesWhichDelim(line); delimValue = valueOf(delimType); } columnValues = splitLine(line, delimValue); // Set the column count if we are looking for it. if (findColCnt) { findColCnt = false; columnCnt = columnValues.length; } // Do a consistancy check against the expected number of // columns. if ((columnValues == null) || (columnValues.length != columnCnt)) { throw new IllegalArgumentException( "Inconsistent number of columns in the data file."); } // We have reached the end of the header once we find a row of // all numbers. if (findHeader) { if (isAllNumbers(columnValues)) { findHeader = false; headerRowCnt = row; dataRowCnt++; } } else { dataRowCnt++; } row++; } // just in case it's all headers if (findHeader) { headerRowCnt = row; } } finally { if (in != null) { try { in.close(); } catch (IOException ioe) { } } } } private void parseFileForHeader() throws IOException { BufferedReader in = null; int row, col; String line; String[] columnValues; char delimValue = valueOf(delimType); headers = new String[columnCnt]; for (int i = 0; i < columnCnt; i++) { headers[i] = ""; } try { in = new BufferedReader(new FileReader(file)); row = 0; while ((row < headerRowCnt) && ((line = in.readLine()) != null)) { line = line.trim(); // Skip blank lines. if (line.length() <= 0) { continue; } columnValues = splitLine(line, delimValue); // Do a consistancy check against the expected number of // columns. if ((columnValues == null) || (columnValues.length != columnCnt)) { throw new IllegalArgumentException( "Inconsistent number of columns in the data file."); } for (col = 0; col < columnCnt; col++) { if (columnValues[col].length() > 0) { if ((headers[col] == null) || (headers[col].length() <= 0)) { headers[col] = columnValues[col]; } else { // Just append the column value to the header text. headers[col] += " " + columnValues[col]; } } } row++; } } finally { if (in != null) { try { in.close(); } catch (IOException ioe) { } } } } private void parseFileForData() throws IOException, ParseException { BufferedReader in = null; int row, col; String line; String[] columnValues; char delimValue = valueOf(delimType); data = new double[columnCnt][dataRowCnt]; try { in = new BufferedReader(new FileReader(file)); // Burn through the header to get to the line with the data on it. row = 0; while ((row < headerRowCnt) && ((line = in.readLine()) != null)) { line = line.trim(); // Count only lines that are not empty/blank. if (line.length() > 0) { row++; } } row = 0; while ((line = in.readLine()) != null) { line = line.trim(); // Skip blank lines. if (line.length() <= 0) { continue; } columnValues = splitLine(line, delimValue); // Do a consistancy check against the expected number of // columns. if ((columnValues == null) || (columnValues.length != columnCnt)) { throw new IllegalArgumentException( "Inconsistent number of columns in the data file."); } for (col = 0; col < columnCnt; col++) { // Use 0.0 for an empty/blank cell. data[col][row] = (columnValues[col].length() > 0) ? numberFormat.parse( columnValues[col]).doubleValue() : 0.0; } row++; } } finally { if (in != null) { try { in.close(); } catch (IOException ioe) { } } } } private List<AnalyzedData> getAnalyzedDataList() { if (columnCnt < 2) { throw new IllegalArgumentException("Data file must have at least two columns of data."); } AnalyzedData analyzedData; int[] segmentIndexes = calcSegmentIndexes(); int segCount = (segmentIndexes != null) ? segmentIndexes.length : 0; int initialCapacity = Math.max(1, (segCount * (columnCnt - 1))); ArrayList<AnalyzedData> analyzedDataList = new ArrayList<AnalyzedData>(initialCapacity); if (segCount <= 1) { // One data segment means that there were no breaks/holes in the // data. for (int col = 1; col < columnCnt; col++) { analyzedData = new AnalyzedData(headers[0], headers[col], data[0], data[col]); analyzedDataList.add(analyzedData); } } else { int s, segStartIndex, segLength; double[] dataSeg1, dataSeg2; // Create an analyzed data object for each segment and column of the // data // excluding the first column (independent variable). for (int col = 1; col < columnCnt; col++) { for (s = 0; s < segCount; s++) { segStartIndex = (s > 0) ? (segmentIndexes[s - 1] + 1) : 0; segLength = (segmentIndexes[s] - segStartIndex) + 1; if (segLength > 0) { dataSeg1 = new double[segLength]; dataSeg2 = new double[segLength]; System.arraycopy(data[0], segStartIndex, dataSeg1, 0, segLength); System.arraycopy(data[col], segStartIndex, dataSeg2, 0, segLength); analyzedData = new AnalyzedData(headers[0], headers[col], dataSeg1, dataSeg2); analyzedDataList.add(analyzedData); } } } } return analyzedDataList; } // Determine where the breaks in the data are if the data is segmented. private int[] calcSegmentIndexes() { int[] segmentIndexes = null; if (ENABLE_DATA_SEGMENTATION) { double[] dataArray = data[0]; int len = (dataArray != null) ? dataArray.length : 0; // There is no data to analyzed so return a zero length array. if (len == 0) { return new int[0]; } // Double the average distance between x-values double doubleAvgStepSize = 2.0 * Math.abs(dataArray[len - 1] - dataArray[0]) / (len - 1.0); // We declare a segment anytime we have a point that steps more // than two times the average step size away from the previous // point. ArrayList<Integer> segList = new ArrayList<Integer>(10); for (int i = 1; i < len; i++) { if (Math.abs(dataArray[i] - dataArray[i - 1]) > doubleAvgStepSize) { segList.add(new Integer(i - 1)); } } // If the segment list is empty then just return the index to the // last point since we only have one segment. if (segList.isEmpty()) { return new int[] { (len - 1) }; } // Make sure we include the index to the last data point. Integer lastSeg = segList.get(segList.size() - 1); if (lastSeg.intValue() != (len - 1)) { segList.add(new Integer(len - 1)); } // Create the integer array of the segment indexes. segmentIndexes = new int[segList.size()]; for (int i = 0; i < segmentIndexes.length; i++) { segmentIndexes[i] = segList.get(i).intValue(); } // Done with the list. segList.clear(); } else { // If data segmentation is disabled then just return the index // to the last item in the data array. if (data[0] != null) { segmentIndexes = new int[] { (data[0].length - 1) }; } } return (segmentIndexes != null) ? segmentIndexes : (new int[0]); } private boolean isSortedInAscendingOrder(double[] d) { if ((d == null) || (d.length <= 1)) { return true; } double[] tmp = new double[d.length]; System.arraycopy(d, 0, tmp, 0, d.length); Arrays.sort(tmp); return Arrays.equals(d, tmp); } /** * Use a default header name for the columns that did not have a header. */ protected void checkHeaders() { if (headers == null) { headers = new String[columnCnt]; } for (int i = 0; i < columnCnt; i++) { if ((headers[i] == null) || (headers[i].length() <= 0)) { headers[i] = "Column-" + (i + 1); } } } /** * Splits the line based on the delimiter. * * @param line the input line. * @param delim the delimiter for splitting. * @return the split up line. */ protected String[] splitLine(String line, char delim) { int len = line.length(); int pos = 0; char ch; boolean processingQuotes; StringBuffer strBuff = new StringBuffer(32); ArrayList<String> results = new ArrayList<String>(10); // Handle the special case of a comma delim and the line starting with // an // empty cell, which means the line started with a comma. if ((delim == ',') && line.startsWith(",")) { results.add(""); } while (pos < len) { // The start of a column. strBuff.setLength(0); if (delim == ',') { // Move past the delim character. if ((pos < len) && (line.charAt(pos) == delim)) { pos++; } } else { // Burn any leading delims while we are at the start of a // column. while ((pos < len) && (line.charAt(pos) == delim)) { pos++; } } // Burn any leading whitespace in the column that is not our delim. while ((pos < len) && ((ch = line.charAt(pos)) != delim) && Character.isWhitespace(ch)) { pos++; } // Extract the characters in the column until we reach the next // delim character or the end of the line. if (pos < len) { ch = line.charAt(pos); // If we find a " character at the start of this columns' data // then we // need to do special processing of the quotes including escaped // quotes. if (ch == '"') { processingQuotes = true; pos++; // Point to the next char after the first " // character while ((pos < len) && (processingQuotes || (line.charAt(pos) != delim))) { ch = line.charAt(pos); if (ch == '"') { // If the next character is a " then it was escaped. // (i.e. "") if ((pos + 1 < len) && (line.charAt(pos + 1) == '"')) { strBuff.append('"'); // escaped quote pos++; } else { // This is a quote by it's self, so we are // either done // processing quotes, or we need to start // processing again. processingQuotes = !processingQuotes; } } else { strBuff.append(ch); } pos++; } } else { // Column data is all the characters until the next delim or // end of line. while ((pos < len) && ((ch = line.charAt(pos)) != delim)) { strBuff.append(ch); pos++; } } results.add(strBuff.toString().trim()); } } // Handle the special case of a comma delim and the line ending with an // empty cell, which means the line ended with a comma. if ((delim == ',') && line.endsWith(",")) { results.add(""); } String[] returnVal = results.toArray(new String[results.size()]); results.clear(); return returnVal; } /** * Returns the character for the specified delimiter type. * * @param delimiterType the delimiter type. * @return the character for the delimiter type. */ protected char valueOf(int delimiterType) { switch (delimiterType) { case COMMA_DELIM: return ','; case TAB_DELIM: return '\t'; case NEWLINE_DELIM: return '\n'; case VERTICAL_TAB_DELIM: return VERTICAL_TAB; case FORM_FEED_DELIM: return '\f'; case CARRIAGE_RETURN_DELIM: return '\r'; case SPACE_DELIM: return ' '; } throw new IllegalArgumentException("Unknown Data Delimiter."); } private int lineUsesWhichDelim(String line) { if (characterCountFor(line, ',') > 0) { return COMMA_DELIM; } if (characterCountFor(line, '\t') > 0) { return TAB_DELIM; } if (characterCountFor(line, '\n') > 0) { return NEWLINE_DELIM; } if (characterCountFor(line, VERTICAL_TAB) > 0) { return VERTICAL_TAB_DELIM; } if (characterCountFor(line, '\f') > 0) { return FORM_FEED_DELIM; } if (characterCountFor(line, '\r') > 0) { return CARRIAGE_RETURN_DELIM; } if (characterCountFor(line, ' ') > 0) { return SPACE_DELIM; } return UNKNOWN_DELIM; } private int characterCountFor(String line, char ch) { int count = 0; int len = line.length(); for (int i = 0; i < len; i++) { if (line.charAt(i) == ch) { count++; } } return count; } // @return true if all the strings are valid numbers, false otherwise. private boolean isAllNumbers(String[] values) { int len = values.length; try { for (int i = 0; i < len; i++) { if (values[i].length() > 0) { numberFormat.parse(values[i]).doubleValue(); } } } catch (Exception e) { return false; } return true; } }