TextDataFileParser.java example

Explorer
MDE-Web-Service-Front-End-master
- src
/* 
 * Copyright 2006, United States Government as represented by the Administrator
 * for the National Aeronautics and Space Administration. No copyright is
 * claimed in the United States under Title 17, U.S. Code. All Other Rights
 * Reserved. 
 * 
 * Created on Apr 23, 2004
 */
package gov.nasa.ial.mde.io;

import gov.nasa.ial.mde.solver.symbolic.AnalyzedData;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.text.NumberFormat;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

/**
 * The <code>TextDataFileParser</code> will parse a text data file with an
 * optional text header describing each column of data and with the data
 * arranged rows and columns. The result of parsing the text data file is a
 * <code>List</code> of <code>AnalyzedData</code> items where each entry in
 * the <code>List</code> corresponds to one segment of data and where each
 * segment corresponds to the chunk of data between a gaps in the values in the
 * first column of data. The <code>TextDataFileParser</code> will analyze the
 * text data file to determine what character is consistently used as a
 * delimiter between columns of data. This allows the
 * <code>TextDataFileParser</code> to parse most any text data file provided
 * that the delimiter is consistently used throughout the file.
 * <p>
 * The <code>TextDataFileParser</code> can also read text data files in the
 * Comma Separated Values (CSV) format, which is compatible with the CSV file
 * format used by Microsoft Excel.
 * <p>
 * Here is how the <code>TextDataFileParser</code> works:
 * <ol>
 * <li>The text data file must have at least two columns.</li>
 * <li>If a text header is exists, the number of columns in the text headers
 * must match the number of columns for the data.</li>
 * <li>It will skip blank lines in the header and data fields.</li>
 * <li>If a data cell is empty (for a given row and column, nothing is
 * specified but there are valid delimiters) it will use a value of 0.0 for it.</li>
 * <li>The first column of data must be in ascending order.</li>
 * <li>Duplicate entries in the first column of data are allowed.</li>
 * <li>If the gap between first column data values is greater than 2 times the
 * average difference between the values then a new segment of data is created.</li>
 * </ol>
 * 
 * @author Dan Dexter
 * @version 1.0
 * @since 1.0
 */
public class TextDataFileParser implements FileParser {

    /** A referene to the file */
	File file;

    /** The number of columns of data. */
	int columnCnt;
    
    /** The number of rows of test header. */
	int headerRowCnt;
    
    /** The number of rows of data. */
	int dataRowCnt;
    
    /** The type of the delimiter. */
	int delimType;

    /** The text headers. */
	String[] headers;

	private double[][] data;

	private NumberFormat numberFormat = NumberFormat.getInstance();

	private static final boolean ENABLE_DATA_SEGMENTATION = true;

	/**
	 * Default constructor not allowed.
	 */
	@SuppressWarnings("unused")
	private TextDataFileParser() {
		throw new RuntimeException("Default constructor not allowed.");
	}

	/**
	 * Creates an instance of <code>TextDataFileParser</code> that will parse
	 * the specified <code>filename</code> for the data.
	 * 
	 * @param filename
	 *            the specified name of the text file to parse the data from
	 */
	public TextDataFileParser(String filename) {
		this(new File(filename));
	}

	/**
	 * Creates an instance of <code>TextDataFileParser</code> that will parse
	 * the specified <code>File</code> object for the data.
	 * 
	 * @param file
	 *            the specified <code>File</code> object of the text file to
	 *            parse the data from
	 */
	public TextDataFileParser(File file) {
		if (file == null) {
			throw new NullPointerException("Null file.");
		}
		if (!file.exists() || !file.isFile()) {
			throw new IllegalArgumentException(
					"The specified file does not exist or is an invalid file.");
		}
		this.file = file;
		clear();
	}

	/**
	 * Parse the file and return a <code>List</code> of
	 * <code>AnalyzedData</code> item's, where each entry in the
	 * <code>List</code> corresponds to one segment of data.
	 * 
	 * @return a <code>List</code> of <code>AnalyzedData</code> item's
	 * @throws IOException
	 *             is thrown for file Input/Output errors
	 * @throws ParseException
	 *             is thrown for parse errors
	 * @see gov.nasa.ial.mde.solver.symbolic.AnalyzedData
	 */
	public List<AnalyzedData> parse() throws IOException, ParseException {
		analyzeFile();

		if (columnCnt < 2) {
			throw new IllegalArgumentException("Data file must have at least two columns of data.");
		}

		// Parse the file for the column header information.
		parseFileForHeader();

		// Use a default header name for the columns that do not have a header.
		checkHeaders();

		// Parse the file for the data.
		parseFileForData();

		// Make sure we have some data.
		if ((data == null) || (data.length <= 0) || (data[0].length <= 0)) {
			throw new IllegalArgumentException("Data file does not contain any data.");
		}

		// For now, we only support the first column values being sorted in
		// ascending order.
		if (!isSortedInAscendingOrder(data[0])) {
			throw new IllegalArgumentException(
					"Data values in the first column must be sorted in ascending order.");
		}

		return getAnalyzedDataList();
	}

	/* (non-Javadoc)
	 * @see gov.nasa.ial.mde.io.FileParser#clear()
	 */
	public void clear() {
		this.columnCnt = 0;
		this.headerRowCnt = 0;
		this.dataRowCnt = 0;
		this.delimType = UNKNOWN_DELIM;
		this.headers = null;
		if (data != null) {
			int len = data.length;
			for (int i = 0; i < len; i++) {
				data[i] = null;
			}
			this.data = null;
		}
	}

	/* (non-Javadoc)
	 * @see gov.nasa.ial.mde.io.FileParser#dispose()
	 */
	public void dispose() {
		clear();
		this.numberFormat = null;
		this.file = null;
	}

	/**
     * Analyze the file to determine how many columns and rows of header and
     * data there are. It also verifies that there is a consistant number of
     * columns of data.
     * 
	 * @throws IOException thrown if the file could not be read.
	 */
	protected void analyzeFile() throws IOException {
		String line;
		String[] columnValues;
		BufferedReader in = null;
		boolean findColCnt = true;
		boolean findHeader = true;
		int row = 0;
		columnCnt = 0;
		headerRowCnt = 0;
		dataRowCnt = 0;
		delimType = UNKNOWN_DELIM;
		char delimValue = ',';

		try {
			in = new BufferedReader(new FileReader(file));

			while ((line = in.readLine()) != null) {
				line = line.trim();

				// Skip blank lines.
				if (line.length() <= 0) {
					continue;
				}

				// Determine what deliminator is being used.
				if (delimType == UNKNOWN_DELIM) {
					delimType = lineUsesWhichDelim(line);
					delimValue = valueOf(delimType);
				}

				columnValues = splitLine(line, delimValue);

				// Set the column count if we are looking for it.
				if (findColCnt) {
					findColCnt = false;
					columnCnt = columnValues.length;
				}

				// Do a consistancy check against the expected number of
				// columns.
				if ((columnValues == null) || (columnValues.length != columnCnt)) {
					throw new IllegalArgumentException(
							"Inconsistent number of columns in the data file.");
				}

				// We have reached the end of the header once we find a row of
				// all numbers.
				if (findHeader) {
					if (isAllNumbers(columnValues)) {
						findHeader = false;
						headerRowCnt = row;
						dataRowCnt++;
					}
				} else {
					dataRowCnt++;
				}
				row++;
			}

			// just in case it's all headers
			if (findHeader) {
				headerRowCnt = row;
			}
		} finally {
			if (in != null) {
				try {
					in.close();
				} catch (IOException ioe) {
				}
			}
		}
	}

	private void parseFileForHeader() throws IOException {
		BufferedReader in = null;
		int row, col;
		String line;
		String[] columnValues;
		char delimValue = valueOf(delimType);

		headers = new String[columnCnt];
		for (int i = 0; i < columnCnt; i++) {
			headers[i] = "";
		}

		try {
			in = new BufferedReader(new FileReader(file));

			row = 0;
			while ((row < headerRowCnt) && ((line = in.readLine()) != null)) {
				line = line.trim();

				// Skip blank lines.
				if (line.length() <= 0) {
					continue;
				}

				columnValues = splitLine(line, delimValue);

				// Do a consistancy check against the expected number of
				// columns.
				if ((columnValues == null) || (columnValues.length != columnCnt)) {
					throw new IllegalArgumentException(
							"Inconsistent number of columns in the data file.");
				}

				for (col = 0; col < columnCnt; col++) {
					if (columnValues[col].length() > 0) {
						if ((headers[col] == null) || (headers[col].length() <= 0)) {
							headers[col] = columnValues[col];
						} else {
							// Just append the column value to the header text.
							headers[col] += " " + columnValues[col];
						}
					}
				}
				row++;
			}
		} finally {
			if (in != null) {
				try {
					in.close();
				} catch (IOException ioe) {
				}
			}
		}
	}

	private void parseFileForData() throws IOException, ParseException {
		BufferedReader in = null;
		int row, col;
		String line;
		String[] columnValues;
		char delimValue = valueOf(delimType);

		data = new double[columnCnt][dataRowCnt];

		try {
			in = new BufferedReader(new FileReader(file));

			// Burn through the header to get to the line with the data on it.
			row = 0;
			while ((row < headerRowCnt) && ((line = in.readLine()) != null)) {
				line = line.trim();

				// Count only lines that are not empty/blank.
				if (line.length() > 0) {
					row++;
				}
			}

			row = 0;
			while ((line = in.readLine()) != null) {
				line = line.trim();

				// Skip blank lines.
				if (line.length() <= 0) {
					continue;
				}

				columnValues = splitLine(line, delimValue);

				// Do a consistancy check against the expected number of
				// columns.
				if ((columnValues == null) || (columnValues.length != columnCnt)) {
					throw new IllegalArgumentException(
							"Inconsistent number of columns in the data file.");
				}

				for (col = 0; col < columnCnt; col++) {
					// Use 0.0 for an empty/blank cell.
					data[col][row] = (columnValues[col].length() > 0) ? numberFormat.parse(
							columnValues[col]).doubleValue() : 0.0;
				}

				row++;
			}
		} finally {
			if (in != null) {
				try {
					in.close();
				} catch (IOException ioe) {
				}
			}
		}
	}

	private List<AnalyzedData> getAnalyzedDataList() {
		if (columnCnt < 2) {
			throw new IllegalArgumentException("Data file must have at least two columns of data.");
		}
		AnalyzedData analyzedData;

		int[] segmentIndexes = calcSegmentIndexes();
		int segCount = (segmentIndexes != null) ? segmentIndexes.length : 0;
		int initialCapacity = Math.max(1, (segCount * (columnCnt - 1)));
		ArrayList<AnalyzedData> analyzedDataList = new ArrayList<AnalyzedData>(initialCapacity);

		if (segCount <= 1) {
			// One data segment means that there were no breaks/holes in the
			// data.
			for (int col = 1; col < columnCnt; col++) {
				analyzedData = new AnalyzedData(headers[0], headers[col], data[0], data[col]);
				analyzedDataList.add(analyzedData);
			}
		} else {
			int s, segStartIndex, segLength;
			double[] dataSeg1, dataSeg2;

			// Create an analyzed data object for each segment and column of the
			// data
			// excluding the first column (independent variable).
			for (int col = 1; col < columnCnt; col++) {
				for (s = 0; s < segCount; s++) {
					segStartIndex = (s > 0) ? (segmentIndexes[s - 1] + 1) : 0;
					segLength = (segmentIndexes[s] - segStartIndex) + 1;

					if (segLength > 0) {
						dataSeg1 = new double[segLength];
						dataSeg2 = new double[segLength];

						System.arraycopy(data[0], segStartIndex, dataSeg1, 0, segLength);
						System.arraycopy(data[col], segStartIndex, dataSeg2, 0, segLength);

						analyzedData = new AnalyzedData(headers[0], headers[col], dataSeg1,
								dataSeg2);
						analyzedDataList.add(analyzedData);
					}
				}
			}
		}
		return analyzedDataList;
	}

	// Determine where the breaks in the data are if the data is segmented.
	private int[] calcSegmentIndexes() {
		int[] segmentIndexes = null;
		if (ENABLE_DATA_SEGMENTATION) {
			double[] dataArray = data[0];
			int len = (dataArray != null) ? dataArray.length : 0;

			// There is no data to analyzed so return a zero length array.
			if (len == 0) {
				return new int[0];
			}

			// Double the average distance between x-values
			double doubleAvgStepSize = 2.0 * Math.abs(dataArray[len - 1] - dataArray[0]) / (len - 1.0);

			// We declare a segment anytime we have a point that steps more
			// than two times the average step size away from the previous
			// point.
			ArrayList<Integer> segList = new ArrayList<Integer>(10);
			for (int i = 1; i < len; i++) {
				if (Math.abs(dataArray[i] - dataArray[i - 1]) > doubleAvgStepSize) {
					segList.add(new Integer(i - 1));
				}
			}

			// If the segment list is empty then just return the index to the
			// last point since we only have one segment.
			if (segList.isEmpty()) {
				return new int[] { (len - 1) };
			}

			// Make sure we include the index to the last data point.
			Integer lastSeg = segList.get(segList.size() - 1);
			if (lastSeg.intValue() != (len - 1)) {
				segList.add(new Integer(len - 1));
			}

			// Create the integer array of the segment indexes.
			segmentIndexes = new int[segList.size()];
			for (int i = 0; i < segmentIndexes.length; i++) {
				segmentIndexes[i] = segList.get(i).intValue();
			}

			// Done with the list.
			segList.clear();
		} else {
			// If data segmentation is disabled then just return the index
			// to the last item in the data array.
			if (data[0] != null) {
				segmentIndexes = new int[] { (data[0].length - 1) };
			}
		}
		return (segmentIndexes != null) ? segmentIndexes : (new int[0]);
	}

	private boolean isSortedInAscendingOrder(double[] d) {
		if ((d == null) || (d.length <= 1)) {
			return true;
		}
		double[] tmp = new double[d.length];
		System.arraycopy(d, 0, tmp, 0, d.length);
		Arrays.sort(tmp);
		return Arrays.equals(d, tmp);
	}
	
	/**
	 * Use a default header name for the columns that did not have a header.
	 */
	protected void checkHeaders() {
		if (headers == null) {
			headers = new String[columnCnt];
		}
		for (int i = 0; i < columnCnt; i++) {
			if ((headers[i] == null) || (headers[i].length() <= 0)) {
				headers[i] = "Column-" + (i + 1);
			}
		}
	}

	/**
     * Splits the line based on the delimiter.
     * 
	 * @param line the input line.
	 * @param delim the delimiter for splitting.
	 * @return the split up line.
	 */
	protected String[] splitLine(String line, char delim) {
		int len = line.length();
		int pos = 0;
		char ch;
		boolean processingQuotes;
		StringBuffer strBuff = new StringBuffer(32);
		ArrayList<String> results = new ArrayList<String>(10);

		// Handle the special case of a comma delim and the line starting with
		// an
		// empty cell, which means the line started with a comma.
		if ((delim == ',') && line.startsWith(",")) {
			results.add("");
		}

		while (pos < len) {
			// The start of a column.
			strBuff.setLength(0);

			if (delim == ',') {
				// Move past the delim character.
				if ((pos < len) && (line.charAt(pos) == delim)) {
					pos++;
				}
			} else {
				// Burn any leading delims while we are at the start of a
				// column.
				while ((pos < len) && (line.charAt(pos) == delim)) {
					pos++;
				}
			}

			// Burn any leading whitespace in the column that is not our delim.
			while ((pos < len) && ((ch = line.charAt(pos)) != delim) && Character.isWhitespace(ch)) {
				pos++;
			}

			// Extract the characters in the column until we reach the next
			// delim character or the end of the line.
			if (pos < len) {
				ch = line.charAt(pos);

				// If we find a " character at the start of this columns' data
				// then we
				// need to do special processing of the quotes including escaped
				// quotes.
				if (ch == '"') {
					processingQuotes = true;
					pos++; // Point to the next char after the first "
					// character

					while ((pos < len) && (processingQuotes || (line.charAt(pos) != delim))) {
						ch = line.charAt(pos);

						if (ch == '"') {
							// If the next character is a " then it was escaped.
							// (i.e. "")
							if ((pos + 1 < len) && (line.charAt(pos + 1) == '"')) {
								strBuff.append('"'); // escaped quote
								pos++;
							} else {
								// This is a quote by it's self, so we are
								// either done
								// processing quotes, or we need to start
								// processing again.
								processingQuotes = !processingQuotes;
							}
						} else {
							strBuff.append(ch);
						}
						pos++;
					}
				} else {
					// Column data is all the characters until the next delim or
					// end of line.
					while ((pos < len) && ((ch = line.charAt(pos)) != delim)) {
						strBuff.append(ch);
						pos++;
					}
				}
				results.add(strBuff.toString().trim());
			}
		}

		// Handle the special case of a comma delim and the line ending with an
		// empty cell, which means the line ended with a comma.
		if ((delim == ',') && line.endsWith(",")) {
			results.add("");
		}

		String[] returnVal = results.toArray(new String[results.size()]);
		results.clear();

		return returnVal;
	}

	/**
     * Returns the character for the specified delimiter type.
     * 
	 * @param delimiterType the delimiter type.
	 * @return the character for the delimiter type.
	 */
	protected char valueOf(int delimiterType) {
		switch (delimiterType) {
		case COMMA_DELIM:
			return ',';
		case TAB_DELIM:
			return '\t';
		case NEWLINE_DELIM:
			return '\n';
		case VERTICAL_TAB_DELIM:
			return VERTICAL_TAB;
		case FORM_FEED_DELIM:
			return '\f';
		case CARRIAGE_RETURN_DELIM:
			return '\r';
		case SPACE_DELIM:
			return ' ';
		}
		throw new IllegalArgumentException("Unknown Data Delimiter.");
	}

	private int lineUsesWhichDelim(String line) {
		if (characterCountFor(line, ',') > 0) {
			return COMMA_DELIM;
		}
		if (characterCountFor(line, '\t') > 0) {
			return TAB_DELIM;
		}
		if (characterCountFor(line, '\n') > 0) {
			return NEWLINE_DELIM;
		}
		if (characterCountFor(line, VERTICAL_TAB) > 0) {
			return VERTICAL_TAB_DELIM;
		}
		if (characterCountFor(line, '\f') > 0) {
			return FORM_FEED_DELIM;
		}
		if (characterCountFor(line, '\r') > 0) {
			return CARRIAGE_RETURN_DELIM;
		}
		if (characterCountFor(line, ' ') > 0) {
			return SPACE_DELIM;
		}
		return UNKNOWN_DELIM;
	}

	private int characterCountFor(String line, char ch) {
		int count = 0;
		int len = line.length();
		for (int i = 0; i < len; i++) {
			if (line.charAt(i) == ch) {
				count++;
			}
		}
		return count;
	}

	// @return true if all the strings are valid numbers, false otherwise.
	private boolean isAllNumbers(String[] values) {
		int len = values.length;
		try {
			for (int i = 0; i < len; i++) {
				if (values[i].length() > 0) {
					numberFormat.parse(values[i]).doubleValue();
				}
			}
		} catch (Exception e) {
			return false;
		}
		return true;
	}

}