package org.tgdb.fileimport;
import java.io.*;
import java.util.*;
/**
* AbstractFileParser parses a file and stores its contents in a string
* array. The constructor takes a file name which is used to read input
* data. When the <I>Parse()</I> method is called, the actual parsing takes
* place. However, before the file is parsed, the the parser validates that
* the file has valid contents. This is done by comparing the object- and
* format type name of the file with the file type definitions passed to
* the <I>Parse()</I> method. If the names found in the file matches at
* least one of the file type definitions in the passed vector, parsing
* will take place. If no matching definition is found, parsing will be
* cancelled.
* <P>
* During the parsing process, all rows of the file are read into a string
* array. Each row of the file is represented by one string. As the file is
* parsed, the number of actual data rows within the file are counted. A
* data row is a row which contain real data (blank lines, comments,
* headers etc are not counted). In order to verify if a row is a data row
* or not, the method <I>isDataRow</I> is used. The method does some basic
* verification of the row and tries to find out if it is a data
* row. Additional checks of the row might be added by overriding the
* method in subclasses. The AbstractFileParser object also knows the total
* number of rows in the file.
*
* <P> The header of the file is also parsed and the header data are stored
* within the object. This makes it possible to find out the object type,
* the format type name, the format version and the delimiter used in the
* input file. The parser expects a header with the following layout
*
* <P><CODE>
* objecttype name/formattype name/version/delimiter<BR>
* </CODE>
*
* <P>Objecttype name: The objecttype that is stored in the file (string)<BR>
* Formattype: The formattype that the data is structured in (string)<BR>
* Version: The version of the format for the objecttype (int> <BR>
* Delimiter: The character used to separate field in the file (char)<BR>
*
* <P>The input file is allowed to have comment rows. A comment row begin
* with the character defined in the private member mCommentChar.
*
* <P>When using the data read from the file, one has to be able to
* determine the index of the file row that the data was read
* from. Whithout this feature, it is impossible to report errors in the
* data in a correct way. To support this, the class implements a mapping
* table which maps data row numbers to file row numbers. If one encounter
* an error on a certain data row, one can call the <I>dataRow2FileRow</I>
* method with a data row number as parameter. The method returns the file
* row number that the data row is located at.
*
* <P>Finally, the class includes an abstract method called test() which
* can be used to test the parser. For example, the method can be
* implemented to print all the contents of the parser in order to verify
* it has read data correctly.
*
* <P>
* @author frob
*/
public abstract class AbstractFileParser {
/**
* The name of the file this object reads data from
*/
private String mFileName;
/**
* The contents of the file represented as a string array. Each line of
* the file represents one row in the array.
*/
private String[] mFileData;
//////////////////////////////////////////////////////////////////////
//
// Constructors
//
//////////////////////////////////////////////////////////////////////
/**
* Creates a new AbstractFileParser instance.
* @param fileName the file name to open for parsing
* @exception AssertionException If no filename is given
*/
public AbstractFileParser(String fileName)
throws AssertionException {
setFileName(fileName);
}
//////////////////////////////////////////////////////////////////////
//
// Public section
//
//////////////////////////////////////////////////////////////////////
/**
* Tests the object
*
* @exception FileParserException If error when accessing FileParser
* object
*/
public abstract void test()
throws FileParserException;
/**
* Parses the input file linked to this object. Before the actual
* parsing takes place, the type of the linked file is validated. The
* file must have a object- and format type name that matches one of the
* file type definitions in the given vector.
*
* @exception InputDataFileException If anything wrong with the input
* file.
* @exception FileParserException If no file type definitions are passed.
*/
public void Parse()
throws InputDataFileException, FileParserException {
// Now parse the file
parseInputFile();
}
//////////////////////////////////////////////////////////////////////
//
// Protected section
//
//////////////////////////////////////////////////////////////////////
/**
* This method parses the file and builds an array of string objects
* containg all the data in the file.
* @throws org.tgdb.fileimport.InputDataFileException If file parsing fails
*/
protected void parseInputFile()
throws InputDataFileException {
try {
// Read the inputfile into the string array
readInputFile();
} catch (Exception e) {
throw new InputDataFileException(e.getMessage());
}
}
/**
* Returns the name of the file this object reads data from
*
* @return The name of the file
*/
protected String getFileName() {
return mFileName;
}
/**
* Returns the total number of rows in the file
*
* @return Number of rows in the file
*/
protected int getFileRows() {
return mFileData.length;
}
/**
* Returns the file data row
* @param row the row number
* @return a string of the row.
*/
protected String getFileData(int row)
{
return mFileData[row];
}
//////////////////////////////////////////////////////////////////////
//
// Private section
//
//////////////////////////////////////////////////////////////////////
/**
* Sets the name of the file this object reads data from
* @param fileName the file name
* @throws org.tgdb.fileimport.AssertionException If something fails.
*/
private void setFileName(String fileName)
throws AssertionException {
Assertion.assertMsg(fileName != null && fileName.length() > 0,
"No filename given, could not create the file parser");
mFileName = fileName;
}
/**
* If the last character on the given row in a CR, a string where the
* CR is removed is returned . If there is no CR at the end, the
* original string is returned
*
* @param fileRow A string from which a trailing CR should be removed.
* @return The string with the trailing CR is removed.
*/
private String removeCR(String fileRow) {
if (fileRow.charAt(fileRow.length() -1) == '\r') {
return fileRow.substring(0,fileRow.length() - 1);
} else {
return fileRow;
}
}
/**
* Reads from file!
*
* Reads the input file and stores each line in the file as a string in
* the file array. Each line added to the array is 'cleaned' from LF/CR
* characters. The method also counts the number of data rows found in
* the file.
*
* @exception InputDataFileException if an error occurs
*/
private void readInputFile()
throws InputDataFileException {
File inputFile = null;
FileInputStream inputStream = null;
try {
inputFile = new File(getFileName());
inputStream = new FileInputStream(inputFile);
} catch (Exception e) {
InputDataFileException e2 = new InputDataFileException("Failed to read file");
e2.initCause(e);
throw e2;
}
readInputFile(inputStream,(int) inputFile.length());
}
/**
* Read from stream!
*
* Reads the input stream and stores each line in the "file" as a string in
* the file array. Each line added to the array is 'cleaned' from LF/CR
* characters. The method also counts the number of data rows found in
* the file.
* @param inputStream The stream to read from
* @param length The number of bytes to read
* @exception InputDataFileException if an error occurs
*/
private void readInputFile(FileInputStream inputStream, int length)
throws InputDataFileException {
try {
// Create a byte array with the same size as the file and read the
// file into the array. If the size of the file is 0, an exception
// will be raise.
byte[] inputAsByte = new byte[length]; // (int) inputFile.length()
Assertion.assertMsg(inputAsByte.length > 0, "Given input file (" +
getFileName() + ") is empty");
inputStream.read(inputAsByte);
// Build a string from the byte array and convert it to Dos format
String inputAsStr = new String(inputAsByte);
inputAsStr = convertToDOS(inputAsStr);
// The contents of the file is now stored in inputAsStr. The
// input should now be split into rows and each row should be
// added to the file array (mFileData). To do this, we use a
// tokenizer. Each token will represent one row in the file. A
// row in the file ends with two chars, CR (13, \r) and LF (10,
// \n). However, we can't use that pattern as separator in the
// tokenizer, as this will remove any empty lines (which only
// contains CR and LF). Hence we use just the LF as the separator
// in the tokenizer. The CR remains on the line and is removed
// before the row is added to the file array
StringTokenizer tokenizer =
new StringTokenizer(inputAsStr, "\n", false);
// Build the file array with the same size as there are tokens (eg
// rows in the file). Also build the dataRow2FileArray with the
// same size as there are rows.
setFileData(new String[tokenizer.countTokens()]);
// Step through the tokens (eg all rows in the file) and add them
// to the file array. Before each row is added, the CR at the end
// of each row is removed. Also count the number of data rows, eg
// rows that actually contains data. Header-, column-, comment-
// and blanklines are not counted
int rowIndex = 0;
String currentRow;
while (tokenizer.hasMoreElements()) {
// Get the row, remove the CR and add it to the file array
currentRow = removeCR(tokenizer.nextToken());
mFileData[rowIndex] = currentRow;
rowIndex++;
}
} catch (FileNotFoundException e) {
throw new InputDataFileException("File not found: " + getFileName());
} catch (IOException e) {
throw new InputDataFileException("I/O exception when reading file"
+ getFileName());
} catch (AssertionException e) {
throw new InputDataFileException(e.getMessage());
}
}
/**
* Initialises the file array with the given array
*
* @param data The array to initialise the file array with
*/
private void setFileData(String[] data) {
mFileData = data;
}
/**
* Converts all line breaks of a string to look like DOS line breaks.
*
* @param originalString The string to convert.
* @return The converted string with all line breaks converted to DOS
* line breaks.
*/
private String convertToDOS(String originalString) {
final String DOS_LINE_BREAK = "\r\n";
final byte UNIX_LINE_BREAK = 10;
final byte MAC_LINE_BREAK = 13;
int lineBreakPos;
// Look for a Dos line break. If found we assume the string is in Dos
// format allready. Return the string as it is
lineBreakPos = originalString.indexOf(DOS_LINE_BREAK);
if (lineBreakPos > -1) {
return originalString;
}
// Replace all Mac line breaks with Unix line breaks
originalString = originalString.replace((char) MAC_LINE_BREAK,
(char) UNIX_LINE_BREAK);
// Look for Unix line break. If found, replace with Dos line break. A
// Dos line break contains two chars, carriage return (\r, 13) and
// newline (\n, 10). A Unix line break is newline, while Mac line
// break is carriage return. To convert a Unix line break to Dos, we
// add a carriage return (Mac line break) before each Unix line break
// (newline). This will create a Dos line break (carriage return +
// newline).
lineBreakPos = originalString.indexOf(UNIX_LINE_BREAK);
if (lineBreakPos > -1) {
StringBuffer newString = new StringBuffer("");
// Loop the characters of the original string.
for (int i = 0; i < originalString.length(); i++) {
// If current char is a Unix line break, add a Mac line break
if (originalString.charAt(i) == (char) UNIX_LINE_BREAK) {
newString = newString.append((char) MAC_LINE_BREAK);
}
// Finally add the character from the original string
newString = newString.append(originalString.charAt(i));
}
return newString.toString();
}
return originalString;
}
}