AbstractFileParser.java example

Explorer
crezoo-master
/*
  Copyright (C) 2000 by Prevas AB. All rights reserved.

  $Log$
  Revision 1.4  2004/02/04 08:37:17  heto
  Test av blobar..

  Revision 1.3  2003/05/02 07:58:45  heto
  Changed the package structure from se.prevas.arexis.XYZ to se.arexis.agdb.XYZ
  Modified configuration and source files according to package change.

  Revision 1.2  2002/10/18 11:41:26  heto
  Replaced Assertion.assert with Assertion.assertMsg

  Java 1.4 have a keyword "assert".

  Revision 1.1.1.1  2002/10/16 18:14:07  heto
  Import of aGDB 1.5 L3 from Prevas CVS-tree.
  This version of aGDB is migrated to Tomcat from JServ by Tobias Hermansson


  Revision 1.2  2001/05/31 05:39:41  frob
  Now accepts a delimiter with more than one char.

  Revision 1.1  2001/04/24 09:34:18  frob
  Moved file import classes to new package se.prevas.arexis.util.FileImport,
  caused updates in several files.

  Revision 1.2  2001/04/24 06:31:44  frob
  Checkin after merging frob_fileparser branch.

  Revision 1.1.2.3  2001/04/19 09:59:59  frob
  Major changes: Parse() now requires a vector with file type definitions.
                 New static method for scanning file header: scanFileHeader()
                 Enhanced handling of rows with convertToDOS(), which converts
                 Unix and Mac rows to Dos format.
  Minor changes: Some methods renamed, made protected/private, etc.

  Revision 1.1.2.2  2001/04/03 10:35:14  frob
  Removed some old comments.

  Revision 1.1.2.1  2001/04/03 10:30:20  frob
  Class created as part of the new hierachy for parser classes.


*/

package se.arexis.agdb.util.FileImport;

import java.io.*;
import java.util.*; 
import se.arexis.agdb.util.*;


/**
 * AbstractFileParser parses a file and stores its contents in a string
 * array. The constructor takes a file name which is used to read input
 * data. When the <I>Parse()</I> method is called, the actual parsing takes
 * place. However, before the file is parsed, the the parser validates that
 * the file has valid contents. This is done by comparing the object- and
 * format type name of the file with the file type definitions passed to
 * the <I>Parse()</I> method. If the names found in the file matches at
 * least one of the file type definitions in the passed vector, parsing
 * will take place. If no matching definition is found, parsing will be
 * cancelled.
 * <P>
 * During the parsing process, all  rows of the file are read into a string
 * array. Each row of the file is represented by one string. As the file is
 * parsed, the number of actual data rows within the file are counted. A
 * data row is a row which contain real data (blank lines, comments,
 * headers etc are not counted). In order to verify if a row is a data row
 * or not, the method <I>isDataRow</I> is used. The method does some basic
 * verification of the row and tries to find out if it is a data
 * row. Additional checks of the row might be added by overriding the
 * method in subclasses. The AbstractFileParser object also knows the total
 * number of rows in the file. 
 *
 * <P> The header of the file is also parsed and the header data are stored 
 * within the object. This makes it possible to find out the object type,
 * the format type name, the format version and the delimiter used in the
 * input file. The parser expects a header with the following layout 
 *
 * <P><CODE>
 *    objecttype name/formattype name/version/delimiter<BR>
 * </CODE>
 *
 * <P>Objecttype name: The objecttype that is stored in the file (string)<BR>
 * Formattype: The formattype that the data is structured in (string)<BR>
 * Version: The version of the format for the objecttype (int> <BR>
 * Delimiter: The character used to separate field in the file (char)<BR>
 *
 * <P>The input file is allowed to have comment rows. A comment row begin
 * with the character defined in the private member mCommentChar.
 *
 * <P>When using the data read from the file, one has to be able to
 * determine the index of the file row that the data was read
 * from. Whithout this feature, it is impossible to report errors in the
 * data in a correct way. To support this, the class implements a mapping
 * table which maps data row numbers to file row numbers. If one encounter
 * an error on a certain data row, one can call the <I>dataRow2FileRow</I>
 * method with a data row number as parameter. The method returns the file
 * row number that the data row is located at.
 *
 * <P>Finally, the class includes an abstract method called test() which
 * can be used to test the parser. For example, the method can be
 * implemented to print all the contents of the parser in order to verify
 * it has read data correctly.
 *
 * <P>
 * @author frob
 */
public abstract class AbstractFileParser
{
   /**
    * The name of the file this object reads data from   
    */
   private String mFileName;

   /**
    * The name of the objecttype the data belongs to. Read from the data
    * file 
    */
   private String mObjectTypeName;

   /**
    * The name of the formattype of the data. Read from the data file 
    */
   private String mFormatTypeName;

   /**
    * The version of the file format. Read from the data file
    */
   private int mFormatVersion;
   
   /**
    * The delimiter used to separate the columns in the data file. Read
    * from the file
    */
   private Character mFieldDelimiter;

   /**
    * The char used to comment lines in the data file
    */
   private static final char mCommentChar = '#';

   /**
    * The contents of the file represented as a string array. Each line of
    * the file represents one row in the array. 
    */
   private String[] mFileData;

   /**
    * The number of rows in the input file that actually contains
    * data. This is <I>not</I> the same as the total number of rows in the
    * file 
    */
   private int mDataRows = 0;
   

   /**
    * The mapping table that mapps data rows to file rows
    */
   private int[] mDataRow2FileRow;
   

   //////////////////////////////////////////////////////////////////////
   //
   // Constructors
   //
   //////////////////////////////////////////////////////////////////////

   
   /**
    * Empty constructor, should never be used directly
    *
    */
   public AbstractFileParser()
   {
      super();
   }
   
   
   /**
    * Creates a new AbstractFileParser instance.
    *
    * @param fileName The name of the input file this object should read
    * data from 
    * @exception AssertionException If no filename is given
    */
   public AbstractFileParser(String fileName)
      throws AssertionException
   {
      fileName(fileName);
   }
   

   //////////////////////////////////////////////////////////////////////
   //
   // Public section
   //
   //////////////////////////////////////////////////////////////////////


   /**
    * Tests the object
    *
    * @exception FileParserException If error when accessing FileParser
    * object 
    */
   public abstract void test()
      throws FileParserException;


   /**
    * Parses the input file linked to this object. Before the actual
    * parsing takes place, the type of the linked file is validated. The
    * file must have a object- and format type name that matches one of the
    * file type definitions in the given vector.
    *
    * @param fileTypeDefinitions A vector with valid file type definitions.
    * @exception InputDataFileException If anything wrong with the input
    *            file. 
    * @exception FileParserException If no file type definitions are passed.
    */
   public void Parse(Vector fileTypeDefinitions)
      throws InputDataFileException, FileParserException
   {
      try
      {
         // Ensure a vector with valid file type definitions were passed to
         // the method
         Assertion.assertMsg(fileTypeDefinitions != null,
                          "An Vector with known file type definitions " +
                          "has to be passed to the Parse()-method");
         Assertion.assertMsg(fileTypeDefinitions.size() > 0,
                         "Vector with file type definitions passed to " +
                         "Parse()-method contains no file type definitions.");
         
         // Ensure the file has the correct object- and format type
         validateFileType(fileTypeDefinitions);

         // Now parse the file
         parseInputFile();
      }
      catch (AssertionException e)
      {
         throw new FileParserException("Error parsing file: " +
                                       e.getMessage());
      }

   }

   
   /**
    * Returns the name of the objecttype the data belong to.
    *
    * @return The name of the object type
    */
   public String objectTypeName()
   {
      return mObjectTypeName;
   }


   /**
    * Returns the name of the formattype the file is structured in 
    *
    * @return The name of the formattype
    */
   public String formatTypeName()
   {
      return mFormatTypeName;
   }
   

   /**
    * Returns the version of the file format
    *
    * @return The version of the format
    */
   public int formatVersion()
   {
      return mFormatVersion;
   }
   

   /**
    * Returns the delimiter used to separate data fields in the file 
    *
    * @return The field delimiter
    */
   public Character fieldDelimiter()
   {
      return mFieldDelimiter;
   }
   

   /**
    * Returns the number of data rows in the file. Rows not containing data
    * are not counted
    *
    * @return The number of data rows
    */
   public int dataRows() 
   {
      return mDataRows;
   }


   /**
    * Returns the row number in the file that the data row with the given
    * number is located at. The method is zero-based, which means that the
    * first data row is row 0, the second row is row 1 and so on. The
    * returned value is also zero-based
    *
    * @param dataRow The number of a data row to look up
    * @return The row number in the file of the given data row. <BR>
    *         -1 If row was not found

    */
   public int dataRow2FileRow(int dataRow)
   {
      try
      {
         return mDataRow2FileRow[dataRow];
      }
      catch (IndexOutOfBoundsException e)
      {
         return -1;
      }
   }
   

   /**
    * Scans the file header of the given file and constructs a FileHeader
    * based on the found information.
    *
    * @param fileName The file to scan.
    * @return A FileHeader with the information found in the file.
    * @exception InputDataFileException If anything wrong with the given
    *            input file.
    */
   public static FileHeader scanFileHeader(String fileName)
      throws InputDataFileException
   {
      try
      {
         
      RandomAccessFile file = new RandomAccessFile
            (fileName, "r");
         String headerRow = file.readLine();
         FileHeader header = parseHeader(headerRow);
         return header;
      }
      catch (FileNotFoundException e)
      {
         throw new InputDataFileException("File not found: " + fileName);
      }
      catch (IOException e)
      {
         throw new InputDataFileException("Error accessing file: " + fileName);
      }
      
   }
   

   //////////////////////////////////////////////////////////////////////
   //
   // Protected section
   //
   //////////////////////////////////////////////////////////////////////
   
   
   /**
    * This method parses the file and builds an array of string objects
    * containg all the data in the file.
    *
    */
   protected void parseInputFile()
      throws InputDataFileException
   {
      try
      {
         // Read the inputfile into the string array
         readInputFile();

         // parse the header information 
         getHeaderData(fileData()[0]);
      }
      catch (Exception e)
      {
         throw new InputDataFileException(e.getMessage());
      }
   }
   


   /**
    * Determines if the given row is a data row
    *
    * @param dataRow The row to examine
    * @param rowNumber The position of the row in the file. First row is
    * row number 0
    * @return true if the row is a data row<BR>
    *         false if the row is not a data row
    */
   protected boolean isDataRow(String fileRow, int rowNumber)
   {
      // If row is not first row and row is more than one char and if first
      // char is not the comment char, it is a data row
      if (rowNumber != 0 && fileRow.length() > 0 &&
          fileRow.charAt(0) !=  commentChar()) 
      {
         return true;
      }
      return false;
   }   
   
   
   /**
    * Returns the name of the file this object reads data from
    *
    * @return The name of the file
    */
   protected String fileName()
   {
      return mFileName;
   }


   /**
    * Returns the comment char used to comment lines in the input file. 
    *
    * @return The comment char
    */
   protected static char commentChar()
   {
      return mCommentChar;
   }


   /**
    * Returns the total number of rows in the file
    *
    * @return Number of rows in the file
    */
   protected int fileRows()
   {
      return fileData().length;
   }


   /**
    * Returns the file array
    *
    * @return An array with the file rows
    */
   protected String[] fileData()
   {
      return mFileData;
   }
   
   
   //////////////////////////////////////////////////////////////////////
   //
   // Private section
   //
   //////////////////////////////////////////////////////////////////////



   /**
    * Sets the name of the file this object reads data from
    *
    * @param fileName The name of the file
    * @exception FileNotFoundException If no filename is given
    */
   private void fileName(String fileName)
      throws AssertionException
   {
      Assertion.assertMsg(fileName != null && fileName.length() > 0,
                       "No filename given, could not create the file parser");
      mFileName = fileName;
   }


   /**
    * Sets the name of the objecttype
    *
    * @param name The name of the objecttype
    */
   private void objectTypeName(String name)
   {
      mObjectTypeName = name;
   }

   
   /**
    * Sets the name of the formattype
    *
    * @param name The name of the formattype
    */
   private void formatTypeName(String name)
   {
      mFormatTypeName = name;
   }

   
   /**
    * Sets the version of the file format
    *
    * @param version The file format version
    */
   private void formatVersion(int version)
   {
      mFormatVersion = version;
   }
   

   /**
    * Sets the field delimiter used in the file
    *
    * @param delimiter The delimiter
    */
   private void fieldDelimiter(Character delimiter)
   {
      mFieldDelimiter = delimiter;
   }


   /**
    * Sets the number of data rows in the input file
    *
    * @param numberOfDataRows The number of rows in the file
    */
   private void dataRows(int numberOfDataRows)
   {
      mDataRows = numberOfDataRows;
   }


   /**
    * If the last character on the given row in a CR, a string where the 
    * CR is removed is returned . If there is no CR at the end, the
    * original string is returned 
    *
    * @param fileRow A string from which a trailing CR should be removed.
    * @return The string with the trailing CR is removed.
    */
   private String removeCR(String fileRow)
   {
      if (fileRow.charAt(fileRow.length() -1) == '\r')
      {
         return fileRow.substring(0,fileRow.length() - 1);
      }
      else
      {
         return fileRow;
      }
   }
   
   /**
    * Reads from file!
    *
    * Reads the input file and stores each line in the file as a string in
    * the file array. Each line added to the array is 'cleaned' from LF/CR
    * characters. The method also counts the number of data rows found in
    * the file.
    *
    * @exception InputDataFileException if an error occurs
    */
   private void readInputFile()
    throws InputDataFileException
   {
       File inputFile = null;
       FileInputStream inputStream = null;
       try
       {
           inputFile = new File(fileName());
           inputStream = new FileInputStream(inputFile);
       }
       catch (Exception e)
       {
           Errors.log(e.getMessage());
       }       
       readInputFile(inputStream,(int) inputFile.length());
   }


   /**
    * Read from stream!
    *
    * Reads the input stream and stores each line in the "file" as a string in
    * the file array. Each line added to the array is 'cleaned' from LF/CR
    * characters. The method also counts the number of data rows found in
    * the file.
    *
    * @exception InputDataFileException if an error occurs
    */
   private void readInputFile(FileInputStream inputStream, int length)
      throws InputDataFileException
   {
      try 
      {
         // Create a file and a stream from the given filename.
         //File inputFile = new File(fileName());
         //FileInputStream inputStream = new FileInputStream(inputFile);

         // Create a byte array with the same size as the file and read the
         // file into the array. If the size of the file is 0, an exception
         // will be raise.
         byte[] inputAsByte = new byte[length]; // (int) inputFile.length()
         Assertion.assertMsg(inputAsByte.length > 0, "Given input file (" +
                          fileName() + ") is empty");
         inputStream.read(inputAsByte);
         
         // Build a string from the byte array and convert it to Dos format
         String inputAsStr = new String(inputAsByte);
         inputAsStr = convertToDOS(inputAsStr);

         // The contents of the file is now stored in inputAsStr. The
         // input should now be split into rows and each row should be
         // added to the file array (mFileData). To do this, we use a
         // tokenizer. Each token will represent one row in the file. A
         // row in the file ends with two chars, CR (13, \r) and LF (10,
         // \n). However, we can't use that pattern as separator in the
         // tokenizer, as this will remove any empty lines (which only
         // contains CR and LF). Hence we use just the LF as the separator
         // in the tokenizer. The CR remains on the line and is removed
         // before the row is added to the file array
         StringTokenizer tokenizer =
            new StringTokenizer(inputAsStr, "\n", false); 

         // Build the file array with the same size as there are tokens (eg
         // rows in the file). Also build the dataRow2FileArray with the
         // same size as there are rows.
         fileData(new String[tokenizer.countTokens()]);
         initDataRow2FileRow(tokenizer.countTokens());
         
         // Step through the tokens (eg all rows in the file) and add them
         // to the file array. Before each row is added, the CR at the end
         // of each row is removed. Also count the number of data rows, eg
         // rows that actually contains data. Header-, column-, comment-
         // and blanklines are not counted
         int rowIndex = 0;
         int dataRowCount = 0;
         String currentRow;
         while (tokenizer.hasMoreElements())
         {
            // Get the row, remove the CR and add it to the file array
            currentRow = removeCR(tokenizer.nextToken());
            fileData()[rowIndex] = currentRow;
            
            // If current row is a data row, map it against the file
            // row. Also increase the number of data rows 
            if (isDataRow(currentRow, rowIndex))
            {
               mapDataRow(dataRowCount, rowIndex);
               dataRowCount++;
            }

            rowIndex++;
         }
         // Store the number of data rows for future use
         dataRows(dataRowCount);
      }
      catch (FileNotFoundException e)
      {
         throw new InputDataFileException("File not found: " + fileName());
      }
      catch (IOException e)
      {
         throw new InputDataFileException("I/O exception when reading file"
                                          + fileName());
      }
      catch (AssertionException e)
      {
         throw new InputDataFileException(e.getMessage());
      }
   }


   /**
    * Parses the information in the passed string, which is supposed to be
    * a header row. Expects to find a row with the following structure:
    *
    * <P><CODE>
    *    objecttypename/formattypename/version/delimiter<BR>
    * </CODE>
    *
    * <P>Objecttypename and formattypename should be strings<BR>
    * Version should be an integer<BR>
    * Delimiter should be one character<BR>
    *
    * @param headerRow The string to parse
    * @exception InputDataFileException If the headerRow has an unknown format
    */
   private void getHeaderData(String headerRow)
      throws InputDataFileException
   {
      FileHeader header = parseHeader(headerRow);
      
      // Parse the fields from the header
      objectTypeName(header.objectTypeName());
      formatTypeName(header.formatTypeName());
      formatVersion(header.version());
      fieldDelimiter(header.delimiter());
   }


   /**
    * Constructs the mapping table for data rows => file rows
    *
    * @param rows The number of rows in the file
    */
   private void initDataRow2FileRow(int rows)
   {
      mDataRow2FileRow = new int[rows];
   }


   /**
    * Mapps the given data row to the given file row. Values should be
    * zero-based, eg row 1 is 0, row 2 is 1 and so on. This applies to bot
    * data rows and file rows.
    *
    * @param dataRow The data row number
    * @param fileRow The file row number
    */
   private void mapDataRow(int dataRow, int fileRow)
   {
      mDataRow2FileRow[dataRow] = fileRow;
   }
   

   /**
    * Initialises the file array with the given array
    *
    * @param data The array to initialise the file array with 
    */
   private void fileData(String[] data)
   {
      mFileData = data;
   }


   /**
    * Ensures the data file has the correct object- and format
    * type. Reads the header of the file and looks for a file type
    * definition with matching object- and format type in the vector passed
    * to the method.
    *
    * @param fileTypeDefinitions A vector with valid file type definitions.
    * @exception InputDataFileException If no format type definition in the
    *            given vector matches the object- and format type name found
    *            in the header of the file
    */
   private void validateFileType(Vector fileTypeDefinitions)
      throws InputDataFileException
   {
      // Get the header of the file
      FileHeader fileHeader = scanFileHeader(fileName());
      
      // Build an interator and iterate the given definitions
      Iterator defIterator = fileTypeDefinitions.iterator();
      FileTypeDefinition definition;
      while (defIterator.hasNext())
      {
         // Compare the current definition to the header object. If object-
         // and format type name is equal, the file is of the correct
         // type. Means we can exit.
         definition = (FileTypeDefinition) defIterator.next();
         if (definition.objectTypeName().equalsIgnoreCase(fileHeader.objectTypeName()) &&
             definition.formatTypeName().equalsIgnoreCase(fileHeader.formatTypeName()))
         {
            return;
         }

         // The file is of a incorrect type.
         throw new InputDataFileException("Input file contains unknown " +
                                          "object- and/or format type name: " +
                                          fileHeader.objectTypeName() + " " +
                                          fileHeader.formatTypeName());
      }
   }


   /**
    * Converts all line breaks of a string to look like DOS line breaks. 
    *
    * @param originalString The string to convert.
    * @return The converted string with all line breaks converted to DOS
    *         line breaks.
    */
   private String convertToDOS(String originalString)
   {
      final String DOS_LINE_BREAK = "\r\n";
      final byte UNIX_LINE_BREAK = 10;
      final byte MAC_LINE_BREAK = 13;
      int lineBreakPos;

      // Look for a Dos line break. If found we assume the string is in Dos
      // format allready. Return the string as it is
      lineBreakPos = originalString.indexOf(DOS_LINE_BREAK);
      if (lineBreakPos > -1)
      {
         return originalString;
      }

      // Replace all Mac line breaks with Unix line breaks
      originalString = originalString.replace((char) MAC_LINE_BREAK,
                                              (char) UNIX_LINE_BREAK);
      

      // Look for Unix line break. If found, replace with Dos line break. A
      // Dos line break contains two chars, carriage return (\r, 13) and
      // newline (\n, 10). A Unix line break is newline, while Mac line
      // break is carriage return. To convert a Unix line break to Dos, we
      // add a carriage return (Mac line break) before each Unix line break
      // (newline). This will create a Dos line break (carriage return +
      // newline). 
      lineBreakPos = originalString.indexOf(UNIX_LINE_BREAK);
      if (lineBreakPos > -1)
      {
         StringBuffer newString = new StringBuffer("");

         // Loop the characters of the original string.
         for (int i = 0; i < originalString.length(); i++)
         {
            // If current char is a Unix line break, add a Mac line break
            if (originalString.charAt(i) == (char) UNIX_LINE_BREAK)
            {
               newString = newString.append((char) MAC_LINE_BREAK);
            }

            // Finally add the character from the original string
            newString = newString.append(originalString.charAt(i));
         }
         return newString.toString();
      }
      
      return originalString;
   }

   /**
    * Parses header information from a string and builds a FileHeader
    * object based on the parsed information.
    *
    * @param headerRow The string to parse. Should be a valid header string
    * @return A FileHeader object containing the information parsed from
    *         the string.
    * @exception InputDataFileException If anything wrong with the given string.
    */
   public static FileHeader parseHeader(String headerRow)
      throws InputDataFileException
   {
      try
      {
         // Ensure there is not a comment on the line
         Assertion.assertMsg(headerRow.charAt(0) != commentChar(),
                          "First line should not be a comment");
         
         // Build a tokenizer to used when parsing the fields from the row
         StringTokenizer aTokenizer =
            new StringTokenizer(headerRow, "/", false);

         // Ensure there are four fields in the header
         Assertion.assertMsg(aTokenizer.countTokens() == 4,
                          "File header has wrong format. First line must " +
                          "have the following layout: " +
                          "objecttypename/formattypename/version/delimiter");
         
         // Parse the fields from the header
         String objectTypeName = aTokenizer.nextToken();
         String formatTypeName = aTokenizer.nextToken();
         int formatVersion = Integer.parseInt(aTokenizer.nextToken());

         // Read the delimiter as a string. We will accept a delimiter with
         // more than one char, but only the first char will be used, the
         // rest of the string will be ignored.
         String delimiter = aTokenizer.nextToken();

         // Construct a new header object, based on the data in the string
         // and return it.
         FileHeader header = new FileHeader(objectTypeName, formatTypeName,
                                            formatVersion,
                                            delimiter.charAt(0));
         return header;
      }
      catch (NumberFormatException e)
      {
         throw new InputDataFileException("File version has wrong format, "
                                          + "should be an iteger.");
      }
      catch (AssertionException e)
      {
         throw new InputDataFileException(e.getMessage());
      }
   }

   
}