CSVLoader.java example

Explorer
TimeSeriesClassification-master
- TimeSeriesClassification
  - src
/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

/*
 *    CSVLoader.java
 *    Copyright (C) 2000-2012 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.core.converters;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.io.Reader;
import java.io.StreamTokenizer;
import java.io.StringReader;
import java.io.Writer;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Vector;

import weka.core.Attribute;
import weka.core.DenseInstance;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.Range;
import weka.core.Utils;
import weka.core.converters.ArffLoader.ArffReader;

/**
 <!-- globalinfo-start --> 
 * Reads a source that is in comma separated format
 * (the default). One can also change the column separator from comma to tab or
 * another character, specify string enclosures, specify whether aheader row is
 * present or not and specify which attributes are to beforced to be nominal or
 * date. Can operate in batch or incremental mode. In batch mode, a buffer is
 * used to process a fixed number of rows in memory at any one time and the data
 * is dumped to a temporary file. This allows the legal values for nominal
 * attributes to be automatically determined. The final ARFF file is produced in
 * a second pass over the temporary file using the structure determined on the
 * first pass. In incremental mode, the first buffer full of rows is used to
 * determine the structure automatically. Following this all rows are read and
 * output incrementally. An error will occur if a row containing nominal values
 * not seen in the initial buffer is encountered. In this case, the size of the
 * initial buffer can be increased, or the user can explicitly provide the legal
 * values of all nominal attributes using the -L (setNominalLabelSpecs) option.
 * *
 * <p/>
 <!-- globalinfo-end -->
 * 
 <!-- options-start --> 
 * Valid options are:
 * <p/>
 * 
 * <pre>
 * -H
 *  No header row present in the data.
 * </pre>
 * 
 * <pre>
 * -N <range>
 *  The range of attributes to force type to be NOMINAL.
 *  'first' and 'last' are accepted as well.
 *  Examples: "first-last", "1,4,5-27,50-last"
 *  (default: -none-)
 * </pre>
 * 
 * <pre>
 * -L <nominal label spec>
 *  Optional specification of legal labels for nominal
 *  attributes. May be specified multiple times.
 *  Batch mode can determine this
 *  automatically (and so can incremental mode if
 *  the first in memory buffer load of instances
 *  contains an example of each legal value). The
 *  spec contains two parts separated by a ":". The
 *  first part can be a range of attribute indexes or
 *  a comma-separated list off attruibute names; the
 *  second part is a comma-separated list of labels. E.g
 *  "1,2,4-6:red,green,blue" or "att1,att2:red,green,blue"
 * </pre>
 * 
 * <pre>
 * -S <range>
 *  The range of attribute to force type to be STRING.
 *  'first' and 'last' are accepted as well.
 *  Examples: "first-last", "1,4,5-27,50-last"
 *  (default: -none-)
 * </pre>
 * 
 * <pre>
 * -D <range>
 *  The range of attribute to force type to be DATE.
 *  'first' and 'last' are accepted as well.
 *  Examples: "first-last", "1,4,5-27,50-last"
 *  (default: -none-)
 * </pre>
 * 
 * <pre>
 * -format <date format>
 *  The date formatting string to use to parse date values.
 *  (default: "yyyy-MM-dd'T'HH:mm:ss")
 * </pre>
 * 
 * <pre>
 * -M <str>
 *  The string representing a missing value.
 *  (default: ?)
 * </pre>
 * 
 * <pre>
 * -F <separator>
 *  The field separator to be used.
 *  '\t' can be used as well.
 *  (default: ',')
 * </pre>
 * 
 * <pre>
 * -E <enclosures>
 *  The enclosure character(s) to use for strings.
 *  Specify as a comma separated list (e.g. ",' (default: ",')
 * </pre>
 * 
 * <pre>
 * -B <num>
 *  The size of the in memory buffer (in rows).
 *  (default: 100)
 * </pre>
 * 
 <!-- options-end -->
 * 
 * @author Mark Hall (mhall{[at]}pentaho{[dot]}com)
 * @version $Revision: 9858 $
 */
public class CSVLoader extends AbstractFileLoader implements BatchConverter,
    IncrementalConverter, OptionHandler {

  /** For serialization */
  private static final long serialVersionUID = -1300595850715808438L;

  /** the file extension. */
  public static String FILE_EXTENSION = ".csv";

  /** The reader for the data. */
  protected transient BufferedReader m_sourceReader;

  /** Tokenizer for the data. */
  protected transient StreamTokenizer m_st;

  protected transient File m_tempFile;
  protected transient PrintWriter m_dataDumper;

  /** the field separator. */
  protected String m_FieldSeparator = ",";

  /** The placeholder for missing values. */
  protected String m_MissingValue = "?";

  /** The range of attributes to force to type nominal. */
  protected Range m_NominalAttributes = new Range();

  /** The user-supplied legal nominal values - each entry in the list is a spec */
  protected List<String> m_nominalLabelSpecs = new ArrayList<String>();

  /** The range of attributes to force to type string. */
  protected Range m_StringAttributes = new Range();

  /** The range of attributes to force to type date */
  protected Range m_dateAttributes = new Range();

  /** The formatting string to use to parse dates */
  protected String m_dateFormat = "yyyy-MM-dd'T'HH:mm:ss";

  /** The formatter to use on dates */
  protected SimpleDateFormat m_formatter;

  /** whether the csv file contains a header row with att names */
  protected boolean m_noHeaderRow = false;

  /** enclosure character(s) to use for strings */
  protected String m_Enclosures = "\",\'";

  /** The in memory row buffer */
  protected List<String> m_rowBuffer;

  /** The maximum number of rows to hold in memory at any one time */
  protected int m_bufferSize = 100;

  /** Lookup for nominal values */
  protected Map<Integer, LinkedHashSet<String>> m_nominalVals;

  /** Reader used to process and output data incrementally */
  protected ArffReader m_incrementalReader;

  /**
   * Returns a string describing this attribute evaluator.
   * 
   * @return a description of the evaluator suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String globalInfo() {
    return "Reads a source that is in comma separated format (the default). "
        + "One can also change the column separator from comma to tab or "
        + "another character, specify string enclosures, specify whether a"
        + "header row is present or not and specify which attributes are to be"
        + "forced to be nominal or date. Can operate in batch or incremental mode. "
        + "In batch mode, a buffer is used to process a fixed number of rows in "
        + "memory at any one time and the data is dumped to a temporary file. This "
        + "allows the legal values for nominal attributes to be automatically "
        + "determined. The final ARFF file is produced in a second pass over the "
        + "temporary file using the structure determined on the first pass. In "
        + "incremental mode, the first buffer full of rows is used to determine "
        + "the structure automatically. Following this all rows are read and output "
        + "incrementally. An error will occur if a row containing nominal values not "
        + "seen in the initial buffer is encountered. In this case, the size of the "
        + "initial buffer can be increased, or the user can explicitly provide the "
        + "legal values of all nominal attributes using the -L (setNominalLabelSpecs) "
        + "option.";
  }

  /**
   * default constructor.
   */
  public CSVLoader() {
    // No instances retrieved yet
    setRetrieval(NONE);
  }

  @Override
  public String getFileExtension() {
    return FILE_EXTENSION;
  }

  @Override
  public String[] getFileExtensions() {
    return new String[] { getFileExtension() };
  }

  @Override
  public String getFileDescription() {
    return "CSV data files";
  }

  @Override
  public String getRevision() {
    return "$Revisoon: $";
  }

  /**
   * Returns the tip text for this property.
   * 
   * @return tip text for this property suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String noHeaderRowPresentTipText() {
    return "First row of data does not contain attribute names";
  }

  /**
   * Set whether there is no header row in the data.
   * 
   * @param b true if there is no header row in the data
   */
  public void setNoHeaderRowPresent(boolean b) {
    m_noHeaderRow = b;
  }

  /**
   * Get whether there is no header row in the data.
   * 
   * @return true if there is no header row in the data
   */
  public boolean getNoHeaderRowPresent() {
    return m_noHeaderRow;
  }

  /**
   * Sets the placeholder for missing values.
   * 
   * @param value the placeholder
   */
  public void setMissingValue(String value) {
    m_MissingValue = value;
  }

  /**
   * Returns the current placeholder for missing values.
   * 
   * @return the placeholder
   */
  public String getMissingValue() {
    return m_MissingValue;
  }

  /**
   * Returns the tip text for this property.
   * 
   * @return tip text for this property suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String missingValueTipText() {
    return "The placeholder for missing values, default is '?'.";
  }

  /**
   * Sets the attribute range to be forced to type string.
   * 
   * @param value the range
   */
  public void setStringAttributes(String value) {
    m_StringAttributes.setRanges(value);
  }

  /**
   * Returns the current attribute range to be forced to type string.
   * 
   * @return the range
   */
  public String getStringAttributes() {
    return m_StringAttributes.getRanges();
  }

  /**
   * Returns the tip text for this property.
   * 
   * @return tip text for this property suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String stringAttributesTipText() {
    return "The range of attributes to force to be of type STRING, example "
        + "ranges: 'first-last', '1,4,7-14,50-last'.";
  }

  /**
   * Sets the attribute range to be forced to type nominal.
   * 
   * @param value the range
   */
  public void setNominalAttributes(String value) {
    m_NominalAttributes.setRanges(value);
  }

  /**
   * Returns the current attribute range to be forced to type nominal.
   * 
   * @return the range
   */
  public String getNominalAttributes() {
    return m_NominalAttributes.getRanges();
  }

  /**
   * Returns the tip text for this property.
   * 
   * @return tip text for this property suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String nominalAttributesTipText() {
    return "The range of attributes to force to be of type NOMINAL, example "
        + "ranges: 'first-last', '1,4,7-14,50-last'.";
  }

  /**
   * Set the format to use for parsing date values.
   * 
   * @param value the format to use.
   */
  public void setDateFormat(String value) {
    m_dateFormat = value;
    m_formatter = null;
  }

  /**
   * Get the format to use for parsing date values.
   * 
   * @return the format to use for parsing date values.
   * 
   */
  public String getDateFormat() {
    return m_dateFormat;
  }

  /**
   * Returns the tip text for this property.
   * 
   * @return tip text for this property suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String dateFormatTipText() {
    return "The format to use for parsing date values.";
  }

  /**
   * Set the attribute range to be forced to type date.
   * 
   * @param value the range
   */
  public void setDateAttributes(String value) {
    m_dateAttributes.setRanges(value);
  }

  /**
   * Returns the current attribute range to be forced to type date.
   * 
   * @return the range.
   */
  public String getDateAttributes() {
    return m_dateAttributes.getRanges();
  }

  /**
   * Returns the tip text for this property.
   * 
   * @return tip text for this property suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String dateAttributesTipText() {
    return "The range of attributes to force to type DATE, example "
        + "ranges: 'first-last', '1,4,7-14, 50-last'.";
  }

  /**
   * Returns the tip text for this property.
   * 
   * @return tip text for this property suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String enclosureCharactersTipText() {
    return "The characters to use as enclosures for strings. E.g. \",'";
  }

  /**
   * Set the character(s) to use/recognize as string enclosures
   * 
   * @param enclosure the characters to use as string enclosures
   */
  public void setEnclosureCharacters(String enclosure) {
    m_Enclosures = enclosure;
  }

  /**
   * Get the character(s) to use/recognize as string enclosures
   * 
   * @return the characters to use as string enclosures
   */
  public String getEnclosureCharacters() {
    return m_Enclosures;
  }

  /**
   * Sets the character used as column separator.
   * 
   * @param value the character to use
   */
  public void setFieldSeparator(String value) {
    m_FieldSeparator = Utils.unbackQuoteChars(value);
    if (m_FieldSeparator.length() != 1) {
      m_FieldSeparator = ",";
      System.err
          .println("Field separator can only be a single character (exception being '\t'), "
              + "defaulting back to '" + m_FieldSeparator + "'!");
    }
  }

  /**
   * Returns the character used as column separator.
   * 
   * @return the character to use
   */
  public String getFieldSeparator() {
    return Utils.backQuoteChars(m_FieldSeparator);
  }

  /**
   * Returns the tip text for this property.
   * 
   * @return tip text for this property suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String fieldSeparatorTipText() {
    return "The character to use as separator for the columns/fields (use '\\t' for TAB).";
  }

  /**
   * Set the buffer size to use - i.e. the number of rows to load and process in
   * memory at any one time
   * 
   * @param buff the buffer size (number of rows)
   */
  public void setBufferSize(int buff) {
    m_bufferSize = buff;
  }

  /**
   * Get the buffer size to use - i.e. the number of rows to load and process in
   * memory at any one time
   * 
   * @return
   */
  public int getBufferSize() {
    return m_bufferSize;
  }

  /**
   * Returns the tip text for this property.
   * 
   * @return tip text for this property suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String bufferSizeTipText() {
    return "The number of rows to process in memory at any one time.";
  }

  /**
   * Set label specifications for nominal attributes.
   * 
   * @param specs an array of label specifications
   */
  public void setNominalLabelSpecs(Object[] specs) {
    m_nominalLabelSpecs.clear();
    for (Object s : specs) {
      m_nominalLabelSpecs.add(s.toString());
    }
  }

  /**
   * Get label specifications for nominal attributes.
   * 
   * @return an array of label specifications
   */
  public Object[] getNominalLabelSpecs() {
    return m_nominalLabelSpecs.toArray(new String[0]);
  }

  /**
   * Returns the tip text for this property.
   * 
   * @return tip text for this property suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String nominalLabelSpecsTipText() {
    return "Optional specification of legal labels for nominal "
        + "attributes. May be specified multiple times. "
        + "Batch mode can determine this "
        + "automatically (and so can incremental mode if "
        + "the first in memory buffer load of instances "
        + "contains an example of each legal value). The "
        + "spec contains two parts separated by a \":\". The "
        + "first part can be a range of attribute indexes or "
        + "a comma-separated list off attruibute names; the "
        + "second part is a comma-separated list of labels. E.g "
        + "\"1,2,4-6:red,green,blue\" or \"att1,att2:red,green,blue\"";
  }

  @Override
  public Enumeration listOptions() {
    Vector<Option> result = new Vector<Option>();

    result
        .add(new Option("\tNo header row present in the data.", "H", 0, "-H"));
    result.add(new Option(
        "\tThe range of attributes to force type to be NOMINAL.\n"
            + "\t'first' and 'last' are accepted as well.\n"
            + "\tExamples: \"first-last\", \"1,4,5-27,50-last\"\n"
            + "\t(default: -none-)", "N", 1, "-N <range>"));

    result.add(new Option(
        "\tOptional specification of legal labels for nominal\n"
            + "\tattributes. May be specified multiple times.\n"
            + "\tBatch mode can determine this\n"
            + "\tautomatically (and so can incremental mode if\n"
            + "\tthe first in memory buffer load of instances\n"
            + "\tcontains an example of each legal value). The\n"
            + "\tspec contains two parts separated by a \":\". The\n"
            + "\tfirst part can be a range of attribute indexes or\n"
            + "\ta comma-separated list off attruibute names; the\n"
            + "\tsecond part is a comma-separated list of labels. E.g\n"
            + "\t\"1,2,4-6:red,green,blue\" or \"att1,att2:red,green,"
            + "blue\"", "L", 1, "-L <nominal label spec>"));

    result.add(new Option(
        "\tThe range of attribute to force type to be STRING.\n"
            + "\t'first' and 'last' are accepted as well.\n"
            + "\tExamples: \"first-last\", \"1,4,5-27,50-last\"\n"
            + "\t(default: -none-)", "S", 1, "-S <range>"));

    result.add(new Option(
        "\tThe range of attribute to force type to be DATE.\n"
            + "\t'first' and 'last' are accepted as well.\n"
            + "\tExamples: \"first-last\", \"1,4,5-27,50-last\"\n"
            + "\t(default: -none-)", "D", 1, "-D <range>"));

    result.add(new Option(
        "\tThe date formatting string to use to parse date values.\n"
            + "\t(default: \"yyyy-MM-dd'T'HH:mm:ss\")", "format", 1,
        "-format <date format>"));

    result.add(new Option("\tThe string representing a missing value.\n"
        + "\t(default: ?)", "M", 1, "-M <str>"));

    result.addElement(new Option("\tThe field separator to be used.\n"
        + "\t'\\t' can be used as well.\n" + "\t(default: ',')", "F", 1,
        "-F <separator>"));

    result.addElement(new Option(
        "\tThe enclosure character(s) to use for strings.\n"
            + "\tSpecify as a comma separated list (e.g. \",'"
            + " (default: \",')", "E", 1, "-E <enclosures>"));

    result.add(new Option("\tThe size of the in memory buffer (in rows).\n"
        + "\t(default: 100)", "B", 1, "-B <num>"));

    return result.elements();
  }

  @Override
  public void setOptions(String[] options) throws Exception {
    String tmpStr;

    setNoHeaderRowPresent(Utils.getFlag('H', options));

    tmpStr = Utils.getOption('N', options);
    if (tmpStr.length() != 0) {
      setNominalAttributes(tmpStr);
    } else {
      setNominalAttributes("");
    }

    tmpStr = Utils.getOption('S', options);
    if (tmpStr.length() != 0) {
      setStringAttributes(tmpStr);
    } else {
      setStringAttributes("");
    }

    tmpStr = Utils.getOption('D', options);
    if (tmpStr.length() > 0) {
      setDateAttributes(tmpStr);
    }
    tmpStr = Utils.getOption("format", options);
    if (tmpStr.length() > 0) {
      setDateFormat(tmpStr);
    }

    tmpStr = Utils.getOption('M', options);
    if (tmpStr.length() != 0) {
      setMissingValue(tmpStr);
    } else {
      setMissingValue("?");
    }

    tmpStr = Utils.getOption('F', options);
    if (tmpStr.length() != 0) {
      setFieldSeparator(tmpStr);
    } else {
      setFieldSeparator(",");
    }

    tmpStr = Utils.getOption('B', options);
    if (tmpStr.length() > 0) {
      int buff = Integer.parseInt(tmpStr);
      if (buff < 1) {
        throw new Exception("Buffer size must be >= 1");
      }
      setBufferSize(buff);
    }

    tmpStr = Utils.getOption("E", options);
    if (tmpStr.length() > 0) {
      setEnclosureCharacters(tmpStr);
    }

    while (true) {
      tmpStr = Utils.getOption('L', options);
      if (tmpStr.length() == 0) {
        break;
      }

      m_nominalLabelSpecs.add(tmpStr);
    }
  }

  @Override
  public String[] getOptions() {
    Vector<String> result = new Vector<String>();

    if (getNominalAttributes().length() > 0) {
      result.add("-N");
      result.add(getNominalAttributes());
    }

    if (getStringAttributes().length() > 0) {
      result.add("-S");
      result.add(getStringAttributes());
    }

    if (getDateAttributes().length() > 0) {
      result.add("-D");
      result.add(getDateAttributes());
      result.add("-format");
      result.add(getDateFormat());
    }

    result.add("-M");
    result.add(getMissingValue());

    result.add("-B");
    result.add("" + getBufferSize());

    result.add("-E");
    result.add(getEnclosureCharacters());

    result.add("-F");
    result.add(getFieldSeparator());

    for (String spec : m_nominalLabelSpecs) {
      result.add("-L");
      result.add(spec);
    }

    return result.toArray(new String[result.size()]);
  }

  private int m_numBufferedRows;

  @Override
  public Instance getNextInstance(Instances structure) throws IOException {
    m_structure = structure;
    if (getRetrieval() == BATCH) {
      throw new IOException(
          "Cannot mix getting instances in both incremental and batch modes");
    }
    setRetrieval(INCREMENTAL);

    if (m_dataDumper != null) {
      // close the uneeded temp files (if necessary)
      m_dataDumper.close();
      m_dataDumper = null;
    }

    if (m_rowBuffer.size() > 0 && m_incrementalReader == null) {
      StringBuilder tempB = new StringBuilder();
      for (String r : m_rowBuffer) {
        tempB.append(r).append("\n");
      }
      m_numBufferedRows = m_rowBuffer.size();
      Reader batchReader = new BufferedReader(
          new StringReader(tempB.toString()));
      m_incrementalReader = new ArffReader(batchReader, m_structure, 0, 0);
      m_rowBuffer.clear();
    }

    if (m_numBufferedRows == 0) {
      // m_incrementalReader = new ArffReader(m_sourceReader, m_structure, 0,
      // 0);
      m_numBufferedRows = -1;

      m_st = new StreamTokenizer(m_sourceReader);
      initTokenizer(m_st);
      m_st.ordinaryChar(m_FieldSeparator.charAt(0));
      //
      m_incrementalReader = null;
    }

    Instance current = null;
    if (m_sourceReader != null) {
      if (m_incrementalReader != null) {
        current = m_incrementalReader.readInstance(m_structure);
      } else {
        if (getInstance(m_st) != null) {
          current = makeInstance();
        }
      }
      if (current == null) {
      }
      if (m_numBufferedRows > 0) {
        m_numBufferedRows--;
      }
    }

    if ((m_sourceReader != null) && (current == null)) {
      try {
        // close the stream
        m_sourceReader.close();
        m_sourceReader = null;
        // reset();
      } catch (Exception ex) {
        ex.printStackTrace();
      }
    }

    return current;
  }

  @Override
  public Instances getDataSet() throws IOException {

    if (m_sourceReader == null) {
      throw new IOException("No source has been specified");
    }

    if (getRetrieval() == INCREMENTAL) {
      throw new IOException(
          "Cannot mix getting instances in both incremental and batch modes");
    }
    setRetrieval(BATCH);

    if (m_structure == null) {
      getStructure();
    }

    while (readData(true))
      ;

    m_dataDumper.flush();
    m_dataDumper.close();

    // make final structure
    makeStructure();

    Reader sr = new BufferedReader(new FileReader(m_tempFile));
    ArffReader initialArff = new ArffReader(sr, m_structure, 0);
    Instances initialInsts = initialArff.getData();
    sr.close();
    initialArff = null;

    return initialInsts;
  }

  private boolean readData(boolean dump) throws IOException {
    if (m_sourceReader == null) {
      throw new IOException("No source has been specified");
    }

    boolean finished = false;
    boolean moreDataToRead = false;

    do {
      String checked = getInstance(m_st);
      if (checked == null) {
        return false;
      }

      if (dump) {
        dumpRow(checked);
      }
      m_rowBuffer.add(checked);

      if (m_rowBuffer.size() == m_bufferSize) {
        finished = true;

        if (getRetrieval() == BATCH) {
          m_rowBuffer.clear();
        }
      }
    } while (!finished);

    return true;
  }

  /**
   * Resets the Loader object and sets the source of the data set to be the
   * supplied Stream object.
   * 
   * @param input the input stream
   * @exception IOException if an error occurs
   */
  @Override
  public void setSource(InputStream input) throws IOException {
    m_structure = null;
    m_sourceFile = null;
    m_File = null;

    m_sourceReader = new BufferedReader(new InputStreamReader(input));
  }

  /**
   * Resets the Loader object and sets the source of the data set to be the
   * supplied File object.
   * 
   * @param file the source file.
   * @exception IOException if an error occurs
   */
  @Override
  public void setSource(File file) throws IOException {
    super.setSource(file);
  }

  @Override
  public Instances getStructure() throws IOException {

    if (m_sourceReader == null) {
      throw new IOException("No source has been specified");
    }

    if (m_structure == null) {
      readHeader();
    }

    return m_structure;
  }

  protected Instance makeInstance() throws IOException {

    if (m_current == null) {
      return null;
    }

    double[] vals = new double[m_structure.numAttributes()];
    for (int i = 0; i < m_structure.numAttributes(); i++) {
      Object val = m_current.get(i);
      if (val.toString().equals("?")) {
        vals[i] = Utils.missingValue();
      } else if (m_structure.attribute(i).isString()) {
        vals[i] = 0;
        m_structure.attribute(i).setStringValue(Utils.unquote(val.toString()));
      } else if (m_structure.attribute(i).isDate()) {
        String format = m_structure.attribute(i).getDateFormat();
        SimpleDateFormat sdf = new SimpleDateFormat(format);
        try {
          vals[i] = sdf.parse(val.toString()).getTime();
        } catch (ParseException e) {
          throw new IOException("Unable to parse date value " + val.toString()
              + " using date format " + format + " for date attribute "
              + m_structure.attribute(i));
        }
      } else if (m_structure.attribute(i).isNumeric()) {
        try {
          Double v = Double.parseDouble(val.toString());
          vals[i] = v.doubleValue();
        } catch (NumberFormatException ex) {
          throw new IOException("Was expecting a number for attribute "
              + m_structure.attribute(i).name() + " but read " + val.toString()
              + " instead.");
        }
      } else {
        // nominal
        double index = m_structure.attribute(i).indexOfValue(
            Utils.unquote(val.toString()));
        if (index < 0) {
          throw new IOException("Read unknown nominal value " + val.toString()
              + "for attribute " + m_structure.attribute(i).name());
        }
        vals[i] = index;
      }
    }

    DenseInstance inst = new DenseInstance(1.0, vals);
    inst.setDataset(m_structure);

    return inst;
  }

  protected void makeStructure() {
    // make final structure
    ArrayList<Attribute> attribs = new ArrayList<Attribute>();
    for (int i = 0; i < m_types.length; i++) {
      if (m_types[i] == TYPE.STRING || m_types[i] == TYPE.UNDETERMINED) {
        attribs.add(new Attribute(m_structure.attribute(i).name(),
            (java.util.List<String>) null));
      } else if (m_types[i] == TYPE.NUMERIC) {
        attribs.add(new Attribute(m_structure.attribute(i).name()));
      } else if (m_types[i] == TYPE.NOMINAL) {
        LinkedHashSet<String> vals = m_nominalVals.get(i);
        ArrayList<String> theVals = new ArrayList<String>();
        if (vals.size() > 0) {
          for (String v : vals) {
            /*
             * if (v.startsWith("'") || v.startsWith("\"")) { v = v.substring(1,
             * v.length() - 1); }
             */
            theVals.add(v);
          }
        } else {
          theVals.add("*unknown*");
        }
        attribs.add(new Attribute(m_structure.attribute(i).name(), theVals));
      } else {
        attribs
            .add(new Attribute(m_structure.attribute(i).name(), m_dateFormat));
      }
    }
    m_structure = new Instances(m_structure.relationName(), attribs, 0);
  }

  private void readHeader() throws IOException {
    m_incrementalReader = null;
    m_current = new ArrayList<Object>();
    openTempFiles();

    m_rowBuffer = new ArrayList<String>();

    String firstRow = m_sourceReader.readLine();
    if (firstRow == null) {
      throw new IOException("No data in the file!");
    }
    if (m_noHeaderRow) {
      m_rowBuffer.add(firstRow);
    }

    ArrayList<Attribute> attribNames = new ArrayList<Attribute>();

    // now tokenize to determine attribute names (or create att names if
    // no header row
    StringReader sr = new StringReader(firstRow + "\n");
    // System.out.print(firstRow + "\n");
    m_st = new StreamTokenizer(sr);
    initTokenizer(m_st);

    m_st.ordinaryChar(m_FieldSeparator.charAt(0));

    int attNum = 1;
    StreamTokenizerUtils.getFirstToken(m_st);
    if (m_st.ttype == StreamTokenizer.TT_EOF) {
      StreamTokenizerUtils.errms(m_st, "premature end of file");
    }
    boolean first = true;
    boolean wasSep;

    while (m_st.ttype != StreamTokenizer.TT_EOL
        && m_st.ttype != StreamTokenizer.TT_EOF) {
      // Get next token

      if (!first) {
        StreamTokenizerUtils.getToken(m_st);
      }

      if (m_st.ttype == m_FieldSeparator.charAt(0)
          || m_st.ttype == StreamTokenizer.TT_EOL) {
        wasSep = true;
      } else {
        wasSep = false;

        String attName = null;

        if (m_noHeaderRow) {
          attName = "att" + attNum;
          attNum++;
        } else {
          attName = m_st.sval;
        }

        attribNames.add(new Attribute(attName, (java.util.List<String>) null));
      }
      if (!wasSep) {
        StreamTokenizerUtils.getToken(m_st);
      }
      first = false;
    }
    String relationName;
    if (m_sourceFile != null) {
      relationName = (m_sourceFile.getName())
          .replaceAll("\\.[cC][sS][vV]$", "");
    } else {
      relationName = "stream";
    }
    m_structure = new Instances(relationName, attribNames, 0);
    m_NominalAttributes.setUpper(m_structure.numAttributes() - 1);
    m_StringAttributes.setUpper(m_structure.numAttributes() - 1);
    m_dateAttributes.setUpper(m_structure.numAttributes() - 1);
    m_nominalVals = new HashMap<Integer, LinkedHashSet<String>>();

    m_types = new TYPE[m_structure.numAttributes()];
    for (int i = 0; i < m_structure.numAttributes(); i++) {
      if (m_NominalAttributes.isInRange(i)) {
        m_types[i] = TYPE.NOMINAL;
        LinkedHashSet<String> ts = new LinkedHashSet<String>();
        m_nominalVals.put(i, ts);
      } else if (m_StringAttributes.isInRange(i)) {
        m_types[i] = TYPE.STRING;
      } else if (m_dateAttributes.isInRange(i)) {
        m_types[i] = TYPE.DATE;
      } else {
        m_types[i] = TYPE.UNDETERMINED;
      }
    }

    if (m_nominalLabelSpecs.size() > 0) {
      for (String spec : m_nominalLabelSpecs) {
        String[] attsAndLabels = spec.split(":");
        if (attsAndLabels.length == 2) {
          String[] labels = attsAndLabels[1].split(",");
          try {
            // try as a range string first
            Range tempR = new Range();
            tempR.setRanges(attsAndLabels[0].trim());
            tempR.setUpper(m_structure.numAttributes() - 1);

            int[] rangeIndexes = tempR.getSelection();
            for (int i = 0; i < rangeIndexes.length; i++) {
              m_types[rangeIndexes[i]] = TYPE.NOMINAL;
              LinkedHashSet<String> ts = new LinkedHashSet<String>();
              for (String lab : labels) {
                ts.add(lab);
              }
              m_nominalVals.put(rangeIndexes[i], ts);
            }
          } catch (IllegalArgumentException e) {
            // one or more named attributes?
            String[] attNames = attsAndLabels[0].split(",");
            for (String attN : attNames) {
              Attribute a = m_structure.attribute(attN.trim());
              if (a != null) {
                int attIndex = a.index();
                m_types[attIndex] = TYPE.NOMINAL;
                LinkedHashSet<String> ts = new LinkedHashSet<String>();
                for (String lab : labels) {
                  ts.add(lab);
                }
                m_nominalVals.put(attIndex, ts);
              }
            }
          }
        }
      }
    }

    m_st = new StreamTokenizer(m_sourceReader);
    initTokenizer(m_st);
    m_st.ordinaryChar(m_FieldSeparator.charAt(0));

    // try and determine a more accurate structure from the first batch
    readData(false || getRetrieval() == BATCH);
    makeStructure();
  }

  protected void openTempFiles() throws IOException {
    String tempPrefix = "" + Math.random() + "arffOut";
    m_tempFile = File.createTempFile(tempPrefix, null);
    m_tempFile.deleteOnExit();
    Writer os2 = new FileWriter(m_tempFile);
    m_dataDumper = new PrintWriter(new BufferedWriter(os2));
  }

  protected void dumpRow(String row) throws IOException {
    m_dataDumper.println(row);
  }

  /**
   * Initializes the stream tokenizer.
   * 
   * @param tokenizer the tokenizer to initialize
   */
  private void initTokenizer(StreamTokenizer tokenizer) {
    tokenizer.resetSyntax();
    tokenizer.whitespaceChars(0, (' ' - 1));
    tokenizer.wordChars(' ', '\u00FF');
    tokenizer.whitespaceChars(m_FieldSeparator.charAt(0),
        m_FieldSeparator.charAt(0));
    // tokenizer.commentChar('%');

    String[] parts = m_Enclosures.split(",");
    for (String e : parts) {
      if (e.length() > 1 || e.length() == 0) {
        throw new IllegalArgumentException(
            "Enclosures can only be single characters");
      }
      tokenizer.quoteChar(e.charAt(0));
    }

    tokenizer.eolIsSignificant(true);
  }

  enum TYPE {
    UNDETERMINED, NUMERIC, NOMINAL, STRING, DATE
  };

  protected ArrayList<Object> m_current;
  protected TYPE[] m_types;

  /**
   * Attempts to parse a line of the data set.
   * 
   * @param tokenizer the tokenizer
   * @return a String version of the instance that has had String and nominal
   *         attribute values quoted if necessary
   * @exception IOException if an error occurs
   * 
   *              <pre>
   * <jml>
   *    private_normal_behavior
   *      requires: tokenizer != null;
   *      ensures: \result  != null;
   *  also
   *    private_exceptional_behavior
   *      requires: tokenizer == null
   *                || (* unsucessful parse *);
   *      signals: (IOException);
   * </jml>
   * </pre>
   */
  private String getInstance(StreamTokenizer tokenizer) throws IOException {

    // Check if end of file reached.
    StreamTokenizerUtils.getFirstToken(tokenizer);
    if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
      return null;
    }

    boolean first = true;
    boolean wasSep;
    boolean containedMissing = false;
    m_current.clear();

    int i = 0;
    while (tokenizer.ttype != StreamTokenizer.TT_EOL
        && tokenizer.ttype != StreamTokenizer.TT_EOF) {

      // Get next token
      if (!first) {
        StreamTokenizerUtils.getToken(tokenizer);
      }

      if (tokenizer.ttype == m_FieldSeparator.charAt(0)
          || tokenizer.ttype == StreamTokenizer.TT_EOL) {
        m_current.add("?");
        containedMissing = true;
        wasSep = true;
      } else {
        wasSep = false;
        if (tokenizer.sval.equals(m_MissingValue)) {
          m_current.add("?");
          containedMissing = true;
        } else if (m_types[i] == TYPE.NUMERIC
            || m_types[i] == TYPE.UNDETERMINED) {
          // try to parse as a number
          try {
            double val = Double.parseDouble(tokenizer.sval);
            m_current.add(tokenizer.sval);
            m_types[i] = TYPE.NUMERIC;
          } catch (NumberFormatException e) {
            // otherwise assume its an enumerated value
            m_current.add(Utils.quote(tokenizer.sval));
            if (m_types[i] == TYPE.UNDETERMINED) {
              m_types[i] = TYPE.NOMINAL;
              LinkedHashSet<String> ts = new LinkedHashSet<String>();
              ts.add(tokenizer.sval);
              m_nominalVals.put(i, ts);
            } else {
              m_types[i] = TYPE.STRING;
            }
          }
        } else if (m_types[i] == TYPE.STRING || m_types[i] == TYPE.DATE) {
          m_current.add(Utils.quote(tokenizer.sval));
        } else if (m_types[i] == TYPE.NOMINAL) {
          m_current.add(Utils.quote(tokenizer.sval));
          m_nominalVals.get(i).add(tokenizer.sval);
        }
      }

      if (!wasSep) {
        StreamTokenizerUtils.getToken(tokenizer);
      }
      first = false;
      i++;
    }

    // check number of values read
    if (m_current.size() != m_structure.numAttributes()) {
      for (Object o : m_current) {
        System.out.print(o.toString() + "|||");
      }
      System.out.println();
      StreamTokenizerUtils.errms(tokenizer, "wrong number of values. Read "
          + m_current.size() + ", expected " + m_structure.numAttributes());

    }

    StringBuilder temp = new StringBuilder();
    for (Object o : m_current) {
      temp.append(o.toString()).append(m_FieldSeparator);
    }
    return temp.substring(0, temp.length() - 1);
  }

  @Override
  public void reset() throws IOException {
    m_structure = null;
    m_rowBuffer = null;

    if (m_dataDumper != null) {
      // close the unneeded temp files (if necessary)
      m_dataDumper.close();
      m_dataDumper = null;
    }
    if (m_sourceReader != null) {
      m_sourceReader.close();
    }

    if (m_File != null) {
      setFile(new File(m_File));
    }
  }

  /**
   * Main method.
   * 
   * @param args should contain the name of an input file.
   */
  public static void main(String[] args) {
    runFileLoader(new CSVLoader(), args);
  }
}