CSVLoader.java example

Explorer
AIWekaProject-master
/*
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/*
 *    CSVLoader.java
 *    Copyright (C) 2000 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.core.converters;

import weka.core.Attribute;
import weka.core.FastVector;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.Range;
import weka.core.RevisionUtils;
import weka.core.Utils;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.StreamTokenizer;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.Vector;

/**
 <!-- globalinfo-start -->
 * Reads a source that is in comma separated or tab separated format. Assumes that the first row in the file determines the number of and names of the attributes.
 * <p/>
 <!-- globalinfo-end -->
 *
 <!-- options-start -->
 * Valid options are: <p/>
 * 
 * <pre> -N <range>
 *  The range of attributes to force type to be NOMINAL.
 *  'first' and 'last' are accepted as well.
 *  Examples: "first-last", "1,4,5-27,50-last"
 *  (default: -none-)</pre>
 * 
 * <pre> -S <range>
 *  The range of attribute to force type to be STRING.
 *  'first' and 'last' are accepted as well.
 *  Examples: "first-last", "1,4,5-27,50-last"
 *  (default: -none-)</pre>
 * 
 * <pre> -M <str>
 *  The string representing a missing value.
 *  (default: ?)</pre>
 * 
 <!-- options-end -->
 *
 * @author Mark Hall (mhall@cs.waikato.ac.nz)
 * @version $Revision: 6002 $
 * @see Loader
 */
public class CSVLoader 
  extends AbstractFileLoader 
  implements BatchConverter, OptionHandler {

  /** for serialization. */
  static final long serialVersionUID = 5607529739745491340L;
  
  /** the file extension. */
  public static String FILE_EXTENSION = ".csv";

  /**
   * A list of hash tables for accumulating nominal values during parsing.
   */
  protected FastVector m_cumulativeStructure;

  /**
   * Holds instances accumulated so far.
   */
  protected FastVector m_cumulativeInstances;
  
  /** The reader for the data. */         
  protected transient BufferedReader m_sourceReader;
  
  /** Tokenizer for the data. */
  protected transient StreamTokenizer m_st;
  
  /** The range of attributes to force to type nominal. */
  protected Range m_NominalAttributes = new Range();
  
  /** The range of attributes to force to type string. */
  protected Range m_StringAttributes = new Range();
  
  /** The placeholder for missing values. */
  protected String m_MissingValue = "?";
  
  /** whether the first row has been read. */
  protected boolean m_FirstCheck;
  
  /**
   * default constructor.
   */
  public CSVLoader() {
    // No instances retrieved yet
    setRetrieval(NONE);
  }

  /**
   * Get the file extension used for arff files.
   *
   * @return the file extension
   */
  public String getFileExtension() {
    return FILE_EXTENSION;
  }

  /**
   * Returns a description of the file type.
   *
   * @return a short file description
   */
  public String getFileDescription() {
    return "CSV data files";
  }

  /**
   * Gets all the file extensions used for this type of file.
   *
   * @return the file extensions
   */
  public String[] getFileExtensions() {
    return new String[]{getFileExtension()};
  }

  /**
   * Returns a string describing this attribute evaluator.
   * 
   * @return a description of the evaluator suitable for
   * displaying in the explorer/experimenter gui
   */
  public String globalInfo() {
    return "Reads a source that is in comma separated or tab separated format. "
      +"Assumes that the first row in the file determines the number of "
      +"and names of the attributes.";
  }

  /**
   * Returns an enumeration describing the available options.
   *
   * @return an enumeration of all the available options.
   */
  public Enumeration listOptions() {
    Vector result = new Vector();
    
    result.addElement(new Option(
        "\tThe range of attributes to force type to be NOMINAL.\n"
        + "\t'first' and 'last' are accepted as well.\n"
        + "\tExamples: \"first-last\", \"1,4,5-27,50-last\"\n"
        + "\t(default: -none-)",
        "N", 1, "-N <range>"));
    
    result.addElement(new Option(
        "\tThe range of attribute to force type to be STRING.\n"
        + "\t'first' and 'last' are accepted as well.\n"
        + "\tExamples: \"first-last\", \"1,4,5-27,50-last\"\n"
        + "\t(default: -none-)",
        "S", 1, "-S <range>"));
    
    result.addElement(new Option(
        "\tThe string representing a missing value.\n"
        + "\t(default: ?)",
        "M", 1, "-M <str>"));
      
    return result.elements();
  }

  /**
   * Parses a given list of options. <p/>
   *
   <!-- options-start -->
   * Valid options are: <p/>
   * 
   * <pre> -N <range>
   *  The range of attributes to force type to be NOMINAL.
   *  'first' and 'last' are accepted as well.
   *  Examples: "first-last", "1,4,5-27,50-last"
   *  (default: -none-)</pre>
   * 
   * <pre> -S <range>
   *  The range of attribute to force type to be STRING.
   *  'first' and 'last' are accepted as well.
   *  Examples: "first-last", "1,4,5-27,50-last"
   *  (default: -none-)</pre>
   * 
   * <pre> -M <str>
   *  The string representing a missing value.
   *  (default: ?)</pre>
   * 
   <!-- options-end -->
   *
   * @param options the list of options as an array of strings
   * @throws Exception if an option is not supported
   */
  public void setOptions(String[] options) throws Exception {
    String	tmpStr;

    tmpStr = Utils.getOption('N', options);
    if (tmpStr.length() != 0)
      setNominalAttributes(tmpStr);
    else
      setNominalAttributes("");

    tmpStr = Utils.getOption('S', options);
    if (tmpStr.length() != 0)
      setStringAttributes(tmpStr);
    else
      setStringAttributes("");

    tmpStr = Utils.getOption('M', options);
    if (tmpStr.length() != 0)
      setMissingValue(tmpStr);
    else
      setMissingValue("?");
  }

  /**
   * Gets the current settings of the Classifier.
   *
   * @return an array of strings suitable for passing to setOptions
   */
  public String[] getOptions() {
    Vector<String>	result;
    
    result  = new Vector<String>();

    if (getNominalAttributes().length() > 0) {
      result.add("-N");
      result.add(getNominalAttributes());
    }
    
    if (getStringAttributes().length() > 0) {
      result.add("-S");
      result.add(getStringAttributes());
    }

    result.add("-M");
    result.add(getMissingValue());
    
    return result.toArray(new String[result.size()]);
  }
  
  /**
   * Sets the attribute range to be forced to type nominal.
   * 
   * @param value	the range
   */
  public void setNominalAttributes(String value) {
    m_NominalAttributes.setRanges(value);
  }
  
  /**
   * Returns the current attribute range to be forced to type nominal.
   * 
   * @return		the range
   */
  public String getNominalAttributes() {
    return m_NominalAttributes.getRanges();
  }

  /**
   * Returns the tip text for this property.
   *
   * @return 		tip text for this property suitable for
   *         		displaying in the explorer/experimenter gui
   */
  public String nominalAttributesTipText() {
    return 
        "The range of attributes to force to be of type NOMINAL, example "
      + "ranges: 'first-last', '1,4,7-14,50-last'.";
  }
  
  /**
   * Sets the attribute range to be forced to type string.
   * 
   * @param value	the range
   */
  public void setStringAttributes(String value) {
    m_StringAttributes.setRanges(value);
  }
  
  /**
   * Returns the current attribute range to be forced to type string.
   * 
   * @return		the range
   */
  public String getStringAttributes() {
    return m_StringAttributes.getRanges();
  }

  /**
   * Returns the tip text for this property.
   *
   * @return 		tip text for this property suitable for
   *         		displaying in the explorer/experimenter gui
   */
  public String stringAttributesTipText() {
    return 
        "The range of attributes to force to be of type STRING, example "
      + "ranges: 'first-last', '1,4,7-14,50-last'.";
  }
  
  /**
   * Sets the placeholder for missing values.
   * 
   * @param value	the placeholder
   */
  public void setMissingValue(String value) {
    m_MissingValue = value;
  }
  
  /**
   * Returns the current placeholder for missing values.
   * 
   * @return		the placeholder
   */
  public String getMissingValue() {
    return m_MissingValue;
  }

  /**
   * Returns the tip text for this property.
   *
   * @return 		tip text for this property suitable for
   *         		displaying in the explorer/experimenter gui
   */
  public String missingValueTipText() {
    return "The placeholder for missing values, default is '?'.";
  }
  
  /**
   * Resets the Loader object and sets the source of the data set to be 
   * the supplied Stream object.
   *
   * @param input the input stream
   * @exception IOException if an error occurs
   */
  @Override
  public void setSource(InputStream input) throws IOException {    
    m_structure    = null;
    m_sourceFile   = null;
    m_File         = null;
    m_FirstCheck     = true;

    m_sourceReader = new BufferedReader(new InputStreamReader(input));
  }

  /**
   * Resets the Loader object and sets the source of the data set to be 
   * the supplied File object.
   *
   * @param file the source file.
   * @exception IOException if an error occurs
   */
  @Override
  public void setSource(File file) throws IOException {
    super.setSource(file);
  }

  /**
   * Determines and returns (if possible) the structure (internally the 
   * header) of the data set as an empty set of instances.
   *
   * @return the structure of the data set as an empty set of Instances
   * @exception IOException if an error occurs
   */
  @Override
  public Instances getStructure() throws IOException {
    if ((m_sourceFile == null) && (m_sourceReader == null)) {
      throw new IOException("No source has been specified");
    }

    if (m_structure == null) {
      try {
	m_st = new StreamTokenizer(m_sourceReader);
	initTokenizer(m_st);
	readStructure(m_st);
      } catch (FileNotFoundException ex) {
      }
    }
    
    return m_structure;
  }

  /**
   * reads the structure.
   * 
   * @param st the stream tokenizer to read from
   * @throws IOException if reading fails
   */
  private void readStructure(StreamTokenizer st) throws IOException {
    readHeader(st);
  }

  /**
   * Return the full data set. If the structure hasn't yet been determined
   * by a call to getStructure then method should do so before processing
   * the rest of the data set.
   *
   * @return the structure of the data set as an empty set of Instances
   * @exception IOException if there is no source or parsing fails
   */
  @Override
  public Instances getDataSet() throws IOException {
    if ((m_sourceFile == null) && (m_sourceReader == null)) {
      throw new IOException("No source has been specified");
    }
    
    if (m_structure == null) {
      getStructure();
    }
    
    if (m_st == null) {
      m_st = new StreamTokenizer(m_sourceReader);
      initTokenizer(m_st);      
    }
        
    m_st.ordinaryChar(',');
    m_st.ordinaryChar('\t');
    
    m_cumulativeStructure = new FastVector(m_structure.numAttributes());
    for (int i = 0; i < m_structure.numAttributes(); i++) {
      m_cumulativeStructure.addElement(new Hashtable());
    }
    
    m_cumulativeInstances = new FastVector();
    FastVector current;
    while ((current = getInstance(m_st)) != null) {
      m_cumulativeInstances.addElement(current);
    }

    FastVector atts = new FastVector(m_structure.numAttributes());
    for (int i = 0; i < m_structure.numAttributes(); i++) {
      String attname = m_structure.attribute(i).name();
      Hashtable tempHash = ((Hashtable)m_cumulativeStructure.elementAt(i));
      if (tempHash.size() == 0) {
	atts.addElement(new Attribute(attname));
      } else {
	if (m_StringAttributes.isInRange(i)) {
	  atts.addElement(new Attribute(attname, (FastVector) null));
	}
	else {
	  FastVector values = new FastVector(tempHash.size());
	  // add dummy objects in order to make the FastVector's size == capacity
	  for (int z = 0; z < tempHash.size(); z++) {
	    values.addElement("dummy");
	  }
	  Enumeration e = tempHash.keys();
	  while (e.hasMoreElements()) {
	    Object ob = e.nextElement();
	    //	  if (ob instanceof Double) {
	    int index = ((Integer)tempHash.get(ob)).intValue();
	    String s = ob.toString();
	    if (s.startsWith("'") || s.startsWith("\""))
	      s = s.substring(1, s.length() - 1);
	    values.setElementAt(new String(s), index);
	    //	  }
	  }
	  atts.addElement(new Attribute(attname, values));
	}
      }
    }

    // make the instances
    String relationName;
    if (m_sourceFile != null)
      relationName = (m_sourceFile.getName()).replaceAll("\\.[cC][sS][vV]$","");
    else
      relationName = "stream";
    Instances dataSet = new Instances(relationName, 
				      atts, 
				      m_cumulativeInstances.size());

    for (int i = 0; i < m_cumulativeInstances.size(); i++) {
      current = ((FastVector)m_cumulativeInstances.elementAt(i));
      double [] vals = new double[dataSet.numAttributes()];
      for (int j = 0; j < current.size(); j++) {
	Object cval = current.elementAt(j);
	if (cval instanceof String) {
	  if (((String)cval).compareTo(m_MissingValue) == 0) {
	    vals[j] = Instance.missingValue();
	  } else {
	    if (dataSet.attribute(j).isString()) {
	      vals[j] = dataSet.attribute(j).addStringValue((String) cval);
	    }
	    else if (dataSet.attribute(j).isNominal()) {
	      // find correct index
	      Hashtable lookup = (Hashtable)m_cumulativeStructure.elementAt(j);
	      int index = ((Integer)lookup.get(cval)).intValue();
	      vals[j] = index;
	    }
	    else {
	      throw new IllegalStateException("Wrong attribute type at position " + (i+1) + "!!!");
	    }
	  }
	} else if (dataSet.attribute(j).isNominal()) {
	  // find correct index
	  Hashtable lookup = (Hashtable)m_cumulativeStructure.elementAt(j);
	  int index = ((Integer)lookup.get(cval)).intValue();
	  vals[j] = index;
	} else if (dataSet.attribute(j).isString()) {
	  vals[j] = dataSet.attribute(j).addStringValue("" + cval);
	} else {
	  vals[j] = ((Double)cval).doubleValue();
	}
      }
      dataSet.add(new Instance(1.0, vals));
    }
    m_structure = new Instances(dataSet, 0);
    setRetrieval(BATCH);
    m_cumulativeStructure = null; // conserve memory
    
    // close the stream
    m_sourceReader.close();
    
    return dataSet;
  }

  /**
   * CSVLoader is unable to process a data set incrementally.
   *
   * @param structure ignored
   * @return never returns without throwing an exception
   * @exception IOException always. CSVLoader is unable to process a data
   * set incrementally.
   */
  @Override
  public Instance getNextInstance(Instances structure) throws IOException {
    throw new IOException("CSVLoader can't read data sets incrementally.");
  }

  /**
   * Attempts to parse a line of the data set.
   *
   * @param tokenizer the tokenizer
   * @return a FastVector containg String and Double objects representing
   * the values of the instance.
   * @exception IOException if an error occurs
   *
   * <pre><jml>
   *    private_normal_behavior
   *      requires: tokenizer != null;
   *      ensures: \result  != null;
   *  also
   *    private_exceptional_behavior
   *      requires: tokenizer == null
   *                || (* unsucessful parse *);
   *      signals: (IOException);
   * </jml></pre>
   */
  private FastVector getInstance(StreamTokenizer tokenizer) 
    throws IOException {

    FastVector current = new FastVector();

    // Check if end of file reached.
    ConverterUtils.getFirstToken(tokenizer);
    if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
      return null;
    }
    boolean first = true;
    boolean wasSep;

    while (tokenizer.ttype != StreamTokenizer.TT_EOL &&
	   tokenizer.ttype != StreamTokenizer.TT_EOF) {
      
      // Get next token
      if (!first) {
	ConverterUtils.getToken(tokenizer);
      }

      if (tokenizer.ttype == ',' || tokenizer.ttype == '\t' || 
	  tokenizer.ttype == StreamTokenizer.TT_EOL) {
	current.addElement(m_MissingValue);
	wasSep = true;
      } else {
	wasSep = false;
	if (tokenizer.sval.equals(m_MissingValue)) {
	  current.addElement(new String(m_MissingValue));
	}
	else {
	  // try to parse as a number
	  try {
	    double val = Double.valueOf(tokenizer.sval).doubleValue();
	    current.addElement(new Double(val));
	  } catch (NumberFormatException e) {
	    // otherwise assume its an enumerated value
	    current.addElement(new String(tokenizer.sval));
	  }
	}
      }
      
      if (!wasSep) {
	ConverterUtils.getToken(tokenizer);
      }
      first = false;
    }
    
    // check number of values read
    if (current.size() != m_structure.numAttributes()) {
      ConverterUtils.errms(tokenizer, 
			   "wrong number of values. Read "+current.size()
			   +", expected "+m_structure.numAttributes());
    }

    // check for structure update
    try {
      checkStructure(current);
    } catch (Exception ex) {
      ex.printStackTrace();
    }

    return current;
  }

  /**
   * Checks the current instance against what is known about the structure
   * of the data set so far. If there is a nominal value for an attribute
   * that was beleived to be numeric then all previously seen values for this
   * attribute are stored in a Hashtable.
   *
   * @param current a <code>FastVector</code> value
   * @exception Exception if an error occurs
   *
   * <pre><jml>
   *    private_normal_behavior
   *      requires: current != null;
   *  also
   *    private_exceptional_behavior
   *      requires: current == null
   *                || (* unrecognized object type in current *);
   *      signals: (Exception);
   * </jml></pre>
   */
  private void checkStructure(FastVector current) throws Exception {
    if (current == null) {
      throw new Exception("current shouldn't be null in checkStructure");
    }

    // initialize ranges, if necessary
    if (m_FirstCheck) {
      m_NominalAttributes.setUpper(current.size() - 1);
      m_StringAttributes.setUpper(current.size() - 1);
      m_FirstCheck = false;
    }
    
    for (int i = 0; i < current.size(); i++) {
      Object ob = current.elementAt(i);
      if ((ob instanceof String) || (m_NominalAttributes.isInRange(i)) || (m_StringAttributes.isInRange(i))) {
	if (ob.toString().compareTo(m_MissingValue) == 0) {
	  // do nothing
	} else {
	  Hashtable tempHash = (Hashtable)m_cumulativeStructure.elementAt(i);
	  if (!tempHash.containsKey(ob)) {
	    // may have found a nominal value in what was previously thought to
	    // be a numeric variable.
	    if (tempHash.size() == 0) {
	      for (int j = 0; j < m_cumulativeInstances.size(); j++) {
		FastVector tempUpdate = 
		  ((FastVector)m_cumulativeInstances.elementAt(j));
		Object tempO = tempUpdate.elementAt(i);
		if (tempO instanceof String) {
		  // must have been a missing value
		} else {
		  if (!tempHash.containsKey(tempO)) {
		    tempHash.put(new Double(((Double)tempO).doubleValue()), 
				 new Integer(tempHash.size()));
		  }
		}
	      }
	    }
	    int newIndex = tempHash.size();
	    tempHash.put(ob, new Integer(newIndex));
	  }
	}
      } else if (ob instanceof Double) {
	Hashtable tempHash = (Hashtable)m_cumulativeStructure.elementAt(i);
	if (tempHash.size() != 0) {
	  if (!tempHash.containsKey(ob)) {
	    int newIndex = tempHash.size();
	    tempHash.put(new Double(((Double)ob).doubleValue()), 
				    new Integer(newIndex));
	  }
	}
      } else {
	throw new Exception("Wrong object type in checkStructure!");
      }
    }
  }

  /**
   * Assumes the first line of the file contains the attribute names.
   * Assumes all attributes are real (Reading the full data set with
   * getDataSet will establish the true structure).
   *
   * @param tokenizer a <code>StreamTokenizer</code> value
   * @exception IOException if an error occurs
   *
   * <pre><jml>
   *    private_normal_behavior
   *      requires: tokenizer != null;
   *      modifiable: m_structure;
   *      ensures: m_structure != null;
   *  also
   *    private_exceptional_behavior
   *      requires: tokenizer == null
   *                || (* unsucessful parse *);
   *      signals: (IOException);
   * </jml></pre>
   */
  private void readHeader(StreamTokenizer tokenizer) throws IOException {
   
    FastVector attribNames = new FastVector();
    ConverterUtils.getFirstToken(tokenizer);
    if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
      ConverterUtils.errms(tokenizer,"premature end of file");
    }

    while (tokenizer.ttype != StreamTokenizer.TT_EOL) {
      attribNames.addElement(new Attribute(tokenizer.sval));
      ConverterUtils.getToken(tokenizer);
    }
    String relationName;
    if (m_sourceFile != null)
      relationName = (m_sourceFile.getName()).replaceAll("\\.[cC][sS][vV]$","");
    else
      relationName = "stream";
    m_structure = new Instances(relationName, attribNames, 0);
  }

  /**
   * Initializes the stream tokenizer.
   *
   * @param tokenizer the tokenizer to initialize
   */
  private void initTokenizer(StreamTokenizer tokenizer) {
    tokenizer.resetSyntax();         
    tokenizer.whitespaceChars(0, (' '-1));    
    tokenizer.wordChars(' ','\u00FF');
    tokenizer.whitespaceChars(',',',');
    tokenizer.whitespaceChars('\t','\t');
    tokenizer.commentChar('%');
    tokenizer.quoteChar('"');
    tokenizer.quoteChar('\'');
    tokenizer.eolIsSignificant(true);
  }
  
  /**
   * Resets the Loader ready to read a new data set or the
   * same data set again.
   * 
   * @throws IOException if something goes wrong
   */
  @Override
  public void reset() throws IOException {
    m_structure = null;
    m_st = null;
    setRetrieval(NONE);
    
    if (m_File != null) {
      setFile(new File(m_File));
    }
  }
  
  /**
   * Returns the revision string.
   * 
   * @return		the revision
   */
  public String getRevision() {
    return RevisionUtils.extract("$Revision: 6002 $");
  }

  /**
   * Main method.
   *
   * @param args should contain the name of an input file.
   */
  public static void main(String [] args) {
    runFileLoader(new CSVLoader(), args);
  }
}