/*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
/*
* CSVLoader.java
* Copyright (C) 2000-2012 University of Waikato, Hamilton, New Zealand
*
*/
package weka.core.converters;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.io.Reader;
import java.io.StreamTokenizer;
import java.io.StringReader;
import java.io.Writer;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Vector;
import weka.core.Attribute;
import weka.core.DenseInstance;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.Range;
import weka.core.Utils;
import weka.core.converters.ArffLoader.ArffReader;
/**
<!-- globalinfo-start -->
* Reads a source that is in comma separated format
* (the default). One can also change the column separator from comma to tab or
* another character, specify string enclosures, specify whether aheader row is
* present or not and specify which attributes are to beforced to be nominal or
* date. Can operate in batch or incremental mode. In batch mode, a buffer is
* used to process a fixed number of rows in memory at any one time and the data
* is dumped to a temporary file. This allows the legal values for nominal
* attributes to be automatically determined. The final ARFF file is produced in
* a second pass over the temporary file using the structure determined on the
* first pass. In incremental mode, the first buffer full of rows is used to
* determine the structure automatically. Following this all rows are read and
* output incrementally. An error will occur if a row containing nominal values
* not seen in the initial buffer is encountered. In this case, the size of the
* initial buffer can be increased, or the user can explicitly provide the legal
* values of all nominal attributes using the -L (setNominalLabelSpecs) option.
* *
* <p/>
<!-- globalinfo-end -->
*
<!-- options-start -->
* Valid options are:
* <p/>
*
* <pre>
* -H
* No header row present in the data.
* </pre>
*
* <pre>
* -N <range>
* The range of attributes to force type to be NOMINAL.
* 'first' and 'last' are accepted as well.
* Examples: "first-last", "1,4,5-27,50-last"
* (default: -none-)
* </pre>
*
* <pre>
* -L <nominal label spec>
* Optional specification of legal labels for nominal
* attributes. May be specified multiple times.
* Batch mode can determine this
* automatically (and so can incremental mode if
* the first in memory buffer load of instances
* contains an example of each legal value). The
* spec contains two parts separated by a ":". The
* first part can be a range of attribute indexes or
* a comma-separated list off attruibute names; the
* second part is a comma-separated list of labels. E.g
* "1,2,4-6:red,green,blue" or "att1,att2:red,green,blue"
* </pre>
*
* <pre>
* -S <range>
* The range of attribute to force type to be STRING.
* 'first' and 'last' are accepted as well.
* Examples: "first-last", "1,4,5-27,50-last"
* (default: -none-)
* </pre>
*
* <pre>
* -D <range>
* The range of attribute to force type to be DATE.
* 'first' and 'last' are accepted as well.
* Examples: "first-last", "1,4,5-27,50-last"
* (default: -none-)
* </pre>
*
* <pre>
* -format <date format>
* The date formatting string to use to parse date values.
* (default: "yyyy-MM-dd'T'HH:mm:ss")
* </pre>
*
* <pre>
* -M <str>
* The string representing a missing value.
* (default: ?)
* </pre>
*
* <pre>
* -F <separator>
* The field separator to be used.
* '\t' can be used as well.
* (default: ',')
* </pre>
*
* <pre>
* -E <enclosures>
* The enclosure character(s) to use for strings.
* Specify as a comma separated list (e.g. ",' (default: ",')
* </pre>
*
* <pre>
* -B <num>
* The size of the in memory buffer (in rows).
* (default: 100)
* </pre>
*
<!-- options-end -->
*
* @author Mark Hall (mhall{[at]}pentaho{[dot]}com)
* @version $Revision: 9858 $
*/
public class CSVLoader extends AbstractFileLoader implements BatchConverter,
IncrementalConverter, OptionHandler {
/** For serialization */
private static final long serialVersionUID = -1300595850715808438L;
/** the file extension. */
public static String FILE_EXTENSION = ".csv";
/** The reader for the data. */
protected transient BufferedReader m_sourceReader;
/** Tokenizer for the data. */
protected transient StreamTokenizer m_st;
protected transient File m_tempFile;
protected transient PrintWriter m_dataDumper;
/** the field separator. */
protected String m_FieldSeparator = ",";
/** The placeholder for missing values. */
protected String m_MissingValue = "?";
/** The range of attributes to force to type nominal. */
protected Range m_NominalAttributes = new Range();
/** The user-supplied legal nominal values - each entry in the list is a spec */
protected List<String> m_nominalLabelSpecs = new ArrayList<String>();
/** The range of attributes to force to type string. */
protected Range m_StringAttributes = new Range();
/** The range of attributes to force to type date */
protected Range m_dateAttributes = new Range();
/** The formatting string to use to parse dates */
protected String m_dateFormat = "yyyy-MM-dd'T'HH:mm:ss";
/** The formatter to use on dates */
protected SimpleDateFormat m_formatter;
/** whether the csv file contains a header row with att names */
protected boolean m_noHeaderRow = false;
/** enclosure character(s) to use for strings */
protected String m_Enclosures = "\",\'";
/** The in memory row buffer */
protected List<String> m_rowBuffer;
/** The maximum number of rows to hold in memory at any one time */
protected int m_bufferSize = 100;
/** Lookup for nominal values */
protected Map<Integer, LinkedHashSet<String>> m_nominalVals;
/** Reader used to process and output data incrementally */
protected ArffReader m_incrementalReader;
/**
* Returns a string describing this attribute evaluator.
*
* @return a description of the evaluator suitable for displaying in the
* explorer/experimenter gui
*/
public String globalInfo() {
return "Reads a source that is in comma separated format (the default). "
+ "One can also change the column separator from comma to tab or "
+ "another character, specify string enclosures, specify whether a"
+ "header row is present or not and specify which attributes are to be"
+ "forced to be nominal or date. Can operate in batch or incremental mode. "
+ "In batch mode, a buffer is used to process a fixed number of rows in "
+ "memory at any one time and the data is dumped to a temporary file. This "
+ "allows the legal values for nominal attributes to be automatically "
+ "determined. The final ARFF file is produced in a second pass over the "
+ "temporary file using the structure determined on the first pass. In "
+ "incremental mode, the first buffer full of rows is used to determine "
+ "the structure automatically. Following this all rows are read and output "
+ "incrementally. An error will occur if a row containing nominal values not "
+ "seen in the initial buffer is encountered. In this case, the size of the "
+ "initial buffer can be increased, or the user can explicitly provide the "
+ "legal values of all nominal attributes using the -L (setNominalLabelSpecs) "
+ "option.";
}
/**
* default constructor.
*/
public CSVLoader() {
// No instances retrieved yet
setRetrieval(NONE);
}
@Override
public String getFileExtension() {
return FILE_EXTENSION;
}
@Override
public String[] getFileExtensions() {
return new String[] { getFileExtension() };
}
@Override
public String getFileDescription() {
return "CSV data files";
}
@Override
public String getRevision() {
return "$Revisoon: $";
}
/**
* Returns the tip text for this property.
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String noHeaderRowPresentTipText() {
return "First row of data does not contain attribute names";
}
/**
* Set whether there is no header row in the data.
*
* @param b true if there is no header row in the data
*/
public void setNoHeaderRowPresent(boolean b) {
m_noHeaderRow = b;
}
/**
* Get whether there is no header row in the data.
*
* @return true if there is no header row in the data
*/
public boolean getNoHeaderRowPresent() {
return m_noHeaderRow;
}
/**
* Sets the placeholder for missing values.
*
* @param value the placeholder
*/
public void setMissingValue(String value) {
m_MissingValue = value;
}
/**
* Returns the current placeholder for missing values.
*
* @return the placeholder
*/
public String getMissingValue() {
return m_MissingValue;
}
/**
* Returns the tip text for this property.
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String missingValueTipText() {
return "The placeholder for missing values, default is '?'.";
}
/**
* Sets the attribute range to be forced to type string.
*
* @param value the range
*/
public void setStringAttributes(String value) {
m_StringAttributes.setRanges(value);
}
/**
* Returns the current attribute range to be forced to type string.
*
* @return the range
*/
public String getStringAttributes() {
return m_StringAttributes.getRanges();
}
/**
* Returns the tip text for this property.
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String stringAttributesTipText() {
return "The range of attributes to force to be of type STRING, example "
+ "ranges: 'first-last', '1,4,7-14,50-last'.";
}
/**
* Sets the attribute range to be forced to type nominal.
*
* @param value the range
*/
public void setNominalAttributes(String value) {
m_NominalAttributes.setRanges(value);
}
/**
* Returns the current attribute range to be forced to type nominal.
*
* @return the range
*/
public String getNominalAttributes() {
return m_NominalAttributes.getRanges();
}
/**
* Returns the tip text for this property.
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String nominalAttributesTipText() {
return "The range of attributes to force to be of type NOMINAL, example "
+ "ranges: 'first-last', '1,4,7-14,50-last'.";
}
/**
* Set the format to use for parsing date values.
*
* @param value the format to use.
*/
public void setDateFormat(String value) {
m_dateFormat = value;
m_formatter = null;
}
/**
* Get the format to use for parsing date values.
*
* @return the format to use for parsing date values.
*
*/
public String getDateFormat() {
return m_dateFormat;
}
/**
* Returns the tip text for this property.
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String dateFormatTipText() {
return "The format to use for parsing date values.";
}
/**
* Set the attribute range to be forced to type date.
*
* @param value the range
*/
public void setDateAttributes(String value) {
m_dateAttributes.setRanges(value);
}
/**
* Returns the current attribute range to be forced to type date.
*
* @return the range.
*/
public String getDateAttributes() {
return m_dateAttributes.getRanges();
}
/**
* Returns the tip text for this property.
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String dateAttributesTipText() {
return "The range of attributes to force to type DATE, example "
+ "ranges: 'first-last', '1,4,7-14, 50-last'.";
}
/**
* Returns the tip text for this property.
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String enclosureCharactersTipText() {
return "The characters to use as enclosures for strings. E.g. \",'";
}
/**
* Set the character(s) to use/recognize as string enclosures
*
* @param enclosure the characters to use as string enclosures
*/
public void setEnclosureCharacters(String enclosure) {
m_Enclosures = enclosure;
}
/**
* Get the character(s) to use/recognize as string enclosures
*
* @return the characters to use as string enclosures
*/
public String getEnclosureCharacters() {
return m_Enclosures;
}
/**
* Sets the character used as column separator.
*
* @param value the character to use
*/
public void setFieldSeparator(String value) {
m_FieldSeparator = Utils.unbackQuoteChars(value);
if (m_FieldSeparator.length() != 1) {
m_FieldSeparator = ",";
System.err
.println("Field separator can only be a single character (exception being '\t'), "
+ "defaulting back to '" + m_FieldSeparator + "'!");
}
}
/**
* Returns the character used as column separator.
*
* @return the character to use
*/
public String getFieldSeparator() {
return Utils.backQuoteChars(m_FieldSeparator);
}
/**
* Returns the tip text for this property.
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String fieldSeparatorTipText() {
return "The character to use as separator for the columns/fields (use '\\t' for TAB).";
}
/**
* Set the buffer size to use - i.e. the number of rows to load and process in
* memory at any one time
*
* @param buff the buffer size (number of rows)
*/
public void setBufferSize(int buff) {
m_bufferSize = buff;
}
/**
* Get the buffer size to use - i.e. the number of rows to load and process in
* memory at any one time
*
* @return
*/
public int getBufferSize() {
return m_bufferSize;
}
/**
* Returns the tip text for this property.
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String bufferSizeTipText() {
return "The number of rows to process in memory at any one time.";
}
/**
* Set label specifications for nominal attributes.
*
* @param specs an array of label specifications
*/
public void setNominalLabelSpecs(Object[] specs) {
m_nominalLabelSpecs.clear();
for (Object s : specs) {
m_nominalLabelSpecs.add(s.toString());
}
}
/**
* Get label specifications for nominal attributes.
*
* @return an array of label specifications
*/
public Object[] getNominalLabelSpecs() {
return m_nominalLabelSpecs.toArray(new String[0]);
}
/**
* Returns the tip text for this property.
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String nominalLabelSpecsTipText() {
return "Optional specification of legal labels for nominal "
+ "attributes. May be specified multiple times. "
+ "Batch mode can determine this "
+ "automatically (and so can incremental mode if "
+ "the first in memory buffer load of instances "
+ "contains an example of each legal value). The "
+ "spec contains two parts separated by a \":\". The "
+ "first part can be a range of attribute indexes or "
+ "a comma-separated list off attruibute names; the "
+ "second part is a comma-separated list of labels. E.g "
+ "\"1,2,4-6:red,green,blue\" or \"att1,att2:red,green,blue\"";
}
@Override
public Enumeration listOptions() {
Vector<Option> result = new Vector<Option>();
result
.add(new Option("\tNo header row present in the data.", "H", 0, "-H"));
result.add(new Option(
"\tThe range of attributes to force type to be NOMINAL.\n"
+ "\t'first' and 'last' are accepted as well.\n"
+ "\tExamples: \"first-last\", \"1,4,5-27,50-last\"\n"
+ "\t(default: -none-)", "N", 1, "-N <range>"));
result.add(new Option(
"\tOptional specification of legal labels for nominal\n"
+ "\tattributes. May be specified multiple times.\n"
+ "\tBatch mode can determine this\n"
+ "\tautomatically (and so can incremental mode if\n"
+ "\tthe first in memory buffer load of instances\n"
+ "\tcontains an example of each legal value). The\n"
+ "\tspec contains two parts separated by a \":\". The\n"
+ "\tfirst part can be a range of attribute indexes or\n"
+ "\ta comma-separated list off attruibute names; the\n"
+ "\tsecond part is a comma-separated list of labels. E.g\n"
+ "\t\"1,2,4-6:red,green,blue\" or \"att1,att2:red,green,"
+ "blue\"", "L", 1, "-L <nominal label spec>"));
result.add(new Option(
"\tThe range of attribute to force type to be STRING.\n"
+ "\t'first' and 'last' are accepted as well.\n"
+ "\tExamples: \"first-last\", \"1,4,5-27,50-last\"\n"
+ "\t(default: -none-)", "S", 1, "-S <range>"));
result.add(new Option(
"\tThe range of attribute to force type to be DATE.\n"
+ "\t'first' and 'last' are accepted as well.\n"
+ "\tExamples: \"first-last\", \"1,4,5-27,50-last\"\n"
+ "\t(default: -none-)", "D", 1, "-D <range>"));
result.add(new Option(
"\tThe date formatting string to use to parse date values.\n"
+ "\t(default: \"yyyy-MM-dd'T'HH:mm:ss\")", "format", 1,
"-format <date format>"));
result.add(new Option("\tThe string representing a missing value.\n"
+ "\t(default: ?)", "M", 1, "-M <str>"));
result.addElement(new Option("\tThe field separator to be used.\n"
+ "\t'\\t' can be used as well.\n" + "\t(default: ',')", "F", 1,
"-F <separator>"));
result.addElement(new Option(
"\tThe enclosure character(s) to use for strings.\n"
+ "\tSpecify as a comma separated list (e.g. \",'"
+ " (default: \",')", "E", 1, "-E <enclosures>"));
result.add(new Option("\tThe size of the in memory buffer (in rows).\n"
+ "\t(default: 100)", "B", 1, "-B <num>"));
return result.elements();
}
@Override
public void setOptions(String[] options) throws Exception {
String tmpStr;
setNoHeaderRowPresent(Utils.getFlag('H', options));
tmpStr = Utils.getOption('N', options);
if (tmpStr.length() != 0) {
setNominalAttributes(tmpStr);
} else {
setNominalAttributes("");
}
tmpStr = Utils.getOption('S', options);
if (tmpStr.length() != 0) {
setStringAttributes(tmpStr);
} else {
setStringAttributes("");
}
tmpStr = Utils.getOption('D', options);
if (tmpStr.length() > 0) {
setDateAttributes(tmpStr);
}
tmpStr = Utils.getOption("format", options);
if (tmpStr.length() > 0) {
setDateFormat(tmpStr);
}
tmpStr = Utils.getOption('M', options);
if (tmpStr.length() != 0) {
setMissingValue(tmpStr);
} else {
setMissingValue("?");
}
tmpStr = Utils.getOption('F', options);
if (tmpStr.length() != 0) {
setFieldSeparator(tmpStr);
} else {
setFieldSeparator(",");
}
tmpStr = Utils.getOption('B', options);
if (tmpStr.length() > 0) {
int buff = Integer.parseInt(tmpStr);
if (buff < 1) {
throw new Exception("Buffer size must be >= 1");
}
setBufferSize(buff);
}
tmpStr = Utils.getOption("E", options);
if (tmpStr.length() > 0) {
setEnclosureCharacters(tmpStr);
}
while (true) {
tmpStr = Utils.getOption('L', options);
if (tmpStr.length() == 0) {
break;
}
m_nominalLabelSpecs.add(tmpStr);
}
}
@Override
public String[] getOptions() {
Vector<String> result = new Vector<String>();
if (getNominalAttributes().length() > 0) {
result.add("-N");
result.add(getNominalAttributes());
}
if (getStringAttributes().length() > 0) {
result.add("-S");
result.add(getStringAttributes());
}
if (getDateAttributes().length() > 0) {
result.add("-D");
result.add(getDateAttributes());
result.add("-format");
result.add(getDateFormat());
}
result.add("-M");
result.add(getMissingValue());
result.add("-B");
result.add("" + getBufferSize());
result.add("-E");
result.add(getEnclosureCharacters());
result.add("-F");
result.add(getFieldSeparator());
for (String spec : m_nominalLabelSpecs) {
result.add("-L");
result.add(spec);
}
return result.toArray(new String[result.size()]);
}
private int m_numBufferedRows;
@Override
public Instance getNextInstance(Instances structure) throws IOException {
m_structure = structure;
if (getRetrieval() == BATCH) {
throw new IOException(
"Cannot mix getting instances in both incremental and batch modes");
}
setRetrieval(INCREMENTAL);
if (m_dataDumper != null) {
// close the uneeded temp files (if necessary)
m_dataDumper.close();
m_dataDumper = null;
}
if (m_rowBuffer.size() > 0 && m_incrementalReader == null) {
StringBuilder tempB = new StringBuilder();
for (String r : m_rowBuffer) {
tempB.append(r).append("\n");
}
m_numBufferedRows = m_rowBuffer.size();
Reader batchReader = new BufferedReader(
new StringReader(tempB.toString()));
m_incrementalReader = new ArffReader(batchReader, m_structure, 0, 0);
m_rowBuffer.clear();
}
if (m_numBufferedRows == 0) {
// m_incrementalReader = new ArffReader(m_sourceReader, m_structure, 0,
// 0);
m_numBufferedRows = -1;
m_st = new StreamTokenizer(m_sourceReader);
initTokenizer(m_st);
m_st.ordinaryChar(m_FieldSeparator.charAt(0));
//
m_incrementalReader = null;
}
Instance current = null;
if (m_sourceReader != null) {
if (m_incrementalReader != null) {
current = m_incrementalReader.readInstance(m_structure);
} else {
if (getInstance(m_st) != null) {
current = makeInstance();
}
}
if (current == null) {
}
if (m_numBufferedRows > 0) {
m_numBufferedRows--;
}
}
if ((m_sourceReader != null) && (current == null)) {
try {
// close the stream
m_sourceReader.close();
m_sourceReader = null;
// reset();
} catch (Exception ex) {
ex.printStackTrace();
}
}
return current;
}
@Override
public Instances getDataSet() throws IOException {
if (m_sourceReader == null) {
throw new IOException("No source has been specified");
}
if (getRetrieval() == INCREMENTAL) {
throw new IOException(
"Cannot mix getting instances in both incremental and batch modes");
}
setRetrieval(BATCH);
if (m_structure == null) {
getStructure();
}
while (readData(true))
;
m_dataDumper.flush();
m_dataDumper.close();
// make final structure
makeStructure();
Reader sr = new BufferedReader(new FileReader(m_tempFile));
ArffReader initialArff = new ArffReader(sr, m_structure, 0);
Instances initialInsts = initialArff.getData();
sr.close();
initialArff = null;
return initialInsts;
}
private boolean readData(boolean dump) throws IOException {
if (m_sourceReader == null) {
throw new IOException("No source has been specified");
}
boolean finished = false;
boolean moreDataToRead = false;
do {
String checked = getInstance(m_st);
if (checked == null) {
return false;
}
if (dump) {
dumpRow(checked);
}
m_rowBuffer.add(checked);
if (m_rowBuffer.size() == m_bufferSize) {
finished = true;
if (getRetrieval() == BATCH) {
m_rowBuffer.clear();
}
}
} while (!finished);
return true;
}
/**
* Resets the Loader object and sets the source of the data set to be the
* supplied Stream object.
*
* @param input the input stream
* @exception IOException if an error occurs
*/
@Override
public void setSource(InputStream input) throws IOException {
m_structure = null;
m_sourceFile = null;
m_File = null;
m_sourceReader = new BufferedReader(new InputStreamReader(input));
}
/**
* Resets the Loader object and sets the source of the data set to be the
* supplied File object.
*
* @param file the source file.
* @exception IOException if an error occurs
*/
@Override
public void setSource(File file) throws IOException {
super.setSource(file);
}
@Override
public Instances getStructure() throws IOException {
if (m_sourceReader == null) {
throw new IOException("No source has been specified");
}
if (m_structure == null) {
readHeader();
}
return m_structure;
}
protected Instance makeInstance() throws IOException {
if (m_current == null) {
return null;
}
double[] vals = new double[m_structure.numAttributes()];
for (int i = 0; i < m_structure.numAttributes(); i++) {
Object val = m_current.get(i);
if (val.toString().equals("?")) {
vals[i] = Utils.missingValue();
} else if (m_structure.attribute(i).isString()) {
vals[i] = 0;
m_structure.attribute(i).setStringValue(Utils.unquote(val.toString()));
} else if (m_structure.attribute(i).isDate()) {
String format = m_structure.attribute(i).getDateFormat();
SimpleDateFormat sdf = new SimpleDateFormat(format);
try {
vals[i] = sdf.parse(val.toString()).getTime();
} catch (ParseException e) {
throw new IOException("Unable to parse date value " + val.toString()
+ " using date format " + format + " for date attribute "
+ m_structure.attribute(i));
}
} else if (m_structure.attribute(i).isNumeric()) {
try {
Double v = Double.parseDouble(val.toString());
vals[i] = v.doubleValue();
} catch (NumberFormatException ex) {
throw new IOException("Was expecting a number for attribute "
+ m_structure.attribute(i).name() + " but read " + val.toString()
+ " instead.");
}
} else {
// nominal
double index = m_structure.attribute(i).indexOfValue(
Utils.unquote(val.toString()));
if (index < 0) {
throw new IOException("Read unknown nominal value " + val.toString()
+ "for attribute " + m_structure.attribute(i).name());
}
vals[i] = index;
}
}
DenseInstance inst = new DenseInstance(1.0, vals);
inst.setDataset(m_structure);
return inst;
}
protected void makeStructure() {
// make final structure
ArrayList<Attribute> attribs = new ArrayList<Attribute>();
for (int i = 0; i < m_types.length; i++) {
if (m_types[i] == TYPE.STRING || m_types[i] == TYPE.UNDETERMINED) {
attribs.add(new Attribute(m_structure.attribute(i).name(),
(java.util.List<String>) null));
} else if (m_types[i] == TYPE.NUMERIC) {
attribs.add(new Attribute(m_structure.attribute(i).name()));
} else if (m_types[i] == TYPE.NOMINAL) {
LinkedHashSet<String> vals = m_nominalVals.get(i);
ArrayList<String> theVals = new ArrayList<String>();
if (vals.size() > 0) {
for (String v : vals) {
/*
* if (v.startsWith("'") || v.startsWith("\"")) { v = v.substring(1,
* v.length() - 1); }
*/
theVals.add(v);
}
} else {
theVals.add("*unknown*");
}
attribs.add(new Attribute(m_structure.attribute(i).name(), theVals));
} else {
attribs
.add(new Attribute(m_structure.attribute(i).name(), m_dateFormat));
}
}
m_structure = new Instances(m_structure.relationName(), attribs, 0);
}
private void readHeader() throws IOException {
m_incrementalReader = null;
m_current = new ArrayList<Object>();
openTempFiles();
m_rowBuffer = new ArrayList<String>();
String firstRow = m_sourceReader.readLine();
if (firstRow == null) {
throw new IOException("No data in the file!");
}
if (m_noHeaderRow) {
m_rowBuffer.add(firstRow);
}
ArrayList<Attribute> attribNames = new ArrayList<Attribute>();
// now tokenize to determine attribute names (or create att names if
// no header row
StringReader sr = new StringReader(firstRow + "\n");
// System.out.print(firstRow + "\n");
m_st = new StreamTokenizer(sr);
initTokenizer(m_st);
m_st.ordinaryChar(m_FieldSeparator.charAt(0));
int attNum = 1;
StreamTokenizerUtils.getFirstToken(m_st);
if (m_st.ttype == StreamTokenizer.TT_EOF) {
StreamTokenizerUtils.errms(m_st, "premature end of file");
}
boolean first = true;
boolean wasSep;
while (m_st.ttype != StreamTokenizer.TT_EOL
&& m_st.ttype != StreamTokenizer.TT_EOF) {
// Get next token
if (!first) {
StreamTokenizerUtils.getToken(m_st);
}
if (m_st.ttype == m_FieldSeparator.charAt(0)
|| m_st.ttype == StreamTokenizer.TT_EOL) {
wasSep = true;
} else {
wasSep = false;
String attName = null;
if (m_noHeaderRow) {
attName = "att" + attNum;
attNum++;
} else {
attName = m_st.sval;
}
attribNames.add(new Attribute(attName, (java.util.List<String>) null));
}
if (!wasSep) {
StreamTokenizerUtils.getToken(m_st);
}
first = false;
}
String relationName;
if (m_sourceFile != null) {
relationName = (m_sourceFile.getName())
.replaceAll("\\.[cC][sS][vV]$", "");
} else {
relationName = "stream";
}
m_structure = new Instances(relationName, attribNames, 0);
m_NominalAttributes.setUpper(m_structure.numAttributes() - 1);
m_StringAttributes.setUpper(m_structure.numAttributes() - 1);
m_dateAttributes.setUpper(m_structure.numAttributes() - 1);
m_nominalVals = new HashMap<Integer, LinkedHashSet<String>>();
m_types = new TYPE[m_structure.numAttributes()];
for (int i = 0; i < m_structure.numAttributes(); i++) {
if (m_NominalAttributes.isInRange(i)) {
m_types[i] = TYPE.NOMINAL;
LinkedHashSet<String> ts = new LinkedHashSet<String>();
m_nominalVals.put(i, ts);
} else if (m_StringAttributes.isInRange(i)) {
m_types[i] = TYPE.STRING;
} else if (m_dateAttributes.isInRange(i)) {
m_types[i] = TYPE.DATE;
} else {
m_types[i] = TYPE.UNDETERMINED;
}
}
if (m_nominalLabelSpecs.size() > 0) {
for (String spec : m_nominalLabelSpecs) {
String[] attsAndLabels = spec.split(":");
if (attsAndLabels.length == 2) {
String[] labels = attsAndLabels[1].split(",");
try {
// try as a range string first
Range tempR = new Range();
tempR.setRanges(attsAndLabels[0].trim());
tempR.setUpper(m_structure.numAttributes() - 1);
int[] rangeIndexes = tempR.getSelection();
for (int i = 0; i < rangeIndexes.length; i++) {
m_types[rangeIndexes[i]] = TYPE.NOMINAL;
LinkedHashSet<String> ts = new LinkedHashSet<String>();
for (String lab : labels) {
ts.add(lab);
}
m_nominalVals.put(rangeIndexes[i], ts);
}
} catch (IllegalArgumentException e) {
// one or more named attributes?
String[] attNames = attsAndLabels[0].split(",");
for (String attN : attNames) {
Attribute a = m_structure.attribute(attN.trim());
if (a != null) {
int attIndex = a.index();
m_types[attIndex] = TYPE.NOMINAL;
LinkedHashSet<String> ts = new LinkedHashSet<String>();
for (String lab : labels) {
ts.add(lab);
}
m_nominalVals.put(attIndex, ts);
}
}
}
}
}
}
m_st = new StreamTokenizer(m_sourceReader);
initTokenizer(m_st);
m_st.ordinaryChar(m_FieldSeparator.charAt(0));
// try and determine a more accurate structure from the first batch
readData(false || getRetrieval() == BATCH);
makeStructure();
}
protected void openTempFiles() throws IOException {
String tempPrefix = "" + Math.random() + "arffOut";
m_tempFile = File.createTempFile(tempPrefix, null);
m_tempFile.deleteOnExit();
Writer os2 = new FileWriter(m_tempFile);
m_dataDumper = new PrintWriter(new BufferedWriter(os2));
}
protected void dumpRow(String row) throws IOException {
m_dataDumper.println(row);
}
/**
* Initializes the stream tokenizer.
*
* @param tokenizer the tokenizer to initialize
*/
private void initTokenizer(StreamTokenizer tokenizer) {
tokenizer.resetSyntax();
tokenizer.whitespaceChars(0, (' ' - 1));
tokenizer.wordChars(' ', '\u00FF');
tokenizer.whitespaceChars(m_FieldSeparator.charAt(0),
m_FieldSeparator.charAt(0));
// tokenizer.commentChar('%');
String[] parts = m_Enclosures.split(",");
for (String e : parts) {
if (e.length() > 1 || e.length() == 0) {
throw new IllegalArgumentException(
"Enclosures can only be single characters");
}
tokenizer.quoteChar(e.charAt(0));
}
tokenizer.eolIsSignificant(true);
}
enum TYPE {
UNDETERMINED, NUMERIC, NOMINAL, STRING, DATE
};
protected ArrayList<Object> m_current;
protected TYPE[] m_types;
/**
* Attempts to parse a line of the data set.
*
* @param tokenizer the tokenizer
* @return a String version of the instance that has had String and nominal
* attribute values quoted if necessary
* @exception IOException if an error occurs
*
* <pre>
* <jml>
* private_normal_behavior
* requires: tokenizer != null;
* ensures: \result != null;
* also
* private_exceptional_behavior
* requires: tokenizer == null
* || (* unsucessful parse *);
* signals: (IOException);
* </jml>
* </pre>
*/
private String getInstance(StreamTokenizer tokenizer) throws IOException {
// Check if end of file reached.
StreamTokenizerUtils.getFirstToken(tokenizer);
if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
return null;
}
boolean first = true;
boolean wasSep;
boolean containedMissing = false;
m_current.clear();
int i = 0;
while (tokenizer.ttype != StreamTokenizer.TT_EOL
&& tokenizer.ttype != StreamTokenizer.TT_EOF) {
// Get next token
if (!first) {
StreamTokenizerUtils.getToken(tokenizer);
}
if (tokenizer.ttype == m_FieldSeparator.charAt(0)
|| tokenizer.ttype == StreamTokenizer.TT_EOL) {
m_current.add("?");
containedMissing = true;
wasSep = true;
} else {
wasSep = false;
if (tokenizer.sval.equals(m_MissingValue)) {
m_current.add("?");
containedMissing = true;
} else if (m_types[i] == TYPE.NUMERIC
|| m_types[i] == TYPE.UNDETERMINED) {
// try to parse as a number
try {
double val = Double.parseDouble(tokenizer.sval);
m_current.add(tokenizer.sval);
m_types[i] = TYPE.NUMERIC;
} catch (NumberFormatException e) {
// otherwise assume its an enumerated value
m_current.add(Utils.quote(tokenizer.sval));
if (m_types[i] == TYPE.UNDETERMINED) {
m_types[i] = TYPE.NOMINAL;
LinkedHashSet<String> ts = new LinkedHashSet<String>();
ts.add(tokenizer.sval);
m_nominalVals.put(i, ts);
} else {
m_types[i] = TYPE.STRING;
}
}
} else if (m_types[i] == TYPE.STRING || m_types[i] == TYPE.DATE) {
m_current.add(Utils.quote(tokenizer.sval));
} else if (m_types[i] == TYPE.NOMINAL) {
m_current.add(Utils.quote(tokenizer.sval));
m_nominalVals.get(i).add(tokenizer.sval);
}
}
if (!wasSep) {
StreamTokenizerUtils.getToken(tokenizer);
}
first = false;
i++;
}
// check number of values read
if (m_current.size() != m_structure.numAttributes()) {
for (Object o : m_current) {
System.out.print(o.toString() + "|||");
}
System.out.println();
StreamTokenizerUtils.errms(tokenizer, "wrong number of values. Read "
+ m_current.size() + ", expected " + m_structure.numAttributes());
}
StringBuilder temp = new StringBuilder();
for (Object o : m_current) {
temp.append(o.toString()).append(m_FieldSeparator);
}
return temp.substring(0, temp.length() - 1);
}
@Override
public void reset() throws IOException {
m_structure = null;
m_rowBuffer = null;
if (m_dataDumper != null) {
// close the unneeded temp files (if necessary)
m_dataDumper.close();
m_dataDumper = null;
}
if (m_sourceReader != null) {
m_sourceReader.close();
}
if (m_File != null) {
setFile(new File(m_File));
}
}
/**
* Main method.
*
* @param args should contain the name of an input file.
*/
public static void main(String[] args) {
runFileLoader(new CSVLoader(), args);
}
}