/*
* RapidMiner
*
* Copyright (C) 2001-2011 by Rapid-I and the contributors
*
* Complete list of developers available at our web site:
*
* http://rapid-i.com
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see http://www.gnu.org/licenses/.
*/
package com.rapidminer.operator.io;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.nio.charset.Charset;
import java.text.NumberFormat;
import java.util.Iterator;
import java.util.List;
import com.rapidminer.example.Attributes;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.example.table.DataRowFactory;
import com.rapidminer.example.table.ExampleTable;
import com.rapidminer.example.table.FileDataRowReader;
import com.rapidminer.example.table.MemoryExampleTable;
import com.rapidminer.gui.tools.dialogs.wizards.dataimport.csv.CSVFileReader;
import com.rapidminer.operator.OperatorDescription;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.UserError;
import com.rapidminer.operator.ports.metadata.MetaData;
import com.rapidminer.parameter.ParameterType;
import com.rapidminer.parameter.ParameterTypeBoolean;
import com.rapidminer.parameter.ParameterTypeChar;
import com.rapidminer.parameter.ParameterTypeDouble;
import com.rapidminer.parameter.ParameterTypeFile;
import com.rapidminer.parameter.ParameterTypeInt;
import com.rapidminer.parameter.ParameterTypeString;
import com.rapidminer.parameter.UndefinedParameterError;
import com.rapidminer.parameter.conditions.BooleanParameterCondition;
import com.rapidminer.tools.LineParser;
import com.rapidminer.tools.RandomGenerator;
import com.rapidminer.tools.StrictDecimalFormat;
import com.rapidminer.tools.att.AttributeDataSource;
import com.rapidminer.tools.att.AttributeDataSourceCreator;
import com.rapidminer.tools.att.AttributeDataSources;
import com.rapidminer.tools.att.AttributeSet;
import com.rapidminer.tools.io.Encoding;
/**
* <p>
* This operator reads an example set from (a) file(s). Probably you can use the
* default parameter values for the most file formats (including the format
* produced by the ExampleSetWriter, CSV, ...). In fact, in many cases this operator
* is more appropriate for CSV based file formats than the {@link CSVExampleSource} operator
* itself since you can better control some of the necessary settings like column separators etc.
* </p>
*
* <p>
* In contrast to the usual ExampleSource operator this operator is able to read the
* attribute names from the first line of the data file. However, there is one
* restriction: the data can only be read from one file instead of multiple
* files. If you need a fully flexible operator for data loading you should use
* the more powerful ExampleSource operator which also provides more parameters
* tuning for example the quoting mechanism and other specialized settings.
* </p>
*
* <p>
* The column split points can be defined with regular expressions (please refer to the
* annex of the RapidMiner tutorial). The default split parameter ",\s*|;\s*|\s+" should work
* for most file formats. This regular expression describes the following column
* separators
* <ul>
* <li>the character "," followed by a whitespace of arbitrary length (also no white space)</li>
* <li>the character ";" followed by a whitespace of arbitrary length (also no white space)</li>
* <li>a whitespace of arbitrary length (min. 1)</li>
* </ul>
* A logical XOR is defined by "|". Other useful separators might be
* "\t" for tabulars, " " for a single whitespace, and
* "\s" for any whitespace.
* </p>
*
* <p>
* Quoting is also possible with ". Escaping a quote is done with \".
* Additionally you can specify comment characters which can be used at
* arbitrary locations of the data lines and will skip the remaining part of the lines.
* Unknown attribute values can be marked with empty strings or a question mark.
* </p>
*
* @rapidminer.index csv
* @author Ingo Mierswa
*/
@Deprecated
public class SimpleExampleSource extends AbstractExampleSource {
/** The parameter name for "Name of the label attribute (if empty, the column defined by label_column will be used)" */
public static final String PARAMETER_LABEL_NAME = "label_name";
/** The parameter name for "Column number of the label attribute (only used if label_name is empty; 0 = none; negative values are counted from the last column)" */
public static final String PARAMETER_LABEL_COLUMN = "label_column";
/** The parameter name for "Name of the id attribute (if empty, the column defined by id_column will be used)" */
public static final String PARAMETER_ID_NAME = "id_name";
/** The parameter name for "Column number of the id attribute (only used if id_name is empty; 0 = none; negative values are counted from the last column)" */
public static final String PARAMETER_ID_COLUMN = "id_column";
/** The parameter name for "Name of the weight attribute (if empty, the column defined by weight_column will be used)" */
public static final String PARAMETER_WEIGHT_NAME = "weight_name";
/** The parameter name for "Column number of the weight attribute (only used if weight_name is empty; 0 = none, negative values are counted from the last column)" */
public static final String PARAMETER_WEIGHT_COLUMN = "weight_column";
/** The parameter name for "The fraction of the data set which should be read (1 = all; only used if sample_size = -1)" */
public static final String PARAMETER_SAMPLE_RATIO = "sample_ratio";
/** The parameter name for "The exact number of samples which should be read (-1 = use sample ratio; if not -1, sample_ratio will not have any effect)" */
public static final String PARAMETER_SAMPLE_SIZE = "sample_size";
public static final String PARAMETER_FILENAME = "file_name";
public static final String PARAMETER_USE_FIRST_ROW_AS_ATTRIBUTE_NAMES = "use_first_row_as_attribute_names";
public static final String PARAMETER_TRIM_LINES = "trim_lines";
// public static final String PARAMETER_SKIP_ERROR_LINES = "skip_error_lines";
public static final String PARAMETER_SKIP_COMMENTS = "skip_comments";
public static final String PARAMETER_COMMENT_CHARS = "comment_characters";
public static final String PARAMETER_USE_QUOTES = "use_quotes";
public static final String PARAMETER_QUOTES_CHARACTER = "quotes_character";
public static final String PARAMETER_COLUMN_SEPARATORS = "column_separators";
public SimpleExampleSource(OperatorDescription description) {
super(description);
}
public CSVFileReader createReader(File file) throws UndefinedParameterError {
final LineParser parser = new LineParser();
parser.setTrimLine(getParameterAsBoolean(PARAMETER_TRIM_LINES));
parser.setSkipComments(getParameterAsBoolean(PARAMETER_SKIP_COMMENTS));
try {
parser.setSplitExpression(getParameterAsString(PARAMETER_COLUMN_SEPARATORS));
} catch (OperatorException e) {
throw new UndefinedParameterError(e.getMessage());
}
parser.setUseQuotes(getParameterAsBoolean(PARAMETER_USE_QUOTES));
parser.setQuoteCharacter(getParameterAsChar(PARAMETER_QUOTES_CHARACTER));
parser.setCommentCharacters(getParameterAsString(PARAMETER_COMMENT_CHARS));
final NumberFormat numberFormat = StrictDecimalFormat.getInstance(this);
final CSVFileReader reader = new CSVFileReader(file, getParameterAsBoolean(PARAMETER_USE_FIRST_ROW_AS_ATTRIBUTE_NAMES), parser, numberFormat);
return reader;
}
@Override
public MetaData getGeneratedMetaData() throws OperatorException {
File file = getParameterAsFile(PARAMETER_FILENAME);
CSVFileReader reader = createReader(getParameterAsFile(PARAMETER_FILENAME));
MetaData metaData = null;
try {
metaData = reader.getMetaData();
} catch (FileNotFoundException e) {
throw new UserError(this, 302, file, e.getMessage());
} catch (IOException e) {
}
return metaData;
}
@Override
public ExampleSet createExampleSet() throws OperatorException {
File file = getParameterAsFile(PARAMETER_FILENAME);
final CSVFileReader reader = createReader(file);
ExampleSet result = null;
try {
result = reader.createExampleSet();
} catch (IOException e) {
}
return result;
// File file = getParameterAsFile(PARAMETER_FILENAME);
// double sampleRatio = getParameterAsDouble(PARAMETER_SAMPLE_RATIO);
// int maxLines = getParameterAsInt(PARAMETER_SAMPLE_SIZE);
// String separatorRegExpr = getParameterAsString(PARAMETER_COLUMN_SEPARATORS);
// char[] comments = null;
// if (getParameterAsBoolean(PARAMETER_USE_COMMENT_CHARACTERS)) {
// comments = getParameterAsString(PARAMETER_COMMENT_CHARS).toCharArray();
// }
// int dataRowType = getParameterAsInt(PARAMETER_DATAMANAGEMENT);
// boolean useQuotes = getParameterAsBoolean(PARAMETER_USE_QUOTES);
// boolean trimLines = getParameterAsBoolean(PARAMETER_TRIM_LINES);
// boolean skipErrorLines = getParameterAsBoolean(PARAMETER_SKIP_ERROR_LINES);
// char decimalPointCharacter = getParameterAsString(PARAMETER_DECIMAL_POINT_CHARACTER).charAt(0);
// boolean firstRowAsColumnNames = getParameterAsBoolean(PARAMETER_READ_ATTRIBUTE_NAMES);
//
// String labelName = getParameterAsString("label_name");
// int labelColumn = getParameterAsInt("label_column");
// String idName = getParameterAsString("id_name");
// int idColumn = getParameterAsInt("id_column");
// String weightName = getParameterAsString("weight_name");
// int weightColumn = getParameterAsInt("weight_column");
//
// ExampleSet result = null;
// try {
// result = createExampleSet(file, firstRowAsColumnNames, sampleRatio, maxLines, separatorRegExpr, comments, dataRowType, useQuotes, trimLines, skipErrorLines, decimalPointCharacter, Encoding.getEncoding(this), labelName, labelColumn, idName, idColumn, weightName, weightColumn);
// } catch (IOException e) {
// throw new UserError(this, 302, file, e.getMessage());
// } catch (IndexOutOfBoundsException i) {
// throw new UserError(this, 111, i.getMessage());
// }
// return result;
}
public static ExampleSet createExampleSet(File file, boolean firstRowAsColumnNames, double sampleRatio, int maxLines, String separatorRegExpr, char[] comments, int dataRowType, boolean useQuotes, boolean trimLines, boolean skipErrorLines, char decimalPointCharacter, Charset encoding, String labelName, int labelColumn, String idName, int idColumn, String weightName, int weightColumn) throws IOException, UserError, IndexOutOfBoundsException {
// create attribute data sources and guess value types (performs a data scan)
AttributeDataSourceCreator adsCreator = new AttributeDataSourceCreator();
adsCreator.loadData(file, comments, separatorRegExpr, decimalPointCharacter, useQuotes, '"', '\\', trimLines, firstRowAsColumnNames, -1, skipErrorLines, encoding, null);
List<AttributeDataSource> attributeDataSources = adsCreator.getAttributeDataSources();
// set special attributes
resetAttributeType(attributeDataSources, labelName, labelColumn, Attributes.LABEL_NAME);
resetAttributeType(attributeDataSources, idName, idColumn, Attributes.ID_NAME);
resetAttributeType(attributeDataSources, weightName, weightColumn, Attributes.WEIGHT_NAME);
// read data
FileDataRowReader reader = new FileDataRowReader(new DataRowFactory(dataRowType, decimalPointCharacter), attributeDataSources, sampleRatio, maxLines, separatorRegExpr, comments, useQuotes, '"', '\\', trimLines, skipErrorLines, encoding, RandomGenerator.getGlobalRandomGenerator());
if (firstRowAsColumnNames) {
reader.skipLine();
}
AttributeSet attributeSet = new AttributeSet(new AttributeDataSources(attributeDataSources, file, encoding));
// create table and example set
ExampleTable table = new MemoryExampleTable(attributeSet.getAllAttributes(), reader);
ExampleSet result = table.createExampleSet(attributeSet);
return result;
}
private static void resetAttributeType(List<AttributeDataSource> attributeDataSources, String attribute, int column, String typeName) throws IndexOutOfBoundsException {
if ((attribute == null) || (attribute.length() == 0)) {
if (column != 0) {
if (column < 0)
column = attributeDataSources.size() + column + 1;
if ((column < 1) || (column >= attributeDataSources.size() + 1)) {
throw new IndexOutOfBoundsException("column = " + column + " as label");
}
column--;
attributeDataSources.get(column).setType(typeName);
}
} else {
Iterator<AttributeDataSource> i = attributeDataSources.iterator();
while (i.hasNext()) {
AttributeDataSource ads = i.next();
if (ads.getAttribute().getName().equals(attribute)) {
ads.setType(typeName);
break;
}
}
}
}
@Override
public List<ParameterType> getParameterTypes() {
List<ParameterType> types = super.getParameterTypes();
types.add(new ParameterTypeFile(PARAMETER_FILENAME, "Name of the file to read the data from.", "dat", false));
types.addAll(Encoding.getParameterTypes(this));
types.add(new ParameterTypeBoolean(PARAMETER_TRIM_LINES, "Indicates if lines should be trimmed (empty spaces are removed at the beginning and the end) before the column split is performed.", false));
types.add(new ParameterTypeBoolean(PARAMETER_SKIP_COMMENTS, "Indicates if qa comment character should be used.", true));
ParameterType type = new ParameterTypeString(PARAMETER_COMMENT_CHARS, "Lines beginning with these characters are ignored.", "#", true);
type.registerDependencyCondition(new BooleanParameterCondition(this, PARAMETER_SKIP_COMMENTS, false, true));
types.add(type);
type = new ParameterTypeBoolean(PARAMETER_USE_FIRST_ROW_AS_ATTRIBUTE_NAMES, "Read attribute names from file (assumes the attribute names are in the first line of the file).", false);
type.setExpert(false);
types.add(type);
types.add(new ParameterTypeBoolean(PARAMETER_USE_QUOTES, "Indicates if quotes should be regarded (slower!).", false));
type = new ParameterTypeChar(PARAMETER_QUOTES_CHARACTER, "The quotes character.", '"', true);
type.registerDependencyCondition(new BooleanParameterCondition(this, PARAMETER_USE_QUOTES, false, true));
types.add(type);
types.add(new ParameterTypeString(PARAMETER_COLUMN_SEPARATORS, "Column separators for data files (regular expression)", ",\\s*|;\\s*|\\s+"));
// types.add(new ParameterTypeBoolean(PARAMETER_SKIP_ERROR_LINES, "Indicates if lines which can not be read should be skipped instead of letting this operator fail its execution.", false));
types.addAll(StrictDecimalFormat.getParameterTypes(this));
type = new ParameterTypeString(PARAMETER_LABEL_NAME, "Name of the label attribute (if empty, the column defined by label_column will be used)", true);
type.setExpert(false);
types.add(type);
type = new ParameterTypeInt(PARAMETER_LABEL_COLUMN, "Column number of the label attribute (only used if label_name is empty; 0 = none; negative values are counted from the last column)", Integer.MIN_VALUE, Integer.MAX_VALUE, 0);
type.setExpert(false);
types.add(type);
types.add(new ParameterTypeString(PARAMETER_ID_NAME, "Name of the id attribute (if empty, the column defined by id_column will be used)", true));
types.add(new ParameterTypeInt(PARAMETER_ID_COLUMN, "Column number of the id attribute (only used if id_name is empty; 0 = none; negative values are counted from the last column)", Integer.MIN_VALUE, Integer.MAX_VALUE, 0));
types.add(new ParameterTypeString(PARAMETER_WEIGHT_NAME, "Name of the weight attribute (if empty, the column defined by weight_column will be used)", true));
types.add(new ParameterTypeInt(PARAMETER_WEIGHT_COLUMN, "Column number of the weight attribute (only used if weight_name is empty; 0 = none, negative values are counted from the last column)", Integer.MIN_VALUE, Integer.MAX_VALUE, 0));
types.add(new ParameterTypeDouble(PARAMETER_SAMPLE_RATIO, "The fraction of the data set which should be read (1 = all; only used if sample_size = -1)", 0.0d, 1.0d, 1.0d));
types.add(new ParameterTypeInt(PARAMETER_SAMPLE_SIZE, "The exact number of samples which should be read (-1 = use sample ratio; if not -1, sample_ratio will not have any effect)", -1, Integer.MAX_VALUE, -1));
return types;
}
}