/* * RapidMiner * * Copyright (C) 2001-2011 by Rapid-I and the contributors * * Complete list of developers available at our web site: * * http://rapid-i.com * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.operator.io; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.nio.charset.Charset; import java.text.NumberFormat; import java.util.Iterator; import java.util.List; import com.rapidminer.example.Attributes; import com.rapidminer.example.ExampleSet; import com.rapidminer.example.table.DataRowFactory; import com.rapidminer.example.table.ExampleTable; import com.rapidminer.example.table.FileDataRowReader; import com.rapidminer.example.table.MemoryExampleTable; import com.rapidminer.gui.tools.dialogs.wizards.dataimport.csv.CSVFileReader; import com.rapidminer.operator.OperatorDescription; import com.rapidminer.operator.OperatorException; import com.rapidminer.operator.UserError; import com.rapidminer.operator.ports.metadata.MetaData; import com.rapidminer.parameter.ParameterType; import com.rapidminer.parameter.ParameterTypeBoolean; import com.rapidminer.parameter.ParameterTypeChar; import com.rapidminer.parameter.ParameterTypeDouble; import com.rapidminer.parameter.ParameterTypeFile; import com.rapidminer.parameter.ParameterTypeInt; import com.rapidminer.parameter.ParameterTypeString; import com.rapidminer.parameter.UndefinedParameterError; import com.rapidminer.parameter.conditions.BooleanParameterCondition; import com.rapidminer.tools.LineParser; import com.rapidminer.tools.RandomGenerator; import com.rapidminer.tools.StrictDecimalFormat; import com.rapidminer.tools.att.AttributeDataSource; import com.rapidminer.tools.att.AttributeDataSourceCreator; import com.rapidminer.tools.att.AttributeDataSources; import com.rapidminer.tools.att.AttributeSet; import com.rapidminer.tools.io.Encoding; /** * <p> * This operator reads an example set from (a) file(s). Probably you can use the * default parameter values for the most file formats (including the format * produced by the ExampleSetWriter, CSV, ...). In fact, in many cases this operator * is more appropriate for CSV based file formats than the {@link CSVExampleSource} operator * itself since you can better control some of the necessary settings like column separators etc. * </p> * * <p> * In contrast to the usual ExampleSource operator this operator is able to read the * attribute names from the first line of the data file. However, there is one * restriction: the data can only be read from one file instead of multiple * files. If you need a fully flexible operator for data loading you should use * the more powerful ExampleSource operator which also provides more parameters * tuning for example the quoting mechanism and other specialized settings. * </p> * * <p> * The column split points can be defined with regular expressions (please refer to the * annex of the RapidMiner tutorial). The default split parameter ",\s*|;\s*|\s+" should work * for most file formats. This regular expression describes the following column * separators * <ul> * <li>the character "," followed by a whitespace of arbitrary length (also no white space)</li> * <li>the character ";" followed by a whitespace of arbitrary length (also no white space)</li> * <li>a whitespace of arbitrary length (min. 1)</li> * </ul> * A logical XOR is defined by "|". Other useful separators might be * "\t" for tabulars, " " for a single whitespace, and * "\s" for any whitespace. * </p> * * <p> * Quoting is also possible with ". Escaping a quote is done with \". * Additionally you can specify comment characters which can be used at * arbitrary locations of the data lines and will skip the remaining part of the lines. * Unknown attribute values can be marked with empty strings or a question mark. * </p> * * @rapidminer.index csv * @author Ingo Mierswa */ @Deprecated public class SimpleExampleSource extends AbstractExampleSource { /** The parameter name for "Name of the label attribute (if empty, the column defined by label_column will be used)" */ public static final String PARAMETER_LABEL_NAME = "label_name"; /** The parameter name for "Column number of the label attribute (only used if label_name is empty; 0 = none; negative values are counted from the last column)" */ public static final String PARAMETER_LABEL_COLUMN = "label_column"; /** The parameter name for "Name of the id attribute (if empty, the column defined by id_column will be used)" */ public static final String PARAMETER_ID_NAME = "id_name"; /** The parameter name for "Column number of the id attribute (only used if id_name is empty; 0 = none; negative values are counted from the last column)" */ public static final String PARAMETER_ID_COLUMN = "id_column"; /** The parameter name for "Name of the weight attribute (if empty, the column defined by weight_column will be used)" */ public static final String PARAMETER_WEIGHT_NAME = "weight_name"; /** The parameter name for "Column number of the weight attribute (only used if weight_name is empty; 0 = none, negative values are counted from the last column)" */ public static final String PARAMETER_WEIGHT_COLUMN = "weight_column"; /** The parameter name for "The fraction of the data set which should be read (1 = all; only used if sample_size = -1)" */ public static final String PARAMETER_SAMPLE_RATIO = "sample_ratio"; /** The parameter name for "The exact number of samples which should be read (-1 = use sample ratio; if not -1, sample_ratio will not have any effect)" */ public static final String PARAMETER_SAMPLE_SIZE = "sample_size"; public static final String PARAMETER_FILENAME = "file_name"; public static final String PARAMETER_USE_FIRST_ROW_AS_ATTRIBUTE_NAMES = "use_first_row_as_attribute_names"; public static final String PARAMETER_TRIM_LINES = "trim_lines"; // public static final String PARAMETER_SKIP_ERROR_LINES = "skip_error_lines"; public static final String PARAMETER_SKIP_COMMENTS = "skip_comments"; public static final String PARAMETER_COMMENT_CHARS = "comment_characters"; public static final String PARAMETER_USE_QUOTES = "use_quotes"; public static final String PARAMETER_QUOTES_CHARACTER = "quotes_character"; public static final String PARAMETER_COLUMN_SEPARATORS = "column_separators"; public SimpleExampleSource(OperatorDescription description) { super(description); } public CSVFileReader createReader(File file) throws UndefinedParameterError { final LineParser parser = new LineParser(); parser.setTrimLine(getParameterAsBoolean(PARAMETER_TRIM_LINES)); parser.setSkipComments(getParameterAsBoolean(PARAMETER_SKIP_COMMENTS)); try { parser.setSplitExpression(getParameterAsString(PARAMETER_COLUMN_SEPARATORS)); } catch (OperatorException e) { throw new UndefinedParameterError(e.getMessage()); } parser.setUseQuotes(getParameterAsBoolean(PARAMETER_USE_QUOTES)); parser.setQuoteCharacter(getParameterAsChar(PARAMETER_QUOTES_CHARACTER)); parser.setCommentCharacters(getParameterAsString(PARAMETER_COMMENT_CHARS)); final NumberFormat numberFormat = StrictDecimalFormat.getInstance(this); final CSVFileReader reader = new CSVFileReader(file, getParameterAsBoolean(PARAMETER_USE_FIRST_ROW_AS_ATTRIBUTE_NAMES), parser, numberFormat); return reader; } @Override public MetaData getGeneratedMetaData() throws OperatorException { File file = getParameterAsFile(PARAMETER_FILENAME); CSVFileReader reader = createReader(getParameterAsFile(PARAMETER_FILENAME)); MetaData metaData = null; try { metaData = reader.getMetaData(); } catch (FileNotFoundException e) { throw new UserError(this, 302, file, e.getMessage()); } catch (IOException e) { } return metaData; } @Override public ExampleSet createExampleSet() throws OperatorException { File file = getParameterAsFile(PARAMETER_FILENAME); final CSVFileReader reader = createReader(file); ExampleSet result = null; try { result = reader.createExampleSet(); } catch (IOException e) { } return result; // File file = getParameterAsFile(PARAMETER_FILENAME); // double sampleRatio = getParameterAsDouble(PARAMETER_SAMPLE_RATIO); // int maxLines = getParameterAsInt(PARAMETER_SAMPLE_SIZE); // String separatorRegExpr = getParameterAsString(PARAMETER_COLUMN_SEPARATORS); // char[] comments = null; // if (getParameterAsBoolean(PARAMETER_USE_COMMENT_CHARACTERS)) { // comments = getParameterAsString(PARAMETER_COMMENT_CHARS).toCharArray(); // } // int dataRowType = getParameterAsInt(PARAMETER_DATAMANAGEMENT); // boolean useQuotes = getParameterAsBoolean(PARAMETER_USE_QUOTES); // boolean trimLines = getParameterAsBoolean(PARAMETER_TRIM_LINES); // boolean skipErrorLines = getParameterAsBoolean(PARAMETER_SKIP_ERROR_LINES); // char decimalPointCharacter = getParameterAsString(PARAMETER_DECIMAL_POINT_CHARACTER).charAt(0); // boolean firstRowAsColumnNames = getParameterAsBoolean(PARAMETER_READ_ATTRIBUTE_NAMES); // // String labelName = getParameterAsString("label_name"); // int labelColumn = getParameterAsInt("label_column"); // String idName = getParameterAsString("id_name"); // int idColumn = getParameterAsInt("id_column"); // String weightName = getParameterAsString("weight_name"); // int weightColumn = getParameterAsInt("weight_column"); // // ExampleSet result = null; // try { // result = createExampleSet(file, firstRowAsColumnNames, sampleRatio, maxLines, separatorRegExpr, comments, dataRowType, useQuotes, trimLines, skipErrorLines, decimalPointCharacter, Encoding.getEncoding(this), labelName, labelColumn, idName, idColumn, weightName, weightColumn); // } catch (IOException e) { // throw new UserError(this, 302, file, e.getMessage()); // } catch (IndexOutOfBoundsException i) { // throw new UserError(this, 111, i.getMessage()); // } // return result; } public static ExampleSet createExampleSet(File file, boolean firstRowAsColumnNames, double sampleRatio, int maxLines, String separatorRegExpr, char[] comments, int dataRowType, boolean useQuotes, boolean trimLines, boolean skipErrorLines, char decimalPointCharacter, Charset encoding, String labelName, int labelColumn, String idName, int idColumn, String weightName, int weightColumn) throws IOException, UserError, IndexOutOfBoundsException { // create attribute data sources and guess value types (performs a data scan) AttributeDataSourceCreator adsCreator = new AttributeDataSourceCreator(); adsCreator.loadData(file, comments, separatorRegExpr, decimalPointCharacter, useQuotes, '"', '\\', trimLines, firstRowAsColumnNames, -1, skipErrorLines, encoding, null); List<AttributeDataSource> attributeDataSources = adsCreator.getAttributeDataSources(); // set special attributes resetAttributeType(attributeDataSources, labelName, labelColumn, Attributes.LABEL_NAME); resetAttributeType(attributeDataSources, idName, idColumn, Attributes.ID_NAME); resetAttributeType(attributeDataSources, weightName, weightColumn, Attributes.WEIGHT_NAME); // read data FileDataRowReader reader = new FileDataRowReader(new DataRowFactory(dataRowType, decimalPointCharacter), attributeDataSources, sampleRatio, maxLines, separatorRegExpr, comments, useQuotes, '"', '\\', trimLines, skipErrorLines, encoding, RandomGenerator.getGlobalRandomGenerator()); if (firstRowAsColumnNames) { reader.skipLine(); } AttributeSet attributeSet = new AttributeSet(new AttributeDataSources(attributeDataSources, file, encoding)); // create table and example set ExampleTable table = new MemoryExampleTable(attributeSet.getAllAttributes(), reader); ExampleSet result = table.createExampleSet(attributeSet); return result; } private static void resetAttributeType(List<AttributeDataSource> attributeDataSources, String attribute, int column, String typeName) throws IndexOutOfBoundsException { if ((attribute == null) || (attribute.length() == 0)) { if (column != 0) { if (column < 0) column = attributeDataSources.size() + column + 1; if ((column < 1) || (column >= attributeDataSources.size() + 1)) { throw new IndexOutOfBoundsException("column = " + column + " as label"); } column--; attributeDataSources.get(column).setType(typeName); } } else { Iterator<AttributeDataSource> i = attributeDataSources.iterator(); while (i.hasNext()) { AttributeDataSource ads = i.next(); if (ads.getAttribute().getName().equals(attribute)) { ads.setType(typeName); break; } } } } @Override public List<ParameterType> getParameterTypes() { List<ParameterType> types = super.getParameterTypes(); types.add(new ParameterTypeFile(PARAMETER_FILENAME, "Name of the file to read the data from.", "dat", false)); types.addAll(Encoding.getParameterTypes(this)); types.add(new ParameterTypeBoolean(PARAMETER_TRIM_LINES, "Indicates if lines should be trimmed (empty spaces are removed at the beginning and the end) before the column split is performed.", false)); types.add(new ParameterTypeBoolean(PARAMETER_SKIP_COMMENTS, "Indicates if qa comment character should be used.", true)); ParameterType type = new ParameterTypeString(PARAMETER_COMMENT_CHARS, "Lines beginning with these characters are ignored.", "#", true); type.registerDependencyCondition(new BooleanParameterCondition(this, PARAMETER_SKIP_COMMENTS, false, true)); types.add(type); type = new ParameterTypeBoolean(PARAMETER_USE_FIRST_ROW_AS_ATTRIBUTE_NAMES, "Read attribute names from file (assumes the attribute names are in the first line of the file).", false); type.setExpert(false); types.add(type); types.add(new ParameterTypeBoolean(PARAMETER_USE_QUOTES, "Indicates if quotes should be regarded (slower!).", false)); type = new ParameterTypeChar(PARAMETER_QUOTES_CHARACTER, "The quotes character.", '"', true); type.registerDependencyCondition(new BooleanParameterCondition(this, PARAMETER_USE_QUOTES, false, true)); types.add(type); types.add(new ParameterTypeString(PARAMETER_COLUMN_SEPARATORS, "Column separators for data files (regular expression)", ",\\s*|;\\s*|\\s+")); // types.add(new ParameterTypeBoolean(PARAMETER_SKIP_ERROR_LINES, "Indicates if lines which can not be read should be skipped instead of letting this operator fail its execution.", false)); types.addAll(StrictDecimalFormat.getParameterTypes(this)); type = new ParameterTypeString(PARAMETER_LABEL_NAME, "Name of the label attribute (if empty, the column defined by label_column will be used)", true); type.setExpert(false); types.add(type); type = new ParameterTypeInt(PARAMETER_LABEL_COLUMN, "Column number of the label attribute (only used if label_name is empty; 0 = none; negative values are counted from the last column)", Integer.MIN_VALUE, Integer.MAX_VALUE, 0); type.setExpert(false); types.add(type); types.add(new ParameterTypeString(PARAMETER_ID_NAME, "Name of the id attribute (if empty, the column defined by id_column will be used)", true)); types.add(new ParameterTypeInt(PARAMETER_ID_COLUMN, "Column number of the id attribute (only used if id_name is empty; 0 = none; negative values are counted from the last column)", Integer.MIN_VALUE, Integer.MAX_VALUE, 0)); types.add(new ParameterTypeString(PARAMETER_WEIGHT_NAME, "Name of the weight attribute (if empty, the column defined by weight_column will be used)", true)); types.add(new ParameterTypeInt(PARAMETER_WEIGHT_COLUMN, "Column number of the weight attribute (only used if weight_name is empty; 0 = none, negative values are counted from the last column)", Integer.MIN_VALUE, Integer.MAX_VALUE, 0)); types.add(new ParameterTypeDouble(PARAMETER_SAMPLE_RATIO, "The fraction of the data set which should be read (1 = all; only used if sample_size = -1)", 0.0d, 1.0d, 1.0d)); types.add(new ParameterTypeInt(PARAMETER_SAMPLE_SIZE, "The exact number of samples which should be read (-1 = use sample ratio; if not -1, sample_ratio will not have any effect)", -1, Integer.MAX_VALUE, -1)); return types; } }