/* * RapidMiner * * Copyright (C) 2001-2008 by Rapid-I and the contributors * * Complete list of developers available at our web site: * * http://rapid-i.com * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.operator.io; import java.io.File; import java.io.IOException; import java.util.List; import javax.xml.parsers.ParserConfigurationException; import org.xml.sax.SAXException; import com.rapidminer.example.ExampleSet; import com.rapidminer.example.table.DataRowFactory; import com.rapidminer.example.table.ExampleTable; import com.rapidminer.example.table.FileDataRowReader; import com.rapidminer.example.table.MemoryExampleTable; import com.rapidminer.gui.wizards.ExampleSourceConfigurationWizardCreator; import com.rapidminer.operator.IOObject; import com.rapidminer.operator.Operator; import com.rapidminer.operator.OperatorDescription; import com.rapidminer.operator.OperatorException; import com.rapidminer.operator.UserError; import com.rapidminer.parameter.ParameterType; import com.rapidminer.parameter.ParameterTypeAttributeFile; import com.rapidminer.parameter.ParameterTypeBoolean; import com.rapidminer.parameter.ParameterTypeCategory; import com.rapidminer.parameter.ParameterTypeConfiguration; import com.rapidminer.parameter.ParameterTypeDouble; import com.rapidminer.parameter.ParameterTypeInt; import com.rapidminer.parameter.ParameterTypeString; import com.rapidminer.tools.RandomGenerator; import com.rapidminer.tools.att.AttributeDataSource; import com.rapidminer.tools.att.AttributeDataSources; import com.rapidminer.tools.att.AttributeSet; /** * <p> * This operator reads an example set from (a) file(s). Probably you can use the * default parameter values for the most file formats (including the format * produced by the ExampleSetWriter, CSV, ...). Please refer to section * {@rapidminer.ref sec:inputfiles|First steps/File formats} for details on the * attribute description file set by the parameter <var>attributes</var> used * to specify attribute types. * </p> * * <p> * This operator supports the reading of data from multiple source files. Each * attribute (including special attributes like labels, weights, ...) might be * read from another file. Please note that only the minimum number of lines of * all files will be read, i.e. if one of the data source files has less lines * than the others, only this number of examples will be read. * </p> * * <p> * The split points can be defined with regular expressions (please refer to the * Java API). The default split parameter ",\s*|;\s*|\s+" should work * for most file formats. This regular expression describes the following column * separators * <ul> * <li>the character "," followed by a whitespace of arbitrary length (also no white space)</li> * <li>the character ";" followed by a whitespace of arbitrary length (also no white space)</li> * <li>a whitespace of arbitrary length (min. 1)</li> * </ul> * A logical XOR is defined by "|". Other useful separators might be * "\t" for tabulars, " " for a single whitespace, and * "\s" for any whitespace. * </p> * * <p> * Quoting is also possible with ". However, since using quotes might slow down * the parsing it is therefore recommended to ensure that the * split characters are not included in the data columns and that quotes are not * needed. * </p> * * <p> * Additionally you can specify comment characters which can be used at * arbitrary locations of the data lines. Any content after the comment character * will be ignored. Unknown attribute values can be marked with empty strings * (if this is possible for your column separators) or by a question mark (recommended). * </p> * * @author Simon Fischer, Ingo Mierswa * @version $Id: ExampleSource.java,v 1.14 2008/07/07 07:06:38 ingomierswa Exp $ */ public class ExampleSource extends Operator { /** The parameter name for "Filename for the xml attribute description file. This file also contains the names of the files to read the data from." */ public static final String PARAMETER_ATTRIBUTES = "attributes"; /** The parameter name for "The fraction of the data set which should be read (1 = all; only used if sample_size = -1)" */ public static final String PARAMETER_SAMPLE_RATIO = "sample_ratio"; /** The parameter name for "The exact number of samples which should be read (-1 = use sample ratio; if not -1, sample_ratio will not have any effect)" */ public static final String PARAMETER_SAMPLE_SIZE = "sample_size"; /** The parameter name for "Determines, how the data is represented internally." */ public static final String PARAMETER_DATAMANAGEMENT = "datamanagement"; /** The parameter name for "Column separators for data files (regular expression)" */ public static final String PARAMETER_COLUMN_SEPARATORS = "column_separators"; /** The parameter name for "Indicates if a comment character should be used" */ public static final String PARAMETER_USE_COMMENT_CHARACTERS = "use_comment_characters"; /** The parameter name for "Lines beginning with these characters are ignored." */ public static final String PARAMETER_COMMENT_CHARS = "comment_chars"; /** The parameter name for "Character that is used as decimal point." */ public static final String PARAMETER_DECIMAL_POINT_CHARACTER = "decimal_point_character"; /** The parameter name for "Indicates if quotes should be regarded (slower!)." */ public static final String PARAMETER_USE_QUOTES = "use_quotes"; public static final String PARAMETER_TRIM_LINES = "trim_lines"; /** The parameter name for "Indicates if the loaded data should be permutated." */ public static final String PARAMETER_PERMUTATE = "permutate"; /** The parameter name for "Use the given random seed instead of global random numbers (only for permutation, -1: use global)." */ public static final String PARAMETER_LOCAL_RANDOM_SEED = "local_random_seed"; private static final Class[] INPUT_CLASSES = {}; private static final Class[] OUTPUT_CLASSES = { ExampleSet.class }; public ExampleSource(OperatorDescription description) { super(description); } public IOObject[] apply() throws OperatorException { AttributeDataSources attributeDataSources = null; FileDataRowReader reader = null; File attributeFile = getParameterAsFile(PARAMETER_ATTRIBUTES); try { attributeDataSources = AttributeDataSource.createAttributeDataSources(attributeFile, true, this); char[] commentCharacters = null; if (getParameterAsBoolean(PARAMETER_USE_COMMENT_CHARACTERS)) { commentCharacters = getParameterAsString(PARAMETER_COMMENT_CHARS).toCharArray(); } reader = new FileDataRowReader(new DataRowFactory(getParameterAsInt(PARAMETER_DATAMANAGEMENT), getParameterAsString(PARAMETER_DECIMAL_POINT_CHARACTER).charAt(0)), attributeDataSources.getDataSources(), getParameterAsDouble(PARAMETER_SAMPLE_RATIO), getParameterAsInt(PARAMETER_SAMPLE_SIZE), getParameterAsString(PARAMETER_COLUMN_SEPARATORS), commentCharacters, getParameterAsBoolean(PARAMETER_USE_QUOTES), getParameterAsBoolean(PARAMETER_TRIM_LINES), getEncoding(), RandomGenerator.getRandomGenerator(getParameterAsInt(PARAMETER_LOCAL_RANDOM_SEED))); } catch (IOException e) { throw new UserError(this, e, 302, new Object[] { attributeFile, e.getMessage() }); } catch (com.rapidminer.tools.XMLException e) { throw new UserError(this, e, 401, e.getMessage()); } catch (ParserConfigurationException e) { throw new UserError(this, e, 401, e.toString()); } catch (SAXException e) { throw new UserError(this, e, 401, e.toString()); } AttributeSet attributeSet = new AttributeSet(attributeDataSources); ExampleTable table = new MemoryExampleTable(attributeSet.getAllAttributes(), reader, getParameterAsBoolean(PARAMETER_PERMUTATE)); ExampleSet result = table.createExampleSet(attributeSet); if (result.size() == 0) { throw new UserError(this, 117); } return new IOObject[] { result }; } public Class<?>[] getInputClasses() { return INPUT_CLASSES; } public Class<?>[] getOutputClasses() { return OUTPUT_CLASSES; } public List<ParameterType> getParameterTypes() { List<ParameterType> types = super.getParameterTypes(); ParameterType type = new ParameterTypeConfiguration(ExampleSourceConfigurationWizardCreator.class, this); type.setExpert(false); types.add(type); types.add(new ParameterTypeAttributeFile(PARAMETER_ATTRIBUTES, "Filename for the xml attribute description file. This file also contains the names of the files to read the data from.", false)); type = new ParameterTypeDouble(PARAMETER_SAMPLE_RATIO, "The fraction of the data set which should be read (1 = all; only used if sample_size = -1)", 0.0d, 1.0d, 1.0d); type.setExpert(false); types.add(type); types.add(new ParameterTypeInt(PARAMETER_SAMPLE_SIZE, "The exact number of samples which should be read (-1 = use sample ratio; if not -1, sample_ratio will not have any effect)", -1, Integer.MAX_VALUE, -1)); types.add(new ParameterTypeCategory(PARAMETER_DATAMANAGEMENT, "Determines, how the data is represented internally.", DataRowFactory.TYPE_NAMES, DataRowFactory.TYPE_DOUBLE_ARRAY)); types.add(new ParameterTypeString(PARAMETER_COLUMN_SEPARATORS, "Column separators for data files (regular expression)", ",\\s*|;\\s*|\\s+")); types.add(new ParameterTypeBoolean(PARAMETER_USE_COMMENT_CHARACTERS, "Indicates if qa comment character should be used.", true)); types.add(new ParameterTypeString(PARAMETER_COMMENT_CHARS, "Any content in a line after one of these characters will be ignored.", "#")); types.add(new ParameterTypeString(PARAMETER_DECIMAL_POINT_CHARACTER, "Character that is used as decimal point.", ".")); types.add(new ParameterTypeBoolean(PARAMETER_USE_QUOTES, "Indicates if quotes should be regarded.", true)); types.add(new ParameterTypeBoolean(PARAMETER_TRIM_LINES, "Indicates if lines should be trimmed (empty spaces are removed at the beginning and the end) before the column split is performed.", false)); types.add(new ParameterTypeBoolean(PARAMETER_PERMUTATE, "Indicates if the loaded data should be permutated.", false)); types.add(new ParameterTypeInt(PARAMETER_LOCAL_RANDOM_SEED, "Use the given random seed instead of global random numbers (only for permutation, -1: use global).", -1, Integer.MAX_VALUE, -1)); return types; } }