/*
* RapidMiner
*
* Copyright (C) 2001-2011 by Rapid-I and the contributors
*
* Complete list of developers available at our web site:
*
* http://rapid-i.com
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see http://www.gnu.org/licenses/.
*/
package com.rapidminer.operator.nio.model;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.text.NumberFormat;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;
import com.rapidminer.example.Attributes;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.example.table.DataRowFactory;
import com.rapidminer.operator.Annotations;
import com.rapidminer.operator.IOObject;
import com.rapidminer.operator.OperatorDescription;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.io.AbstractDataReader.AttributeColumn;
import com.rapidminer.operator.io.AbstractExampleSource;
import com.rapidminer.operator.io.ExampleSource;
import com.rapidminer.operator.nio.file.FileObject;
import com.rapidminer.operator.ports.InputPort;
import com.rapidminer.operator.ports.Port;
import com.rapidminer.operator.ports.metadata.ExampleSetMetaData;
import com.rapidminer.operator.ports.metadata.MetaData;
import com.rapidminer.operator.ports.metadata.SimplePrecondition;
import com.rapidminer.operator.preprocessing.filter.AbstractDateDataProcessing;
import com.rapidminer.parameter.ParameterType;
import com.rapidminer.parameter.ParameterTypeBoolean;
import com.rapidminer.parameter.ParameterTypeCategory;
import com.rapidminer.parameter.ParameterTypeDateFormat;
import com.rapidminer.parameter.ParameterTypeFile;
import com.rapidminer.parameter.ParameterTypeInt;
import com.rapidminer.parameter.ParameterTypeList;
import com.rapidminer.parameter.ParameterTypeString;
import com.rapidminer.parameter.ParameterTypeStringCategory;
import com.rapidminer.parameter.ParameterTypeTupel;
import com.rapidminer.parameter.PortProvider;
import com.rapidminer.parameter.conditions.BooleanParameterCondition;
import com.rapidminer.parameter.conditions.InputPortNotConnectedCondition;
import com.rapidminer.tools.Ontology;
import com.rapidminer.tools.Tools;
/**
* This class uses DataResultSets to load data from file and then delivers the data as an example set.
*
* @author Sebastian Land
*/
public abstract class AbstractDataResultSetReader extends AbstractExampleSource {
/** Pseudo-annotation to be used for attribute names. */
public static final String ANNOTATION_NAME = "Name";
/**
* This parameter holds the hole information about the attribute columns. I.e. which attributes are defined, the
* names, what value type they have, whether the att. is selected,
*/
public static final String PARAMETER_META_DATA = "data_set_meta_data_information";
/**
* Parameters being part of the list for PARAMETER_META_DATA
*/
public static final String PARAMETER_COLUMN_INDEX = "column_index";
public static final String PARAMETER_COLUMN_META_DATA = "attribute_meta_data_information";
public static final String PARAMETER_COLUMN_NAME = "attribute name";
public static final String PARAMETER_COLUMN_SELECTED = "column_selected";
public static final String PARAMETER_COLUMN_VALUE_TYPE = "attribute_value_type";
public static final String PARAMETER_COLUMN_ROLE = "attribute_role";
public static final String PARAMETER_DATE_FORMAT = "date_format";
public static final String PARAMETER_TIME_ZONE = "time_zone";
public static final String PARAMETER_LOCALE = "locale";
/**
* The parameter name for "Determines, how the data is represented internally."
*/
public static final String PARAMETER_DATAMANAGEMENT = "datamanagement";
public static final String PARAMETER_FIRST_ROW_AS_NAMES = "first_row_as_names";
public static final String PARAMETER_ANNOTATIONS = "annotations";
public static final String PARAMETER_ERROR_TOLERANT = "read_not_matching_values_as_missings";
private InputPort fileInputPort = getInputPorts().createPort("file");
public AbstractDataResultSetReader(OperatorDescription description) {
super(description);
fileInputPort.addPrecondition(new SimplePrecondition(fileInputPort, new MetaData(FileObject.class)) {
@Override
protected boolean isMandatory() {
return false;
}
});
}
@Override
public ExampleSet createExampleSet() throws OperatorException {
// loading data result set
DataResultSetFactory dataResultSetFactory = getDataResultSetFactory();
DataResultSet dataResultSet = dataResultSetFactory.makeDataResultSet(this);
// loading configuration
DataResultSetTranslationConfiguration configuration = new DataResultSetTranslationConfiguration(this);
final boolean configComplete = !configuration.isComplete();
if (configComplete) {
configuration.reconfigure(dataResultSet);
}
// now use translator to read, translate and return example set
DataResultSetTranslator translator = new DataResultSetTranslator(this);
NumberFormat numberFormat = getNumberFormat();
if (numberFormat != null) {
configuration.setNumberFormat(numberFormat);
}
if (configComplete) {
translator.guessValueTypes(configuration, dataResultSet, null);
}
final ExampleSet exampleSet = translator.read(dataResultSet, configuration, false, null);
dataResultSet.close();
dataResultSetFactory.close();
if (fileInputPort.isConnected()) {
IOObject fileObject = fileInputPort.getDataOrNull();
if (fileObject != null) {
String sourceAnnotation = fileObject.getAnnotations().getAnnotation(Annotations.KEY_SOURCE);
if (sourceAnnotation != null) {
exampleSet.getAnnotations().setAnnotation(Annotations.KEY_SOURCE, sourceAnnotation);
}
}
}
return exampleSet;
}
@Override
public MetaData getGeneratedMetaData() throws OperatorException {
DataResultSetFactory dataResultSetFactory = getDataResultSetFactory();
ExampleSetMetaData result = dataResultSetFactory.makeMetaData();
DataResultSetTranslationConfiguration configuration = new DataResultSetTranslationConfiguration(this);
configuration.addColumnMetaData(result);
dataResultSetFactory.close();
return result;
}
/**
* Must be implemented by subclasses to return the DataResultSet.
*/
protected abstract DataResultSetFactory getDataResultSetFactory() throws OperatorException;
/** Returns the configured number format or null if a default number format should be
* used. */
protected abstract NumberFormat getNumberFormat() throws OperatorException;
/**
* This method might be overwritten by subclasses to avoid that the first row
* might be misinterpreted as attribute names.
*/
protected boolean isSupportingFirstRowAsNames() {
return true;
}
/** Returns either the selected file referenced by the value of the parameter with the name
* {@link #getFileParameterName()} or the file delivered at {@link #fileInputPort}.
* Which of these options is chosen is determined by the parameter {@link #PARAMETER_DESTINATION_TYPE}.
* */
public File getSelectedFile() throws OperatorException {
if(!fileInputPort.isConnected()){
return getParameterAsFile(getFileParameterName());
} else {
return fileInputPort.getData(FileObject.class).getFile();
}
}
/** Same as {@link #getSelectedFile()}, but opens the stream.
* */
public InputStream openSelectedFile() throws OperatorException, IOException {
if(!fileInputPort.isConnected()){
return new FileInputStream(getParameterAsFile(getFileParameterName()));
} else {
return fileInputPort.getData(FileObject.class).openStream();
}
}
/** Same as {@link #getSelectedFile()}, but returns true if file is specified (in the respective way).
* */
public boolean isFileSpecified() {
if(!fileInputPort.isConnected()){
return isParameterSet(getFileParameterName());
} else {
try {
return (fileInputPort.getData() instanceof FileObject);
} catch (OperatorException e) {
return false;
}
}
}
/** Returns the name of the {@link ParameterTypeFile} to be added through which the user
* can specify the file name. */
protected abstract String getFileParameterName();
/** Returns the allowed file extension. */
protected abstract String getFileExtension();
/** Creates (but does not add) the file parameter named by {@link #getFileParameterName()}
* that depends on whether or not {@link #fileInputPort} is connected. */
protected ParameterType makeFileParameterType() {
final ParameterTypeFile fileParam = new ParameterTypeFile(getFileParameterName(), "Name of the file to read the data from.", getFileExtension(), true);
fileParam.setExpert(false);
fileParam.registerDependencyCondition(new InputPortNotConnectedCondition(this, new PortProvider() {
@Override
public Port getPort() {
return fileInputPort;
}
}, true));
return fileParam;
}
@Override
public List<ParameterType> getParameterTypes() {
List<ParameterType> types = new LinkedList<ParameterType>();
if (isSupportingFirstRowAsNames())
types.add(new ParameterTypeBoolean(PARAMETER_FIRST_ROW_AS_NAMES, "Indicates if the first row should be used for the attribute names. If activated no annotations can be used.", true, false));
List<String> annotations = new LinkedList<String>();
annotations.add(ANNOTATION_NAME);
annotations.addAll(Arrays.asList(Annotations.ALL_KEYS_ATTRIBUTE));
ParameterType type = new ParameterTypeList(PARAMETER_ANNOTATIONS, "Maps row numbers to annotation names.", //
new ParameterTypeInt("row_number", "Row number which contains an annotation", 0, Integer.MAX_VALUE), //
new ParameterTypeCategory("annotation", "Name of the annotation to assign this row.", annotations.toArray(new String[annotations.size()]), 0), true);
if (isSupportingFirstRowAsNames())
type.registerDependencyCondition(new BooleanParameterCondition(this, PARAMETER_FIRST_ROW_AS_NAMES, false, false));
types.add(type);
type = new ParameterTypeDateFormat(PARAMETER_DATE_FORMAT, "The parse format of the date values, for example \"yyyy/MM/dd\".", false);
type.setExpert(false);
types.add(type);
type = new ParameterTypeCategory(PARAMETER_TIME_ZONE, "The time zone used for the date objects if not specified in the date string itself.", Tools.getAllTimeZones(), Tools.getPreferredTimeZoneIndex());
types.add(type);
type = new ParameterTypeCategory(PARAMETER_LOCALE, "The used locale for date texts, for example \"Wed\" (English) in contrast to \"Mi\" (German).", AbstractDateDataProcessing.availableLocaleNames, AbstractDateDataProcessing.defaultLocale);
types.add(type);
types.addAll(super.getParameterTypes());
type = new ParameterTypeList(PARAMETER_META_DATA, "The meta data information", //
new ParameterTypeInt(PARAMETER_COLUMN_INDEX, "The column index", 0, Integer.MAX_VALUE), //
new ParameterTypeTupel(PARAMETER_COLUMN_META_DATA, "The meta data definition of one column", //
new ParameterTypeString(PARAMETER_COLUMN_NAME, "Describes the attributes name."), //
new ParameterTypeBoolean(PARAMETER_COLUMN_SELECTED, "Indicates if a column is selected", true), //
new ParameterTypeCategory(PARAMETER_COLUMN_VALUE_TYPE, "Indicates the value type of an attribute", Ontology.VALUE_TYPE_NAMES, Ontology.NOMINAL), //
new ParameterTypeStringCategory(PARAMETER_COLUMN_ROLE, "Indicates the role of an attribute", Attributes.KNOWN_ATTRIBUTE_TYPES, AttributeColumn.REGULAR)), true);
types.add(type);
types.add(new ParameterTypeBoolean(PARAMETER_ERROR_TOLERANT, "Values which does not match to the specified value typed are considered as missings.", true, true));
types.add(new ParameterTypeCategory(ExampleSource.PARAMETER_DATAMANAGEMENT, "Determines, how the data is represented internally.", DataRowFactory.TYPE_NAMES, DataRowFactory.TYPE_DOUBLE_ARRAY, true));
return types;
}
}