/*
* RapidMiner
*
* Copyright (C) 2001-2011 by Rapid-I and the contributors
*
* Complete list of developers available at our web site:
*
* http://rapid-i.com
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see http://www.gnu.org/licenses/.
*/
package com.rapidminer.operator.nio.model;
import java.text.DateFormat;
import java.text.NumberFormat;
import java.text.ParseException;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
import com.rapidminer.RapidMiner;
import com.rapidminer.example.Attribute;
import com.rapidminer.example.AttributeTypeException;
import com.rapidminer.example.Attributes;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.example.table.AttributeFactory;
import com.rapidminer.example.table.DataRow;
import com.rapidminer.example.table.DataRowFactory;
import com.rapidminer.example.table.MemoryExampleTable;
import com.rapidminer.operator.Operator;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.UserError;
import com.rapidminer.operator.nio.ImportWizardUtils;
import com.rapidminer.operator.nio.model.DataResultSet.ValueType;
import com.rapidminer.operator.nio.model.ParsingError.ErrorCode;
import com.rapidminer.tools.Ontology;
import com.rapidminer.tools.ParameterService;
import com.rapidminer.tools.ProgressListener;
import com.rapidminer.tools.container.Pair;
/**
* This class encapsulates the translation step from a {@link DataResultSetTranslator} to an
* {@link ExampleSet} which is controlled by the {@link DataResultSetTranslationConfiguration}.
*
* @author Sebastian Land
*/
public class DataResultSetTranslator {
private static class NominalValueSet {
private String first = null;
private String second = null;
private boolean moreThanTwo = false;
private boolean register(String value) {
if (moreThanTwo) {
return true;
} else if (value == null) {
return false;
} else if (first == null) {
first = value;
return false;
} else if (first.equals(value)) {
return false;
} else if (second == null) {
second = value;
return false;
} else if (second.equals(value)) {
return false;
} else {
moreThanTwo = true;
return true;
}
}
}
private boolean shouldStop = false;
private boolean isReading = false;
private boolean cancelGuessingRequested = false;
private boolean cancelLoadingRequested = false;
private final Map<Pair<Integer,Integer>,ParsingError> errors = new HashMap<Pair<Integer,Integer>, ParsingError>();
private Operator operator;
public DataResultSetTranslator(Operator operator) {
this.operator = operator;
}
/**
* This method will start the translation of the actual ResultDataSet to an ExampleSet.
*/
public ExampleSet read(DataResultSet dataResultSet, DataResultSetTranslationConfiguration configuration, boolean previewOnly, ProgressListener listener) throws OperatorException {
int maxRows = previewOnly ? ImportWizardUtils.getPreviewLength() : -1;
cancelLoadingRequested = false;
boolean isFaultTolerant = configuration.isFaultTolerant();
isReading = true;
int[] attributeColumns = configuration.getSelectedIndices();
int numberOfAttributes = attributeColumns.length;
Attribute[] attributes = new Attribute[numberOfAttributes];
for (int i = 0; i < attributes.length; i++) {
int attributeValueType = configuration.getColumnMetaData(attributeColumns[i]).getAttributeValueType();
if (attributeValueType == Ontology.ATTRIBUTE_VALUE) //fallback for uninitialized reading.
attributeValueType = Ontology.POLYNOMINAL;
attributes[i] = AttributeFactory.createAttribute(configuration.getColumnMetaData(attributeColumns[i]).getOriginalAttributeName(), attributeValueType);
}
// check whether all columns are accessible
int numberOfAvailableColumns = dataResultSet.getNumberOfColumns();
for (int attributeColumn : attributeColumns) {
if(attributeColumn >= numberOfAvailableColumns) {
throw new UserError(null, "data_import.specified_more_columns_than_exist", configuration.getColumnMetaData(attributeColumn).getUserDefinedAttributeName(), attributeColumn);
}
}
// building example table
MemoryExampleTable exampleTable = new MemoryExampleTable(attributes);
// now iterate over complete dataResultSet and copy data
int currentRow = 0; // The row in the underlying DataResultSet
int exampleIndex = 0; // The row in the example set
dataResultSet.reset(listener);
DataRowFactory factory = new DataRowFactory(configuration.getDataManagementType(), '.');
int maxAnnotatedRow = configuration.getLastAnnotatedRowIndex();
while (dataResultSet.hasNext() && !shouldStop && (currentRow < maxRows || maxRows < 0)) {
if (cancelLoadingRequested) {
break;
}
dataResultSet.next(listener);
// checking for annotation
String currentAnnotation;
if (currentRow <= maxAnnotatedRow) {
currentAnnotation = configuration.getAnnotation(currentRow);
} else {
currentAnnotation = null;
}
if (currentAnnotation != null) {
// registering annotation on all attributes
int attributeIndex = 0;
for (Attribute attribute : attributes) {
if (AbstractDataResultSetReader.ANNOTATION_NAME.equals(currentAnnotation)) {
// resetting name
String newAttributeName = getString(dataResultSet, exampleIndex, attributeColumns[attributeIndex], isFaultTolerant);
if (newAttributeName != null && !newAttributeName.isEmpty()) {
attribute.setName(newAttributeName);
// We also remember the name in the CMD since we otherwise would override the attribute name later in this method
ColumnMetaData cmd = configuration.getColumnMetaData(attributeColumns[attributeIndex]);
if (cmd != null) {
if (!cmd.isAttributeNameSpecified()) {
cmd.setUserDefinedAttributeName(newAttributeName);
}
}
}
} else {
// setting annotation
String annotationValue = getString(dataResultSet, exampleIndex, attributeColumns[attributeIndex], isFaultTolerant);
if (annotationValue != null && !annotationValue.isEmpty())
attribute.getAnnotations().put(currentAnnotation, annotationValue);
}
attributeIndex++;
}
} else {
// creating data row
DataRow row = factory.create(attributes.length);
//DoubleArrayDataRow row = new DoubleArrayDataRow(new double[attributes.length]);
exampleTable.addDataRow(row);
int attributeIndex = 0;
for (Attribute attribute : attributes) {
// check for missing
if (dataResultSet.isMissing(attributeColumns[attributeIndex])) {
row.set(attribute, Double.NaN);
} else {
switch (attribute.getValueType()) {
case Ontology.INTEGER:
row.set(attribute, getOrParseNumber(configuration, dataResultSet, exampleIndex, attributeColumns[attributeIndex], isFaultTolerant));
//getNumber(dataResultSet, exampleIndex, attributeColumns[attributeIndex], isFaultTolerant).intValue());
break;
case Ontology.NUMERICAL:
case Ontology.REAL:
row.set(attribute, getOrParseNumber(configuration, dataResultSet, exampleIndex, attributeColumns[attributeIndex], isFaultTolerant));
break;
case Ontology.DATE_TIME:
case Ontology.TIME:
case Ontology.DATE:
row.set(attribute, getOrParseDate(configuration, dataResultSet, exampleIndex, attributeColumns[attributeIndex], isFaultTolerant));
break;
default:
row.set(attribute, getStringIndex(attribute, dataResultSet, exampleIndex, attributeColumns[attributeIndex], isFaultTolerant));
}
}
attributeIndex++;
}
exampleIndex++;
}
currentRow++;
}
// derive ExampleSet from exampleTable and assigning roles
ExampleSet exampleSet = exampleTable.createExampleSet();
Attributes exampleSetAttributes = exampleSet.getAttributes();
int attributeIndex = 0;
for (Attribute attribute : attributes) {
// if user defined names have been found, rename accordingly
final ColumnMetaData cmd = configuration.getColumnMetaData(attributeColumns[attributeIndex]);
if (!cmd.isSelected()) {
attributeIndex++;
continue;
}
String userDefinedName = cmd.getUserDefinedAttributeName();
if (userDefinedName != null && !userDefinedName.isEmpty())
attribute.setName(userDefinedName);
String roleId = cmd.getRole();
if (!Attributes.ATTRIBUTE_NAME.equals(roleId))
exampleSetAttributes.setSpecialAttribute(attribute, roleId);
attributeIndex++;
}
isReading = false;
if (listener != null)
listener.complete();
return exampleSet;
}
/** If native type is date, returns the date. Otherwise, uses string and parses.
*/
private double getOrParseDate(DataResultSetTranslationConfiguration config, DataResultSet dataResultSet, int row, int column, boolean isFaultTolerant) throws OperatorException {
ValueType nativeValueType;
try {
nativeValueType = dataResultSet.getNativeValueType(column);
} catch (com.rapidminer.operator.nio.model.ParseException e1) {
addOrThrow(isFaultTolerant, e1.getError(), row);
return Double.NaN;
}
if (nativeValueType == ValueType.DATE) {
return getDate(dataResultSet, row, column, isFaultTolerant);
} else {
String value = getString(dataResultSet, row, column, isFaultTolerant);
try {
return config.getDateFormat().parse(value).getTime();
} catch (ParseException e) {
ParsingError error = new ParsingError(dataResultSet.getCurrentRow(), column, ErrorCode.UNPARSEABLE_DATE, value, e);
addOrThrow(isFaultTolerant, error, row);
return Double.NaN;
}
}
}
private double getDate(DataResultSet dataResultSet, int row, int column, boolean isFaultTolerant) throws OperatorException {
try {
return dataResultSet.getDate(column).getTime();
} catch (com.rapidminer.operator.nio.model.ParseException e) {
addOrThrow(isFaultTolerant, e.getError(), row);
return Double.NaN;
}
}
private double getStringIndex(Attribute attribute, DataResultSet dataResultSet, int row, int column, boolean isFaultTolerant) throws UserError {
String value = null;
try {
value = dataResultSet.getString(column);
int mapIndex = attribute.getMapping().mapString(value);
return mapIndex;
} catch (com.rapidminer.operator.nio.model.ParseException e) {
addOrThrow(isFaultTolerant, e.getError(), row);
return Double.NaN;
} catch (AttributeTypeException e) {
ParsingError error = new ParsingError(dataResultSet.getCurrentRow(), column, ErrorCode.MORE_THAN_TWO_VALUES, value, e);
addOrThrow(isFaultTolerant, error, row);
return Double.NaN;
}
}
private String getString(DataResultSet dataResultSet, int row, int column, boolean isFaultTolerant) throws UserError {
try {
return dataResultSet.getString(column);
} catch (com.rapidminer.operator.nio.model.ParseException e) {
addOrThrow(isFaultTolerant, e.getError(), row);
return null;
}
}
/** If native type is date, returns the date. Otherwise, uses string and parses.
*/
private double getOrParseNumber(DataResultSetTranslationConfiguration config, DataResultSet dataResultSet, int row, int column, boolean isFaultTolerant) throws OperatorException {
ValueType nativeValueType;
try {
nativeValueType = dataResultSet.getNativeValueType(column);
} catch (com.rapidminer.operator.nio.model.ParseException e1) {
addOrThrow(isFaultTolerant, e1.getError(), row);
return Double.NaN;
}
if (nativeValueType == ValueType.NUMBER) {
return getNumber(dataResultSet, row, column, isFaultTolerant).doubleValue();
} else {
String value = getString(dataResultSet, row, column, isFaultTolerant);
NumberFormat numberFormat = config.getNumberFormat();
if (numberFormat != null) {
try {
Number parsedValue;
parsedValue = numberFormat.parse(value);
return parsedValue.doubleValue();
} catch (ParseException e) {
ParsingError error = new ParsingError(dataResultSet.getCurrentRow(), column, ErrorCode.UNPARSEABLE_REAL, value, e);
addOrThrow(isFaultTolerant, error, row);
return Double.NaN;
}
} else {
try {
return Double.parseDouble(value);
} catch (NumberFormatException e) {
ParsingError error = new ParsingError(dataResultSet.getCurrentRow(), column, ErrorCode.UNPARSEABLE_REAL, value, e);
addOrThrow(isFaultTolerant, error, row);
return Double.NaN;
}
}
}
}
private Number getNumber(DataResultSet dataResultSet, int row, int column, boolean isFaultTolerant) throws OperatorException {
try {
return dataResultSet.getNumber(column);
} catch (com.rapidminer.operator.nio.model.ParseException e) {
if (isFaultTolerant) {
addError(e.getError(), row);
return Double.NaN;
} else {
throw new UserError(operator, "data_parsing_error", e.toString());
}
}
}
public void guessValueTypes(DataResultSetTranslationConfiguration configuration, DataResultSet dataResultSet, ProgressListener listener) throws OperatorException {
int maxProbeRows;
try {
maxProbeRows = Integer.parseInt(ParameterService.getParameterValue(RapidMiner.PROPERTY_RAPIDMINER_GENERAL_MAX_TEST_ROWS));
} catch (NumberFormatException e) {
maxProbeRows = 100;
}
guessValueTypes(configuration, dataResultSet, maxProbeRows, listener);
}
public void guessValueTypes(DataResultSetTranslationConfiguration configuration, DataResultSet dataResultSet, int maxNumberOfRows, ProgressListener listener) throws OperatorException {
int[] originalValueTypes = new int[configuration.getNumerOfColumns()];
for (int i = 0; i < originalValueTypes.length; i++) {
originalValueTypes[i] = configuration.getColumnMetaData(i).getAttributeValueType();
}
final int[] guessedTypes = guessValueTypes(originalValueTypes, configuration, dataResultSet, maxNumberOfRows, listener);
for (int i = 0; i < guessedTypes.length; i++) {
configuration.getColumnMetaData(i).setAttributeValueType(guessedTypes[i]);
}
}
/**
* This method will select the most appropriate value types defined on the first few thousand rows.
*
* @throws OperatorException
*/
private int[] guessValueTypes(int[] definedTypes, DataResultSetTranslationConfiguration configuration, DataResultSet dataResultSet, int maxProbeRows, ProgressListener listener) throws OperatorException {
cancelGuessingRequested = false;
if (listener != null)
listener.setTotal(1 + maxProbeRows);
DateFormat dateFormat = configuration.getDateFormat();
NumberFormat numberFormat = configuration.getNumberFormat();
if (listener != null)
listener.setCompleted(1);
int[] columnValueTypes = new int[dataResultSet.getNumberOfColumns()];
Arrays.fill(columnValueTypes, Ontology.INTEGER);
// TODO: The following could be made more efficient using an indirect indexing to access the columns: would
dataResultSet.reset(listener);
// the row in the underlying DataResultSet
int currentRow = 0;
// the example row in the ExampleTable
int exampleIndex = 0;
NominalValueSet nominalValues[] = new NominalValueSet[dataResultSet.getNumberOfColumns()];
for (int i = 0; i < nominalValues.length; i++) {
nominalValues[i] = new NominalValueSet();
}
int maxAnnotatedRow = configuration.getLastAnnotatedRowIndex();
while (dataResultSet.hasNext() && (currentRow < maxProbeRows || maxProbeRows <= 0)) {
if (cancelGuessingRequested) {
break;
}
dataResultSet.next(listener);
if (listener != null)
listener.setCompleted(1 + currentRow);
// skip rows with annotations
if (currentRow > maxAnnotatedRow || configuration.getAnnotation(currentRow) == null) {
int numCols = dataResultSet.getNumberOfColumns();
// number of columns can change as we read the data set.
if (numCols > definedTypes.length) {
String excessString;
try {
excessString = dataResultSet.getString(definedTypes.length);
} catch (com.rapidminer.operator.nio.model.ParseException e) {
excessString = null;
}
addError(new ParsingError(dataResultSet.getCurrentRow(), 0, ErrorCode.ROW_TOO_LONG, excessString, null),
exampleIndex);
}
for (int column = 0; column < definedTypes.length; column++) {
// No more guessing necessary if guessed type is polynomial (this is the most general case)
if (definedTypes[column] == Ontology.POLYNOMINAL || dataResultSet.isMissing(column)) {
continue;
}
ValueType nativeType;
String stringRepresentation;
try {
nativeType = dataResultSet.getNativeValueType(column);
stringRepresentation = dataResultSet.getString(column);
} catch (com.rapidminer.operator.nio.model.ParseException e) {
final ParsingError error = e.getError();
addError(error, exampleIndex);
continue;
}
nominalValues[column].register(stringRepresentation);
if (nativeType != ValueType.STRING) {
// Native representation is not a string, so we trust the data source
// and adapt the type accordingly.
int isType = nativeType.getRapidMinerAttributeType();
if (nativeType == ValueType.NUMBER) {
Number value = getNumber(dataResultSet, exampleIndex, column, true);
if (!Double.isNaN(value.doubleValue())) {
if (value.intValue() == value.doubleValue()) {
isType = Ontology.INTEGER;
} else {
isType = Ontology.REAL;
}
}
}
if (Ontology.ATTRIBUTE_VALUE_TYPE.isA(isType, definedTypes[column])) {
// We're good, nothing to do
if (definedTypes[column] == Ontology.ATTRIBUTE_VALUE) {
// First row, just use the one delivered
definedTypes[column] = isType;
}
continue;
} else {
// otherwise, generalize until we are good
while (!Ontology.ATTRIBUTE_VALUE_TYPE.isA(isType, definedTypes[column])) {
definedTypes[column] = Ontology.ATTRIBUTE_VALUE_TYPE.getParent(definedTypes[column]);
}
// in the most general case, we switch to polynomial
if (definedTypes[column] == Ontology.ATTRIBUTE_VALUE) {
definedTypes[column] =
nominalValues[column].moreThanTwo ? Ontology.POLYNOMINAL : Ontology.BINOMINAL;
}
}
} else {
// for strings, we try parsing ourselves
// fill value buffer for binominal assessment
definedTypes[column] = guessValueType(definedTypes[column], stringRepresentation, !nominalValues[column].moreThanTwo, dateFormat, numberFormat);
}
}
exampleIndex++;
}
currentRow++;
}
if (listener != null)
listener.complete();
return definedTypes;
}
/**
* This method tries to guess the value type by taking into account the current guessed type and the string value.
* The type will be transformed to more general ones.
*/
private int guessValueType(int currentValueType, String value, boolean onlyTwoValues, DateFormat dateFormat, NumberFormat numberFormat) {
if (currentValueType == Ontology.POLYNOMINAL)
return currentValueType;
if (currentValueType == Ontology.BINOMINAL) {
if (onlyTwoValues) {
return Ontology.BINOMINAL;
} else {
return Ontology.POLYNOMINAL;
}
}
if (currentValueType == Ontology.DATE) {
try {
dateFormat.parse(value);
return currentValueType;
} catch (ParseException e) {
return guessValueType(Ontology.BINOMINAL, value, onlyTwoValues, dateFormat, numberFormat);
}
}
if (currentValueType == Ontology.REAL) {
if (numberFormat != null) {
try {
numberFormat.parse(value);
return currentValueType;
} catch (ParseException e) {
return guessValueType(Ontology.DATE, value, onlyTwoValues, dateFormat, numberFormat);
}
} else {
try {
Double.parseDouble(value);
return currentValueType;
} catch (NumberFormatException e) {
return guessValueType(Ontology.DATE, value, onlyTwoValues, dateFormat, null);
}
}
}
try {
Integer.parseInt(value);
return Ontology.INTEGER;
} catch (NumberFormatException e) {
return guessValueType(Ontology.REAL, value, onlyTwoValues, dateFormat, numberFormat);
}
}
/**
* This method will stop any ongoing read action and close the underlying DataResultSet. It will wait until this has
* been successfully performed.
*
* @throws OperatorException
*/
public void close() throws OperatorException {
if (isReading) {
shouldStop = true;
}
}
public void clearErrors() {
errors.clear();
}
private void addOrThrow(boolean isFaultTolerant, ParsingError error, int row) throws UserError {
if (isFaultTolerant) {
addError(error, row);
} else {
throw new UserError(operator, 403, error.toString());
}
}
private void addError(final ParsingError error, int exampleIndex) {
error.setExampleIndex(exampleIndex);
errors.put(new Pair<Integer,Integer>(error.getExampleIndex(), error.getColumn()), error);
}
public Collection<ParsingError> getErrors() {
return errors.values();
}
public ParsingError getErrorByExampleIndexAndColumn(int row, int column) {
if (errors == null) {
return null;
}
return errors.get(new Pair<Integer,Integer>(row, column));
}
/** Cancels {@link #guessValueTypes(int[], DataResultSetTranslationConfiguration, DataResultSet, int, ProgressListener)}
* after the next row. */
public void cancelGuessing() {
cancelGuessingRequested = true;
}
/** Cancels {@link #read(DataResultSet, DataResultSetTranslationConfiguration, int, ProgressListener)} after the
* next row. */
public void cancelLoading() {
cancelLoadingRequested = true;
}
public boolean isGuessingCancelled() {
return cancelGuessingRequested;
}
}