/**
* Copyright (C) 2001-2017 by RapidMiner and the contributors
*
* Complete list of developers available at our web site:
*
* http://rapidminer.com
*
* This program is free software: you can redistribute it and/or modify it under the terms of the
* GNU Affero General Public License as published by the Free Software Foundation, either version 3
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
* even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License along with this program.
* If not, see http://www.gnu.org/licenses/.
*/
package com.rapidminer.studio.io.data.internal;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import com.rapidminer.core.io.data.ColumnMetaData.ColumnType;
import com.rapidminer.core.io.data.DataSet;
import com.rapidminer.core.io.data.DataSetException;
import com.rapidminer.core.io.data.DataSetMetaData;
import com.rapidminer.core.io.data.source.DataSource;
import com.rapidminer.operator.Annotations;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.nio.model.ColumnMetaData;
import com.rapidminer.operator.nio.model.DataResultSet;
import com.rapidminer.operator.nio.model.DataResultSetTranslationConfiguration;
import com.rapidminer.operator.nio.model.DataResultSetTranslator;
import com.rapidminer.operator.nio.model.ParseException;
import com.rapidminer.studio.io.data.DefaultDataSetMetaData;
import com.rapidminer.studio.io.data.HeaderRowBehindStartRowException;
import com.rapidminer.studio.io.data.HeaderRowNotFoundException;
import com.rapidminer.studio.io.data.StartRowNotFoundException;
import com.rapidminer.tools.Ontology;
import com.rapidminer.tools.ProgressListener;
/**
* Utility class that contains helper methods for {@link DataSource}s that return a
* {@link ResultSetAdapter} as {@link DataSet}.
*
* @author Nils Woehler
* @since 7.0.0
*/
public final class ResultSetAdapterUtils {
/**
* Utility class constructor.
*/
private ResultSetAdapterUtils() {
throw new AssertionError("Utility class");
}
/**
* Creates a new {@link DataSetMetaData} instance for the provided {@link DataResultSet} based
* on the provided data range and header row index (if any). This includes reading the column
* names and guessing the column types for the selected columns. For guessing the column types
* the logic from {@link DataResultSetTranslator} is used.
*
* @param resultSet
* the {@link DataResultSet} that should be used to extract the meta data
* @param numberFormat
* the number format that should be used during column type guessing
* @param startingRowIndex
* the 0-based index of the first data row (not including the header row)
* @param headerRowIndex
* the 0-based index for the header row (if any,
* {@link ResultSetAdapter#NO_HEADER_ROW} otherwise)
* @return the new {@link DataSetMetaData} instance which contains meta data retrieved from the
* column name extraction and column type guessing
* @throws HeaderRowNotFoundException
* if the header row was not found
* @throws StartRowNotFoundException
* if the data start row was not found
* @throws HeaderRowBehindStartRowException
* in case the headerRowIndex > startingRowIndex
* @throws DataSetException
* if the meta data fetching fails
*/
public static DataSetMetaData createMetaData(DataResultSet resultSet, NumberFormat numberFormat, int startingRowIndex,
int headerRowIndex) throws HeaderRowNotFoundException, StartRowNotFoundException,
HeaderRowBehindStartRowException, DataSetException {
// check whether the header row index is lower or equal to the starting row
if (headerRowIndex > startingRowIndex) {
throw new HeaderRowBehindStartRowException();
}
try {
int numberOfColumns = resultSet.getNumberOfColumns();
String[] columnNames = getColumnNames(resultSet, headerRowIndex, startingRowIndex, numberOfColumns);
List<ColumnType> columnTypes = guessColumnTypes(resultSet, startingRowIndex, headerRowIndex, numberOfColumns,
numberFormat);
return new DefaultDataSetMetaData(Arrays.asList(columnNames), columnTypes);
} catch (OperatorException e) {
throw new DataSetException(e.getMessage(), e);
}
}
/**
* Checks if the start row is contained in the resultSet.
*
* @param startingRowIndex
* the 0-based starting row index
* @throws StartRowNotFoundException
* in case the starting row was not found
* @throws OperatorException
* in case the underlying file could not be read
*/
private static void checkStartRow(DataResultSet resultSet, int startingRowIndex)
throws StartRowNotFoundException, OperatorException {
while (resultSet.getCurrentRow() < startingRowIndex) {
if (!resultSet.hasNext()) {
throw new StartRowNotFoundException();
}
resultSet.next(null);
}
}
/**
* Reads the column names from the resultSet given the configuration.
*
* @param resultSet
* the data set
* @param headerRowIndex
* the index of the row that should be used to extract column names from or
* {@link ResultSetAdapter#NO_HEADER_ROW} in case the default names should be used
* @param startingRowIndex
* the index of the actual data start row
* @param numberOfColumns
* the number of columns for the {@link DataSource}
* @return the column names as a String array
* @throws HeaderRowNotFoundException
* if the header row was not found
* @throws StartRowNotFoundException
* if the data start row was not found
* @throws OperatorException
* if reading the resultSet failed
*/
private static String[] getColumnNames(DataResultSet resultSet, int headerRowIndex, int startingRowIndex,
int numberOfColumns) throws HeaderRowNotFoundException, OperatorException, StartRowNotFoundException {
resultSet.reset(null);
String[] defaultNames = resultSet.getColumnNames();
// check which last column index to use and create array of names
String[] columnNames = new String[numberOfColumns];
// read column names from specified row
while (resultSet.getCurrentRow() < headerRowIndex) {
if (!resultSet.hasNext()) {
throw new HeaderRowNotFoundException();
}
resultSet.next(null);
}
for (int i = 0; i < numberOfColumns; i++) {
if (headerRowIndex > ResultSetAdapter.NO_HEADER_ROW) {
if (resultSet.isMissing(i)) {
columnNames[i] = defaultNames[i];
} else {
try {
// retrieve data with native value type and convert to String
switch (resultSet.getNativeValueType(i)) {
case DATE:
columnNames[i] = String.valueOf(resultSet.getDate(i));
break;
case EMPTY:
columnNames[i] = defaultNames[i];
break;
case NUMBER:
columnNames[i] = String.valueOf(resultSet.getNumber(i));
break;
case STRING:
default:
columnNames[i] = resultSet.getString(i);
break;
}
} catch (ParseException e) {
columnNames[i] = defaultNames[i];
}
}
} else {
columnNames[i] = defaultNames[i];
}
}
checkStartRow(resultSet, startingRowIndex);
return columnNames;
}
/**
* Guesses column types by using the
* {@link DataResultSetTranslator#guessValueTypes(DataResultSetTranslationConfiguration, DataResultSet, ProgressListener)}
* logic and transforming the guessed value types into {@link ColumnType}s.
*/
private static List<ColumnType> guessColumnTypes(DataResultSet dataResultSet, int startingRow, int headerRow,
int numberOfColumns, NumberFormat numberFormat) throws DataSetException {
try {
int[] valueTypes = getValueTypes(dataResultSet, startingRow, headerRow, numberOfColumns, numberFormat);
List<ColumnType> columnTypes = new ArrayList<>(valueTypes.length);
for (int type : valueTypes) {
columnTypes.add(transformValueType(type));
}
return columnTypes;
} catch (OperatorException e) {
throw new DataSetException(e.getMessage(), e);
}
}
/**
* Transforms a {@link Ontology#ATTRIBUTE_VALUE_TYPE} into a {@link ColumnType}.
*/
public static ColumnType transformValueType(int valueType) {
switch (valueType) {
case Ontology.TIME:
return ColumnType.TIME;
case Ontology.DATE:
return ColumnType.DATE;
case Ontology.DATE_TIME:
return ColumnType.DATETIME;
case Ontology.NUMERICAL:
case Ontology.REAL:
return ColumnType.REAL;
case Ontology.INTEGER:
return ColumnType.INTEGER;
case Ontology.BINOMINAL:
return ColumnType.BINARY;
default:
return ColumnType.CATEGORICAL;
}
}
/**
* Transforms a {@link ColumnType} into a {@link Ontology#ATTRIBUTE_VALUE_TYPE} .
*/
public static int transformColumnType(ColumnType columnType) {
switch (columnType) {
case DATETIME:
return Ontology.DATE_TIME;
case DATE:
return Ontology.DATE;
case TIME:
return Ontology.TIME;
case INTEGER:
return Ontology.INTEGER;
case REAL:
return Ontology.REAL;
case BINARY:
return Ontology.BINOMINAL;
default:
case CATEGORICAL:
return Ontology.POLYNOMINAL;
}
}
/**
* Uses the {@link DataResultSetTranslator} to guess the valueTypes.
*
* @param dataResultSet
* the data set
* @param startingRow
* the starting row
* @param headerRow
* the header row
* @param numberOfColumns
* the number of columns the {@link DataSource} is going to have
* @param numberFormat
* the number format used to guess the value types (or {@code null} in case of the
* default format)
* @return the guessed value types
* @throws OperatorException
* if the guessing failed because of an IOException
*/
private static int[] getValueTypes(DataResultSet dataResultSet, int startingRow, int headerRow, int numberOfColumns,
NumberFormat numberFormat) throws OperatorException {
// generate a DataResultSetTranslationConfiguration
DataResultSetTranslationConfiguration translationConfiguration = new DataResultSetTranslationConfiguration(
dataResultSet, getAnnotations(startingRow, headerRow));
translationConfiguration.setNumberFormat(numberFormat);
// use a translator to guess value types given the configuration
new DataResultSetTranslator(null).guessValueTypes(translationConfiguration, dataResultSet, null);
ColumnMetaData[] metadata = translationConfiguration.getColumnMetaData();
// check which index to use as last column and create valueType array
int[] valueTypes = new int[numberOfColumns];
for (int i = 0; i < numberOfColumns; i++) {
valueTypes[i] = metadata[i].getAttributeValueType();
}
return valueTypes;
}
/**
* Creates a list of annotations used by the {@link DataResultSet} to define comment and name
* rows.
*
* @param startingRow
* the starting row index
* @param headerRow
* the header row index
* @return the annotations associated to the configuration
*/
private static List<String> getAnnotations(int startingRow, int headerRow) {
int lastCommentRow = startingRow - 1;
int max = Math.max(headerRow, lastCommentRow);
List<String> annotations = new ArrayList<>(max + 1);
for (int i = 0; i <= max; i++) {
if (i == headerRow) {
annotations.add(Annotations.ANNOTATION_NAME);
} else if (i < startingRow) {
annotations.add(Annotations.KEY_COMMENT);
} else {
annotations.add(null);
}
}
return annotations;
}
}