/** * Copyright (C) 2001-2017 by RapidMiner and the contributors * * Complete list of developers available at our web site: * * http://rapidminer.com * * This program is free software: you can redistribute it and/or modify it under the terms of the * GNU Affero General Public License as published by the Free Software Foundation, either version 3 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License along with this program. * If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.studio.io.data; import java.util.Arrays; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Set; import com.rapidminer.Process; import com.rapidminer.core.io.data.ColumnMetaData; import com.rapidminer.core.io.data.DataSet; import com.rapidminer.core.io.data.DataSetException; import com.rapidminer.core.io.data.DataSetRow; import com.rapidminer.core.io.data.ParseException; import com.rapidminer.example.Attribute; import com.rapidminer.example.AttributeTypeException; import com.rapidminer.example.Attributes; import com.rapidminer.example.ExampleSet; import com.rapidminer.example.table.AttributeFactory; import com.rapidminer.example.table.DataRow; import com.rapidminer.example.table.DataRowFactory; import com.rapidminer.example.utils.ExampleSetBuilder; import com.rapidminer.example.utils.ExampleSets; import com.rapidminer.gui.tools.ProgressThread; import com.rapidminer.gui.tools.ProgressThreadStoppedException; import com.rapidminer.operator.Operator; import com.rapidminer.operator.ProcessStoppedException; import com.rapidminer.operator.UserError; import com.rapidminer.studio.io.data.internal.ResultSetAdapterUtils; import com.rapidminer.studio.io.gui.internal.steps.StoreToRepositoryStep; import com.rapidminer.tools.Ontology; import com.rapidminer.tools.ProgressListener; /** * Can read a {@link DataSet} into an {@link ExampleSet}. Uses a list of {@link ColumnMetaData} to * create {@link Attribute}s and fill the data for them by going through the rows of the * {@link DataSet}. Used by the {@link StoreToRepositoryStep} to store the data in the repository at * the end of the import wizard. * * @author Gisa Schaefer * @since 7.0.0 */ public class DataSetReader { private int dataManagementType = DataRowFactory.TYPE_DOUBLE_ARRAY; private boolean shouldStop = false; private boolean isReading = false; private boolean isFaultTolerant = true; private final List<ColumnMetaData> metaData; private final Operator operator; /** * Creates a new reader. * * @param operator * the operator that should be checked for stop, can be {@code null} * @param metaData * the metaData to use for reading * @param isFaultTolerant * {@code true} if the reader puts missing values on parsing errors, {@code false} if * the reader throws an exception on parsing errors */ public DataSetReader(Operator operator, List<ColumnMetaData> metaData, boolean isFaultTolerant) { this.operator = operator; this.isFaultTolerant = isFaultTolerant; this.metaData = metaData; } /** * @return {@code true} if the reader puts missing values on parsing errors, {@code false} if * the reader throws an exception on parsing errors */ public boolean isFaultTolerant() { return isFaultTolerant; } /** * Set if the reader should throw an exception or put a missing value on parsing errors * * @param isFaultTolerant * {@code true} if the reader puts missing values on parsing errors, {@code false} if * the reader throws an exception on parsing errors */ public void setFaultTolerant(boolean isFaultTolerant) { this.isFaultTolerant = isFaultTolerant; } /** * Sets the data management type to use for the data rows in the example set created by * {@link #read}. Default is {@link DataRowFactory.TYPE_DOUBLE_ARRAY}. * * @param dataManagmentType * the dataManagmentType to use for creating the example set, must be one of the * constants TYPE_DOUBLE_ARRAY, TYPE_FLOAT_ARRAY, TYPE_LONG_ARRAY, TYPE_INT_ARRAY, * TYPE_SHORT_ARRAY, TYPE_BYTE_ARRAY, TYPE_BOOLEAN_ARRAY, TYPE_DOUBLE_SPARSE_ARRAY, * TYPE_FLOAT_SPARSE_ARRAY, TYPE_LONG_SPARSE_ARRAY, TYPE_INT_SPARSE_ARRAY, * TYPE_SHORT_SPARSE_ARRAY, TYPE_BYTE_SPARSE_ARRAY, TYPE_BOOLEAN_SPARSE_ARRAY, or * TYPE_SPARSE_MAP of {@link DataRowFactory}. */ public void setDataManagmentType(int dataManagmentType) { this.dataManagementType = dataManagmentType; } /** * Transforms the {@link DataSet} dataSet to an {@link ExampleSet} with respect to the metaData. * Uses the metaData to create {@link Attribute}s and fill the data for them by going through * the rows of the {@link DataSet}. The reading process can be stop at each new row by the * operator this is associated to, by the listener, or by calling {@link #stop()}. * * @param dataSet * the data set to transform, will not be closed in this method * @param listener * the progress listener, can be {@code null} * @return the created example set * @throws UserError * if a column specified in the metaData is not found or the column names are not * unique * @throws DataSetException * when reading the data set fails * @throws ProcessStoppedException * if the associated process is stopped or the {@link #stop()} method is called * @throws ParseException * if parsing failed and the reading is not done fault tolerant * @throws ProgressThreadStoppedException * if the {@link ProgressThread} associated to the listener was stopped */ public ExampleSet read(DataSet dataSet, ProgressListener listener) throws UserError, DataSetException, ProcessStoppedException, ParseException { isReading = true; if (listener != null) { listener.setTotal(120); } // check the meta data and create needed attributes int[] attributeColumns = parseMetaDataInformation(); checkColumnsInDataSet(dataSet, attributeColumns); Attribute[] attributes = createAttributes(attributeColumns); if (listener != null) { listener.setCompleted(5); } // building example table ExampleSetBuilder builder = ExampleSets.from(attributes); fillExampleTable(dataSet, listener, attributeColumns, builder, attributes); // derive ExampleSet from exampleTable and assigning roles ExampleSet exampleSet = builder.build(); assignRoles(attributeColumns, exampleSet); isReading = false; if (listener != null) { listener.setCompleted(110); } return exampleSet; } /** * Stops the reading process. If the reading process is in progress, sets a flag that will stop * the reading before the next row is read. */ public void stop() { if (isReading) { shouldStop = true; } } /** * Assigns the roles stored in the metaData to the attributes of the exampleSet. */ private void assignRoles(int[] attributeColumns, ExampleSet exampleSet) { // Copy attribute list to avoid concurrent modification when setting to special List<Attribute> allAttributes = new LinkedList<>(); for (Attribute att : exampleSet.getAttributes()) { allAttributes.add(att); } int attributeIndex = 0; for (Attribute attribute : allAttributes) { String roleId = metaData.get(attributeColumns[attributeIndex]).getRole(); if (roleId != null && !Attributes.ATTRIBUTE_NAME.equals(roleId)) { exampleSet.getAttributes().setSpecialAttribute(attribute, roleId); } attributeIndex++; } } /** * Fills the exampleTable with the data from the dataSet. */ private void fillExampleTable(DataSet dataSet, ProgressListener listener, int[] attributeColumns, ExampleSetBuilder builder, Attribute[] attributes) throws DataSetException, ProcessStoppedException, ParseException { dataSet.reset(); int numberOfRows = dataSet.getNumberOfRows(); DataRowFactory factory = new DataRowFactory(dataManagementType, DataRowFactory.POINT_AS_DECIMAL_CHARACTER); // detect if this is executed in a process boolean isRunningInProcess = isOperatorRunning(); // now iterate over complete dataSet and copy data while (dataSet.hasNext()) { if (isRunningInProcess) { operator.checkForStop(); } if (shouldStop) { throw new ProcessStoppedException(); } DataSetRow currentRow = dataSet.nextRow(); if (listener != null) { updateProcess(listener, dataSet.getCurrentRowIndex(), numberOfRows); } // creating data row DataRow row = factory.create(attributes.length); int attributeIndex = 0; for (Attribute attribute : attributes) { // check for missing if (currentRow.isMissing(attributeColumns[attributeIndex])) { row.set(attribute, Double.NaN); } else { switch (attribute.getValueType()) { case Ontology.INTEGER: case Ontology.NUMERICAL: case Ontology.REAL: row.set(attribute, getNumber(currentRow, attributeColumns[attributeIndex])); break; case Ontology.DATE_TIME: case Ontology.TIME: case Ontology.DATE: row.set(attribute, getDate(currentRow, attributeColumns[attributeIndex])); break; default: row.set(attribute, getStringIndex(attribute, currentRow, attributeColumns[attributeIndex])); } } attributeIndex++; } builder.addDataRow(row); } } /** * @return if the operator is running in a process */ private boolean isOperatorRunning() { boolean isRunningInProcess = false; if (operator != null) { Process process = operator.getProcess(); if (process != null && process.getProcessState() == Process.PROCESS_STATE_RUNNING) { isRunningInProcess = true; } } return isRunningInProcess; } /** * Creates {@link Attribute}s from the metaData. */ private Attribute[] createAttributes(int[] attributeColumns) { int numberOfAttributes = attributeColumns.length; // create associated attributes Attribute[] attributes = new Attribute[numberOfAttributes]; for (int i = 0; i < attributes.length; i++) { int attributeValueType = ResultSetAdapterUtils.transformColumnType(metaData.get(attributeColumns[i]).getType()); attributes[i] = AttributeFactory.createAttribute(metaData.get(attributeColumns[i]).getName(), attributeValueType); } return attributes; } /** * Checks if the unremoved columns described by the metaData are contained in the dataSet. * * @throws UserError * if a needed column is not found */ private void checkColumnsInDataSet(DataSet dataSet, int[] attributeColumns) throws UserError { // check whether all columns are accessible int numberOfAvailableColumns = dataSet.getNumberOfColumns(); for (int attributeColumn : attributeColumns) { if (attributeColumn >= numberOfAvailableColumns) { throw new UserError(null, "data_import.specified_more_columns_than_exist", metaData.get(attributeColumn).getName(), attributeColumn); } } } /** * Goes once through the metaData, checks which columns are removed and checks that the * remaining columns have unique names. * * @return an array that contains at position i the original index of the column that will be * the i-th column in the example set * @throws UserError * if the column names provided by the metaData are not unique */ private int[] parseMetaDataInformation() throws UserError { // create array that contains at position i the original index of the column that is now the // i-th column because of removals int[] selectedColumns = new int[metaData.size()]; // create set of unique used column names Set<String> usedColumnNames = new HashSet<>(); int columnIndex = 0; int usedColumnIndex = 0; for (ColumnMetaData column : metaData) { if (!column.isRemoved()) { selectedColumns[usedColumnIndex] = columnIndex; usedColumnIndex++; String columnName = column.getName(); if (!usedColumnNames.contains(columnName)) { usedColumnNames.add(columnName); } else { throw new UserError(null, "data_import.non_unique_column_name", columnName); } } columnIndex++; } int[] attributeColumns = Arrays.copyOf(selectedColumns, usedColumnIndex); return attributeColumns; } /** * Updates the process depending on whether the numberOfRows is known */ private void updateProcess(ProgressListener listener, int currentRow, int numberOfRows) { if (numberOfRows > 0) { listener.setCompleted(5 + 100 * currentRow / numberOfRows); } else { listener.setCompleted(50); } } /** * Returns the nominal mapping index of the String entry at columnIndex of the row. * * @throws ParseException * if the parsing failed and we are not fault tolerant */ private double getStringIndex(Attribute attribute, DataSetRow row, int columnIndex) throws ParseException { try { String value = row.getString(columnIndex); return attribute.getMapping().mapString(value); } catch (ParseException e) { checkFaultTolerance(e); return Double.NaN; } catch (AttributeTypeException e) { // happens when binominal attribute with too many values checkFaultTolerance(new ParseException(e.getMessage(), e, columnIndex)); return Double.NaN; } } /** * Returns the date at index columnIndex of the row. * * @throws ParseException * if the parsing failed and we are not fault tolerant */ private double getDate(DataSetRow row, int columnIndex) throws ParseException { try { return row.getDate(columnIndex).getTime(); } catch (ParseException e) { checkFaultTolerance(e); return Double.NaN; } } /** * Returns the number at index columnIndex of the row. * * @throws ParseException * if the parsing failed and we are not fault tolerant */ private double getNumber(DataSetRow row, int columnIndex) throws ParseException { try { return row.getDouble(columnIndex); } catch (ParseException e) { checkFaultTolerance(e); return Double.NaN; } } /** * Checks if an exception should be thrown. */ private void checkFaultTolerance(ParseException e) throws ParseException { if (!isFaultTolerant) { throw e; } } }