DataSetReader.java example

Explorer
rapidminer-studio-master
- doc
  - doc
- src
/**
 * Copyright (C) 2001-2017 by RapidMiner and the contributors
 * 
 * Complete list of developers available at our web site:
 * 
 * http://rapidminer.com
 * 
 * This program is free software: you can redistribute it and/or modify it under the terms of the
 * GNU Affero General Public License as published by the Free Software Foundation, either version 3
 * of the License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
 * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Affero General Public License for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License along with this program.
 * If not, see http://www.gnu.org/licenses/.
*/
package com.rapidminer.studio.io.data;

import java.util.Arrays;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;

import com.rapidminer.Process;
import com.rapidminer.core.io.data.ColumnMetaData;
import com.rapidminer.core.io.data.DataSet;
import com.rapidminer.core.io.data.DataSetException;
import com.rapidminer.core.io.data.DataSetRow;
import com.rapidminer.core.io.data.ParseException;
import com.rapidminer.example.Attribute;
import com.rapidminer.example.AttributeTypeException;
import com.rapidminer.example.Attributes;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.example.table.AttributeFactory;
import com.rapidminer.example.table.DataRow;
import com.rapidminer.example.table.DataRowFactory;
import com.rapidminer.example.utils.ExampleSetBuilder;
import com.rapidminer.example.utils.ExampleSets;
import com.rapidminer.gui.tools.ProgressThread;
import com.rapidminer.gui.tools.ProgressThreadStoppedException;
import com.rapidminer.operator.Operator;
import com.rapidminer.operator.ProcessStoppedException;
import com.rapidminer.operator.UserError;
import com.rapidminer.studio.io.data.internal.ResultSetAdapterUtils;
import com.rapidminer.studio.io.gui.internal.steps.StoreToRepositoryStep;
import com.rapidminer.tools.Ontology;
import com.rapidminer.tools.ProgressListener;


/**
 * Can read a {@link DataSet} into an {@link ExampleSet}. Uses a list of {@link ColumnMetaData} to
 * create {@link Attribute}s and fill the data for them by going through the rows of the
 * {@link DataSet}. Used by the {@link StoreToRepositoryStep} to store the data in the repository at
 * the end of the import wizard.
 *
 * @author Gisa Schaefer
 * @since 7.0.0
 */
public class DataSetReader {

	private int dataManagementType = DataRowFactory.TYPE_DOUBLE_ARRAY;

	private boolean shouldStop = false;
	private boolean isReading = false;
	private boolean isFaultTolerant = true;
	private final List<ColumnMetaData> metaData;
	private final Operator operator;

	/**
	 * Creates a new reader.
	 *
	 * @param operator
	 *            the operator that should be checked for stop, can be {@code null}
	 * @param metaData
	 *            the metaData to use for reading
	 * @param isFaultTolerant
	 *            {@code true} if the reader puts missing values on parsing errors, {@code false} if
	 *            the reader throws an exception on parsing errors
	 */
	public DataSetReader(Operator operator, List<ColumnMetaData> metaData, boolean isFaultTolerant) {
		this.operator = operator;
		this.isFaultTolerant = isFaultTolerant;
		this.metaData = metaData;
	}

	/**
	 * @return {@code true} if the reader puts missing values on parsing errors, {@code false} if
	 *         the reader throws an exception on parsing errors
	 */
	public boolean isFaultTolerant() {
		return isFaultTolerant;
	}

	/**
	 * Set if the reader should throw an exception or put a missing value on parsing errors
	 *
	 * @param isFaultTolerant
	 *            {@code true} if the reader puts missing values on parsing errors, {@code false} if
	 *            the reader throws an exception on parsing errors
	 */
	public void setFaultTolerant(boolean isFaultTolerant) {
		this.isFaultTolerant = isFaultTolerant;
	}

	/**
	 * Sets the data management type to use for the data rows in the example set created by
	 * {@link #read}. Default is {@link DataRowFactory.TYPE_DOUBLE_ARRAY}.
	 *
	 * @param dataManagmentType
	 *            the dataManagmentType to use for creating the example set, must be one of the
	 *            constants TYPE_DOUBLE_ARRAY, TYPE_FLOAT_ARRAY, TYPE_LONG_ARRAY, TYPE_INT_ARRAY,
	 *            TYPE_SHORT_ARRAY, TYPE_BYTE_ARRAY, TYPE_BOOLEAN_ARRAY, TYPE_DOUBLE_SPARSE_ARRAY,
	 *            TYPE_FLOAT_SPARSE_ARRAY, TYPE_LONG_SPARSE_ARRAY, TYPE_INT_SPARSE_ARRAY,
	 *            TYPE_SHORT_SPARSE_ARRAY, TYPE_BYTE_SPARSE_ARRAY, TYPE_BOOLEAN_SPARSE_ARRAY, or
	 *            TYPE_SPARSE_MAP of {@link DataRowFactory}.
	 */
	public void setDataManagmentType(int dataManagmentType) {
		this.dataManagementType = dataManagmentType;
	}

	/**
	 * Transforms the {@link DataSet} dataSet to an {@link ExampleSet} with respect to the metaData.
	 * Uses the metaData to create {@link Attribute}s and fill the data for them by going through
	 * the rows of the {@link DataSet}. The reading process can be stop at each new row by the
	 * operator this is associated to, by the listener, or by calling {@link #stop()}.
	 *
	 * @param dataSet
	 *            the data set to transform, will not be closed in this method
	 * @param listener
	 *            the progress listener, can be {@code null}
	 * @return the created example set
	 * @throws UserError
	 *             if a column specified in the metaData is not found or the column names are not
	 *             unique
	 * @throws DataSetException
	 *             when reading the data set fails
	 * @throws ProcessStoppedException
	 *             if the associated process is stopped or the {@link #stop()} method is called
	 * @throws ParseException
	 *             if parsing failed and the reading is not done fault tolerant
	 * @throws ProgressThreadStoppedException
	 *             if the {@link ProgressThread} associated to the listener was stopped
	 */
	public ExampleSet read(DataSet dataSet, ProgressListener listener)
			throws UserError, DataSetException, ProcessStoppedException, ParseException {
		isReading = true;

		if (listener != null) {
			listener.setTotal(120);
		}

		// check the meta data and create needed attributes
		int[] attributeColumns = parseMetaDataInformation();
		checkColumnsInDataSet(dataSet, attributeColumns);
		Attribute[] attributes = createAttributes(attributeColumns);

		if (listener != null) {
			listener.setCompleted(5);
		}

		// building example table
		ExampleSetBuilder builder = ExampleSets.from(attributes);
		fillExampleTable(dataSet, listener, attributeColumns, builder, attributes);

		// derive ExampleSet from exampleTable and assigning roles
		ExampleSet exampleSet = builder.build();
		assignRoles(attributeColumns, exampleSet);

		isReading = false;
		if (listener != null) {
			listener.setCompleted(110);
		}
		return exampleSet;
	}

	/**
	 * Stops the reading process. If the reading process is in progress, sets a flag that will stop
	 * the reading before the next row is read.
	 */
	public void stop() {
		if (isReading) {
			shouldStop = true;
		}
	}

	/**
	 * Assigns the roles stored in the metaData to the attributes of the exampleSet.
	 */
	private void assignRoles(int[] attributeColumns, ExampleSet exampleSet) {
		// Copy attribute list to avoid concurrent modification when setting to special
		List<Attribute> allAttributes = new LinkedList<>();
		for (Attribute att : exampleSet.getAttributes()) {
			allAttributes.add(att);
		}

		int attributeIndex = 0;
		for (Attribute attribute : allAttributes) {
			String roleId = metaData.get(attributeColumns[attributeIndex]).getRole();
			if (roleId != null && !Attributes.ATTRIBUTE_NAME.equals(roleId)) {
				exampleSet.getAttributes().setSpecialAttribute(attribute, roleId);
			}
			attributeIndex++;
		}
	}

	/**
	 * Fills the exampleTable with the data from the dataSet.
	 */
	private void fillExampleTable(DataSet dataSet, ProgressListener listener, int[] attributeColumns,
			ExampleSetBuilder builder, Attribute[] attributes)
			throws DataSetException, ProcessStoppedException, ParseException {

		dataSet.reset();
		int numberOfRows = dataSet.getNumberOfRows();
		DataRowFactory factory = new DataRowFactory(dataManagementType, DataRowFactory.POINT_AS_DECIMAL_CHARACTER);

		// detect if this is executed in a process
		boolean isRunningInProcess = isOperatorRunning();

		// now iterate over complete dataSet and copy data
		while (dataSet.hasNext()) {
			if (isRunningInProcess) {
				operator.checkForStop();
			}
			if (shouldStop) {
				throw new ProcessStoppedException();
			}

			DataSetRow currentRow = dataSet.nextRow();

			if (listener != null) {
				updateProcess(listener, dataSet.getCurrentRowIndex(), numberOfRows);
			}

			// creating data row
			DataRow row = factory.create(attributes.length);
			int attributeIndex = 0;
			for (Attribute attribute : attributes) {
				// check for missing
				if (currentRow.isMissing(attributeColumns[attributeIndex])) {
					row.set(attribute, Double.NaN);
				} else {
					switch (attribute.getValueType()) {
						case Ontology.INTEGER:
						case Ontology.NUMERICAL:
						case Ontology.REAL:
							row.set(attribute, getNumber(currentRow, attributeColumns[attributeIndex]));
							break;
						case Ontology.DATE_TIME:
						case Ontology.TIME:
						case Ontology.DATE:
							row.set(attribute, getDate(currentRow, attributeColumns[attributeIndex]));
							break;
						default:
							row.set(attribute, getStringIndex(attribute, currentRow, attributeColumns[attributeIndex]));
					}
				}
				attributeIndex++;
			}
			builder.addDataRow(row);
		}
	}

	/**
	 * @return if the operator is running in a process
	 */
	private boolean isOperatorRunning() {
		boolean isRunningInProcess = false;
		if (operator != null) {
			Process process = operator.getProcess();
			if (process != null && process.getProcessState() == Process.PROCESS_STATE_RUNNING) {
				isRunningInProcess = true;
			}
		}
		return isRunningInProcess;
	}

	/**
	 * Creates {@link Attribute}s from the metaData.
	 */
	private Attribute[] createAttributes(int[] attributeColumns) {
		int numberOfAttributes = attributeColumns.length;

		// create associated attributes
		Attribute[] attributes = new Attribute[numberOfAttributes];
		for (int i = 0; i < attributes.length; i++) {
			int attributeValueType = ResultSetAdapterUtils.transformColumnType(metaData.get(attributeColumns[i]).getType());
			attributes[i] = AttributeFactory.createAttribute(metaData.get(attributeColumns[i]).getName(),
					attributeValueType);
		}

		return attributes;
	}

	/**
	 * Checks if the unremoved columns described by the metaData are contained in the dataSet.
	 *
	 * @throws UserError
	 *             if a needed column is not found
	 */
	private void checkColumnsInDataSet(DataSet dataSet, int[] attributeColumns) throws UserError {
		// check whether all columns are accessible
		int numberOfAvailableColumns = dataSet.getNumberOfColumns();
		for (int attributeColumn : attributeColumns) {
			if (attributeColumn >= numberOfAvailableColumns) {
				throw new UserError(null, "data_import.specified_more_columns_than_exist",
						metaData.get(attributeColumn).getName(), attributeColumn);
			}
		}
	}

	/**
	 * Goes once through the metaData, checks which columns are removed and checks that the
	 * remaining columns have unique names.
	 *
	 * @return an array that contains at position i the original index of the column that will be
	 *         the i-th column in the example set
	 * @throws UserError
	 *             if the column names provided by the metaData are not unique
	 */
	private int[] parseMetaDataInformation() throws UserError {
		// create array that contains at position i the original index of the column that is now the
		// i-th column because of removals
		int[] selectedColumns = new int[metaData.size()];
		// create set of unique used column names
		Set<String> usedColumnNames = new HashSet<>();

		int columnIndex = 0;
		int usedColumnIndex = 0;
		for (ColumnMetaData column : metaData) {
			if (!column.isRemoved()) {

				selectedColumns[usedColumnIndex] = columnIndex;
				usedColumnIndex++;

				String columnName = column.getName();
				if (!usedColumnNames.contains(columnName)) {
					usedColumnNames.add(columnName);
				} else {
					throw new UserError(null, "data_import.non_unique_column_name", columnName);
				}
			}
			columnIndex++;
		}

		int[] attributeColumns = Arrays.copyOf(selectedColumns, usedColumnIndex);
		return attributeColumns;
	}

	/**
	 * Updates the process depending on whether the numberOfRows is known
	 */
	private void updateProcess(ProgressListener listener, int currentRow, int numberOfRows) {
		if (numberOfRows > 0) {
			listener.setCompleted(5 + 100 * currentRow / numberOfRows);
		} else {
			listener.setCompleted(50);
		}
	}

	/**
	 * Returns the nominal mapping index of the String entry at columnIndex of the row.
	 *
	 * @throws ParseException
	 *             if the parsing failed and we are not fault tolerant
	 */
	private double getStringIndex(Attribute attribute, DataSetRow row, int columnIndex) throws ParseException {
		try {
			String value = row.getString(columnIndex);
			return attribute.getMapping().mapString(value);
		} catch (ParseException e) {
			checkFaultTolerance(e);
			return Double.NaN;
		} catch (AttributeTypeException e) {
			// happens when binominal attribute with too many values
			checkFaultTolerance(new ParseException(e.getMessage(), e, columnIndex));
			return Double.NaN;
		}

	}

	/**
	 * Returns the date at index columnIndex of the row.
	 *
	 * @throws ParseException
	 *             if the parsing failed and we are not fault tolerant
	 */
	private double getDate(DataSetRow row, int columnIndex) throws ParseException {
		try {
			return row.getDate(columnIndex).getTime();
		} catch (ParseException e) {
			checkFaultTolerance(e);
			return Double.NaN;
		}
	}

	/**
	 * Returns the number at index columnIndex of the row.
	 *
	 * @throws ParseException
	 *             if the parsing failed and we are not fault tolerant
	 */
	private double getNumber(DataSetRow row, int columnIndex) throws ParseException {
		try {
			return row.getDouble(columnIndex);
		} catch (ParseException e) {
			checkFaultTolerance(e);
			return Double.NaN;
		}
	}

	/**
	 * Checks if an exception should be thrown.
	 */
	private void checkFaultTolerance(ParseException e) throws ParseException {
		if (!isFaultTolerant) {
			throw e;
		}
	}

}