/**
* Copyright (C) 2001-2017 by RapidMiner and the contributors
*
* Complete list of developers available at our web site:
*
* http://rapidminer.com
*
* This program is free software: you can redistribute it and/or modify it under the terms of the
* GNU Affero General Public License as published by the Free Software Foundation, either version 3
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
* even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License along with this program.
* If not, see http://www.gnu.org/licenses/.
*/
package com.rapidminer.operator.preprocessing;
import java.util.Iterator;
import java.util.List;
import com.rapidminer.RapidMiner;
import com.rapidminer.example.Attribute;
import com.rapidminer.example.AttributeRole;
import com.rapidminer.example.Example;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.example.table.AttributeFactory;
import com.rapidminer.example.table.DataRow;
import com.rapidminer.example.table.DataRowFactory;
import com.rapidminer.example.table.NominalMapping;
import com.rapidminer.example.utils.ExampleSetBuilder;
import com.rapidminer.example.utils.ExampleSetBuilder.DataManagement;
import com.rapidminer.example.utils.ExampleSets;
import com.rapidminer.operator.MemoryCleanUp;
import com.rapidminer.operator.OperatorDescription;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.annotation.ResourceConsumptionEstimator;
import com.rapidminer.operator.generator.ExampleSetGenerator;
import com.rapidminer.parameter.ParameterType;
import com.rapidminer.parameter.UndefinedParameterError;
import com.rapidminer.tools.OperatorResourceConsumptionHandler;
import com.rapidminer.tools.ParameterService;
import com.rapidminer.tools.parameter.internal.DataManagementParameterHelper;
/**
* Creates a fresh and clean copy of the data in memory. Might be very useful in combination with
* the {@link MemoryCleanUp} operator after large preprocessing trees using lot of views or data
* copies.
*
* @author Ingo Mierswa
*/
public class MaterializeDataInMemory extends AbstractDataProcessing {
public MaterializeDataInMemory(OperatorDescription description) {
super(description);
}
@Override
public ExampleSet apply(ExampleSet exampleSet) throws OperatorException {
int dataManagement;
DataManagement newDataManagement = DataManagement.AUTO;
if (Boolean.parseBoolean(ParameterService.getParameterValue(RapidMiner.PROPERTY_RAPIDMINER_SYSTEM_LEGACY_DATA_MGMT))) {
dataManagement = getParameterAsInt(ExampleSetGenerator.PARAMETER_DATAMANAGEMENT);
} else {
dataManagement = DataRowFactory.TYPE_COLUMN_VIEW;
newDataManagement = DataManagementParameterHelper.getSelectedDataManagement(this);
}
ExampleSet createdSet = materialize(exampleSet, dataManagement, newDataManagement);
return createdSet;
}
@Override
public List<ParameterType> getParameterTypes() {
List<ParameterType> types = super.getParameterTypes();
DataManagementParameterHelper.addParameterTypes(types, this);
return types;
}
@Override
public ResourceConsumptionEstimator getResourceConsumptionEstimator() {
return OperatorResourceConsumptionHandler.getResourceConsumptionEstimator(getInputPort(),
MaterializeDataInMemory.class, null);
}
/**
* Creates a materialized copy of the given example set, i.e., a hard copy with all unnecessary
* abstraction layers being removed. The data management strategy will be the same as in the
* current example set. If you want to use a different strategy call
* {@link #materializeExampleSet(ExampleSet, int)} instead.
*
* @param exampleSet
* the example set to materialize
* @return the materialized example set
* @throws UndefinedParameterError
*/
public static ExampleSet materializeExampleSet(ExampleSet exampleSet) {
return materialize(exampleSet, findDataRowType(exampleSet));
}
/**
* Creates a materialized copy of the given example set, i.e., a hard copy with all unnecessary
* abstraction layers being removed.
*
* @param exampleSet
* the example set to materialize
* @param dataManagement
* the data management strategy (see {@link DataRowFactory} for available types)
* @return the materialized example set
* @throws UndefinedParameterError
*/
public static ExampleSet materializeExampleSet(ExampleSet exampleSet, int dataManagement)
throws UndefinedParameterError {
return materialize(exampleSet, dataManagement);
}
/**
* Creates a materialized copy of the given example set, i.e., a hard copy with all unnecessary
* abstraction layers being removed.
*
* @param exampleSet
* the example set to materialize
* @param dataManagement
* the data management strategy
* @return the materialized example set
*/
private static ExampleSet materialize(ExampleSet exampleSet, int dataManagement) {
return materialize(exampleSet, dataManagement, DataManagement.AUTO);
}
/**
* Creates a materialized copy of the given example set, i.e., a hard copy with all unnecessary
* abstraction layers being removed.
*
* @param exampleSet
* the example set to materialize
* @param dataManagement
* the data management strategy
* @param newDataManagement
* the new data management strategy
* @return the materialized example set
*/
private static ExampleSet materialize(ExampleSet exampleSet, int dataManagement, DataManagement newDataManagement) {
// create new attributes
Attribute[] sourceAttributes = new Attribute[exampleSet.getAttributes().allSize()];
Attribute[] targetAttributes = new Attribute[exampleSet.getAttributes().allSize()];
String[] targetRoles = new String[targetAttributes.length];
Iterator<AttributeRole> iterator = exampleSet.getAttributes().allAttributeRoles();
for (int i = 0; i < sourceAttributes.length; i++) {
AttributeRole sourceRole = iterator.next();
sourceAttributes[i] = sourceRole.getAttribute();
targetAttributes[i] = AttributeFactory.createAttribute(sourceAttributes[i].getName(),
sourceAttributes[i].getValueType());
if (sourceAttributes[i].isNominal()) {
targetAttributes[i].setMapping((NominalMapping) sourceAttributes[i].getMapping().clone());
}
if (sourceRole.isSpecial()) {
targetRoles[i] = sourceRole.getSpecialName();
}
targetAttributes[i].getAnnotations().addAll(sourceAttributes[i].getAnnotations());
}
// size table by setting number of rows and add attributes
ExampleSetBuilder builder = ExampleSets.from(targetAttributes);
// copy columnwise if beta features are activated and dataManagment is double array or
// column view
// if datamanagement is not one of the two then there can be value changes when copying to a
// "smaller" row which we need to keep
if (Boolean.valueOf(ParameterService.getParameterValue(RapidMiner.PROPERTY_RAPIDMINER_SYSTEM_LEGACY_DATA_MGMT))
|| (dataManagement != DataRowFactory.TYPE_DOUBLE_ARRAY
&& dataManagement != DataRowFactory.TYPE_COLUMN_VIEW)) {
builder.withExpectedSize(exampleSet.size());
DataRowFactory rowFactory = new DataRowFactory(dataManagement, '.');
// copying data differently for sparse and non sparse for speed reasons
if (isSparseType(dataManagement)) {
for (Example example : exampleSet) {
DataRow targetRow = rowFactory.create(targetAttributes.length);
for (int i = 0; i < sourceAttributes.length; i++) {
double value = example.getValue(sourceAttributes[i]);
// we have a fresh sparse row, so everything is currently empty and we only
// need to set non default value attributes to avoid unnecessary binary
// searchs
if (value != 0) {
targetRow.set(targetAttributes[i], value);
}
}
builder.addDataRow(targetRow);
}
} else {
// dense data we copy entirely without condition
for (Example example : exampleSet) {
DataRow targetRow = rowFactory.create(targetAttributes.length);
for (int i = 0; i < sourceAttributes.length; i++) {
targetRow.set(targetAttributes[i], example.getValue(sourceAttributes[i]));
}
builder.addDataRow(targetRow);
}
}
} else {
builder.withBlankSize(exampleSet.size());
builder.withOptimizationHint(newDataManagement);
for (int i = 0; i < sourceAttributes.length; i++) {
final int index = i;
builder.withColumnFiller(targetAttributes[i],
j -> exampleSet.getExample(j).getValue(sourceAttributes[index]));
}
}
// create and return result
for (int i = 0; i < targetAttributes.length; i++) {
builder.withRole(targetAttributes[i], targetRoles[i]);
}
ExampleSet createdSet = builder.build();
createdSet.getAnnotations().addAll(exampleSet.getAnnotations());
return createdSet;
}
/**
* Returns whether the given type is sparse.
*/
private static boolean isSparseType(int dataRowType) {
switch (dataRowType) {
case DataRowFactory.TYPE_BOOLEAN_SPARSE_ARRAY:
case DataRowFactory.TYPE_BYTE_SPARSE_ARRAY:
case DataRowFactory.TYPE_DOUBLE_SPARSE_ARRAY:
case DataRowFactory.TYPE_FLOAT_SPARSE_ARRAY:
case DataRowFactory.TYPE_INT_SPARSE_ARRAY:
case DataRowFactory.TYPE_LONG_SPARSE_ARRAY:
case DataRowFactory.TYPE_SHORT_SPARSE_ARRAY:
case DataRowFactory.TYPE_SPARSE_MAP:
return true;
default:
return false;
}
}
/**
* This method determines the current used data row implementation in RapidMiner's backend.
*/
private static int findDataRowType(ExampleSet exampleSet) {
if (exampleSet.size() > 0) {
// then determine current representation: get first row
DataRow usedRow = exampleSet.getExample(0).getDataRow();
if (usedRow != null) {
return usedRow.getType();
}
}
// default type
return DataRowFactory.TYPE_DOUBLE_ARRAY;
}
@Override
public boolean writesIntoExistingData() {
return false;
}
}