/* * RapidMiner * * Copyright (C) 2001-2011 by Rapid-I and the contributors * * Complete list of developers available at our web site: * * http://rapid-i.com * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.operator.preprocessing.filter; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; import com.rapidminer.example.Attribute; import com.rapidminer.example.AttributeRole; import com.rapidminer.example.Attributes; import com.rapidminer.example.Example; import com.rapidminer.example.ExampleSet; import com.rapidminer.example.table.DoubleArrayDataRow; import com.rapidminer.example.table.MemoryExampleTable; import com.rapidminer.operator.AbstractExampleSetProcessing; import com.rapidminer.operator.OperatorCreationException; import com.rapidminer.operator.OperatorDescription; import com.rapidminer.operator.OperatorException; import com.rapidminer.operator.UserError; import com.rapidminer.operator.annotation.ResourceConsumptionEstimator; import com.rapidminer.operator.ports.metadata.AttributeMetaData; import com.rapidminer.operator.ports.metadata.ExampleSetMetaData; import com.rapidminer.operator.ports.metadata.ExampleSetPrecondition; import com.rapidminer.operator.ports.metadata.MetaData; import com.rapidminer.parameter.ParameterType; import com.rapidminer.parameter.ParameterTypeBoolean; import com.rapidminer.parameter.ParameterTypeInt; import com.rapidminer.parameter.UndefinedParameterError; import com.rapidminer.parameter.conditions.BooleanParameterCondition; import com.rapidminer.tools.Ontology; import com.rapidminer.tools.OperatorResourceConsumptionHandler; import com.rapidminer.tools.OperatorService; import com.rapidminer.tools.math.MathFunctions; /** * <p>This operator fills gaps in the data based on the ID attribute of the data set. * The ID attribute must either have the value type "integer" or one * of the data value types.</p> * * <p>The operator performs the following steps:</p> * <ol> * <li>The data is sorted according to the ID attribute</li> * <li>All occurring distances between consecutive ID values are calculated</li> * <li>The greatest common divisor (GCD) of all distances is calculated</li> * <li>All rows which would have an ID value which is a multiple of the GCD but are missing are added to the data set</li> * </ol> * * <p>Please note that all values of attributes beside the ID attribute will have a missing value which * often must be replaced as a next step.</p> * * @author Ingo Mierswa */ public class FillDataGaps extends AbstractExampleSetProcessing { public static final String PARAMETER_USE_GCD_FOR_STEP_SIZE = "use_gcd_for_step_size"; public static final String PARAMETER_STEP_SIZE = "step_size"; public static final String PARAMETER_START = "start"; public static final String PARAMETER_END = "end"; public FillDataGaps(OperatorDescription description) { super(description); getExampleSetInputPort().addPrecondition(new ExampleSetPrecondition(getExampleSetInputPort(), Ontology.VALUE_TYPE, Attributes.ID_NAME)); } @Override protected MetaData modifyMetaData(ExampleSetMetaData metaData) throws UndefinedParameterError { metaData.getNumberOfExamples().increaseByUnknownAmount(); for (AttributeMetaData amd: metaData.getAllAttributes()) { if (amd.getRole() == null || !amd.getRole().equals(Attributes.ID_NAME)) { amd.getNumberOfMissingValues().increaseByUnknownAmount(); } } return metaData; } @Override public ExampleSet apply(ExampleSet inputSet) throws OperatorException { // init and checks Attribute idAttribute = inputSet.getAttributes().getId(); if (idAttribute == null) { throw new UserError(this, 129); } if (!Ontology.ATTRIBUTE_VALUE_TYPE.isA(idAttribute.getValueType(), Ontology.DATE_TIME) && !Ontology.ATTRIBUTE_VALUE_TYPE.isA(idAttribute.getValueType(), Ontology.INTEGER)) { throw new UserError(this, 120, idAttribute.getName(), new Object[] { Ontology.VALUE_TYPE_NAMES[idAttribute.getValueType()], Ontology.VALUE_TYPE_NAMES[Ontology.DATE_TIME] + " or " + Ontology.VALUE_TYPE_NAMES[Ontology.INTEGER] }); } // sort data according to ID attribute Sorting sorting = null; try { sorting = OperatorService.createOperator(Sorting.class); } catch (OperatorCreationException e) { throw new OperatorException(getName() + ": Cannot create discretization operator (" + e + ")."); } sorting.setParameter(Sorting.PARAMETER_ATTRIBUTE_NAME, idAttribute.getName()); ExampleSet sortedSet = sorting.apply(inputSet); // determine step size long stepSize = 1; if (!getParameterAsBoolean(PARAMETER_USE_GCD_FOR_STEP_SIZE)) { stepSize = getParameterAsInt(PARAMETER_STEP_SIZE); } else { // calculate all distances List<Long> distances = new LinkedList<Long>(); boolean first = true; long lastValue = 0; // start value defined? if (isParameterSet(PARAMETER_START)) { first = false; lastValue = getParameterAsInt(PARAMETER_START); } // add data distances for (Example example : sortedSet) { long value = (long)example.getValue(idAttribute); if (first) { first = false; } else { if (value > lastValue) distances.add(value - lastValue); } lastValue = value; } // end value defined? if (isParameterSet(PARAMETER_END)) { int endValue = getParameterAsInt(PARAMETER_END); if (endValue > lastValue) distances.add(endValue - lastValue); } // calculate the GCD stepSize = MathFunctions.getGCD(distances); distances.clear(); } // find gaps List<Long> missingValues = new LinkedList<Long>(); long lastValue = 0; boolean first = true; long minValue = Long.MAX_VALUE; long maxValue = - Long.MAX_VALUE; for (Example example : sortedSet) { long value = (long)example.getValue(idAttribute); minValue = Math.min(minValue, value); maxValue = Math.max(maxValue, value); if (first) { first = false; lastValue = value; } else { while (lastValue + stepSize < value) { lastValue += stepSize; missingValues.add(lastValue); } lastValue = value; } } if (isParameterSet(PARAMETER_START)) { long start = getParameterAsInt(PARAMETER_START); if (start < minValue) { lastValue = start; while (lastValue + stepSize <= minValue) { missingValues.add(lastValue); lastValue += stepSize; } } } if (isParameterSet(PARAMETER_END)) { long end = getParameterAsInt(PARAMETER_END); if (end > maxValue) { lastValue = maxValue + stepSize; while (lastValue <= end) { missingValues.add(lastValue); lastValue += stepSize; } } } // create table List<Attribute> attributes = new ArrayList<Attribute>(sortedSet.getAttributes().allSize()); Map<Attribute, String> specialAttributes = new HashMap<Attribute, String>(); Iterator<AttributeRole> a = sortedSet.getAttributes().allAttributeRoles(); int idIndex = -1; int index = 0; while (a.hasNext()) { AttributeRole role = a.next(); Attribute cloneAttribute = (Attribute)role.getAttribute().clone(); attributes.add(cloneAttribute); if (role.isSpecial()) { specialAttributes.put(cloneAttribute, role.getSpecialName()); if (role.getSpecialName().equals(Attributes.ID_NAME)) idIndex = index; } index++; } MemoryExampleTable table = new MemoryExampleTable(attributes); // copy data for (Example example : sortedSet) { double[] data = new double[attributes.size()]; index = 0; Iterator<Attribute> i = sortedSet.getAttributes().allAttributes(); while (i.hasNext()) { data[index++] = example.getValue(i.next()); } table.addDataRow(new DoubleArrayDataRow(data)); } // create missing rows for (long missingValue : missingValues) { double[] data = new double[attributes.size()]; for (int d = 0; d < data.length; d++) { data[d] = Double.NaN; } data[idIndex] = missingValue; table.addDataRow(new DoubleArrayDataRow(data)); } // create final example set ExampleSet resultSet = table.createExampleSet(specialAttributes); // sort final result resultSet = sorting.apply(resultSet); return resultSet; } @Override public List<ParameterType> getParameterTypes() { List<ParameterType> types = super.getParameterTypes(); types.add(new ParameterTypeBoolean(PARAMETER_USE_GCD_FOR_STEP_SIZE, "Indicates if the greatest common divisor should be calculated and used as the underlying distance between all data points.", true)); ParameterType type = new ParameterTypeInt(PARAMETER_STEP_SIZE, "The used step size for filling the gaps (only used if GCD calculation is not checked).", 1, Integer.MAX_VALUE, 1); type.registerDependencyCondition(new BooleanParameterCondition(this, PARAMETER_USE_GCD_FOR_STEP_SIZE, false, false)); types.add(type); types.add(new ParameterTypeInt(PARAMETER_START, "If this parameter is defined gaps at the beginning (if they occur) before the first data point will also be filled.", 1, Integer.MAX_VALUE, true)); types.add(new ParameterTypeInt(PARAMETER_END, "If this parameter is defined gaps at the end (if they occur) after the last data point will also be filled.", 1, Integer.MAX_VALUE, true)); return types; } @Override public boolean writesIntoExistingData() { return false; //creates new table } @Override public ResourceConsumptionEstimator getResourceConsumptionEstimator() { return OperatorResourceConsumptionHandler.getResourceConsumptionEstimator(getInputPort(), FillDataGaps.class, null); } }