/* * RapidMiner * * Copyright (C) 2001-2008 by Rapid-I and the contributors * * Complete list of developers available at our web site: * * http://rapid-i.com * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.operator.preprocessing.transformation; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.Vector; import java.util.regex.Matcher; import java.util.regex.Pattern; import com.rapidminer.example.Attribute; import com.rapidminer.example.Example; import com.rapidminer.example.ExampleSet; import com.rapidminer.example.table.AttributeFactory; import com.rapidminer.example.table.DoubleArrayDataRow; import com.rapidminer.example.table.MemoryExampleTable; import com.rapidminer.example.table.NominalMapping; import com.rapidminer.operator.IOObject; import com.rapidminer.operator.OperatorDescription; import com.rapidminer.operator.OperatorException; import com.rapidminer.parameter.ParameterType; import com.rapidminer.parameter.ParameterTypeBoolean; import com.rapidminer.parameter.ParameterTypeList; import com.rapidminer.parameter.ParameterTypeString; import com.rapidminer.tools.Ontology; /** * This operator converts an example set by dividing examples * which consist of multiple observations (at different times) * into multiple examples, where each example covers on point * in time. An index attribute is added, which contains denotes * the actual point in time the example belongs to after the * transformation. The parameter <tt>keep_missings</tt> specifies * whether examples should be kept, even if if exhibits missing * values for all series at a certain point in time. * * @author Tobias Malbrecht * @version $Id: Attribute2ExamplePivoting.java,v 1.1 2008/09/08 18:57:13 tobiasmalbrecht Exp $ */ public class Attribute2ExamplePivoting extends ExampleSetTransformationOperator { public static final String PARAMETER_ATTRIBUTE_NAME_REGEX = "attributes"; public static final String PARAMETER_SERIES = "attribute_name"; public static final String PARAMETER_INDEX_ATTRIBUTE = "index_attribute"; public static final String PARAMETER_KEEP_MISSINGS = "keep_missings"; public Attribute2ExamplePivoting(OperatorDescription description) { super(description); } public IOObject[] apply() throws OperatorException { ExampleSet exampleSet = getInput(ExampleSet.class); List seriesList = getParameterList(PARAMETER_SERIES); int numberOfSeries = seriesList.size(); String[] seriesNames = new String[numberOfSeries]; Pattern[] seriesPatterns = new Pattern[numberOfSeries]; ArrayList<Vector<Attribute>> seriesAttributes = new ArrayList<Vector<Attribute>>(numberOfSeries); int[] attributeTypes = new int[numberOfSeries]; Iterator iterator = seriesList.iterator(); int j = 0; while (iterator.hasNext()) { Object[] pair = (Object[]) iterator.next(); seriesNames[j] = (String) pair[0]; seriesPatterns[j] = Pattern.compile((String) pair[1]); seriesAttributes.add(j, new Vector<Attribute>()); attributeTypes[j] = Ontology.ATTRIBUTE_VALUE; j++; } Vector<Attribute> newAttributes = new Vector<Attribute>(); Vector<Attribute> constantAttributes = new Vector<Attribute>(); // identify series attributes and check attribute types for (Attribute attribute : exampleSet.getAttributes()) { boolean matched = false; for (int i = 0; i < numberOfSeries; i++) { Matcher matcher = seriesPatterns[i].matcher(attribute.getName()); if (matcher.matches()) { matched = true; seriesAttributes.get(i).add(attribute); if (attributeTypes[i] != Ontology.ATTRIBUTE_VALUE) { if (attribute.getValueType() != attributeTypes[i]) { throw new OperatorException("attributes have different value types: no conversion is performed"); } } else { attributeTypes[i] = attribute.getValueType(); } break; } } if (!matched) { Attribute attributeCopy = AttributeFactory.createAttribute(attribute.getName(), attribute.getValueType()); if (attribute.isNominal()) { attributeCopy.setMapping((NominalMapping) attribute.getMapping().clone()); } newAttributes.add(attributeCopy); constantAttributes.add(attribute); } } // check series length int seriesLength = 0; if (numberOfSeries >= 1) { seriesLength = seriesAttributes.get(0).size(); for (int i = 0; i < numberOfSeries - 1; i++) { seriesLength = seriesAttributes.get(i).size(); if (seriesLength != seriesAttributes.get(i+1).size()) { throw new OperatorException("series must have the same length: no conversion is performed"); } } } // index attributes Attribute indexAttribute = AttributeFactory.createAttribute(getParameterAsString(PARAMETER_INDEX_ATTRIBUTE), Ontology.INTEGER); newAttributes.add(indexAttribute); // series attribtues for (int i = 0; i < numberOfSeries; i++) { Attribute seriesAttribute = AttributeFactory.createAttribute(seriesNames[i], attributeTypes[i]); newAttributes.add(seriesAttribute); } MemoryExampleTable table = new MemoryExampleTable(newAttributes); for (Example example : exampleSet) { int l = 0; for (int k = 0; k < seriesLength; k++) { l++; double[] data = new double[newAttributes.size()]; for (int i = 0; i < data.length; i++) { data[i] = Double.NaN; } // set constant attribute values for (int i = 0; i < constantAttributes.size(); i++) { data[i] = example.getValue(constantAttributes.get(i)); } // set index attribute value data[data.length - numberOfSeries - 1] = l; // set series attribute values boolean onlyMissings = true; for (int i = 0; i < numberOfSeries; i++) { Attribute seriesAttribute = seriesAttributes.get(i).get(k); double seriesValue = example.getValue(seriesAttribute); double newValue = Double.NaN; if (!Double.isNaN(seriesValue)) { if (seriesAttribute.isNominal()) { newValue = newAttributes.get(newAttributes.size() - numberOfSeries + i).getMapping().mapString(seriesAttribute.getMapping().mapIndex((int) seriesValue)); } else { newValue = seriesValue; } onlyMissings = false; } data[data.length - numberOfSeries + i] = newValue; } checkForStop(); if (!getParameterAsBoolean(PARAMETER_KEEP_MISSINGS) && onlyMissings) { continue; } else { table.addDataRow(new DoubleArrayDataRow(data)); } } } // create and deliver example set ExampleSet result = table.createExampleSet(); result.recalculateAllAttributeStatistics(); return new IOObject[] { result }; } public List<ParameterType> getParameterTypes() { List<ParameterType> types = super.getParameterTypes(); ParameterType attributeNames = new ParameterTypeString(PARAMETER_ATTRIBUTE_NAME_REGEX, "Attributes that forms series.", false); ParameterType type = new ParameterTypeList(PARAMETER_SERIES, "Name of resulting attribute.", attributeNames); type.setExpert(false); types.add(type); type = new ParameterTypeString(PARAMETER_INDEX_ATTRIBUTE, "Name of index attribute.", false); type.setExpert(false); types.add(type); type = new ParameterTypeBoolean(PARAMETER_KEEP_MISSINGS, "Keep missing values.", false); type.setExpert(false); types.add(type); return types; } }