/** * Copyright (C) 2001-2017 by RapidMiner and the contributors * * Complete list of developers available at our web site: * * http://rapidminer.com * * This program is free software: you can redistribute it and/or modify it under the terms of the * GNU Affero General Public License as published by the Free Software Foundation, either version 3 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License along with this program. * If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.operator.features.construction; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; import com.rapidminer.example.Attribute; import com.rapidminer.example.Example; import com.rapidminer.example.ExampleSet; import com.rapidminer.example.Statistics; import com.rapidminer.example.set.AttributeWeightedExampleSet; import com.rapidminer.example.table.DataRow; import com.rapidminer.example.table.DataRowReader; import com.rapidminer.example.table.ExampleTable; import com.rapidminer.example.utils.ExampleSets; import com.rapidminer.operator.Operator; import com.rapidminer.operator.ProcessStoppedException; import com.rapidminer.tools.RandomGenerator; import com.rapidminer.tools.expression.ExampleResolver; import com.rapidminer.tools.expression.ExpressionException; import com.rapidminer.tools.expression.ExpressionParser; import com.rapidminer.tools.expression.internal.ExpressionParserUtils; /** * If the example set contain two equivalent attributes, the longer representation is removed. The * length is calculated as the number of nested brackets. The equivalency probe is not done by * structural comparison. The attribute values of the equations in question are randomly sampled and * the equation results compared. If the difference is less than <i>epsilon</i> for <i>k</i> trials, * the equations are probably equivalent. At least they produce similar values. <br/> * * The values of the attributes are sampled in the range of the minimum and maximum values of the * attribute. This ensures equivalency or at least very similar values for the definition range in * question. Therefore a {@link ExampleTable} is constructed and filled with random values. Then a * {@link ExpressionParser} is used to construct the attributes values. * * @author Ingo Mierswa ingomierswa Exp $ */ public class EquivalentAttributeRemoval extends ExampleSetBasedIndividualOperator { /** * Indicates the number of examples which should be randomly generated to check equivalency. */ private int numberOfSamples = 10; /** * If the difference is smaller than epsilon, the attributes are considered as equivalent. */ private double epsilon = 0.00005d; /** Recalculates attribute statistics before sampling. */ private boolean recalculateAttributeStatistics = false; /** The random generator for the example values. */ private RandomGenerator random; /** the operator which calls this class. **/ private Operator operator; /** * Creates a new equivalent attribute removal population operator. */ public EquivalentAttributeRemoval(int numberOfSamples, double epsilon, boolean recalculateAttributeStatistics, RandomGenerator random, Operator op) { this.numberOfSamples = numberOfSamples; this.epsilon = epsilon; this.recalculateAttributeStatistics = recalculateAttributeStatistics; this.random = random; this.operator = op; } @Override public List<ExampleSetBasedIndividual> operate(ExampleSetBasedIndividual individual) throws ProcessStoppedException { AttributeWeightedExampleSet exampleSet = individual.getExampleSet(); if (recalculateAttributeStatistics) { exampleSet.recalculateAllAttributeStatistics(); } Attribute[] allAttributes = exampleSet.getExampleTable().getAttributes(); List<Attribute> simpleAttributesList = new ArrayList<Attribute>(); for (int i = 0; i < allAttributes.length; i++) { if (allAttributes[i] != null && allAttributes[i].getConstruction().equals(allAttributes[i].getName())) { simpleAttributesList.add(allAttributes[i]); } } Map<String, Attribute> removeMap = new HashMap<String, Attribute>(); Attribute[] attributeArray = exampleSet.getAttributes().createRegularAttributeArray(); for (int i = 0; i < attributeArray.length; i++) { for (int j = i + 1; j < attributeArray.length; j++) { Attribute att1 = attributeArray[i]; Attribute att2 = attributeArray[j]; if (att1.getConstruction().equals(att2.getConstruction())) { removeMap.put(att2.getName(), att2); } else { // create data set and attributes to check ExampleSet randomSet = ExampleSets.from(simpleAttributesList)// .withBlankSize(numberOfSamples)// .build(); fillTableWithRandomValues(randomSet.getExampleTable(), exampleSet, random); try { ExampleResolver resolver = new ExampleResolver(exampleSet); ExpressionParser parser = ExpressionParserUtils.createAllModulesParser(operator, resolver); Attribute test1 = ExpressionParserUtils.addAttribute(randomSet, "test1", att1.getConstruction(), parser, resolver, operator); Attribute test2 = ExpressionParserUtils.addAttribute(randomSet, "test2", att2.getConstruction(), parser, resolver, operator); // add longer attribute to remove map if equivalent if (equivalent(randomSet, test1, test2)) { int depth1 = att1.getConstruction().length(); int depth2 = att2.getConstruction().length(); if (depth1 > depth2) { removeMap.put(att1.getName(), att1); exampleSet.getLog().log( "Removing attribute " + att1.getName() + "=" + att1.getConstruction() + " which is equivalent to " + att2.getName() + "=" + att2.getConstruction() + "."); } else { removeMap.put(att2.getName(), att2); exampleSet.getLog().log( "Removing attribute " + att2.getName() + "=" + att2.getConstruction() + " which is equivalent to " + att1.getName() + "=" + att1.getConstruction() + "."); } } } catch (ExpressionException e) { exampleSet.getLog().logWarning( "Cannot generate test attribute: " + e.getShortMessage() + ". We just keep both attributes for sure..."); } } } } Iterator<Attribute> i = removeMap.values().iterator(); while (i.hasNext()) { Attribute attribute = i.next(); // exampleSet.getLog().log("Remove equivalent attribute '" + attribute.getName() + // "'."); exampleSet.getAttributes().remove(attribute); } List<ExampleSetBasedIndividual> l = new LinkedList<ExampleSetBasedIndividual>(); l.add(new ExampleSetBasedIndividual(exampleSet)); return l; } private boolean equivalent(ExampleSet exampleSet, Attribute test1, Attribute test2) { if (exampleSet.getAttributes().size() < 2) { return true; } else { Iterator<Example> reader = exampleSet.iterator(); while (reader.hasNext()) { Example example = reader.next(); double value1 = example.getValue(test1); double value2 = example.getValue(test2); if (Math.abs(value1 - value2) > epsilon) { return false; } } return true; } } /** * After creation of a new ExampleTable with given size all values are 0. Use this method to * fill the table with random values in the range specified by minimum and maximum values of the * attributes. Please note that the attributes in the example table must already have proper * minimum and maximum values. This works only for numerical attribute. Nominal attribute values * will be set to 0. */ private static void fillTableWithRandomValues(ExampleTable exampleTable, ExampleSet baseSet, RandomGenerator random) { DataRowReader reader = exampleTable.getDataRowReader(); Attribute[] attributes = exampleTable.getAttributes(); while (reader.hasNext()) { DataRow dataRow = reader.next(); for (int i = 0; i < attributes.length; i++) { if (attributes[i] != null) { if (!attributes[i].isNominal()) { double min = baseSet.getStatistics(attributes[i], Statistics.MINIMUM); double max = baseSet.getStatistics(attributes[i], Statistics.MAXIMUM); if (max > min) { dataRow.set(attributes[i], random.nextDoubleInRange(min, max)); } else { dataRow.set(attributes[i], random.nextDouble() * 2 - 1); } } else { dataRow.set(attributes[i], 0); } } } } } }