/* * RapidMiner * * Copyright (C) 2001-2011 by Rapid-I and the contributors * * Complete list of developers available at our web site: * * http://rapid-i.com * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.operator.preprocessing.discretization; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.Map; import java.util.Set; import java.util.SortedSet; import java.util.TreeSet; import java.util.Map.Entry; import com.rapidminer.example.Attribute; import com.rapidminer.example.AttributeRole; import com.rapidminer.example.Attributes; import com.rapidminer.example.Example; import com.rapidminer.example.ExampleSet; import com.rapidminer.example.SimpleAttributes; import com.rapidminer.example.table.AttributeFactory; import com.rapidminer.example.table.ExampleTable; import com.rapidminer.example.table.NominalMapping; import com.rapidminer.example.table.PolynominalMapping; import com.rapidminer.example.table.ViewAttribute; import com.rapidminer.operator.OperatorException; import com.rapidminer.operator.UserError; import com.rapidminer.operator.preprocessing.PreprocessingModel; import com.rapidminer.tools.Ontology; import com.rapidminer.tools.Tools; import com.rapidminer.tools.container.Tupel; /** * The generic discretization model. * * @author Sebastian Land */ public class DiscretizationModel extends PreprocessingModel { private static final long serialVersionUID = -8732346419946567062L; private Map<String, SortedSet<Tupel<Double, String>>> rangesMap; private Set<String> attributeNames; private boolean removeUseless = true; public static final String[] RANGE_NAME_TYPES = { "long", "short", "interval" }; public static final int RANGE_NAME_LONG = 0; public static final int RANGE_NAME_SHORT = 1; public static final int RANGE_NAME_INTERVAL = 2; public DiscretizationModel(ExampleSet exampleSet) { this(exampleSet, false); } public DiscretizationModel(ExampleSet exampleSet, boolean removeUseless) { super(exampleSet); attributeNames = new HashSet<String>(); for (Attribute attribute : exampleSet.getAttributes()) { if (attribute.isNumerical()) { attributeNames.add(attribute.getName()); } } this.removeUseless = removeUseless; } @Override public ExampleSet applyOnData(ExampleSet exampleSet) throws OperatorException { // creating new nominal attributes for numerical ones Map<Attribute, Attribute> replacementMap = new LinkedHashMap<Attribute, Attribute>(); Map<String, AttributeRole> replacementRoleMap = new HashMap<String, AttributeRole>(); ExampleTable table = exampleSet.getExampleTable(); Attributes attributes = exampleSet.getAttributes(); Iterator<Attribute> iterator = attributes.allAttributes(); while (iterator.hasNext()) { Attribute attribute = iterator.next(); if (attribute.isNumerical() && attributeNames.contains(attribute.getName())) { Attribute newAttribute = AttributeFactory.createAttribute(Ontology.NOMINAL); replacementMap.put(attribute, newAttribute); AttributeRole role = attributes.getRole(attribute); if (role != null) { replacementRoleMap.put(attribute.getName(), role); } } } Set<Entry<Attribute, Attribute>> replacements = replacementMap.entrySet(); // creating mapping and adding to table and exampleSet for (Entry<Attribute, Attribute> replacement: replacements) { SortedSet<Tupel<Double, String>> ranges = rangesMap.get(replacement.getKey().getName()); Attribute newAttribute = replacement.getValue(); table.addAttribute(newAttribute); exampleSet.getAttributes().addRegular(newAttribute); if (ranges != null) { for (Tupel<Double, String> rangePair : ranges) { newAttribute.getMapping().mapString(rangePair.getSecond()); } } } // copying data for (Example example: exampleSet) { for (Entry<Attribute, Attribute> replacement: replacements) { Attribute originalAttribute = replacement.getKey(); Attribute newAttribute = replacement.getValue(); SortedSet<Tupel<Double, String>> ranges = rangesMap.get(originalAttribute.getName()); if (ranges != null) { double value = example.getValue(originalAttribute); if (!Double.isNaN(value)) { int b = 0; for (Tupel<Double, String> rangePair : ranges) { if (value <= rangePair.getFirst().doubleValue()) { example.setValue(newAttribute, b); break; } b++; } } else { example.setValue(newAttribute, Double.NaN); } } } } // removing old attributes and assigning final names and role to new for (Map.Entry<Attribute,Attribute> entry : replacementMap.entrySet()) { Attribute oldAttribute = entry.getKey(); Attribute newAttribute = entry.getValue(); String name = oldAttribute.getName(); exampleSet.getAttributes().remove(oldAttribute); if (replacementRoleMap.containsKey(name)) { exampleSet.getAttributes().getRole(newAttribute).setSpecial(replacementRoleMap.get(name).getSpecialName()); } newAttribute.setName(name); } // removing useless nominal attributes if (removeUseless) { iterator = exampleSet.getAttributes().iterator(); while (iterator.hasNext()) { Attribute attribute = iterator.next(); if (attribute.isNominal()) { if (attribute.getMapping().size() < 2) { iterator.remove(); } } } } return exampleSet; } /** Creates the ranges. If the range name type is 'Interval' and the number of digits is smaller than 0, the number * of digits is automatically determined in a way such that the range names do actually differ but are rounded as * far as possible. */ public void setRanges(Map<Attribute, double[]> rangesMap, String rangeName, int rangeNameType, int numberOfDigits) throws UserError { this.rangesMap = new HashMap<String, SortedSet<Tupel<Double, String>>>(); Iterator<Map.Entry<Attribute, double[]>> r = rangesMap.entrySet().iterator(); while (r.hasNext()) { Map.Entry<Attribute, double[]> entry = r.next(); Attribute attribute = entry.getKey(); double[] limits = entry.getValue(); TreeSet<Tupel<Double, String>> ranges = null; boolean valid = true; if (rangeNameType == RANGE_NAME_INTERVAL) { int startNumberOfDigits = numberOfDigits; if (startNumberOfDigits <= 0) startNumberOfDigits = 1; for (int n = startNumberOfDigits; n < 30; n++) { valid = true; ranges = createRanges(limits, rangeName, rangeNameType, n); String lastTupel = null; for (Tupel<Double, String> t : ranges) { if (lastTupel != null) { if (lastTupel.equals(t.getSecond())) { valid = false; break; } } String first = t.getSecond().substring(1, t.getSecond().indexOf(" - ")); String second = t.getSecond().substring(t.getSecond().indexOf(" - ") + " - ".length(), t.getSecond().length() - 1); if (first.equals(second)) { valid = false; break; } lastTupel = t.getSecond(); } if (valid) break; } if (!valid) throw new UserError(null, 938); } else { if (numberOfDigits > 0) { ranges = createRanges(limits, rangeName, rangeNameType, numberOfDigits); } else { ranges = createRanges(limits, rangeName, rangeNameType, 3); } } this.rangesMap.put(attribute.getName(), ranges); } } private TreeSet<Tupel<Double,String>> createRanges(double[] entry, String rangeBaseName, int rangeNameType, int numberOfDigits) { TreeSet<Tupel<Double, String>> ranges = new TreeSet<Tupel<Double, String>>(); int i = 1; double lastLimit = Double.NEGATIVE_INFINITY; for (double rangeValue : entry) { String usedRangeName = null; switch (rangeNameType) { case RANGE_NAME_LONG: usedRangeName = (rangeBaseName + i) + " [" + Tools.formatIntegerIfPossible(lastLimit) + " - " + Tools.formatIntegerIfPossible(rangeValue) + "]"; break; case RANGE_NAME_SHORT: usedRangeName = (rangeBaseName + i); break; case RANGE_NAME_INTERVAL: usedRangeName = "[" + Tools.formatNumber(lastLimit, numberOfDigits) + " - " + Tools.formatNumber(rangeValue, numberOfDigits) + "]"; break; } ranges.add(new Tupel<Double, String>(rangeValue, usedRangeName)); i++; lastLimit = rangeValue; } return ranges; } public void setRanges(Map<String, SortedSet<Tupel<Double, String>>> rangesMap) { this.rangesMap = rangesMap; } public Map<String, SortedSet<Tupel<Double, String>>> getRanges() { return this.rangesMap; } @Override public String toString() { StringBuffer buffer = new StringBuffer(); for (String attributeName : rangesMap.keySet()) { buffer.append(Tools.getLineSeparator()); buffer.append(Tools.getLineSeparator()); buffer.append(attributeName); buffer.append(Tools.getLineSeparator()); SortedSet<Tupel<Double, String>> set = rangesMap.get(attributeName); buffer.append(Double.NEGATIVE_INFINITY); for (Tupel<Double, String> tupel : set) { buffer.append(" < " + tupel.getSecond() + " <= " + tupel.getFirst()); } } return buffer.toString(); } public Attributes getTargetAttributes(ExampleSet parentSet) { SimpleAttributes attributes = new SimpleAttributes(); // add special attributes to new attributes Iterator<AttributeRole> specialRoles = parentSet.getAttributes().specialAttributes(); while (specialRoles.hasNext()) { attributes.add(specialRoles.next()); } // add regular attributes for (Attribute attribute : parentSet.getAttributes()) { if (!attribute.isNumerical() || !attributeNames.contains(attribute.getName())) { attributes.addRegular(attribute); } else { // create nominal mapping SortedSet<Tupel<Double, String>> ranges = rangesMap.get(attribute.getName()); if (ranges.size() > 1) { NominalMapping mapping = new PolynominalMapping(); for (Tupel<Double, String> rangePair : ranges) { mapping.mapString(rangePair.getSecond()); } // giving new attributes old name: connection to rangesMap attributes.addRegular(new ViewAttribute(this, attribute, attribute.getName(), Ontology.POLYNOMINAL, mapping)); } } } return attributes; } public double getValue(Attribute targetAttribute, double value) { SortedSet<Tupel<Double, String>> ranges = rangesMap.get(targetAttribute.getName()); if (ranges != null) { int b = 0; for (Tupel<Double, String> rangePair : ranges) { if (value <= rangePair.getFirst().doubleValue()) { return b; } b++; } return Double.NaN; } else { return value; } } }