AttributeValueSplit.java example

Explorer
rapidminer-studio-master
- doc
  - doc
- src
/**
 * Copyright (C) 2001-2017 by RapidMiner and the contributors
 * 
 * Complete list of developers available at our web site:
 * 
 * http://rapidminer.com
 * 
 * This program is free software: you can redistribute it and/or modify it under the terms of the
 * GNU Affero General Public License as published by the Free Software Foundation, either version 3
 * of the License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
 * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Affero General Public License for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License along with this program.
 * If not, see http://www.gnu.org/licenses/.
*/
package com.rapidminer.operator.preprocessing.filter;

import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;

import com.rapidminer.example.Attribute;
import com.rapidminer.example.Example;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.example.table.AttributeFactory;
import com.rapidminer.operator.OperatorDescription;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.OperatorVersion;
import com.rapidminer.operator.UserError;
import com.rapidminer.operator.annotation.ResourceConsumptionEstimator;
import com.rapidminer.operator.ports.metadata.AttributeMetaData;
import com.rapidminer.operator.ports.metadata.ExampleSetMetaData;
import com.rapidminer.operator.ports.metadata.MetaData;
import com.rapidminer.operator.ports.metadata.SetRelation;
import com.rapidminer.operator.preprocessing.AbstractDataProcessing;
import com.rapidminer.operator.tools.AttributeSubsetSelector;
import com.rapidminer.parameter.ParameterType;
import com.rapidminer.parameter.ParameterTypeCategory;
import com.rapidminer.parameter.ParameterTypeRegexp;
import com.rapidminer.parameter.UndefinedParameterError;
import com.rapidminer.tools.Ontology;
import com.rapidminer.tools.OperatorResourceConsumptionHandler;


/**
 * <p>
 * This operator creates new attributes from a nominal attribute by dividing the nominal values into
 * parts according to a split criterion (regular expression). This operator provides two different
 * modes, depending on the setting of the parameter "splitting_mode".
 * </p>
 *
 * <h3>Ordered Splits</h3>
 * <p>
 * In the first split mode, called ordered_split, the resulting attributes get the name of the
 * original attribute together with a number indicating the order. For example, if the original data
 * contained the values<br/>
 * <br/>
 *
 * attribute-name <br/>
 * -------------- <br/>
 * value1 <br/>
 * value2, value3 <br/>
 * value3 <br/>
 * <br/>
 *
 * and should be divided by the separating commas, the resulting attributes would be
 * attribute-name1, attribute-name2, attribute-name3 with the tuples (value1, ?, ?), (value2,
 * value3, ?), and (value3, ?, ?), respectively. This mode is useful if the original values
 * indicated some order like, for example, a preference.
 * </p>
 *
 * <h3>Unordered Splits</h3>
 * <p>
 * In the second split mode, called unordered_split, the resulting attributes get the name of the
 * original attribute together with the value for each of the occurring values. For example, if the
 * original data contained the values<br/>
 * <br/>
 *
 * attribute-name <br/>
 * -------------- <br/>
 * value1 <br/>
 * value2, value3 <br/>
 * value3 <br/>
 * <br/>
 *
 * and again should be divided by the separating commas, the resulting attributes would be
 * attribute-name-value1, attribute-name-value2, and attribute-name-value3 with the tuples (true,
 * false, false), (false, true, true), and (false, false, true), respectively. This mode is useful
 * if the order is not important but the goal is a basket like data set containing all occurring
 * values.
 * </p>
 *
 * @author Ingo Mierswa, Nils Woehler
 */
public class AttributeValueSplit extends AbstractDataProcessing {

	public static final String PARAMETER_SPLIT_PATTERN = "split_pattern";

	public static final String PARAMETER_SPLIT_MODE = "split_mode";

	public final static String[] SPLIT_MODES = new String[] { "ordered_split", "unordered_split" };

	public final static int SPLIT_MODE_ORDERED = 0;

	public final static int SPLIT_MODE_UNORDERED = 1;

	/** last version where selected but missing attributes were silently ignored */
	private static final OperatorVersion OPERATOR_VERSION_6_0_3 = new OperatorVersion(6, 0, 3);

	private AttributeSubsetSelector attributeSubsetSelector = new AttributeSubsetSelector(this, getExampleSetInputPort(),
			Ontology.NOMINAL);

	public AttributeValueSplit(OperatorDescription description) {
		super(description);

	}

	@Override
	protected MetaData modifyMetaData(ExampleSetMetaData metaData) throws UndefinedParameterError {
		String splittingRegex = getParameterAsString(PARAMETER_SPLIT_PATTERN);
		try {
			Pattern splittingPattern = Pattern.compile(splittingRegex);
			ExampleSetMetaData subset = attributeSubsetSelector.getMetaDataSubset(metaData, false, true);
			SetRelation attributeSetRelation = SetRelation.EQUAL;
			int type = getParameterAsInt(PARAMETER_SPLIT_MODE);
			for (AttributeMetaData amd : subset.getAllAttributes()) {
				if (!amd.isSpecial() && amd.isNominal()) {
					attributeSetRelation = attributeSetRelation.merge(amd.getValueSetRelation());

					// removing old attribute
					metaData.removeAttribute(metaData.getAttributeByName(amd.getName()));

					switch (type) {
						case SPLIT_MODE_ORDERED:

							int maxNumber = 0;
							if (amd.getValueSetRelation() == SetRelation.SUBSET
									|| amd.getValueSetRelation() == SetRelation.UNKNOWN) {
								maxNumber = 3;
							}
							String[][] valueParts = new String[amd.getValueSet().size()][];
							int i = 0;
							for (String value : amd.getValueSet()) {
								valueParts[i] = splittingPattern.split(value);
								maxNumber = Math.max(maxNumber, valueParts[i].length);
								i++;
							}

							// creating new attributes
							for (i = 0; i < maxNumber; i++) {
								AttributeMetaData newAmd = new AttributeMetaData(amd.getName() + "_" + (i + 1),
										Ontology.NOMINAL);
								Set<String> valueSet = new HashSet<>();
								for (int value = 0; value < valueParts.length; value++) {
									if (valueParts[value].length > i) {
										valueSet.add(valueParts[value][i]);
									}
								}
								newAmd.setValueSet(valueSet, amd.getValueSetRelation());
								if (i > 0) {
									newAmd.getNumberOfMissingValues().increaseByUnknownAmount();
								}
								metaData.addAttribute(newAmd);
							}
							break;
						case SPLIT_MODE_UNORDERED:
							Set<String> splitValuesSet = new HashSet<>();

							for (String value : amd.getValueSet()) {
								String[] splitValue = splittingPattern.split(value);
								for (int k = 0; k < splitValue.length; k++) {
									splitValuesSet.add(splitValue[k]);
								}
							}

							// creating new attributes
							for (String splitValue : splitValuesSet) {
								AttributeMetaData newAmd = new AttributeMetaData(amd.getName() + "_" + splitValue,
										Ontology.NOMINAL);
								Set<String> valueSet = new HashSet<>();
								valueSet.add("true");
								valueSet.add("false");

								newAmd.setValueSet(valueSet, amd.getValueSetRelation());
								metaData.addAttribute(newAmd);
							}
						default:
							break;
					}

				}
			}

			metaData.mergeSetRelation(attributeSetRelation);
		} catch (PatternSyntaxException e) {
		}
		return metaData;
	}

	@Override
	public ExampleSet apply(ExampleSet exampleSet) throws OperatorException {
		String splittingRegex = getParameterAsString(PARAMETER_SPLIT_PATTERN);
		Pattern splittingPattern = null;
		try {
			splittingPattern = Pattern.compile(splittingRegex);
		} catch (PatternSyntaxException e) {
			throw new UserError(this, 206, splittingRegex, e.getMessage());
		}

		int type = getParameterAsInt(PARAMETER_SPLIT_MODE);
		// Until version 6.0.3 there was thrown no UserError when attributes were missing.
		// Compatibility check to avoid older processes to fail.
		boolean errorOnMissing = getCompatibilityLevel().isAtMost(OPERATOR_VERSION_6_0_3) ? false : true;

		for (Attribute attribute : attributeSubsetSelector.getAttributeSubset(exampleSet, false, errorOnMissing)) {
			if (attribute.isNominal()) {
				switch (type) {
					case SPLIT_MODE_ORDERED:
						orderedSplit(exampleSet, attribute, splittingPattern);
						break;
					case SPLIT_MODE_UNORDERED:
					default:
						unorderedSplit(exampleSet, attribute, splittingPattern);
						break;
				}
			}
		}

		return exampleSet;
	}

	private void orderedSplit(ExampleSet exampleSet, Attribute attribute, Pattern splittingPattern) {
		// check for maximum number
		int maxNumber = 0;
		for (Example example : exampleSet) {
			String value = example.getNominalValue(attribute);
			String[] parts = splittingPattern.split(value);
			maxNumber = Math.max(maxNumber, parts.length);
		}

		if (maxNumber >= 2) {
			// create new attributes
			Attribute[] newAttributes = new Attribute[maxNumber];
			for (int a = 0; a < maxNumber; a++) {
				newAttributes[a] = AttributeFactory.createAttribute(attribute.getName() + "_" + (a + 1), Ontology.NOMINAL);
				exampleSet.getExampleTable().addAttribute(newAttributes[a]);
				exampleSet.getAttributes().addRegular(newAttributes[a]);
			}

			// fill new attributes with values
			for (Example example : exampleSet) {
				int p = 0;
				// check if value is missing, otherwise a "?" string could be filled in.
				if (!Double.isNaN(example.getValue(attribute))) {
					String value = example.getNominalValue(attribute);
					String[] parts = splittingPattern.split(value);

					for (String part : parts) {
						example.setValue(newAttributes[p], newAttributes[p].getMapping().mapString(part));
						p++;
					}
				}

				while (p < maxNumber) {
					example.setValue(newAttributes[p], Double.NaN);
					p++;
				}
			}
			exampleSet.getAttributes().remove(attribute);
		}
	}

	private void unorderedSplit(ExampleSet exampleSet, Attribute attribute, Pattern splittingPattern) {
		// check for maximum number
		SortedSet<String> allValues = new TreeSet<>();
		boolean splitFound = false;
		for (Example example : exampleSet) {
			String value = example.getNominalValue(attribute);
			String[] parts = splittingPattern.split(value);
			for (String part : parts) {
				allValues.add(part);
			}
			if (parts.length > 1) {
				splitFound = true;
			}
		}

		if (splitFound) {
			// create new attributes
			Attribute[] newAttributes = new Attribute[allValues.size()];
			Map<String, Integer> indexMap = new HashMap<>();
			int a = 0;
			Iterator<String> v = allValues.iterator();
			while (v.hasNext()) {
				String value = v.next();
				newAttributes[a] = AttributeFactory.createAttribute(attribute.getName() + "_" + value, Ontology.BINOMINAL);
				newAttributes[a].getMapping().mapString("false");
				newAttributes[a].getMapping().mapString("true");
				exampleSet.getExampleTable().addAttribute(newAttributes[a]);
				exampleSet.getAttributes().addRegular(newAttributes[a]);
				indexMap.put(value, a);
				a++;
			}

			// fill new attributes with values
			for (Example example : exampleSet) {
				// set all new attributes to false
				for (Attribute newAttribute : newAttributes) {
					example.setValue(newAttribute, newAttribute.getMapping().mapString("false"));
				}

				String value = example.getNominalValue(attribute);
				String[] parts = splittingPattern.split(value);
				// int p = 0;
				for (String part : parts) {
					Attribute newAttribute = newAttributes[indexMap.get(part)];
					example.setValue(newAttribute, newAttribute.getMapping().mapString("true"));
					// p++;
				}
			}
			exampleSet.getAttributes().remove(attribute);
		}
	}

	@Override
	public List<ParameterType> getParameterTypes() {
		List<ParameterType> types = super.getParameterTypes();
		types.addAll(attributeSubsetSelector.getParameterTypes());

		ParameterType type = new ParameterTypeRegexp(PARAMETER_SPLIT_PATTERN,
				"The pattern which is used for dividing the nominal values into different parts.", ",");
		type.setExpert(false);
		types.add(type);

		type = new ParameterTypeCategory(PARAMETER_SPLIT_MODE,
				"The split mode of this operator, either ordered splits (keeping the original order) or unordered (keeping basket-like information).",
				SPLIT_MODES, SPLIT_MODE_ORDERED);
		type.setExpert(false);
		types.add(type);

		return types;
	}

	@Override
	public boolean writesIntoExistingData() {
		return false;
	}

	@Override
	public ResourceConsumptionEstimator getResourceConsumptionEstimator() {
		return OperatorResourceConsumptionHandler.getResourceConsumptionEstimator(getInputPort(), AttributeValueSplit.class,
				attributeSubsetSelector);
	}

	@Override
	public OperatorVersion[] getIncompatibleVersionChanges() {
		OperatorVersion[] changes = super.getIncompatibleVersionChanges();
		changes = Arrays.copyOf(changes, changes.length + 1);
		changes[changes.length - 1] = OPERATOR_VERSION_6_0_3;
		return changes;
	}
}