/* * RapidMiner * * Copyright (C) 2001-2011 by Rapid-I and the contributors * * Complete list of developers available at our web site: * * http://rapid-i.com * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.operator.learner.associations.fpgrowth; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.regex.Matcher; import java.util.regex.Pattern; import com.rapidminer.example.Attribute; import com.rapidminer.example.Example; import com.rapidminer.example.ExampleSet; import com.rapidminer.example.Tools; import com.rapidminer.operator.Operator; import com.rapidminer.operator.OperatorDescription; import com.rapidminer.operator.OperatorException; import com.rapidminer.operator.ProcessStoppedException; import com.rapidminer.operator.io.ExampleSource; import com.rapidminer.operator.learner.associations.BooleanAttributeItem; import com.rapidminer.operator.learner.associations.FrequentItemSet; import com.rapidminer.operator.learner.associations.FrequentItemSets; import com.rapidminer.operator.learner.associations.Item; import com.rapidminer.operator.ports.InputPort; import com.rapidminer.operator.ports.OutputPort; import com.rapidminer.operator.ports.metadata.ExampleSetPrecondition; import com.rapidminer.parameter.ParameterType; import com.rapidminer.parameter.ParameterTypeBoolean; import com.rapidminer.parameter.ParameterTypeDouble; import com.rapidminer.parameter.ParameterTypeInt; import com.rapidminer.parameter.ParameterTypeString; import com.rapidminer.parameter.UndefinedParameterError; import com.rapidminer.parameter.conditions.BooleanParameterCondition; import com.rapidminer.tools.Ontology; /** * <p> * This operator calculates all frequent items sets from a data set by building a FPTree data structure on the * transaction data base. This is a very compressed copy of the data which in many cases fits into main memory even for * large data bases. From this FPTree all frequent item set are derived. A major advantage of FPGrowth compared to * Apriori is that it uses only 2 data scans and is therefore often applicable even on large data sets. * </p> * * <p> * Please note that the given data set is only allowed to contain binominal attributes, i.e. nominal attributes with * only two different values. Simply use the provided preprocessing operators in order to transform your data set. The * necessary operators are the discretization operators for changing the value types of numerical attributes to nominal * and the operator Nominal2Binominal for transforming nominal attributes into binominal / binary ones. * </p> * * <p> * The frequent item sets are mined for the positive entries in your data base, i.e. for those nominal values which are * defined as positive in your data base. If you use an attribute description file (.aml) for the {@link ExampleSource} * operator this corresponds to the second value which is defined via the classes attribute or inner value tags. * </p> * * <p> * If your data does not specify the positive entries correctly, you may set them using the parameter positive_value. * This only works if all your attributes contain this value! * </p> * * <p> * This operator has two basic working modes: finding at least the specified number of item sets with highest support * without taking the min_support into account (default) or finding all item sets with a support large than min_support. * </p> * * @author Sebastian Land, Ingo Mierswa */ public class FPGrowth extends Operator { /** * Indicates if this operator should try to find a minimum number of item sets by iteratively decreasing the minimum * support. */ public static final String PARAMETER_FIND_MIN_NUMBER_OF_ITEMSETS = "find_min_number_of_itemsets"; /** Indicates the minimum number of item sets by iteratively decreasing the minimum support. */ public static final String PARAMETER_MIN_NUMBER_OF_ITEMSETS = "min_number_of_itemsets"; public static final String PARAMETER_MAX_REDUCTION_STEPS = "max_number_of_retries"; public static final String PARAMETER_POSITIVE_VALUE = "positive_value"; /** The parameter name for "Minimal Support" */ public static final String PARAMETER_MIN_SUPPORT = "min_support"; /** The parameter name the maximum number of items. */ public static final String PARAMETER_MAX_ITEMS = "max_items"; private static final String PARAMETER_MUST_CONTAIN = "must_contain"; private static final String PARAMETER_KEEP_EXAMPLE_SET = "keep_example_set"; private final InputPort exampleSetInput = getInputPorts().createPort("example set"); private final OutputPort exampleSetOutput = getOutputPorts().createPort("example set"); private final OutputPort frequentSetsOutput = getOutputPorts().createPort("frequent sets"); public FPGrowth(OperatorDescription description) { super(description); exampleSetInput.addPrecondition(new ExampleSetPrecondition(exampleSetInput, Ontology.BINOMINAL)); getTransformer().addGenerationRule(frequentSetsOutput, FrequentItemSets.class); getTransformer().addPassThroughRule(exampleSetInput, exampleSetOutput); } @Override public void doWork() throws OperatorException { ExampleSet exampleSet = exampleSetInput.getData(); // check Tools.onlyNominalAttributes(exampleSet, "FPGrowth"); boolean shouldFindMinimumNumber = getParameterAsBoolean(PARAMETER_FIND_MIN_NUMBER_OF_ITEMSETS); int maximalNumberOfRetries = shouldFindMinimumNumber ? getParameterAsInt(PARAMETER_MAX_REDUCTION_STEPS) : 1; int minimumNumberOfItemsets = shouldFindMinimumNumber ? getParameterAsInt(PARAMETER_MIN_NUMBER_OF_ITEMSETS) : 1; int maxItems = getParameterAsInt(PARAMETER_MAX_ITEMS); double currentSupport = getParameterAsDouble(PARAMETER_MIN_SUPPORT); // determine frequent items sets FrequentItemSets sets = null; int retryCount = 0; while ((sets == null) || (sets.size() < minimumNumberOfItemsets && retryCount < maximalNumberOfRetries)) { int currentMinTotalSupport = (int) Math.ceil(currentSupport * exampleSet.size()); // pre-computing data properties ExampleSet workingSet = preprocessExampleSet(exampleSet); // determining attributes and their positive indices Attribute[] attributes = new Attribute[workingSet.getAttributes().size()]; double[] positiveIndices = new double[workingSet.getAttributes().size()]; int i = 0; for (Attribute attribute : workingSet.getAttributes()) { attributes[i] = attribute; positiveIndices[i] = attribute.getMapping().getPositiveIndex(); String positiveValueString = null; try { positiveValueString = getParameterAsString(PARAMETER_POSITIVE_VALUE); } catch (UndefinedParameterError err) { } if (positiveValueString != null) { if (!positiveValueString.equals("")) { positiveIndices[i] = attribute.getMapping().mapString(positiveValueString); } } i++; } // map attributes to items Map<Attribute, Item> itemMapping = getAttributeMapping(workingSet); // computing frequency of 1-Item Sets getItemFrequency(workingSet, attributes, positiveIndices, itemMapping); // eliminating non frequent items removeNonFrequentItems(itemMapping, currentMinTotalSupport, workingSet); // generating FP Tree FPTree tree = getFPTree(workingSet, attributes, positiveIndices, itemMapping); // mine tree sets = new FrequentItemSets(workingSet.size()); String mustContainItems = getParameterAsString(PARAMETER_MUST_CONTAIN); if (mustContainItems == null) { mineTree(tree, sets, 0, currentMinTotalSupport, maxItems); } else { FrequentItemSet conditionalItems = new FrequentItemSet(); Pattern pattern = Pattern.compile(mustContainItems); int depth = 0; for (Entry<Attribute, Item> attributeEntry : itemMapping.entrySet()) { Matcher matcher = pattern.matcher(attributeEntry.getKey().getName()); if (matcher.matches()) { Item targetItem = attributeEntry.getValue(); // building conditional items Header targetItemHeader = tree.getHeaderTable().get(targetItem); // run over sibling chain for (FPTreeNode node : targetItemHeader.getSiblingChain()) { // and propagate frequency to root int frequency = node.getFrequency(depth); // if frequency is positive if (frequency > 0) { FPTreeNode currentNode = node.getFather(); while (currentNode != tree) { // increase node frequency currentNode.increaseFrequency(depth + 1, frequency); // increase item frequency in headerTable tree.getHeaderTable().get(currentNode.getNodeItem()).getFrequencies().increaseFrequency(depth + 1, frequency); // go up in tree currentNode = currentNode.getFather(); } } } // add item to conditional items int itemSupport = targetItemHeader.getFrequencies().getFrequency(depth); conditionalItems.addItem(targetItem, itemSupport); //conditionalItems.addItem(targetItem, targetItem.getFrequency()); depth++; } } // add this conditional items to frequentSets sets.addFrequentSet(conditionalItems); mineTree(tree, sets, depth, conditionalItems, currentMinTotalSupport, maxItems); } currentSupport *= 0.9; retryCount++; } exampleSetOutput.deliver(exampleSet); frequentSetsOutput.deliver(sets); } private ExampleSet preprocessExampleSet(ExampleSet exampleSet) { // precomputing data properties ExampleSet workingSet = (ExampleSet) exampleSet.clone(); // remove unusuable attributes int oldAttributeCount = workingSet.getAttributes().size(); removeNonBooleanAttributes(workingSet); int newAttributeCount = workingSet.getAttributes().size(); if (oldAttributeCount != newAttributeCount) { int removeCount = oldAttributeCount - newAttributeCount; String message = null; if (removeCount == 1) message = "Removed 1 non-binominal attribute, frequent item set mining is only supported for the positive values of binominal attributes."; else message = "Removed " + removeCount + " non-binominal attributes, frequent item set mining is only supported for the positive values of binominal attributes."; logWarning(message); } return workingSet; } private void mineTree(FPTree tree, FrequentItemSets sets, int recursionDepth, int minTotalSupport, int maxItems) throws ProcessStoppedException { mineTree(tree, sets, recursionDepth, new FrequentItemSet(), minTotalSupport, maxItems); } private void mineTree(FPTree tree, FrequentItemSets sets, int recursionDepth, FrequentItemSet conditionalItems, int minTotalSupport, int maxItems) throws ProcessStoppedException { checkForStop(); if (!(treeIsEmpty(tree, recursionDepth))) { if (maxItems > 0) { if (recursionDepth >= maxItems) { return; } } // recursively mine tree Map<Item, Header> headerTable = tree.getHeaderTable(); Iterator<Map.Entry<Item, Header>> headerIterator = headerTable.entrySet().iterator(); while (headerIterator.hasNext()) { Map.Entry<Item, Header> headerEntry = headerIterator.next(); Item item = headerEntry.getKey(); Header itemHeader = headerEntry.getValue(); // check for minSupport int itemSupport = itemHeader.getFrequencies().getFrequency(recursionDepth); if (itemSupport >= minTotalSupport) { // run over sibling chain for (FPTreeNode node : itemHeader.getSiblingChain()) { // and propagate frequency to root int frequency = node.getFrequency(recursionDepth); // if frequency is positive if (frequency > 0) { FPTreeNode currentNode = node.getFather(); while (currentNode != tree) { // increase node frequency currentNode.increaseFrequency(recursionDepth + 1, frequency); // increase item frequency in headerTable headerTable.get(currentNode.getNodeItem()).getFrequencies().increaseFrequency(recursionDepth + 1, frequency); // go up in tree currentNode = currentNode.getFather(); } } } FrequentItemSet recursivConditionalItems = (FrequentItemSet) conditionalItems.clone(); // add item to conditional items recursivConditionalItems.addItem(item, itemSupport); // add this conditional items to frequentSets sets.addFrequentSet(recursivConditionalItems); // recursively mine new tree mineTree(tree, sets, recursionDepth + 1, recursivConditionalItems, minTotalSupport, maxItems); // run over sibling chain for popping frequency stack for (FPTreeNode node : itemHeader.getSiblingChain()) { // and remove propagation of frequency FPTreeNode currentNode = node.getFather(); while (currentNode != tree) { // pop frequency currentNode.popFrequency(recursionDepth + 1); // go up in tree currentNode = currentNode.getFather(); } } // pop frequencies of every header table on current recursion depth for (Header currentItemHeader : headerTable.values()) { currentItemHeader.getFrequencies().popFrequency(recursionDepth + 1); } } } } } /** * Removes every non boolean attribute. * * @param exampleSet * exampleSet, which attributes are tested */ private void removeNonBooleanAttributes(ExampleSet exampleSet) { // removing non boolean attributes Collection<Attribute> deleteAttributes = new ArrayList<Attribute>(); for (Attribute attribute : exampleSet.getAttributes()) { if (!attribute.isNominal() || (attribute.getMapping().size() != 2)) { deleteAttributes.add(attribute); } } for (Attribute attribute : deleteAttributes) { exampleSet.getAttributes().remove(attribute); } } /** * This method maps the attributes of the given exampleSet to an Item. * * @param exampleSet * the exampleSet which attributes are mapped */ private Map<Attribute, Item> getAttributeMapping(ExampleSet exampleSet) { // computing Attributes to test, because only boolean attributes are used Map<Attribute, Item> mapping = new HashMap<Attribute, Item>(); for (Attribute attribute : exampleSet.getAttributes()) { mapping.put(attribute, new BooleanAttributeItem(attribute)); } return mapping; } /** * This method scans the exampleSet and counts the frequency of every item * * @param exampleSet * the exampleSet to be scaned * @param mapping * the mapping of attributes to items */ private void getItemFrequency(ExampleSet exampleSet, Attribute[] attributes, double[] positiveIndices, Map<Attribute, Item> mapping) { // iterate over exampleSet, counting item frequency for (Example currentExample : exampleSet) { int i = 0; for (Attribute attribute : attributes) { // if attribute is boolean and if attribute is the positive one --> increase frequency of item if (currentExample.getValue(attribute) == positiveIndices[i]) { mapping.get(attribute).increaseFrequency(); } i++; } } } private void removeNonFrequentItems(Map<Attribute, Item> mapping, int minFrequency, ExampleSet exampleSet) { Collection<Attribute> deleteMappings = new ArrayList<Attribute>(); Iterator<Map.Entry<Attribute, Item>> it = mapping.entrySet().iterator(); while (it.hasNext()) { Map.Entry<Attribute, Item> entry = it.next(); if (entry.getValue().getFrequency() < minFrequency) { deleteMappings.add(entry.getKey()); } } for (Attribute attribute : deleteMappings) { exampleSet.getAttributes().remove(attribute); } } /** * Returns a new FPTree, representing the complete ExampleSet. * * @param exampleSet * is the exampleSet, which shall be represented * @param mapping * is the mapping of attributes of the exampleSet to items */ private FPTree getFPTree(ExampleSet exampleSet, Attribute[] attributes, double[] positiveIndices, Map<Attribute, Item> mapping) { FPTree tree = new FPTree(); for (Example currentExample : exampleSet) { List<Item> itemSet = new ArrayList<Item>(); int i = 0; for (Attribute currentAttribute : attributes) { if (currentExample.getValue(currentAttribute) == positiveIndices[i]) { itemSet.add(mapping.get(currentAttribute)); } i++; } Collections.sort(itemSet); tree.addItemSet(itemSet, 1); } return tree; } private boolean treeIsEmpty(FPTree tree, int recursionDepth) { // tree is empty if every child of rootnode has frequency of 0 on top of stack for (FPTreeNode node : tree.getChildren().values()) { if (node.getFrequency(recursionDepth) > 0) { return false; } } return true; } @Override public List<ParameterType> getParameterTypes() { List<ParameterType> types = super.getParameterTypes(); ParameterType type = new ParameterTypeBoolean(PARAMETER_FIND_MIN_NUMBER_OF_ITEMSETS, "Indicates if the mininmal support should be decreased automatically until the specified minimum number of frequent item sets is found. The defined minimal support is lowered by 20 percent each time.", true); type.setExpert(false); types.add(type); type = new ParameterTypeInt(PARAMETER_MIN_NUMBER_OF_ITEMSETS, "Indicates the minimum number of itemsets which should be determined if the corresponding parameter is activated.", 0, Integer.MAX_VALUE, 100); type.registerDependencyCondition(new BooleanParameterCondition(this, PARAMETER_FIND_MIN_NUMBER_OF_ITEMSETS, true, true)); type.setExpert(false); types.add(type); type = new ParameterTypeInt(PARAMETER_MAX_REDUCTION_STEPS, "This determines how many times the operator lowers min support to find the minimal number of item sets. Each time the minimal support is lowered by 20 percent.", 2, Integer.MAX_VALUE, 15); type.registerDependencyCondition(new BooleanParameterCondition(this, PARAMETER_FIND_MIN_NUMBER_OF_ITEMSETS, false, true)); type.setExpert(true); types.add(type); type = new ParameterTypeString(PARAMETER_POSITIVE_VALUE, "This parameter determines, which value of the binominal attributes is treated as positive. Attributes with that value are considered as part of a transaction. If left blank, the example set determines, which is value is used.", true); type.setExpert(true); types.add(type); types.add(new ParameterTypeDouble(PARAMETER_MIN_SUPPORT, "The minimal support necessary in order to be a frequent item (set).", 0.0d, 1.0d, 0.95d)); types.add(new ParameterTypeInt(PARAMETER_MAX_ITEMS, "The upper bound for the length of the item sets (-1: no upper bound)", -1, Integer.MAX_VALUE, -1)); types.add(new ParameterTypeString(PARAMETER_MUST_CONTAIN, "The items any generated rule must contain as regular expression. Empty if none.")); type = new ParameterTypeBoolean(PARAMETER_KEEP_EXAMPLE_SET, "indicates if example set is kept", false); type.setDeprecated(); types.add(type); return types; } }