/* * RapidMiner * * Copyright (C) 2001-2008 by Rapid-I and the contributors * * Complete list of developers available at our web site: * * http://rapid-i.com * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.operator.learner.associations.fpgrowth; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import com.rapidminer.example.Attribute; import com.rapidminer.example.Attributes; import com.rapidminer.example.Example; import com.rapidminer.example.ExampleSet; import com.rapidminer.example.Tools; import com.rapidminer.operator.IOObject; import com.rapidminer.operator.Operator; import com.rapidminer.operator.OperatorDescription; import com.rapidminer.operator.OperatorException; import com.rapidminer.operator.io.ExampleSource; import com.rapidminer.operator.learner.associations.BooleanAttributeItem; import com.rapidminer.operator.learner.associations.FrequentItemSet; import com.rapidminer.operator.learner.associations.FrequentItemSets; import com.rapidminer.operator.learner.associations.Item; import com.rapidminer.parameter.ParameterType; import com.rapidminer.parameter.ParameterTypeBoolean; import com.rapidminer.parameter.ParameterTypeDouble; import com.rapidminer.parameter.ParameterTypeInt; import com.rapidminer.parameter.ParameterTypeString; import com.sun.org.apache.xerces.internal.impl.xpath.regex.RegularExpression; /** * <p>This operator calculates all frequent items sets from a data set by building * a FPTree data structure on the transaction data base. This is a very compressed * copy of the data which in many cases fits into main memory even for large * data bases. From this FPTree all frequent item set are derived. A major advantage * of FPGrowth compared to Apriori is that it uses only 2 data scans and is therefore * often applicable even on large data sets.</p> * * <p>Please note that the given data set is only allowed to contain binominal attributes, * i.e. nominal attributes with only two different values. Simply use the provided * preprocessing operators in order to transform your data set. The necessary operators * are the discretization operators for changing the value types of numerical * attributes to nominal and the operator Nominal2Binominal for transforming nominal * attributes into binominal / binary ones. * The frequent item sets are mined for the positive entries in your data base, * i.e. for those nominal values which are defined as positive in your data base. * If you use an attribute description file (.aml) for the {@link ExampleSource} operator * this corresponds to the second value which is defined via the classes attribute or inner * value tags.</p> * * <p>This operator has two basic working modes: finding at least the specified * number of item sets with highest support without taking the min_support into * account (default) or finding all item sets with a support large than min_support.</p> * * @author Sebastian Land, Ingo Mierswa * @version $Id: FPGrowth.java,v 1.12 2008/07/13 14:16:10 ingomierswa Exp $ */ public class FPGrowth extends Operator { /** Indicates if this operator should try to find a minimum number of item * sets by iteratively decreasing the minimum support. */ public static final String PARAMETER_FIND_MIN_NUMBER_OF_ITEMSETS = "find_min_number_of_itemsets"; /** Indicates the minimum number of item sets by iteratively decreasing the minimum support. */ public static final String PARAMETER_MIN_NUMBER_OF_ITEMSETS = "min_number_of_itemsets"; /** The parameter name for "Minimal Support" */ public static final String PARAMETER_MIN_SUPPORT = "min_support"; /** The parameter name the maximum number of items. */ public static final String PARAMETER_MAX_ITEMS = "max_items"; private static final String PARAMETER_MUST_CONTAIN ="must_contain"; private static final String PARAMETER_KEEP_EXAMPLE_SET = "keep_example_set"; public FPGrowth(OperatorDescription description) { super(description); } public IOObject[] apply() throws OperatorException { ExampleSet exampleSet = getInput(ExampleSet.class); // check Tools.onlyNominalAttributes(exampleSet, "FPGrowth"); // precomputing data properties //ExampleSet workingSet = (ExampleSet)exampleSet.clone(); // determine frequent items sets boolean shouldFindMinimumNumber = getParameterAsBoolean(PARAMETER_FIND_MIN_NUMBER_OF_ITEMSETS); int maxItems = getParameterAsInt(PARAMETER_MAX_ITEMS); FrequentItemSets rules = null; int minimumNumberOfItemsets = getParameterAsInt(PARAMETER_MIN_NUMBER_OF_ITEMSETS); double currentSupport = 0.95; boolean foundEnough = false; while (!foundEnough) { int minTotalSupport; if (shouldFindMinimumNumber) { minTotalSupport = (int) Math.ceil(currentSupport * exampleSet.size()); } else { double minSupport = getParameterAsDouble(PARAMETER_MIN_SUPPORT); minTotalSupport = (int) Math.ceil(minSupport * exampleSet.size()); } // precomputing data properties ExampleSet workingSet = preprocessExampleSet(exampleSet); // map attributes to items Map<Attribute, Item> mapping = getAttributeMapping(workingSet); // computing frequency of 1-Item Sets getItemFrequency(workingSet, mapping); // eliminating non frequent items removeNonFrequentItems(mapping, minTotalSupport, workingSet); // generating FP Tree FPTree tree = getFPTree(workingSet, mapping); // mine tree rules = new FrequentItemSets(workingSet.size()); String mustContainItems = getParameterAsString(PARAMETER_MUST_CONTAIN); if (mustContainItems == null) { mineTree(tree, rules, 0, minTotalSupport, maxItems); } else { // building conditional items FrequentItemSet conditionalItems = new FrequentItemSet(); RegularExpression regEx = new RegularExpression(mustContainItems); for (Attribute attribute: mapping.keySet()) { if (regEx.matches(attribute.getName())) { Item targetItem = mapping.get(attribute); conditionalItems.addItem(targetItem, targetItem.getFrequency()); } } rules.addFrequentSet(conditionalItems); mineTree(tree, rules, 0, conditionalItems, minTotalSupport, maxItems); } if (shouldFindMinimumNumber) { // enough? if ((rules.size() >= minimumNumberOfItemsets) || (currentSupport <= 0.06)) { foundEnough = true; } currentSupport -= 0.05; } else { // leaving loop if parameter is not set break; } } if (getParameterAsBoolean(PARAMETER_KEEP_EXAMPLE_SET)) { return new IOObject[] { exampleSet, rules }; } else { return new IOObject[] { rules }; } } private ExampleSet preprocessExampleSet(ExampleSet exampleSet) { // precomputing data properties ExampleSet workingSet = (ExampleSet)exampleSet.clone(); // remove unusuable attributes int oldAttributeCount = workingSet.getAttributes().size(); removeNonBooleanAttributes(workingSet); int newAttributeCount = workingSet.getAttributes().size(); if (oldAttributeCount != newAttributeCount) { int removeCount = oldAttributeCount - newAttributeCount; String message = null; if (removeCount == 1) message = "Removed 1 non-binominal attribute, frequent item set mining is only supported for the positive values of binominal attributes."; else message = "Removed " + removeCount + " non-binominal attributes, frequent item set mining is only supported for the positive values of binominal attributes."; logWarning(message); } return workingSet; } private void mineTree(FPTree tree, FrequentItemSets rules, int recursionDepth, int minTotalSupport, int maxItems) { mineTree(tree, rules, recursionDepth, new FrequentItemSet(), minTotalSupport, maxItems); } private void mineTree(FPTree tree, FrequentItemSets rules, int recursionDepth, FrequentItemSet conditionalItems, int minTotalSupport, int maxItems) { if (!(treeIsEmpty(tree, recursionDepth))) { if (maxItems > 0) { if (recursionDepth >= maxItems) { return; } } // recursivly mine tree Map<Item, Header> headerTable = tree.getHeaderTable(); Iterator<Map.Entry<Item, Header>> headerIterator = headerTable.entrySet().iterator(); while (headerIterator.hasNext()) { Map.Entry<Item, Header> headerEntry = headerIterator.next(); Item item = headerEntry.getKey(); Header itemHeader = headerEntry.getValue(); // check for minSupport int itemSupport = itemHeader.getFrequencies().getFrequency(recursionDepth); if (itemSupport >= minTotalSupport) { // run over sibling chain for (FPTreeNode node : itemHeader.getSiblingChain()) { // and propagate frequency to root int frequency = node.getFrequency(recursionDepth); // if frequency is positiv if (frequency > 0) { FPTreeNode currentNode = node.getFather(); while (currentNode != tree) { // increase node frequency currentNode.increaseFrequency(recursionDepth + 1, frequency); // increase item frequency in headerTable headerTable.get(currentNode.getNodeItem()).getFrequencies().increaseFrequency(recursionDepth + 1, frequency); // go up in tree currentNode = currentNode.getFather(); } } } FrequentItemSet recursivConditionalItems = (FrequentItemSet) conditionalItems.clone(); // add item to conditional items recursivConditionalItems.addItem(item, itemSupport); // add this conditional items to frequentSets rules.addFrequentSet(recursivConditionalItems); // recursivly mine new tree mineTree(tree, rules, recursionDepth + 1, recursivConditionalItems, minTotalSupport, maxItems); // run over sibling chain for poping frequency stack for (FPTreeNode node : itemHeader.getSiblingChain()) { // and remove propagation of frequency FPTreeNode currentNode = node.getFather(); while (currentNode != tree) { // pop frequency currentNode.popFrequency(recursionDepth + 1); // go up in tree currentNode = currentNode.getFather(); } } // pop frequencies of every header table on current recursion depth for (Header currentItemHeader : headerTable.values()) { currentItemHeader.getFrequencies().popFrequency(recursionDepth + 1); } } } } } /** Removes every non boolean attribute. * @param exampleSet exampleSet, which attributes are tested */ private void removeNonBooleanAttributes(ExampleSet exampleSet) { // removing non boolean attributes Collection<Attribute> deleteAttributes = new ArrayList<Attribute>(); for (Attribute attribute : exampleSet.getAttributes()) { if (!attribute.isNominal() || (attribute.getMapping().size() != 2)) { deleteAttributes.add(attribute); } } for (Attribute attribute : deleteAttributes) { exampleSet.getAttributes().remove(attribute); } } /** * This method maps the attributes of the given exampleSet to an Item. * @param exampleSet the exampleSet which attributes are mapped */ private Map<Attribute, Item> getAttributeMapping(ExampleSet exampleSet) { // computing Attributes to test, because only boolean attributes are used Map<Attribute, Item> mapping = new HashMap<Attribute, Item>(); for (Attribute attribute : exampleSet.getAttributes()) { mapping.put(attribute, new BooleanAttributeItem(attribute)); } return mapping; } /** * This method scans the exampleSet and counts the frequency of every item * * @param exampleSet * the exampleSet to be scaned * @param mapping * the mapping of attributes to items */ private void getItemFrequency(ExampleSet exampleSet, Map<Attribute, Item> mapping) { // iterate over exampleSet, counting item frequency Attributes attributes = exampleSet.getAttributes(); for (Example currentExample : exampleSet) { for (Attribute attribute : attributes) { // if attribute is boolean and if attribute is the positive one --> increase frequency of item if (currentExample.getValue(attribute) == attribute.getMapping().getPositiveIndex()) { mapping.get(attribute).increaseFrequency(); } } } } private void removeNonFrequentItems(Map<Attribute, Item> mapping, int minFrequency, ExampleSet exampleSet) { Collection<Attribute> deleteMappings = new ArrayList<Attribute>(); Iterator<Map.Entry<Attribute, Item>> it = mapping.entrySet().iterator(); while (it.hasNext()) { Map.Entry<Attribute, Item> entry = it.next(); if (entry.getValue().getFrequency() < minFrequency) { deleteMappings.add(entry.getKey()); } } for (Attribute attribute : deleteMappings) { exampleSet.getAttributes().remove(attribute); } } /** * Returns a new FPTree, representing the complete ExampleSet. * * @param exampleSet * is the exampleSet, which shall be represented * @param mapping * is the mapping of attributes of the exampleSet to items */ private FPTree getFPTree(ExampleSet exampleSet, Map<Attribute, Item> mapping) { FPTree tree = new FPTree(); for (Example currentExample : exampleSet) { List<Item> itemSet = new ArrayList<Item>(); for (Attribute currentAttribute : exampleSet.getAttributes()) { if (currentExample.getValue(currentAttribute) == currentAttribute.getMapping().getPositiveIndex()) { itemSet.add(mapping.get(currentAttribute)); } } Collections.sort(itemSet); tree.addItemSet(itemSet, 1); } return tree; } private boolean treeIsEmpty(FPTree tree, int recursionDepth) { // tree is empty if every child of rootnode has frequency of 0 on top of stack for (FPTreeNode node : tree.getChildren().values()) { if (node.getFrequency(recursionDepth) > 0) { return false; } } return true; } public Class<?>[] getInputClasses() { return new Class[] { ExampleSet.class }; } public Class<?>[] getOutputClasses() { return new Class[] { FrequentItemSets.class }; } public List<ParameterType> getParameterTypes() { List<ParameterType> types = super.getParameterTypes(); types.add(new ParameterTypeBoolean(PARAMETER_KEEP_EXAMPLE_SET, "indicates if example set is kept", false)); ParameterType type = new ParameterTypeBoolean(PARAMETER_FIND_MIN_NUMBER_OF_ITEMSETS, "Indicates if the support should be decreased until the specified minimum number of frequent item sets is found. Otherwise, FPGrowth simply uses the defined support.", true); type.setExpert(false); types.add(type); type = new ParameterTypeInt(PARAMETER_MIN_NUMBER_OF_ITEMSETS, "Indicates the minimum number of itemsets which should be determined if the corresponding parameter is activated.", 0, Integer.MAX_VALUE, 100); type.setExpert(false); types.add(type); types.add(new ParameterTypeDouble(PARAMETER_MIN_SUPPORT, "The minimal support necessary in order to be a frequent item (set).", 0.0d, 1.0d, 0.95d)); types.add(new ParameterTypeInt(PARAMETER_MAX_ITEMS, "The upper bound for the length of the item sets (-1: no upper bound)", -1, Integer.MAX_VALUE, -1)); types.add(new ParameterTypeString(PARAMETER_MUST_CONTAIN, "The items any generated rule must contain as regular expression. Empty if none.")); return types; } }