/* * RapidMiner * * Copyright (C) 2001-2011 by Rapid-I and the contributors * * Complete list of developers available at our web site: * * http://rapid-i.com * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.operator.learner.associations.gsp; import java.util.ArrayList; import java.util.Arrays; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashSet; import java.util.List; import com.rapidminer.example.Attribute; import com.rapidminer.example.Attributes; import com.rapidminer.example.Example; import com.rapidminer.example.ExampleSet; import com.rapidminer.example.Tools; import com.rapidminer.example.set.SortedExampleSet; import com.rapidminer.operator.Operator; import com.rapidminer.operator.OperatorDescription; import com.rapidminer.operator.OperatorException; import com.rapidminer.operator.UserError; import com.rapidminer.operator.ProcessSetupError.Severity; import com.rapidminer.operator.ports.InputPort; import com.rapidminer.operator.ports.OutputPort; import com.rapidminer.operator.ports.metadata.AttributeMetaData; import com.rapidminer.operator.ports.metadata.AttributeParameterPrecondition; import com.rapidminer.operator.ports.metadata.ExampleSetMetaData; import com.rapidminer.operator.ports.metadata.MetaData; import com.rapidminer.operator.ports.metadata.SimplePrecondition; import com.rapidminer.parameter.ParameterType; import com.rapidminer.parameter.ParameterTypeAttribute; import com.rapidminer.parameter.ParameterTypeDouble; import com.rapidminer.parameter.ParameterTypeString; import com.rapidminer.parameter.UndefinedParameterError; import com.rapidminer.tools.LogService; import com.rapidminer.tools.Ontology; /** * This operator searches sequential patterns in a set of transactions. Each transaction must be encoded as an single * example and must contain one attribute for the time and for the customer. This pair of attribute is used for generate * one sequence per customer containing each single transaction ordered by the time of each transaction. The algorithm * then searches sequential patterns in the form of: If a customer bought a and c in one transaction, he bought b in the * next: <a, c> then <b>. The minimal support describes how many customer must support such a pattern for regarding it * as frequent. Infrequent patterns will be dropped. A customer supports such a pattern, if there are some parts of his * sequence including the pattern. The above pattern would be supported by a customer with this transactions: <s, g> * then <a, s, c> then <b> then <f, h>. * * The parameters min_gap, max_gap and window_size determine how transaction are handled. For example, if the above * customer forgot to by c, and had to return 5 minutes later to buy c, then his transactions would look like that: <s, * g> then <a, s> then <c> then <b> then <f, h> This would not support the pattern <a, c> then <b>. To avoid this * problem, the window size determines, how long a subsequent transaction is treated as the same transaction. If the * window size is larger than 5 minutes, the <c> would be treated as being part of the second transaction and hence this * customer would support the above pattern. * * The max_gap parameter causes a customers sequence not to support a pattern, if the transactions containing this * pattern are to widely separated in time. The min_gap parameter does the same if they are to near. * * @author Sebastian Land */ public class GSPOperator extends Operator { public static final String TIME_ROLE = "time"; public static final String CUSTOMER_ROLE = "customer"; public static final String PARAMETER_CUSTOMER_ATTRIBUTE = "customer_id"; public static final String PARAMETER_TIME_ATTRIBUTE = "time_attribute"; public static final String PARAMETER_WINDOW_SIZE = "window_size"; public static final String PARAMETER_MAX_GAP = "max_gap"; public static final String PARAMETER_MIN_GAP = "min_gap"; public static final String PARAMETER_POSITIVE_VALUE = "positive_value"; public static final String PARAMETER_MIN_SUPPORT = "min_support"; private InputPort exampleSetInput = getInputPorts().createPort("example set"); private OutputPort exampleSetOutput = getOutputPorts().createPort("example set"); private OutputPort patternOutput = getOutputPorts().createPort("patterns"); public GSPOperator(OperatorDescription description) { super(description); exampleSetInput.addPrecondition(new SimplePrecondition(exampleSetInput, new ExampleSetMetaData(), true) { @Override public void makeAdditionalChecks(MetaData metaData) { if (metaData instanceof ExampleSetMetaData) { ExampleSetMetaData emd = (ExampleSetMetaData)metaData; String customerAttribute = ""; String timeAttribute = ""; try { customerAttribute = getParameterAsString(PARAMETER_CUSTOMER_ATTRIBUTE); timeAttribute = getParameterAsString(PARAMETER_TIME_ATTRIBUTE); } catch (UndefinedParameterError e) { } // checking allowed types for (AttributeMetaData amd : emd.getAllAttributes()) { if (amd.isSpecial()) { continue; } // check if name is in ignore list if (amd.getName().equals(customerAttribute) || amd.getName().equals(timeAttribute)) continue; // otherwise do check if (!Ontology.ATTRIBUTE_VALUE_TYPE.isA(amd.getValueType(), Ontology.NOMINAL)) { createError(Severity.ERROR, "regular_type_mismatch", new Object[] { Ontology.ATTRIBUTE_VALUE_TYPE.mapIndex(Ontology.BINOMINAL)}); break; } } } } }); exampleSetInput.addPrecondition(new AttributeParameterPrecondition(exampleSetInput, this, PARAMETER_CUSTOMER_ATTRIBUTE)); exampleSetInput.addPrecondition(new AttributeParameterPrecondition(exampleSetInput, this, PARAMETER_TIME_ATTRIBUTE, Ontology.NUMERICAL)); getTransformer().addPassThroughRule(exampleSetInput, exampleSetOutput); getTransformer().addGenerationRule(patternOutput, GSPSet.class); } @Override public void doWork() throws OperatorException { ExampleSet exampleSet = exampleSetInput.getData(); Attributes attributes = exampleSet.getAttributes(); String timeAttributeName = getParameterAsString(PARAMETER_TIME_ATTRIBUTE); String customerAttributeName = getParameterAsString(PARAMETER_CUSTOMER_ATTRIBUTE); if (timeAttributeName.equals("")) throw new UserError(this, 205, PARAMETER_TIME_ATTRIBUTE); if (customerAttributeName.equals("")) throw new UserError(this, 205, PARAMETER_CUSTOMER_ATTRIBUTE); double minSupport = getParameterAsDouble(PARAMETER_MIN_SUPPORT); double maxGap = getParameterAsDouble(PARAMETER_MAX_GAP); double minGap = getParameterAsDouble(PARAMETER_MIN_GAP); double windowSize = getParameterAsDouble(PARAMETER_WINDOW_SIZE); Attribute timeAttribute = attributes.get(timeAttributeName); Attribute customerAttribute = attributes.get(customerAttributeName); if (timeAttribute == null) { throw new UserError(this, 111, timeAttributeName); } if (customerAttribute == null) { throw new UserError(this, 111, customerAttributeName); } if (!timeAttribute.isNumerical()) { throw new UserError(this, 144, timeAttribute.getName(), "GSP"); } // setting both attributes special attributes.setSpecialAttribute(timeAttribute, TIME_ROLE); attributes.setSpecialAttribute(customerAttribute, CUSTOMER_ROLE); // now check that only binominal attributes are present and fetch positive index Tools.onlyNominalAttributes(exampleSet, "GSP"); double positiveIndices[] = new double[attributes.size()]; Arrays.fill(positiveIndices, 1); if (isParameterSet(PARAMETER_POSITIVE_VALUE)) { int attributeIndex = 0; for (Attribute attribute : attributes) { positiveIndices[attributeIndex] = attribute.getMapping().mapString(getParameterAsString(PARAMETER_POSITIVE_VALUE)); attributeIndex++; } } // now build items from attributes Item[] items = new Item[attributes.size()]; int i = 0; for (Attribute attribute : attributes) { items[i] = new Item(attribute.getName(), i); i++; } // building sequences ArrayList<DataSequence> dataSequences = buildSequences(exampleSet, attributes, timeAttribute, customerAttribute, positiveIndices, items); double numberOfSequences = dataSequences.size(); if (numberOfSequences * minSupport < 5) { LogService.getGlobal().log("Found only " + numberOfSequences + " sequences. Together with the small minimal support, this could result in very many patterns and a long calculation time.", LogService.WARNING); } // find frequent items: Items are frequent if occur in enough sequences int minFrequency = (int) Math.floor(numberOfSequences * minSupport); LinkedHashSet<Item> frequentItems = findFrequentItems(dataSequences, items, minFrequency); // remove infrequent items from sequences Iterator<DataSequence> sequenceIterator = dataSequences.iterator(); while (sequenceIterator.hasNext()) { DataSequence sequence = sequenceIterator.next(); Iterator<Transaction> transactionIterator = sequence.iterator(); while (transactionIterator.hasNext()) { Transaction transaction = transactionIterator.next(); Iterator<Item> itemIterator = transaction.iterator(); while (itemIterator.hasNext()) { Item item = itemIterator.next(); if (!frequentItems.contains(item)) itemIterator.remove(); } if (transaction.isEmpty()) transactionIterator.remove(); } if (sequence.isEmpty()) sequenceIterator.remove(); } // build first seed set HashSet<Sequence> seeds = buildSeeds(frequentItems); // now iteratively build candidates and filter them to seeds GSPSet model = new GSPSet(); int round = 0; while (seeds.size() > 0) { checkForStop(); ArrayList<Sequence> candidates = generateCandidates(seeds, round == 0); if (candidates.size() == 0) break; checkForStop(); // if new candidates filter them from ones with to small support int[] supportCounter = countSupportingCustomer(candidates, dataSequences, windowSize, maxGap, minGap, minSupport); Iterator<Sequence> iterator = candidates.iterator(); for (i = 0; i < supportCounter.length; i++) { Sequence currentSequence = iterator.next(); double support = supportCounter[i] / numberOfSequences; if (support >= minSupport) { model.addSequence(currentSequence, support); } else { iterator.remove(); } } LogService.getGlobal().log("Filtered Candidates. Remaining: " + candidates.size(), LogService.INIT); // using filtered candidates as seeds seeds.clear(); seeds.addAll(candidates); round++; } exampleSetOutput.deliver(exampleSet); patternOutput.deliver(model); } private int[] countSupportingCustomer(ArrayList<Sequence> candidates, ArrayList<DataSequence> dataSequences, double windowSize, double maxGap, double minGap, double minSupport) { LogService.getGlobal().log("Building Hashtree for counting candidates of length " + candidates.get(0).getNumberOfItems(), LogService.INIT); // build hashtree: root becomes immediately inner node, since candidates will probably exceed limit HashTreeNode root = new HashTreeRootNode(); int i = 0; for (Sequence candidate : candidates) { root.addSequence(candidate, i, 0, null, candidates); i++; } LogService.getGlobal().log("Counting supporting sequences for candidates of length " + candidates.get(0).getNumberOfItems(), LogService.INIT); // now run through all data sequences and counting occurrences of candidate sequences int[] counter = new int[candidates.size()]; boolean[] occurs = new boolean[candidates.size()]; CountingInformations countingInformations = new CountingInformations(occurs, candidates, windowSize, maxGap, minGap); for (DataSequence dataSequence : dataSequences) { // calling tree to let it count the dataSequence root.countCoveredCandidates(dataSequence, 0, countingInformations); for (i = 0; i < occurs.length; i++) { counter[i] += occurs[i] ? 1 : 0; occurs[i] = false; } } return counter; } private static ArrayList<Sequence> generateCandidates(HashSet<Sequence> seeds, boolean isFirstRound) { LogService.getGlobal().log("Generating Candidates of length " + seeds.iterator().next().getNumberOfItems(), LogService.INIT); ArrayList<Sequence> candidates = new ArrayList<Sequence>(); int pruneCheckCounter = 0; // generate set of candidates for (Sequence sequence1 : seeds) { for (Sequence sequence2 : seeds) { if (sequence1.equals(0, sequence2, sequence2.getNumberOfItems() - 1)) { if (isFirstRound || sequence2.getLastTransaction().size() == 1) { Sequence candidate = Sequence.appendTransaction(sequence1, sequence2.getLastTransaction()); pruneCheckCounter++; if (pruneCheckCounter % 10000 == 0) LogService.getGlobal().log("....................................................................................................", LogService.INIT); if (!isPruned(seeds, candidate)) candidates.add(candidate); } if (isFirstRound || sequence2.getLastTransaction().size() > 1) { Sequence candidate = Sequence.appendItem(sequence1, sequence2.getLastTransaction().getLastItem()); pruneCheckCounter++; if (pruneCheckCounter % 10000 == 0) LogService.getGlobal().log("....................................................................................................", LogService.INIT); if (!isPruned(seeds, candidate)) candidates.add(candidate); } } } } LogService.getGlobal().log("Generated " + candidates.size() + " candidates", LogService.INIT); return candidates; } private static boolean isPruned(HashSet<Sequence> seeds, Sequence candidate) { if (candidate.getNumberOfItems() < seeds.iterator().next().getNumberOfItems() + 1) return true; boolean contained = true; // removing from first transaction for (int i = 0; i < candidate.get(0).size(); i++) { if (!isFrequent(Sequence.removeItem(candidate, 0, i), seeds)) return true; } if (contained) { // removing from last transaction int lastIndex = candidate.size() - 1; for (int i = 0; i < candidate.get(lastIndex).size(); i++) { if (!isFrequent(Sequence.removeItem(candidate, lastIndex, i), seeds)) return true; } } if (contained) { // removing from center, hence skip first and last for (int transactionIndex = 1; transactionIndex < candidate.size() - 1; transactionIndex++) { int transactionSize = candidate.get(transactionIndex).size(); if (transactionSize > 1) { for (int i = 0; i < transactionSize; i++) { if (!isFrequent(Sequence.removeItem(candidate, transactionIndex, i), seeds)) return true; } } } } return false; } // test if the candidate is frequent by checking if contained in seeds private static boolean isFrequent(Sequence testCandidate, HashSet<Sequence> seeds) { return seeds.contains(testCandidate); } private HashSet<Sequence> buildSeeds(LinkedHashSet<Item> frequentItems) { HashSet<Sequence> seeds = new HashSet<Sequence>(frequentItems.size()); for (Item item : frequentItems) { Transaction transaction = new Transaction(Double.NaN); transaction.add(item); Sequence sequence = new Sequence(); sequence.add(transaction); seeds.add(sequence); } return seeds; } private LinkedHashSet<Item> findFrequentItems(ArrayList<DataSequence> sequences, Item[] items, int minFrequency) { int[] itemCounters = new int[items.length]; for (Sequence sequence : sequences) { boolean[] itemCounted = new boolean[items.length]; for (Transaction transaction : sequence) { for (Item item : transaction) { int index = item.getIndex(); if (!itemCounted[index]) { itemCounters[index]++; itemCounted[index] = true; } } } } LinkedHashSet<Item> frequentItems = new LinkedHashSet<Item>(); for (int i = 0; i < items.length; i++) { if (itemCounters[i] > minFrequency) frequentItems.add(items[i]); } return frequentItems; } private ArrayList<DataSequence> buildSequences(ExampleSet exampleSet, Attributes attributes, Attribute timeAttribute, Attribute customerAttribute, double[] positiveIndices, Item[] items) { ArrayList<DataSequence> sequences = new ArrayList<DataSequence>(); // now sort exampleSet according to customer attribute and time attribute SortedExampleSet sortedSet = new SortedExampleSet(exampleSet, timeAttribute, SortedExampleSet.INCREASING); sortedSet = new SortedExampleSet(sortedSet, customerAttribute, SortedExampleSet.INCREASING); // now build sequences from exampleset: Each Customer is one sequence, each transaction one item set double lastCustomerId = Double.NEGATIVE_INFINITY; DataSequence currentSequence = null; for (Example example : sortedSet) { double customerId = example.getValue(customerAttribute); if (lastCustomerId != customerId) { // if completely filled: Build access structure if (currentSequence != null) currentSequence.buildAccessStructure(); // then create new sequence currentSequence = new DataSequence(items.length); sequences.add(currentSequence); // add reference already: Will be filled later lastCustomerId = customerId; } Transaction currentSet = new Transaction(example.getValue(timeAttribute)); int attributeIndex = 0; for (Attribute attribute : attributes) { if (example.getValue(attribute) == positiveIndices[attributeIndex]) currentSet.add(items[attributeIndex]); attributeIndex++; } if (currentSet.size() > 0) currentSequence.add(currentSet); } // building structure for last sequence. currentSequence.buildAccessStructure(); return sequences; } @Override public List<ParameterType> getParameterTypes() { List<ParameterType> types = super.getParameterTypes(); ParameterType type = new ParameterTypeAttribute(PARAMETER_CUSTOMER_ATTRIBUTE, "This attribute will be used to identify the customer of a transaction.", exampleSetInput, false); type.setExpert(false); types.add(type); type = new ParameterTypeAttribute(PARAMETER_TIME_ATTRIBUTE, "This numerical attribute specifies the time of a transaction.", exampleSetInput, false, Ontology.NUMERICAL); type.setExpert(false); types.add(type); type = new ParameterTypeDouble(PARAMETER_MIN_SUPPORT, "This specifies the minimal support of a pattern", 0, 1, 0.9); type.setExpert(false); types.add(type); type = new ParameterTypeDouble(PARAMETER_WINDOW_SIZE, "This specifies the window size", 0, Double.POSITIVE_INFINITY); type.setExpert(false); types.add(type); type = new ParameterTypeDouble(PARAMETER_MAX_GAP, "This specifies the maximal gap", 0, Double.POSITIVE_INFINITY); type.setExpert(false); types.add(type); type = new ParameterTypeDouble(PARAMETER_MIN_GAP, "This specifies the minimal gap", 0, Double.POSITIVE_INFINITY); type.setExpert(false); types.add(type); type = new ParameterTypeString(PARAMETER_POSITIVE_VALUE, "This parameter determines, which value of the binominal attributes is treated as positive. Attributes with that value are considered as part of a transaction. If left blank, the example set determines, which is value is used.", true); types.add(type); return types; } }