package ca.pfv.spmf.algorithms.sequentialpatterns.gsp_AGP; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.Set; import ca.pfv.spmf.algorithms.sequentialpatterns.gsp_AGP.items.Item; import ca.pfv.spmf.algorithms.sequentialpatterns.gsp_AGP.items.creators.AbstractionCreator; import ca.pfv.spmf.algorithms.sequentialpatterns.gsp_AGP.items.patterns.Pattern; /** * This is an implementation of the candidate generation addressed in GSP algorithm. * This class is one of the two method continuously repeated by means of the GSP's main loop. * Here, from a set of frequent candidates k-sequences we generate a set of possible (k+1)-supersequences. * * Copyright Antonio Gomariz PeƱalver 2013 * * This file is part of the SPMF DATA MINING SOFTWARE * (http://www.philippe-fournier-viger.com/spmf). * * SPMF is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * SPMF is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with SPMF. If not, see <http://www.gnu.org/licenses/>. * * @author agomariz */ class CandidateGeneration { /** * Main method that creates, from frequent (k-1)-sequence set (aka L(k-1)) * the new set of (k)-sequences candidates. Before returning the * candidate set, the algorithm prunes those canidates that cannot be frequent * at all * @param frequentSet the frequent (k-1)-sequence set, L(k-1) * @param abstractionCreator the abstraction creator * @param indexationMap a map where the frequent sequences are indexed by * their first item * @param k the number that corresponds to the current level * @param minSupportAbsolute the absolute minimum support * @return the final k-candidate set */ public List<Pattern> generateCandidates(Set<Pattern> frequentSet, AbstractionCreator abstractionCreator, Map<Item, Set<Pattern>> indexationMap, int k, double minSupportAbsolute) { //Definition of the set of candidates List<Pattern> candidateSet = new ArrayList<Pattern>(); //copy of (k-1)-frequent sequence set List<Pattern> frequentList = new ArrayList<Pattern>(frequentSet); //Definition of the set of candidates already pruned List<Pattern> prunedCandidates = null; if (k > 2) { //If we are not in the base case, i.e. we are generating a level k>2 Item previousItem = null; Set<Pattern> matching = null; //For each frequent (k-1)-sequence for (Pattern frequentPattern1 : frequentList) { //For the second element of the frequent patterns Item currentItem = frequentPattern1.getIthElement(1).getItem(); //If we did not previously processed if (!currentItem.equals(previousItem)) { //We get all the patterns that starts with that item as first element matching = indexationMap.get(currentItem); //and assign that item to the previous one previousItem = currentItem; } //If matching is not empty if (matching != null) { //for each of its patterns for (Pattern frequentPattern2 : matching) { //we try combining both frequentPattern1 and frequentPattern2 Pattern candidate = abstractionCreator.generateCandidates(abstractionCreator, frequentPattern1, frequentPattern2, minSupportAbsolute); //And if we succeed, we add it to the candidate set if (candidate != null) { candidateSet.add(candidate); } } } } //Once the loop is over, if the candidate set is not empty if (!candidateSet.isEmpty()) { //We prune those candidates that have some infrequent subpatterns prunedCandidates = prunedSubset(candidateSet, frequentSet, abstractionCreator); } else { return null; } } else if (k == 2) { //base case, i.e. k=2 prunedCandidates = new ArrayList<Pattern>(); for (int i = 0; i < frequentList.size(); i++) { for (int j = i; j < frequentList.size(); j++) { //We create candidates with all the possible combinations of frequent 1-sequences prunedCandidates.addAll(abstractionCreator.generateSize2Candidates(abstractionCreator, frequentList.get(i), frequentList.get(j))); } } } if (prunedCandidates.isEmpty()) { return null; } return prunedCandidates; } /** * Return the pruned k-candidate set of candidates. * @param candidateSet the candidate k-sequence set * @param frequentSet the frequent (k-1)-sequence set * @param abstractionCreator * @return */ private List<Pattern> prunedSubset(List<Pattern> candidateSet, Set<Pattern> frequentSet, AbstractionCreator abstractionCreator) { List<Pattern> candidatePatterns = new ArrayList<Pattern>(); //for each candidate for (Pattern candidate : candidateSet) { boolean isInfrequent = false; //for each one of its element for (int i = 0; i < candidate.getElements().size() && !isInfrequent; i++) { //we obtain the subpattern resulting of removing the element chosen just above Pattern subpattern = abstractionCreator.getSubpattern(candidate, i); //and if this subpattern does not appear in the frequent (k-1)-sequence set, L(k-1) if (!frequentSet.contains(subpattern)) { //we mark it as infrequent isInfrequent = true; } } if (!isInfrequent) { //We only add in the output set those patterns that not have any infrequent subpattern candidatePatterns.add(candidate); } } return candidatePatterns; } }