package ca.pfv.spmf.algorithms.sequentialpatterns.spade_spam_AGP;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import ca.pfv.spmf.algorithms.sequentialpatterns.spade_spam_AGP.candidatePatternsGeneration.CandidateGenerator;
import ca.pfv.spmf.algorithms.sequentialpatterns.spade_spam_AGP.dataStructures.creators.AbstractionCreator;
import ca.pfv.spmf.algorithms.sequentialpatterns.spade_spam_AGP.dataStructures.database.SequenceDatabase;
import ca.pfv.spmf.algorithms.sequentialpatterns.spade_spam_AGP.dataStructures.patterns.Pattern;
import ca.pfv.spmf.algorithms.sequentialpatterns.spade_spam_AGP.savers.Saver;
import ca.pfv.spmf.algorithms.sequentialpatterns.spade_spam_AGP.savers.SaverIntoFile;
import ca.pfv.spmf.algorithms.sequentialpatterns.spade_spam_AGP.savers.SaverIntoMemory;
import ca.pfv.spmf.tools.MemoryLogger;
/**
* This is an implementation of the SPADE. SPADE was proposed by ZAKI in 2001.
*
* NOTE: This implementation saves the pattern to a file as soon as they are
* found or can keep the pattern into memory, depending on what the user choose.
*
* Copyright Antonio Gomariz PeƱalver 2013
*
* This file is part of the SPMF DATA MINING SOFTWARE
* (http://www.philippe-fournier-viger.com/spmf).
*
* SPMF is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
*
* SPMF is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with
* SPMF. If not, see <http://www.gnu.org/licenses/>.
*
* @author agomariz
*/
public class AlgoSPADE {
public long joinCount; // PFV 2013
/**
* the minimum support threshold
*/
protected double minSup;
/**
* The minimum support relative threshold, i.e. the minimum number of
* sequences where the patterns have to be
*/
protected double minSupRelative;
/**
* Flag indicating if we want a depth-first search when true. Otherwise we
* say that we want a breadth-first search
*/
protected boolean dfs;
/**
* Saver variable to decide where the user want to save the results, if it
* the case
*/
Saver saver = null;
/**
* Start and end points in order to calculate the overall time taken by the
* algorithm
*/
public long start, end;
/**
* Equivalence class whose class identifier is a frequent item
*/
protected List<EquivalenceClass> frequentItems;
/**
* Abstraction creator
*/
private AbstractionCreator abstractionCreator;
/**
* Number of frequent patterns found by the algorithm
*/
private int numberOfFrequentPatterns;
/**
* Constructor of the class that calls SPADE algorithm.
*
* @param support Minimum support (from 0 up to 1)
* @param dfs Flag for indicating if we want a depth first search. If false,
* we indicate that we want a breath-first search.
* @param abstractionCreator An abstraction creator.
*/
public AlgoSPADE(double support, boolean dfs, AbstractionCreator abstractionCreator) {
this.minSup = support;
this.abstractionCreator = abstractionCreator;
this.dfs = dfs;
}
/**
* Actual call to SPADE algorithm. The output can be either kept or ignore.
* Whenever we choose to keep the patterns found, we can keep them in a file
* or in the main memory
*
* @param database Original database in where we want to search for the
* frequent patterns.
* @param candidateGenerator The candidate generator used by the algorithm
* SPADE
* @param keepPatterns Flag indicating if we want to keep the output or not
* @param verbose Flag for debugging purposes
* @param outputFilePath Path of the file in which we want to store the
* frequent patterns. If this value is null, we keep the patterns in the
* main memory. This argument is taken into account just when keepPatterns
* is activated.
* @param outputSequenceIdentifiers if true, sequence identifiers will be output for each pattern
* @throws IOException
*/
public void runAlgorithm(SequenceDatabase database, CandidateGenerator candidateGenerator, boolean keepPatterns, boolean verbose, String outputFilePath, boolean outputSequenceIdentifiers) throws IOException {
//If we do no have any file path
if (outputFilePath == null) {
//The user wants to save the results in memory
saver = new SaverIntoMemory(outputSequenceIdentifiers);
} else {
//Otherwise, the user wants to save them in the given file
saver = new SaverIntoFile(outputFilePath, outputSequenceIdentifiers);
}
//this.minSupRelative = minSup; // PFV 2013
this.minSupRelative = (int) Math.ceil(database.size() * minSup);
if (this.minSupRelative == 0) { // protection
this.minSupRelative = 1;
}
// reset the stats about memory usage
MemoryLogger.getInstance().reset();
//keeping the starting time
start = System.currentTimeMillis();
//We run SPADE algorithm
runSPADE(database, candidateGenerator, (long) minSupRelative, dfs, keepPatterns, verbose);
//keeping the ending time
end = System.currentTimeMillis();
//Search for frequent patterns: Finished
saver.finish();
}
/**
* Actual call to SPADE algorithm. The output can be either kept or ignore.
* Whenever we choose to keep the patterns found, we can keep them in a file
* or in the main memory. The algorithm SPADE is executed in a parallel way.
*
* @param database Original database in where we want to search for the
* frequent patterns.
* @param candidateGenerator The candidate generator used by the algorithm
* SPADE
* @param keepPatterns Flag indicating if we want to keep the output or not
* @param verbose Flag for debugging purposes
* @param outputFilePath Path of the file in which we want to store the
* frequent patterns. If this value is null, we keep the patterns in the
* main memory. This argument is taken into account just when keepPatterns
* is activated.
* @param outputSequenceIdentifiers if true, sequence identifiers will be output for each pattern
* @throws IOException
*/
public void runAlgorithmParallelized(SequenceDatabase database, CandidateGenerator candidateGenerator, boolean keepPatterns, boolean verbose, String outputFilePath, boolean outputSequenceIdentifiers) throws IOException {
//If we do no have any file path
if (outputFilePath == null) {
//The user wants to save the results in memory
saver = new SaverIntoMemory(outputSequenceIdentifiers);
} else {
//Otherwise, the user wants to save them in the given file
saver = new SaverIntoFile(outputFilePath,outputSequenceIdentifiers);
}
this.minSupRelative = (int) Math.ceil(minSup * database.size());
//this.minSupRelative = (int) (database.size() * minSup);
if (this.minSupRelative == 0) { // protection
this.minSupRelative = 1;
}
// reset the stats about memory usage
MemoryLogger.getInstance().reset();
//keeping the starting time
start = System.currentTimeMillis();
//We run SPADE algorithm
runSPADEFromSize2PatternsParallelized2(database, candidateGenerator, (long) minSupRelative, dfs, keepPatterns, verbose);
//keeping the ending time
end = System.currentTimeMillis();
//Search for frequent patterns: Finished
saver.finish();
}
/**
*
* The actual method for extracting frequent sequences.
*
* @param database The original database
* @param candidateGenerator The candidate generator used by the algorithm
* SPADE
* @param minSupportCount The minimum relative support
* @param dfs Flag for indicating if we want a depth first search. If false,
* we indicate that we want a breath-first search.
* @param keepPatterns flag indicating if we are interested in keeping the
* output of the algorithm
* @param verbose Flag for debugging purposes
*/
protected void runSPADE(SequenceDatabase database, CandidateGenerator candidateGenerator, long minSupportCount, boolean dfs, boolean keepPatterns, boolean verbose) {
//We get the equivalence classes formed by the frequent 1-patterns
frequentItems = database.frequentItems();
//We extract their patterns
Collection<Pattern> size1sequences = getPatterns(frequentItems);
//If we want to keep the output
if (keepPatterns) {
for (Pattern atom : size1sequences) {
//We keep all the frequent 1-patterns
saver.savePattern(atom);
}
}
// CREATE COOCURENCE MAP
database = null;
//We define the root class
EquivalenceClass rootClass = new EquivalenceClass(null);
/*And we insert the equivalence classes corresponding to the frequent
1-patterns as its members*/
for (EquivalenceClass atom : frequentItems) {
rootClass.addClassMember(atom);
}
//Inizialitation of the class that is in charge of find the frequent patterns
FrequentPatternEnumeration frequentPatternEnumeration = new FrequentPatternEnumeration(candidateGenerator, minSupRelative, saver);
//We set the number of frequent items to the number of frequent items
frequentPatternEnumeration.setFrequentPatterns(frequentItems.size());
//We execute the search
frequentPatternEnumeration.execute(rootClass, dfs, keepPatterns, verbose, null,null);
/* Once we had finished, we keep the number of frequent patterns that we
* finally found
*/
numberOfFrequentPatterns = frequentPatternEnumeration.getFrequentPatterns();
// check the memory usage for statistics
MemoryLogger.getInstance().checkMemory();
joinCount = frequentPatternEnumeration.INTERSECTION_COUNTER;
}
/**
*
* The actual method for extracting frequent sequences. This method it starts
* with both the frequent 1-patterns and 2-patterns already found.
*
* @param database The original database
* @param candidateGenerator The candidate generator used by the algorithm
* SPADE
* @param minSupportCount The minimum relative support
* @param dfs Flag for indicating if we want a depth first search. If false,
* we indicate that we want a breath-first search.
* @param keepPatterns flag indicating if we are interested in keeping the
* output of the algorithm
* @param verbose Flag for debugging purposes
*/
protected void runSPADEFromSize2Sequences(SequenceDatabase database, CandidateGenerator candidateGenerator, long minSupportCount, boolean dfs, boolean keepPatterns, boolean verbose) {
frequentItems = database.frequentItems();
Collection<Pattern> size1Patterns = getPatterns(frequentItems);
saver.savePatterns(size1Patterns);
List<EquivalenceClass> size2Patterns = database.getSize2FrecuentSequences(minSupRelative);
Collection<Pattern> size2sequences = getPatterns(size2Patterns);
saver.savePatterns(size2sequences);
size2Patterns.clear();
database.clear();
size2Patterns = null;
database = null;
FrequentPatternEnumeration frequentPatternEnumeration = new FrequentPatternEnumeration(candidateGenerator, minSupRelative, saver);
frequentPatternEnumeration.setFrequentPatterns(size1Patterns.size() + size2sequences.size());
size1Patterns = null;
size2sequences = null;
while (frequentItems.size() > 0) {
EquivalenceClass frequentAtomClass = frequentItems.get(frequentItems.size() - 1);
if (verbose) {
System.out.println("Exploring... " + frequentAtomClass);
}
frequentPatternEnumeration.execute(frequentAtomClass, dfs, keepPatterns, verbose, null,null);
frequentItems.remove(frequentItems.size() - 1);
if (verbose) {
System.out.println("\tWe found " + frequentPatternEnumeration.getFrequentPatterns() + " frequent patterns so far.");
}
// check the memory usage for statistics
MemoryLogger.getInstance().checkMemory();
}
numberOfFrequentPatterns = frequentPatternEnumeration.getFrequentPatterns();
}
/**
* It gets the patterns that are the identifiers of the given equivalence classes
* @param equivalenceClasses The set of equivalence classes from where we want
* to obtain their class identifiers
* @return
*/
private Collection<Pattern> getPatterns(List<EquivalenceClass> equivalenceClasses) {
ArrayList<Pattern> patterns = new ArrayList<Pattern>();
for (EquivalenceClass equivalenceClass : equivalenceClasses) {
Pattern frequentPattern = equivalenceClass.getClassIdentifier();
patterns.add(frequentPattern);
}
return patterns;
}
public String printStatistics() {
StringBuilder sb = new StringBuilder(200);
sb.append("============= Algorithm - STATISTICS =============\n Total time ~ ");
sb.append(getRunningTime());
sb.append(" ms\n");
sb.append(" Frequent sequences count : ");
sb.append(numberOfFrequentPatterns);
sb.append('\n');
sb.append(" Join count : ");
sb.append(joinCount);
sb.append('\n');
sb.append(" Max memory (mb):");
sb.append(MemoryLogger.getInstance().getMaxMemory());
sb.append('\n');
sb.append(saver.print());
sb.append("\n===================================================\n");
return sb.toString();
}
public int getNumberOfFrequentPatterns() {
return numberOfFrequentPatterns;
}
/**
* It gets the time spent by the algoritm in its execution.
* @return
*/
public long getRunningTime() {
return (end - start);
}
/**
* It gets the minimum relative support, i.e. the minimum number of database
* sequences where a pattern has to appear
* @return
*/
public double getMinSupRelative() {
return minSupRelative;
}
/**
* It clears all the attributes of AlgoPrefixSpan class
*/
public void clear() {
frequentItems.clear();
abstractionCreator = null;
if (saver != null) {
saver.clear();
saver = null;
}
}
/**
*
* The actual method for extracting frequent sequences. This method it starts
* with both the frequent 1-patterns and 2-patterns already found. Besides, it
* resolves each equivalence class formed by the 1-patterns independently.
*
* @param database The original database
* @param candidateGenerator The candidate generator used by the algorithm
* SPADE
* @param minSupportCount The minimum relative support
* @param dfs Flag for indicating if we want a depth first search. If false,
* we indicate that we want a breath-first search.
* @param keepPatterns flag indicating if we are interested in keeping the
* output of the algorithm
* @param verbose Flag for debugging purposes
*/
protected void runSPADEFromSize2PatternsParallelized(SequenceDatabase database, CandidateGenerator candidateGenerator, long minSupportCount, boolean dfs, boolean keepPatterns, boolean verbose) {
frequentItems = database.frequentItems();
Collection<Pattern> size1Patterns = getPatterns(frequentItems);
saver.savePatterns(size1Patterns);
List<EquivalenceClass> size2EquivalenceClass = database.getSize2FrecuentSequences(minSupRelative);
Collection<Pattern> size2Sequences = getPatterns(size2EquivalenceClass);
saver.savePatterns(size2Sequences);
size2EquivalenceClass = null;
database = null;
FrequentPatternEnumeration frequentPatternEnumeration = new FrequentPatternEnumeration(candidateGenerator, minSupRelative, saver);
frequentPatternEnumeration.setFrequentPatterns(size1Patterns.size() + size2Sequences.size());
size1Patterns = null;
size2Sequences = null;
Runtime runtime = Runtime.getRuntime();
int numberOfAvailableProcessors = runtime.availableProcessors();
ExecutorService pool = Executors.newFixedThreadPool(numberOfAvailableProcessors);
ArrayList<Future<Void>> set = new ArrayList<Future<Void>>();
while (frequentItems.size() > 0) {
EquivalenceClass frequentItem = frequentItems.get(frequentItems.size() - 1);
if (verbose) {
System.out.println("Exploring " + frequentItem);
}
Callable<Void> callable = new FrequentPatternEnumerationFacade(frequentPatternEnumeration, frequentItem, dfs, keepPatterns, verbose, saver);
Future<Void> future = pool.submit(callable);
set.add(future);
frequentItems.remove(frequentItems.size() - 1);
// check the memory usage for statistics
MemoryLogger.getInstance().checkMemory();
}
try {
int cont = 1;
System.err.println("There are " + set.size() + " equivalence classes and " + numberOfAvailableProcessors + " available processors");
while (!set.isEmpty()) {
for (int i = 0; i < set.size(); i++) {
Future<Void> future = set.get(i);
if (future.isDone()) {
System.err.println(cont++ + ":this thread is done.");
set.remove(i);
i--;
}
}
}
numberOfFrequentPatterns = frequentPatternEnumeration.getFrequentPatterns();// check the memory usage for statistics
MemoryLogger.getInstance().checkMemory();
pool.shutdown();
pool.awaitTermination(1, TimeUnit.DAYS);
} catch (Exception e) {
System.err.println("Problems with the concurrency!!");
e.printStackTrace();
}
}
/**
*
* The actual method for extracting frequent sequences. This method it starts
* with both the frequent 1-patterns and 2-patterns already found. Besides, it
* resolves each equivalence class formed by the 1-patterns independently.
*
* @param database The original database
* @param candidateGenerator The candidate generator used by the algorithm
* SPADE
* @param minSupportCount The minimum relative support
* @param dfs Flag for indicating if we want a depth first search. If false,
* we indicate that we want a breath-first search.
* @param keepPatterns flag indicating if we are interested in keeping the
* output of the algorithm
* @param verbose Flag for debugging purposes
*/
protected void runSPADEFromSize2PatternsParallelized2(SequenceDatabase database, CandidateGenerator candidateGenerator, long minSupportCount, boolean dfs, boolean keepPatterns, boolean verbose) {
frequentItems = database.frequentItems();
Collection<Pattern> size1Sequences = getPatterns(frequentItems);
saver.savePatterns(size1Sequences);
List<EquivalenceClass> size2EquivalenceClasses = database.getSize2FrecuentSequences(minSupRelative);
Collection<Pattern> size2Sequences = getPatterns(size2EquivalenceClasses);
saver.savePatterns(size2Sequences);
numberOfFrequentPatterns = size1Sequences.size() + size2Sequences.size();
size2EquivalenceClasses = null;
database = null;
Runtime runtime = Runtime.getRuntime();
ExecutorService pool = Executors.newFixedThreadPool(runtime.availableProcessors());
Set<Future<Void>> set = new LinkedHashSet<Future<Void>>();
ArrayList<FrequentPatternEnumeration> enumerates = new ArrayList<FrequentPatternEnumeration>();
while (frequentItems.size() > 0) {
EquivalenceClass frequentAtom = frequentItems.get(frequentItems.size() - 1);
if (verbose) {
System.out.println("Exploring " + frequentAtom);
}
FrequentPatternEnumeration frequentPatternEnumeration = new FrequentPatternEnumeration(candidateGenerator, minSupRelative, saver);
enumerates.add(frequentPatternEnumeration);
Callable<Void> callable = new FrequentPatternEnumerationFacade(frequentPatternEnumeration, frequentAtom, dfs, keepPatterns, verbose, saver);
Future<Void> future = pool.submit(callable);
set.add(future);
frequentItems.remove(frequentItems.size() - 1);
// check the memory usage for statistics
MemoryLogger.getInstance().checkMemory();
}
try {
pool.shutdown();
pool.awaitTermination(1, TimeUnit.DAYS);
} catch (Exception e) {
System.err.println("Problems with the concurrency!!");
}
FrequentPatternEnumeration fpe = new FrequentPatternEnumeration(candidateGenerator, minSup, saver);
numberOfFrequentPatterns += fpe.getFrequentPatterns();
// check the memory usage for statistics
MemoryLogger.getInstance().checkMemory();
}
}