package ca.pfv.spmf.algorithms.sequentialpatterns.spade_spam_AGP;
/*
* Copyright Antonio Gomariz Peñalver 2013
*
* This file is part of the SPMF DATA MINING SOFTWARE
* (http://www.philippe-fournier-viger.com/spmf).
*
* SPMF is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
*
* SPMF is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with
* SPMF. If not, see <http://www.gnu.org/licenses/>.
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import ca.pfv.spmf.algorithms.sequentialpatterns.spade_spam_AGP.dataStructures.database.SequenceDatabase;
import ca.pfv.spmf.algorithms.sequentialpatterns.spade_spam_AGP.dataStructures.patterns.Pattern;
import ca.pfv.spmf.algorithms.sequentialpatterns.spade_spam_AGP.savers.Saver;
import ca.pfv.spmf.algorithms.sequentialpatterns.spade_spam_AGP.savers.SaverIntoFile;
import ca.pfv.spmf.algorithms.sequentialpatterns.spade_spam_AGP.savers.SaverIntoMemory;
import ca.pfv.spmf.tools.MemoryLogger;
/**
* This is an implementation of the SPAM algorithm. SPAM was proposed by Ayres
* in 2002.
* <br/><br/>
*
* NOTE: This implementation saves the pattern to a file as soon as they are
* found or can keep the pattern into memory if no output path is provided by the user.
*
*
* @author Antonio Gomariz Peñalver
*/
public class AlgoSPAM_AGP {
/**
* the minimum support threshold
*/
protected double minSupRelative;
/**
* The absolute minimum support threshold, i.e. the minimum number of
* sequences where the patterns have to be
*/
protected double minSupAbsolute;
/**
* Saver variable to decide where the user want to save the results, if it
* the case
*/
Saver saver = null;
/**
* Start and end points in order to calculate the overall time taken by the
* algorithm
*/
protected long start, end;
/**
* Equivalence class whose class' identifier is a frequent item
*/
protected List<EquivalenceClass> frequentItems;
/**
* Number of frequent patterns found by the algorithm
*/
private int numberOfFrequentPatterns;
/**
* Constructor of the class that calls SPAM algorithm.
*
* @param minsupRelative Minimum support (from 0 up to 1)
*/
public AlgoSPAM_AGP(double minsupRelative) {
this.minSupRelative = minsupRelative;
}
/**
* Actual call to SPAM algorithm. The output can be either kept or ignore.
* Whenever we choose to keep the patterns found, we can keep them in a file
* or in the main memory
*
* @param database Original database in where we want to search for the
* frequent patterns.
* @param keepPatterns Flag indicating if we want to keep the output or not
* @param verbose Flag for debugging purposes
* @param outputFilePath Path of the file in which we want to store the
* frequent patterns. If this value is null, we keep the patterns in the
* main memory. This argument is taken into account just when keepPatterns
* is activated.
* @param outputSequenceIdentifiers if true, sequence identifiers will be output for each pattern found
* @throws IOException
*/
public void runAlgorithm(SequenceDatabase database, boolean keepPatterns, boolean verbose, String outputFilePath, boolean outputSequenceIdentifiers) throws IOException {
//If we do no have any file path
if (outputFilePath == null) {
//The user wants to save the results in memory
saver = new SaverIntoMemory(outputSequenceIdentifiers);
} else {
//Otherwise, the user wants to save them in the given file
saver = new SaverIntoFile(outputFilePath,outputSequenceIdentifiers);
}
this.minSupAbsolute = (int) Math.ceil(minSupRelative * database.size());
if (this.minSupAbsolute == 0) { // protection
this.minSupAbsolute = 1;
}
// reset the stats about memory usage
MemoryLogger.getInstance().reset();
//keeping the starting time
start = System.currentTimeMillis();
//We run SPAM algorithm
runSPAM(database, (long) minSupAbsolute, keepPatterns, verbose);
//keeping the ending time
end = System.currentTimeMillis();
//Search for frequent patterns: Finished
saver.finish();
}
/**
* The actual method for extracting frequent sequences.
* @param database The original database
* @param minSupportAbsolute the absolute minimum support
* @param keepPatterns flag indicating if we are interested in keeping the
* output of the algorithm
* @param verbose Flag for debugging purposes
*/
protected void runSPAM(SequenceDatabase database, long minSupportAbsolute, boolean keepPatterns, boolean verbose) {
//We get the equivalence classes formed by the frequent 1-patterns
frequentItems = database.frequentItems();
//We extract their patterns
Collection<Pattern> size1sequences = getPatterns(frequentItems);
//If we want to keep the output
if (keepPatterns) {
for (Pattern atom : size1sequences) {
//We keep all the frequent 1-patterns
saver.savePattern(atom);
}
}
database = null;
//We define the root class
EquivalenceClass rootClass = new EquivalenceClass(null);
/*And we insert the equivalence classes corresponding to the frequent
1-patterns as its members*/
for (EquivalenceClass atom : frequentItems) {
rootClass.addClassMember(atom);
}
//Inizialitation of the class that is in charge of find the frequent patterns
FrequentPatternEnumeration_SPAM frequentPatternEnumeration = new FrequentPatternEnumeration_SPAM(minSupAbsolute, saver);
//We execute the search
frequentPatternEnumeration.execute(rootClass, keepPatterns, verbose);
//Once we had finished, we keep the number of frequent patterns that we found
numberOfFrequentPatterns = frequentPatternEnumeration.getFrequentPatterns();
// check the memory usage for statistics
MemoryLogger.getInstance().checkMemory();
}
/**
* It gets the patterns that are the identifiers of the given equivalence classes
* @param equivalenceClasses The set of equivalence classes from where we want
* to obtain their class identifiers
* @return
*/
private Collection<Pattern> getPatterns(List<EquivalenceClass> equivalenceClasses) {
ArrayList<Pattern> patterns = new ArrayList<Pattern>();
for (EquivalenceClass equivalenceClass : equivalenceClasses) {
Pattern frequentPattern = equivalenceClass.getClassIdentifier();
patterns.add(frequentPattern);
}
return patterns;
}
public String printStatistics() {
StringBuilder sb = new StringBuilder(200);
sb.append("============= Algorithm - STATISTICS =============\n Total time ~ ");
sb.append(getRunningTime());
sb.append(" ms\n");
sb.append(" Frequent sequences count : ");
sb.append(numberOfFrequentPatterns);
sb.append('\n');
sb.append(" Max memory (mb):");
sb.append(MemoryLogger.getInstance().getMaxMemory());
sb.append('\n');
sb.append(saver.print());
sb.append("\n===================================================\n");
return sb.toString();
}
public int getNumberOfFrequentPatterns() {
return numberOfFrequentPatterns;
}
/**
* It gets the time spent by the algoritm in its execution.
* @return the total time
*/
public long getRunningTime() {
return (end - start);
}
/**
* It gets the minimum relative support, i.e. the minimum number of database
* sequences where a pattern has to appear
* @return the minimum support
*/
public double getMinSupRelative() {
return minSupAbsolute;
}
/**
* It clears all the attributes of AlgoSpam class
*/
public void clear() {
frequentItems.clear();
if (saver != null) {
saver.clear();
saver = null;
}
}
}