package ca.pfv.spmf.algorithms.sequentialpatterns.prefixSpan_AGP;
/*
* Copyright Antonio Gomariz Peñalver 2013
*
* This file is part of the SPMF DATA MINING SOFTWARE
* (http://www.philippe-fournier-viger.com/spmf).
*
* SPMF is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
*
* SPMF is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with
* SPMF. If not, see <http://www.gnu.org/licenses/>.
*/
import java.io.IOException;
import java.util.BitSet;
import java.util.Map;
import ca.pfv.spmf.algorithms.sequentialpatterns.prefixSpan_AGP.items.Item;
import ca.pfv.spmf.algorithms.sequentialpatterns.prefixSpan_AGP.items.PseudoSequence;
import ca.pfv.spmf.algorithms.sequentialpatterns.prefixSpan_AGP.items.PseudoSequenceDatabase;
import ca.pfv.spmf.algorithms.sequentialpatterns.prefixSpan_AGP.items.Sequence;
import ca.pfv.spmf.algorithms.sequentialpatterns.prefixSpan_AGP.items.SequenceDatabase;
import ca.pfv.spmf.algorithms.sequentialpatterns.prefixSpan_AGP.items.creators.AbstractionCreator;
import ca.pfv.spmf.algorithms.sequentialpatterns.prefixSpan_AGP.savers.Saver;
import ca.pfv.spmf.algorithms.sequentialpatterns.prefixSpan_AGP.savers.SaverIntoFile;
import ca.pfv.spmf.algorithms.sequentialpatterns.prefixSpan_AGP.savers.SaverIntoMemory;
import ca.pfv.spmf.tools.MemoryLogger;
/**
* This is an implementation of the PrefixSpan algorithm by Antonio Gomariz Peñalver(AGP).
* PrefixSpan was proposed by Pei et al. 2001. This algorithm was inspired in the
* implementation of SPMF and, from it, I changed and optimized some things.
* <br/><br/>
*
* NOTE: This implementation saves the pattern to a file as soon
* as they are found or can keep the pattern into memory if no output path is provided by the user.
*
* @author Antonio Gomariz Peñalver
*/
public class AlgoPrefixSpan_AGP {
/**
* the minimum support threshold as a value in [0,1]
*/
protected double minSupRelative;
/**
* The minimum support relative threshold, i.e. the minimum number of sequences
* where the patterns have to be
*/
protected double minSupAbsolute;
/**
* original sequential database to be used for sequential patterns
* extraction
*/
protected SequenceDatabase originalDataset;
/**
* Saver variable to decide where the user want to save the results, if it the case
*/
Saver saver = null;
/**
* Start and end points in order to calculate the overall time taken by the algorithm
*/
protected long start, end;
/**
* The abstraction creator
*/
private AbstractionCreator abstractionCreator;
/**
* Number of frequent patterns found by the algorithm
*/
private int numberOfFrequentPatterns = 0;
/**
* Standard constructor. It takes the minimum support threshold (from 1 up to 0) and an abstraction creator
* @param minsupRelative the relative minimum support threshold
* @param creator the abstraction creator
*/
public AlgoPrefixSpan_AGP(double minsupRelative, AbstractionCreator creator) {
this.minSupRelative = minsupRelative;
this.abstractionCreator = creator;
}
/**
* Method that starts the execution of the algorithm.
* @param database The original database in which we apply PrefixSpan
* @param keepPatterns Flag indicating if the user want to keep the frequent
* patterns or he just want the amount of them
* @param verbose Flag for debugging purposes
* @param outputFilePath Path pointing out to the file where the output,
* composed of frequent patterns, has to be kept. If, conversely, this
* parameter is null, we understand that the user wants the output in the main memory
* @param outputSequenceIdentifiers if true, sequences ids will be output for each pattern
* @throws IOException
*/
public void runAlgorithm(SequenceDatabase database, boolean keepPatterns, boolean verbose, String outputFilePath, boolean outputSequenceIdentifiers) throws IOException {
//calculation of the absolute minimum support
minSupAbsolute = (int) Math.ceil(minSupRelative * database.size());
if (this.minSupAbsolute == 0) { // protection
this.minSupAbsolute = 1;
}
// reset the stats about memory usage
MemoryLogger.getInstance().reset();
//keeping the starting time
start = System.currentTimeMillis();
//Starting PrefixSpanAlgorithm
prefixSpan(database, keepPatterns, verbose, outputFilePath, outputSequenceIdentifiers);
//keeping the ending time
end = System.currentTimeMillis();
//Search for frequent patterns: Finished
saver.finish();
}
/**
* Method that executes the first steps before calling the actual main
* method of PrefixSpan. In particular, the original database is fully
* converted to a pseudosequece database, removing the infrequent items
* that appeared in the original dabase
* @param database The original database
* @param keepPatterns Flag indicating if the user want to keep the frequent
* patterns or he just want the amount of them
* @param verbose Flag for debugging purposes
* @param outputFilePath Path pointing out to the file where the output,
* composed of frequent patterns, has to be kept. If, conversely, this
* parameter is null, we understand that the user wants the output in the main memory
* @param outputSequenceIdentifiers if true, sequences ids will be output for each pattern
* @throws IOException
*/
protected void prefixSpan(SequenceDatabase database, boolean keepPatterns, boolean verbose, String outputFilePath, boolean outputSequenceIdentifiers) throws IOException {
//If we do no have any file path
if (outputFilePath == null) {
//The user wants to save the results in memory
saver = new SaverIntoMemory(outputSequenceIdentifiers);
} else {
//Otherwise, the user wants to save them in the given file
saver = new SaverIntoFile(outputFilePath,outputSequenceIdentifiers);
}
/*We get the map which relates the frequent items with their appearances
in the database*/
Map<Item, BitSet> mapSequenceID = database.getFrequentItems();
/*projection of the original database in order to obtain a pseudosequence
* database*/
PseudoSequenceDatabase pseudoDatabase = projectInitialDatabase(database, mapSequenceID, (int) minSupAbsolute);
//We initialize the class that is in charge of managing the main loop of PrefixSpan
RecursionPrefixSpan_AGP algorithm = new RecursionPrefixSpan_AGP(abstractionCreator, saver, (int) minSupAbsolute, pseudoDatabase, mapSequenceID);
//And we execute the actual algorithm
algorithm.execute(keepPatterns, verbose);
//Finally we update the number of frequent patterns that we found
numberOfFrequentPatterns = algorithm.numberOfFrequentPatterns();
// check the memory usage for statistics
MemoryLogger.getInstance().checkMemory();
}
/**
* Method to get the outlined information about the search for frequent
* sequences by means of PrefixSpan algorithm as a string.
* @return a string
*/
public String printStatistics() {
StringBuilder sb = new StringBuilder(200);
sb.append("============= Algorithm - STATISTICS =============\n Total time ~ ");
sb.append(getRunningTime());
sb.append(" ms\n");
sb.append(" Frequent sequences count : ");
sb.append(numberOfFrequentPatterns);
sb.append('\n');
sb.append(" Max memory (mb):");
sb.append(MemoryLogger.getInstance().getMaxMemory());
sb.append('\n');
sb.append(saver.print());
sb.append('\n');
sb.append("===================================================\n");
return sb.toString();
}
/**
* Get the number of frequent patterns found.
* @return the number of frequent patterns.
*/
public int getNumberOfFrequentPatterns() {
return numberOfFrequentPatterns;
}
/**
* It gets the time spent by the algoritm in its execution.
* @return the time spent (long).
*/
public long getRunningTime() {
return (end - start);
}
/**
* It gets the absolute minimum support, i.e. the minimum number of database
* sequences where a pattern has to appear
* @return the minimum support (double)
*/
public double getAbsoluteMinSupport() {
return minSupAbsolute;
}
/**
* It projects the initial database converting each original sequence to
* pseudosequences in order to enable the later pseudoprojections in the
* main loop of PrefixSpan
* @param database The original Database
* @param mapSequenceID Map with all the items appearing in the original
* database, and a bitset pointing out in which sequences the items appear
* @param minSupportAbsolute The absolute minimum support
* @return
*/
private PseudoSequenceDatabase projectInitialDatabase(SequenceDatabase database, Map<Item, BitSet> mapSequenceID, long minSupportAbsolute) {
PseudoSequenceDatabase initialContext = new PseudoSequenceDatabase();
//For each database sequence
for (Sequence sequence : database.getSequences()) {
//The new pseudosequences are optimized, since do not have the infrequent items
Sequence optimizedSequence = sequence.cloneSequenceMinusItems(mapSequenceID, minSupportAbsolute);
if (optimizedSequence.size() != 0) {
/*
* If after remove the infrequent items, we remove all the items
* of an original sequence, we insert an empty pseudosequence
* in order not to affect to the absolute minimum support
*/
PseudoSequence pseudoSequence = new PseudoSequence(0, optimizedSequence, 0, 0);
initialContext.addSequence(pseudoSequence);
}
}
return initialContext;
}
/**
* It clears all the attributes of AlgoPrefixSpan class
*/
public void clear() {
if (originalDataset != null) {
originalDataset.clear();
originalDataset = null;
}
if (saver != null) {
saver.clear();
saver = null;
}
abstractionCreator = null;
}
}