package ca.pfv.spmf.algorithms.sequentialpatterns.fournier2008_seqdim;
/* This file is copyright (c) 2008-2013 Philippe Fournier-Viger
*
* This file is part of the SPMF DATA MINING SOFTWARE
* (http://www.philippe-fournier-viger.com/spmf).
*
* SPMF is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
*
* SPMF is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
* You should have received a copy of the GNU General Public License along with
* SPMF. If not, see <http://www.gnu.org/licenses/>.
*/
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import ca.pfv.spmf.algorithms.sequentialpatterns.fournier2008_seqdim.multidimensionalsequentialpatterns.AlgoSeqDim;
/**
* This is an implementation of the PrefixSpan algorithm by Pei et al. 2001 using
* optimizations such as pseudo-projections, designed specifically for
* being used with the multi-dimensional sequential pattern mining algorithm SeqDim. There is another
* implementation of PrefixSpan for general use that is more optimized and can be found in the package:
* ca.pfv.spmf.sequential_patterns/bide_and_prefixspan
*
* @see Sequence
* @see SequenceDatabase
* @see AlgoSeqDim
* @author Philippe Fournier-Viger
*/
public class AlgoPrefixSpanMDSPM extends AbstractAlgoPrefixSpan {
// The sequential patterns that are found
private Sequences patterns = null;
// for statistics
private long startTime; // start time of the latest algorithm execution
private long endTime; // end time of the latest algorithm exeuction
// minimum support set by the user as a percentage
private final double minsup;
// minimum support in terms of sequence count
private int minsuppRelative;
/**
* Constructor
* @param minsup minimum support threshold as a percentage (double)
*/
public AlgoPrefixSpanMDSPM(double minsup){
this.minsup = minsup;
}
/**
* Get the minsup threshold as a percentage (double).
* @return a double value
*/
public double getMinSupp() {
return minsup;
}
/**
* Run the algorithm
* @param database a sequence database
* @return a set of sequential patterns (Sequences)
*/
public Sequences runAlgorithm(SequenceDatabase database) {
// initialize the set of seq. patterns
patterns = new Sequences("FREQUENT SEQUENTIAL PATTERNS");
// convert minsup from a percentage to an integer value indicating
// minsup in terms of sequence count
this.minsuppRelative = (int) Math.ceil(minsup* database.size());
// if minsup is zero, then set it to 1
if(this.minsuppRelative == 0){
this.minsuppRelative = 1;
}
// record start time
startTime = System.currentTimeMillis();
// start the prefixspan algorithm
prefixSpan(database);
// record end time
endTime = System.currentTimeMillis();
// return sequential patterns found
return patterns;
}
/**
* This is the main method of the Prefixspan algorithm
* @param database A sequence database
*/
private void prefixSpan(SequenceDatabase database){
// We have to scan the database to find all frequent patterns of size 1.
// We note the sequences in which these patterns appear in a map
// where key = item and value = a set of IDs of sequences containing
// that item.
Map<ItemSimple, Set<Integer>> mapSequenceID = calculateSupportOfItems(database);
// We convert the database in a pseudo-sequence database,
// and remove the items that are not frequent so that the
// algorithm will not consider them anymore because infrequent
// items cannot be part of a frequent seq. pattern.
PseudoSequenceDatabase initialDatabase = new PseudoSequenceDatabase();
// for each sequence
for(Sequence sequence : database.getSequences()){
// clone the sequence but remove infrequent items
Sequence optimizedSequence = sequence.cloneSequenceMinusItems(mapSequenceID, minsuppRelative);
// if the sequence is not empty
if(optimizedSequence.size() != 0){
// add the sequence to the new database
initialDatabase.addSequence(new PseudoSequence(0, optimizedSequence, 0, 0));
}
}
// Now, the algorithm will consider each frequent item as
// a frequent seq. pattern, and try to grow them recursively to
// find larger seq. patterns.
// For each item
for(Entry<ItemSimple, Set<Integer>> entry : mapSequenceID.entrySet()){
// if the item is frequent
if(entry.getValue().size() >= minsuppRelative){
// build the projected database with this item
ItemSimple item = entry.getKey();
PseudoSequenceDatabase projectedDatabase = buildProjectedContext(item, initialDatabase, false);
// Create a new prefix with this item only
Sequence prefix = new Sequence(0);
prefix.addItemset(new Itemset(item, 0));
// set the IDs of sequence containing this item to the Prefix
prefix.setSequencesID(entry.getValue());
// This prefix is a frequent sequential pattern
// so we add it to the result.
patterns.addSequence(prefix, 1);
// We make a recursive call to try to grow the prefix
// recursively to find larger sequential patterns.
recursion(prefix, 2, projectedDatabase);
}
}
}
/**
* Calculate the support of each item in a given sequence database.
* @param database a sequence database
* @return Map associating to each item (key) the set of IDs of sequences (value)
* containing the item.
*/
private Map<ItemSimple, Set<Integer>> calculateSupportOfItems(SequenceDatabase database) {
// Use a set to remember which item were seen already from a sequence when scanning
// a sequence
Set<Integer> alreadyCounted = new HashSet<Integer>();
// the last sequence scanned
Sequence lastSequence = null;
// the map for storing association from each item (key) to the set of IDs of sequences (value)
// containing the item.
Map<ItemSimple, Set<Integer>> mapSequenceID = new HashMap<ItemSimple, Set<Integer>>();
// for each sequence
for(Sequence sequence : database.getSequences()){
// if this sequence has not the same id as the previous sequence
if(lastSequence == null || lastSequence.getId() != sequence.getId()){ // FIX
// reset the set of items previously seen
alreadyCounted.clear();
// change the last sequence to the current sequence
lastSequence = sequence;
}
// scan the sequence
// for each itemset of the sequence
for(Itemset itemset : sequence.getItemsets()){
// for each item of the current itemset
for(ItemSimple item : itemset.getItems()){
// if we did not see the item yet in this sequence
if(!alreadyCounted.contains(item.getId())){
// get the set of sequence IDs containing this item until now
Set<Integer> sequenceIDs = mapSequenceID.get(item);
if(sequenceIDs == null){
// if no set, then create one
sequenceIDs = new HashSet<Integer>();
mapSequenceID.put(item, sequenceIDs);
}
// then add the current sequence ID to the set
sequenceIDs.add(sequence.getId());
// remember that we have seen this item already in this sequence
alreadyCounted.add(item.getId());
}
}
}
}
// return the map
return mapSequenceID;
}
/**
* Create a projected database by pseudo-projection
* @param item The item to use to make the pseudo-projection
* @param context The current database.
* @param inSuffix This boolean indicates if the item "item" is part of a suffix or not.
* @return the projected database.
*/
private PseudoSequenceDatabase buildProjectedContext(ItemSimple item, PseudoSequenceDatabase database, boolean inSuffix) {
// Create the projected pseudo-database
PseudoSequenceDatabase sequenceDatabase = new PseudoSequenceDatabase();
// for each pseudo-sequence
for(PseudoSequence sequence : database.getPseudoSequences()){
// for each itemset of the sequence
for(int i =0; i< sequence.size(); i++){
// get the position of the item in this sequence
int index = sequence.indexOf(i, item.getId());
// if the itemset contains the item ad it appears in a suffix
// (an itemset cut at left)
if(index != -1 && sequence.isCutAtLeft(i) == inSuffix){
// if this is not the last item of the itemset
if(index != sequence.getSizeOfItemsetAt(i)-1){
// create a new pseudo-sequence such that the
// original sequence will be cut right after the item
// used for the projection
PseudoSequence newSequence = new PseudoSequence(sequence.getAbsoluteTimeStamp(i),
sequence, i, index+1);
// if the resulting sequence is not empty
if(newSequence.size() >0){
// add the sequence to the sequence database
sequenceDatabase.addSequence(newSequence);
}
}else if ((i != sequence.size()-1)){
// if this is not the last itemset of the sequence and
// the item does not appear in an itemset that is a postfix
// create a new pseudo-sequence by cutting right after the item
// used for the projection
PseudoSequence newSequence = new PseudoSequence(sequence.getAbsoluteTimeStamp(i), sequence, i+1, 0);
// if the resulting sequence is not empty
if(newSequence.size() >0){
// add the sequence to the sequence database
sequenceDatabase.addSequence(newSequence);
}
}
}
}
}
// return the projected database
return sequenceDatabase;
}
/**
* This is the recursive method for growing a prefix of size >=1 to find larger
* sequential patterns
* @param prefix the prefix
* @param k the size of the current prefix + 1
* @param database the sequence database
*/
private void recursion(Sequence prefix, int k, PseudoSequenceDatabase database) {
//Call this method to find all frequent pairs in the current pseudo
// sequence database.
// A pair is an item plus a boolean indicating if the item appears in
// a suffix (an itemset that is cut) or not.
Set<Pair> pairs = findAlllPairsAndCountTheirSupport(database.getPseudoSequences());
// For each pair found,
for(Pair paire : pairs){
// if the pair is frequent.
if(paire.getCount() >= minsuppRelative){
// create the new postfix by appending the item from the pair
// to the prefix
Sequence newPrefix;
if(paire.isPostfix()){ // if the item is part of a postfix
// use the method for the case of a postfix
newPrefix = appendItemToPrefixOfSequence(prefix, paire.getItem()); // is =<is, (deltaT,i)>
}else{ // else use the regular method
newPrefix = appendItemToSequence(prefix, paire.getItem(), paire.getTimestamp());
}
// build the projected database for the new prefix
PseudoSequenceDatabase projectedContext = buildProjectedContext(paire.getItem(), database, paire.isPostfix());
// create new prefix
Sequence prefix2 = newPrefix.cloneSequence();
prefix2.setSequencesID(paire.getSequencesID());
// add the new prefix to the set of frequent seq. patterns
patterns.addSequence(prefix2, prefix2.size());
// make a recursive call to this method to try to find
// larger sequential patterns starting with the new prefix.
recursion(prefix2, k+1, projectedContext);
}
}
}
/**
* Method to count the support of all pairs in a pseudo-sequence database. A pair
* is an item ID and a boolean indicating if the item appear in a postfix or not.
* This is for k> 1.
* @param sequences the sequence database
* @return a set of Pair objects
*/
protected Set<Pair> findAlllPairsAndCountTheirSupport(List<PseudoSequence> sequences){
// we will scan the database and store the cumulative support of each pair
// in a map.
// This map contains pairs
Map<Pair, Pair> mapPairs = new HashMap<Pair, Pair>();
// variable to remember what is the last sequence considered
PseudoSequence lastSequence = null;
// Map to rememer which item were seen already for the current sequence
// that is scanned.
Set<Pair> alreadyCountedForSequenceID = new HashSet<Pair>();
// for each pseudo sequence
for(PseudoSequence sequence : sequences){
// if the sequence does not have the same id as the previous one
if(sequence != lastSequence){
// reset the map of items already seen
alreadyCountedForSequenceID.clear();
// set the last sequence to this sequence
lastSequence = sequence;
}
// for each itemset
for(int i=0; i< sequence.size(); i++){
// for each item in the current itemset
for(int j=0; j < sequence.getSizeOfItemsetAt(i); j++){
ItemSimple item = sequence.getItemAtInItemsetAt(j, i);
// create the pair corresponding to this item in this itemset
Pair pair = new Pair(false, sequence.isCutAtLeft(i), item);
// if this pair was not already seen for this sequence
if(!alreadyCountedForSequenceID.contains(pair)){
// look for the pair in the map of pairs
Pair oldPair = mapPairs.get(pair);
// if none
if(oldPair == null){
// put the current pair
mapPairs.put(pair, pair);
}else{
// otherwise use the old one
pair = oldPair;
}
// add the pair to the set of pairs already seen for this sequence.
alreadyCountedForSequenceID.add(pair);
// add the sequence ID of this sequence to
// the pair
pair.getSequencesID().add(sequence.getId());
}
}
}
}
// return the set of pairs
return mapPairs.keySet();
}
/**
* This method creates a copy of the sequence and add a given item
* as a new itemset to the sequence.
* It sets the support of the sequence as the support of the item.
* @param prefix the sequence
* @param item the item
* @param timestamp a timestamp (not used by prefixspan)
* @return the new sequence
*/
private Sequence appendItemToSequence(Sequence prefix, ItemSimple item, long timestamp) {
Sequence newPrefix = prefix.cloneSequence();
newPrefix.addItemset(new Itemset(item, 0));
return newPrefix;
}
/**
* This method creates a copy of the sequence and add a given item
* to the last itemset of the sequence.
* It sets the support of the sequence as the support of the item.
* @param prefix the sequence
* @param item the item
* @return the new sequence
*/
private Sequence appendItemToPrefixOfSequence(Sequence prefix, ItemSimple item) {
Sequence newPrefix = prefix.cloneSequence();
Itemset itemset = newPrefix.get(newPrefix.size()-1); // ajoute au dernier itemset
itemset.addItem(item);
return newPrefix;
}
/**
* Print statistics about the latest execution of this algorithm.
* @param databaseSize the number of sequences in the original sequence database.
*/
public void printStatistics(int databaseSize) {
StringBuilder r = new StringBuilder(200);
r.append("============= PREFIXSPAN - STATISTICS =============\n Total time ~ ");
r.append(endTime - startTime);
r.append(" ms\n");
r.append(" Frequent sequences count : ");
r.append(patterns.sequenceCount);
r.append('\n');
r.append(patterns.toString(databaseSize));
r.append("===================================================\n");
System.out.println(r.toString());
}
}