package ca.pfv.spmf.algorithms.sequentialpatterns.BIDE_and_prefixspan_with_strings;
/* This file is copyright (c) 2008-2013 Philippe Fournier-Viger
*
* This file is part of the SPMF DATA MINING SOFTWARE
* (http://www.philippe-fournier-viger.com/spmf).
*
* SPMF is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
*
* SPMF is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
* You should have received a copy of the GNU General Public License along with
* SPMF. If not, see <http://www.gnu.org/licenses/>.
*/
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import ca.pfv.spmf.input.sequence_database_list_strings.Sequence;
import ca.pfv.spmf.input.sequence_database_list_strings.SequenceDatabase;
import ca.pfv.spmf.tools.MemoryLogger;
/***
* This is an implementation of the BIDE+ algorithm that is optimized to take
* sequences of strings as input instead of sequences of integers.
* <br/><br/>
*
* In this file, I have tried to put some comments about how the algorithm works.
* But if one wants to understand the algorithm, he should read the paper
* by Wang et al. first because this algorithm is quite complex.
* <br/><br/>
*
* In future a version of SPMF, it is planned to remove this package and to provide a more general
* mechanism for handling strings in sequences that would work for all algorithms
* that take sequences as input. But this has not been done yet.
*
@see Sequence
@see SequenceDatabase
@see SequentialPattern
@see SequentialPatterns
@see PairBIDE
@see PseudoSequenceBIDE
* @author Philippe Fournier-Viger
*/
public class AlgoBIDEPlus_withStrings {
// for statistics
private long startTime;
private long endTime;
// the number of patterns found
private int patternCount = 0;
// absolute minimum support
private int minsuppAbsolute;
// object to write the file
BufferedWriter writer = null;
// For BIDE+, we have to keep a pointer to the original database
private List<PseudoSequenceBIDE> initialContext = null;
/*
* Default constructor
*/
public AlgoBIDEPlus_withStrings(){
}
/**
* Run the algorithm
* @param database a sequence database
* @param outputPath an output file path
* @param minsup a minimum support as an integer representing a number of sequences
* @throws IOException exception if error while writing the file
*/
public void runAlgorithm(SequenceDatabase database, String outputPath, int minsup) throws IOException {
// object to write the output file
writer = new BufferedWriter(new FileWriter(outputPath));
// save the minimum support
this.minsuppAbsolute = minsup;
// number of pattern found
patternCount = 0;
// reset the utility to check the memory usage
MemoryLogger.getInstance().reset();
// save start time for stats
startTime = System.currentTimeMillis();
// run the algorithm
bide(database);
// save end time for stats
endTime = System.currentTimeMillis();
// close the output file
writer.close();
}
/**
* This is the main method for the BIDE+ algorithm.
* @param database a sequence database
* @throws IOException exception if some error occurs while writing the output file.
*/
private void bide(SequenceDatabase database) throws IOException{
// The algorithm first scan the database to find all frequent items
// The algorithm note the sequences in which these items appear.
// This is stored in a map: Key: item Value : IDs of sequences containing the item
Map<String, Set<Integer>> mapSequenceID = findSequencesContainingItems(database);
// WE CONVERT THE DATABASE TO A PSEUDO-DATABASE, AND REMOVE
// THE ITEMS OF SIZE 1 THAT ARE NOT FREQUENT, SO THAT THE ALGORITHM
// WILL NOT CONSIDER THEM ANYMORE. (OPTIMIZATION : OCTOBER-08 )
// we create a database
initialContext = new ArrayList<PseudoSequenceBIDE>();
// for each sequence in the original databse
for(Sequence sequence : database.getSequences()){
// make a copy of the sequence but remove infrequent items
Sequence optimizedSequence = sequence.cloneSequenceMinusItems(mapSequenceID, minsuppAbsolute);
if(optimizedSequence.size() != 0){
// if the sequence has more than 1 item, add it to the new database
initialContext.add(new PseudoSequenceBIDE(optimizedSequence, 0, 0));
}
}
// For each frequent item
for(Entry<String, Set<Integer>> entry : mapSequenceID.entrySet()){
// if the item is frequent
if(entry.getValue().size() >= minsuppAbsolute){
// build the projected context with that item
String item = entry.getKey();
List<PseudoSequenceBIDE> projectedContext = buildProjectedContext(item, initialContext, false);
// Create the prefix for the projected database with that item
SequentialPattern prefix = new SequentialPattern(0);
prefix.addItemset(new Itemset(item));
prefix.setSequencesID(entry.getValue());
// variable to store the largest support of patterns
// that will be found starting with this prefix
int supportSuccessors =0;
/// We recursively try to extend the prefix
// if it respect the backscan pruning condition (see BIDE paper for details).
if(!checkBackScanPruning(prefix)){
// recursive call
supportSuccessors = recursion(prefix, projectedContext);
}
// Finally, because this prefix has support > minsup
// and passed the backscan pruning,
// we check if it has no sucessor with the same support
// (a forward extension)
// IF no forward extension
if(prefix.getAbsoluteSupport() != supportSuccessors){
// IF there is also no backward extension
if(!checkBackwardExtension(prefix)){
// the pattern is closed and we save it
savePattern(prefix);
}
}
}
}
// check memory usage
MemoryLogger.getInstance().checkMemory();
}
/**
* This is the "backscan-pruning" strategy described in the BIDE+
* paper to avoid extending some prefixs that are guaranteed to not
* generate a closed pattern (see the BIDE+ paper for details).
*
* @param prefix the current prefix
* @return boolean true if we should not extend the prefix
*/
private boolean checkBackScanPruning(SequentialPattern prefix) {
// See the BIDE+ paper for details about this method.
// For the number of item occurences that can be generated with this prefix:
for(int i=0; i< prefix.getItemOccurencesTotalCount(); i++){
// (1) For each i, we construct the list of semi-maximum periods.
List<PseudoSequenceBIDE> semimaximumPeriods = new ArrayList<PseudoSequenceBIDE>();
for(PseudoSequenceBIDE sequence : initialContext){
if(prefix.getSequencesID().contains(sequence.getId())){
PseudoSequenceBIDE period = sequence.getIthSemiMaximumPeriodOfAPrefix(prefix.getItemsets(), i);
if(period !=null){
semimaximumPeriods.add(period);
}
}
}
// (2) check if an element of the semi-max perdios as the same frequency as the prefix.
Set<PairBIDE> paires = findAllFrequentPairsForBackwardExtensionCheck(prefix, semimaximumPeriods, i);
for(PairBIDE pair : paires){
if(pair.getCount() == prefix.getAbsoluteSupport()){
return true;
}
}
}
return false;
}
/**
* Method to check if a prefix has a backward-extension (see Bide+ article for full details).
* This method do it a little bit differently than the BIDE+ article since
* we iterate with i on elements of the prefix instead of iterating with
* a i on the itemsets of the prefix. But the idea is the same!
* @param prefix the current prefix
* @return boolean true, if there is a backward extension
*/
private boolean checkBackwardExtension(SequentialPattern prefix) {
// We check for an S-extension
for(int i=0; i< prefix.getItemOccurencesTotalCount(); i++){
// (1) For each i, we build the list of maximum periods
List<PseudoSequenceBIDE> maximumPeriods = new ArrayList<PseudoSequenceBIDE>();
// for each sequence in the original database
for(PseudoSequenceBIDE sequence : initialContext){
// if the prefix appear in this sequence
if(prefix.getSequencesID().contains(sequence.getId())){
// get the ith maximum period
PseudoSequenceBIDE period = sequence.getIthMaximumPeriodOfAPrefix(prefix.getItemsets(), i);
// if the period is not null
if(period !=null){
// we add it to the list of maximum periods
maximumPeriods.add(period);
}
}
}
// (2)check if an element from the maximum periods has the same support as the prefix.
for(PairBIDE pair : findAllFrequentPairsForBackwardExtensionCheck(prefix, maximumPeriods, i)){
// if there is extension with the same support
if(pair.getCount() == prefix.getAbsoluteSupport()){
// the prefix will not be closed and we return true
return true;
}
}
}
return false; // no backward extension
}
/**
* Method to find all frequent items in a list of maximum periods.
* @param prefix the current prefix
* @param maximum periods a list of maximum periods
* @return a set of pairs indicating the support of items (note that a pair distinguish
* between items in a postfix, prefix...).
*/
protected Set<PairBIDE> findAllFrequentPairsForBackwardExtensionCheck(
SequentialPattern prefix, List<PseudoSequenceBIDE> maximumPeriods, int iPeriod) {
// Create a Map of pairs to store the pairs
Map<PairBIDE, PairBIDE> mapPaires = new HashMap<PairBIDE, PairBIDE>();
// NEW CODE 2010-02-04
String itemI = prefix.getIthItem(iPeriod); // iPeriod
String itemIm1 = null; // iPeriod -1
if(iPeriod > 0){
itemIm1 = prefix.getIthItem(iPeriod -1);
}
// END NEW
// for each maximum period
for(PseudoSequenceBIDE period : maximumPeriods){
// for each itemset in that period
for(int i=0; i< period.size(); i++){
// NEW
boolean sawI = false; // sawI after current position
boolean sawIm1 = false; // sawI-1 before current position
// END NEW
// NEW march 20 2010 : check if I is after current position in current itemset
for(int j=0; j < period.getSizeOfItemsetAt(i); j++){
String item = period.getItemAtInItemsetAt(j, i);
if(item.equals(itemI)){
sawI = true;
}else if (item.compareTo(itemI) > 0 ){
break;
}
}
// END NEW
for(int j=0; j < period.getSizeOfItemsetAt(i); j++){
String item = period.getItemAtInItemsetAt(j, i);
// NEW
// if(item.getId() == itemI.getId()){
// sawI = true;
// }
if(itemIm1 != null && item.equals(itemIm1)){
sawIm1 = true;
}
boolean isPrefix = period.isCutAtRight(i);
boolean isPostfix = period.isPostfix(i);
// END NEW
// normal case
PairBIDE paire = new PairBIDE(isPrefix, isPostfix, item);
addPaire(mapPaires, period.getId(),
paire);
// NEW: special cases
if(sawIm1){
PairBIDE paire2 = new PairBIDE(isPrefix, !isPostfix, item);
addPaire(mapPaires, period.getId(),
paire2);
}
if(sawI ){
PairBIDE paire2 = new PairBIDE(!isPrefix, isPostfix, item);
addPaire(mapPaires, period.getId(),
paire2);
}
// END NEW
}
}
}
return mapPaires.keySet(); // return the map of pairs
}
/**
* Add a pair to the map of pairs and add a sequence ID to it.
* If the pair is already in the map, the id is added to the old pair.
* @param mapPaires the map of pairs
* @param seqID a sequence id
* @param paire a pair
*/
private void addPaire(Map<PairBIDE, PairBIDE> mapPaires, Integer seqID, PairBIDE paire) {
// check if the pair is already in the map
PairBIDE oldPaire = mapPaires.get(paire);
// if not
if(oldPaire == null){
// we add the new pair "paire" to the map
mapPaires.put(paire, paire);
}else{
// otherwise we use the old one
paire = oldPaire;
}
// we add the sequence ID to the pair
paire.getSequencesID().add(seqID);
}
/**
* For each item, calculate the sequence id of sequences containing that item
* @param database the current sequence database
* @return Map of items to sequence IDs that contains each item
*/
private Map<String, Set<Integer>> findSequencesContainingItems(SequenceDatabase contexte) {
// We use a map to store the sequence IDs where an item appear
// Key : item Value : a set of sequence IDs
Map<String, Set<Integer>> mapSequenceID = new HashMap<String, Set<Integer>>(); // pour conserver les ID des s�quences: <Id Item, Set d'id de s�quences>
// for each sequence
for(Sequence sequence : contexte.getSequences()){
// for each itemset
for(List<String> itemset : sequence.getItemsets()){
// for each item
for(String item : itemset){
// get the sequence IDs of this itemset until now
Set<Integer> sequenceIDs = mapSequenceID.get(item);
// if null, create a new set
if(sequenceIDs == null){
sequenceIDs = new HashSet<Integer>();
mapSequenceID.put(item, sequenceIDs);
}
// add the current sequence ID to the set
sequenceIDs.add(sequence.getId());
}
}
}
return mapSequenceID; // return the map
}
/**
* Create a projected database by pseudo-projection
* @param item The item to use to make the pseudo-projection
* @param context The current database.
* @param inSuffix This boolean indicates if the item "item" is part of a suffix or not.
* @return the projected database.
*/
private List<PseudoSequenceBIDE> buildProjectedContext(String item, List<PseudoSequenceBIDE> database, boolean inSuffix) {
// The projected pseudo-database
List<PseudoSequenceBIDE> sequenceDatabase = new ArrayList<PseudoSequenceBIDE>();
// for each sequence
for(PseudoSequenceBIDE sequence : database){ // for each sequence
// for each item of the sequence
for(int i =0; i< sequence.size(); i++){ // for each item of the sequence
// check if the itemset contains the item that we use for the projection
int index = sequence.indexOf(i, item);
// if it does not, and the current item is part of a suffix if inSuffix is true
// and vice-versa
if(index != -1 && sequence.isPostfix(i) == inSuffix){
if(index != sequence.getSizeOfItemsetAt(i)-1){ // if this is not the last item of the itemset
// create a new pseudo sequence
PseudoSequenceBIDE newSequence = new PseudoSequenceBIDE( sequence, i, index+1);
if(newSequence.size() >0){
// if the size of this pseudo sequence is greater than 0
// add it to the projected database.
sequenceDatabase.add(newSequence);
}
}else if ((i != sequence.size()-1)){// if this is not the last itemset of the sequence
// create a new pseudo sequence
PseudoSequenceBIDE newSequence = new PseudoSequenceBIDE( sequence, i+1, 0);
if(newSequence.size() >0){
// if the size of this pseudo sequence is greater than 0
// add it to the projected database.
sequenceDatabase.add(newSequence);
}
}
}
}
}
return sequenceDatabase; // the projected database
}
/**
* Method to recursively grow a given sequential pattern.
* @param prefix the current sequential pattern that we want to try to grow
* @param database the current projected sequence database
* @throws IOException exception if there is an error writing to the output file
*/
private int recursion(SequentialPattern prefix, List<PseudoSequenceBIDE> contexte) throws IOException {
// find frequent items of size 1 in the current projected database.
Set<PairBIDE> pairs = findAllFrequentPairs(prefix, contexte);
// we will keep tract of the maximum support of patterns
// that can be found with this prefix, to check
// for forward extension when this method returns.
int maxSupport = 0;
// For each pair found (a pair is an item with a boolean indicating if it
// appears in an itemset that is cut (a postfix) or not, and the sequence IDs
// where it appears in the projected database).
for(PairBIDE paire : pairs){
// if the item is freuqent.
if(paire.getCount() >= minsuppAbsolute){
// create the new postfix by appending this item to the prefix
SequentialPattern newPrefix;
if(paire.isPostfix()){
// we append it to the last itemset of the prefix
newPrefix = appendItemToPrefixOfSequence(prefix, paire.getItem()); // is =<is, (deltaT,i)>
}else{ // else, we append it as a new itemset to the sequence
newPrefix = appendItemToSequence(prefix, paire.getItem());
}
// build the projected database with this item
List<PseudoSequenceBIDE> projectedContext = buildProjectedContext(paire.getItem(), contexte, paire.isPostfix());
// create new prefix with this item
newPrefix.setSequencesID(paire.getSequencesID());
// variable to keep track of the maximum support of extension
// with this item and this prefix
int maxSupportOfSuccessors = 0;
// Apply the "backscan pruning" strategy (see BIDE+ paper)
if(checkBackScanPruning(newPrefix) == false){
// make a recursive call to extend the prefix with this item
// and generate other patterns starting with that prefix + item
maxSupportOfSuccessors = recursion(newPrefix, projectedContext); // r�cursion
}
// check the forward extension for the prefix
boolean noForwardSIExtension = newPrefix.getAbsoluteSupport() != maxSupportOfSuccessors;
// if no forward extension
if(noForwardSIExtension){
// check if there is a backward extension
if(!checkBackwardExtension(newPrefix)){
// none, so we save the pattern
savePattern(newPrefix);
}
}
// record the largest support of patterns found starting
// with this prefix until now
if(newPrefix.getAbsoluteSupport() > maxSupport){
maxSupport = newPrefix.getAbsoluteSupport();
}
}
}
return maxSupport; // return the maximum support generated by extension of the prefix
}
/**
* Method to find all frequent items in a projected sequence database
* @param sequences the set of sequences
* @return A list of pairs, where a pair is an item with (1) booleans indicating if it
* is in an itemset that is "cut" at left or right (prefix or postfix)
* and (2) the sequence IDs where it occurs.
*/
protected Set<PairBIDE> findAllFrequentPairs(SequentialPattern prefix, List<PseudoSequenceBIDE> sequences){
// We use a Map the store the pairs.
Map<PairBIDE, PairBIDE> mapPairs = new HashMap<PairBIDE, PairBIDE>();
// for each sequence
for(PseudoSequenceBIDE sequence : sequences){
// for each itemset
for(int i=0; i< sequence.size(); i++){
// for each item
for(int j=0; j < sequence.getSizeOfItemsetAt(i); j++){
String item = sequence.getItemAtInItemsetAt(j, i);
// create the pair corresponding to this item
PairBIDE paire = new PairBIDE(sequence.isCutAtRight(i), sequence.isPostfix(i), item);
// register this sequenceID for that pair.
addPaire(mapPairs, sequence.getId(),
paire);
}
}
}
// check the memory usage
MemoryLogger.getInstance().checkMemory();
return mapPairs.keySet(); // return the pairs.
}
/**
* This method creates a copy of the sequence and add a given item
* as a new itemset to the sequence.
* It sets the support of the sequence as the support of the item.
* @param prefix the sequence
* @param item the item
* @return the new sequence
*/
private SequentialPattern appendItemToSequence(SequentialPattern prefix, String item) {
SequentialPattern newPrefix = prefix.cloneSequence(); // isSuffix
newPrefix.addItemset(new Itemset(item)); // cr�� un nouvel itemset + decalage
return newPrefix;
}
/**
* This method creates a copy of the sequence and add a given item
* to the last itemset of the sequence.
* It sets the support of the sequence as the support of the item.
* @param prefix the sequence
* @param item the item
* @return the new sequence
*/
private SequentialPattern appendItemToPrefixOfSequence(SequentialPattern prefix, String item) {
SequentialPattern newPrefix = prefix.cloneSequence();
Itemset itemset = newPrefix.get(newPrefix.size()-1); // ajoute au dernier itemset
itemset.addItem(item);
return newPrefix;
}
/**
* This method saves a sequential pattern to the output file or
* in memory, depending on if the user provided an output file path or not
* when he launched the algorithm
* @param prefix the pattern to be saved.
* @throws IOException exception if error while writing the output file.
*/
private void savePattern(SequentialPattern prefix) throws IOException {
// increase the number of patterns found
patternCount++;
StringBuilder r = new StringBuilder("");
for(Itemset itemset : prefix.getItemsets()){
// r.append('(');
for(String item : itemset.getItems()){
r.append(item);
r.append(' ');
}
r.append("-1 ");
}
// // print the list of Pattern IDs that contains this pattern.
// if(prefix.getSequencesID() != null){
// r.append("SID: ");
// for(Integer id : prefix.getSequencesID()){
// r.append(id);
// r.append(' ');
// }
// }
r.append(" #SUP: ");
r.append(prefix.getSequencesID().size());
writer.write(r.toString());
writer.newLine();
}
/**
* Print statistics about the algorithm execution to System.out.
* @param size the size of the database
*/
public void printStatistics(int size) {
StringBuilder r = new StringBuilder(200);
r.append("============= Algorithm - STATISTICS =============\n Total time ~ ");
r.append(endTime - startTime);
r.append(" ms\n");
r.append(" Closed sequential patterns count : ");
r.append(patternCount);
r.append('\n');
r.append(" Max memory (mb):");
r.append(MemoryLogger.getInstance().getMaxMemory());
r.append('\n');
r.append("===================================================\n");
System.out.println(r.toString());
}
}