package ca.pfv.spmf.algorithms.sequentialpatterns.fournier2008_seqdim;
/* This file is copyright (c) 2008-2013 Philippe Fournier-Viger
*
* This file is part of the SPMF DATA MINING SOFTWARE
* (http://www.philippe-fournier-viger.com/spmf).
*
* SPMF is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
*
* SPMF is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
* You should have received a copy of the GNU General Public License along with
* SPMF. If not, see <http://www.gnu.org/licenses/>.
*/
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import ca.pfv.spmf.algorithms.sequentialpatterns.fournier2008_seqdim.kmeans_for_fournier08.AlgoKMeansWithSupport;
import ca.pfv.spmf.algorithms.sequentialpatterns.fournier2008_seqdim.kmeans_for_fournier08.Cluster;
/**
* This is the original implementation of the Fournier-Viger algorithm (2008) for sequential
* pattern mining, which combines features from several algorithms and includes original features such
* as accepting items with double values. For details about this algorithm see:
* <br/><br/>
*
* Fournier-Viger, P., Nkambou, R & Mephu Nguifo, E. (2008), A Knowledge Discovery
* Framework for Learning Task Models from User Interactions in Intelligent Tutoring Systems.
* Proceedings of the 7th Mexican International Conference on Artificial Intelligence (MICAI 2008).
* LNAI 5317, Springer, pp. 765-778.
* <br/><br/>
*
* This implementation can keep the result into memory and return it by the method
* runAlgorithm() or save the result directly to a file, if an output file path is provided.
*
* @see SequenceDatabase
* @see Sequence
* @see Sequences
* @see PseudoSequence
* @see PseudoSequenceDatabase
* @see Pair
* @see AlgoKMeansWithSupport
* @see AbstractAlgoPrefixSpan
* @author Philippe Fournier-Viger
*/
public class AlgoFournierViger08 extends AbstractAlgoPrefixSpan{
// The sequential patterns that are found
private Sequences patterns = null;
/// number of sequential pattern found
int patternCount =0;
// start time of latest execution
private long startTime;
// end time of latest execution
private long endTime;
// parameters of runAlgorithm
private final double minInterval; // min time interval between two itemsets (c1)
private final double maxInterval; // max time interval between two itemsets (c2)
private final double minWholeInterval; // min time length of a seq. pattern (c3)
private final double maxWholeInterval; // max time length of a seq. pattern (c4)
private final double minsupp; // minimum support threshold
private final boolean findClosedPatterns; // find closed patterns or not
private int minsuppRelative; // minimum support as an integer
private boolean enableBackscanPruning; // use backscan pruning or not
// For performing the clustering, this algorithm need an instance
// of KMeans with support.
private final AlgoKMeansWithSupport algoClustering;
// For BIDE+, we have to keep a pointer to the original database
private PseudoSequenceDatabase initialDatabase = null;
// object to write the output file if the user wish to write to a file
BufferedWriter writer = null;
/**
* @param minsupp minimum support
* @param minInterval minimum item interval between two adjacent items. (C1)
* @param maxInterval maximum item interval between two adjacent items. (C2)
* @param minWholeInterval minimum item interval between the head and tail of a sequence. (C3)
* @param maxWholeInterval maximum item interval between the head and tail of a sequence (C4)
* @param algoClustering algorithm for clustering
* @param findClosedPatterns to mine only closed sequences
*/
public AlgoFournierViger08(double minsupp,
double minInterval, double maxInterval,
double minWholeInterval, double maxWholeInterval,
AlgoKMeansWithSupport algoClustering,
boolean findClosedPatterns, boolean enableBackscanPruning){
// Checking if the parameters are correct.
if((minInterval > maxInterval) ||
(minWholeInterval > maxWholeInterval) ||
(minInterval > maxWholeInterval) ||
(maxInterval > maxWholeInterval)){
throw new RuntimeException("Parameters are not valid!!!");
}
// Save the parameters in some fields of this class
this.minInterval = minInterval;
this.maxInterval = maxInterval;
this.minWholeInterval = minWholeInterval;
this.maxWholeInterval = maxWholeInterval;
this.algoClustering = algoClustering;
this.minsupp = minsupp;
this.findClosedPatterns = findClosedPatterns;
this.enableBackscanPruning = enableBackscanPruning;
}
/**
* Run the algorithm and save the result to a file
* @param database a sequence database
* @param outputFilePath an output file
* @throws IOException throw exception if error creating output file
*/
public void runAlgorithm(SequenceDatabase database, String outputFilePath) throws IOException {
// if the user want to save the result to a file
// create output file
writer = new BufferedWriter(new FileWriter(outputFilePath));
patterns = null;
// run the algorithm
runAlgorithm(database);
// close output file
writer.close();
writer = null;
}
/**
* Run the algorithm and save the result to memory
* @param database a sequence database
* @return a set of sequential patterns (Sequences)
* @throws IOException
*/
public Sequences runAlgorithm(SequenceDatabase database) throws IOException {
// if the user wants to save the result to memory
if(writer == null){
patterns = new Sequences("FREQUENT SEQUENCES WITH TIME + CLUSTERING");
}
patternCount =0;
// convert the minimum support from a percentage to an integer
// (a number of sequences)
this.minsuppRelative = (int) Math.ceil(minsupp * database.size());
// if support is 0, then set it to 1
if(this.minsuppRelative == 0){
this.minsuppRelative = 1;
}
// record start time
startTime = System.currentTimeMillis();
// call to the main method
isdb(database);
// record end time
endTime = System.currentTimeMillis();
// return the set of patterns found
return patterns;
}
/**
* The main method. It is inspired by
* the method ISDB based on the description in the article of Hirate & Yumana but
* with some additional modifications for clustering and for finding
* closed seq. patterns.
* @param originalDatabase The initial context.
* @throws IOException exception if error writing to output file
*/
private void isdb(SequenceDatabase originalDatabase) throws IOException{
// The algorithm first scan the database to find all frequent items
// The algorithm note the sequences in which these items appear.
// This is stored in a map: Key: item Value : IDs of sequences containing the item
Map<ItemSimple, Set<Integer>> mapSequenceID = findSequencesContainingItems(originalDatabase);
// WE CONVERT THE DATABASE IN A PSEUDO-DATABASE, AND REMOVE
// THE ITEMS OF SIZE 1 THAT ARE NOT FREQUENT, SO THAT THE ALGORITHM
// WILL NOT CONSIDER THEM ANYMORE. (OPTIMIZATION : OCTOBER-08 )
// we create a database
initialDatabase = new PseudoSequenceDatabase();
// for each sequence of the original database
for(Sequence sequence : originalDatabase.getSequences()){
// we make a copy of the sequence while removing infrequent items
Sequence optimizedSequence = sequence.cloneSequenceMinusItems(mapSequenceID, minsuppRelative);
if(optimizedSequence.size() != 0){
// if this sequence has size >0, we add it to the new database
initialDatabase.addSequence(new PseudoSequence(0, optimizedSequence, 0, 0));
}
}
// For each item
for(Entry<ItemSimple, Set<Integer>> entry : mapSequenceID.entrySet()){
// if the item is frequent
if(entry.getValue().size() >= minsuppRelative){
// build the projected database with this item
ItemSimple item = entry.getKey();
PseudoSequenceDatabase[] projectedContexts = null;
// if the item has a value
if(item instanceof ItemValued){
// build projected database by a using method specific to the case of valued items
projectedContexts = buildProjectedContextItemValued((ItemValued)item, initialDatabase, false, -1);
}else{
// otherwise use the regular method
projectedContexts = buildProjectedDatabase(item, initialDatabase, false, -1);
}
// For each projected database (because of clustering, there could be many)
for(PseudoSequenceDatabase projectedDatabase : projectedContexts){
// Create the prefix for the projected database.
Sequence prefix = new Sequence(0);
// if there no clustering was performed
if(projectedDatabase.getCluster() == null){
prefix.addItemset(new Itemset(item, 0));
// set the sequence IDS of this prefix
prefix.setSequencesID(entry.getValue());
}
else{
// If there was valued items (clustering or not)
// Create an item for the current cluster
ItemValued item2 = new ItemValued(entry.getKey().getId(),
projectedDatabase.getCluster().getaverage(),
projectedDatabase.getCluster().getLower(),
projectedDatabase.getCluster().getHigher());
prefix.addItemset(new Itemset(item2, 0));
// Sequence IDs
prefix.setSequencesID(projectedDatabase.getCluster().getSequenceIDs());
}
// variable to store the largest support of patterns
// that will be found starting with this prefix
int maxSuccessorSupport =0;
// We recursively try to extend the prefix.
// If the user wants to find closed patterns, then
// if the current prefix respect the backscan pruning condition
// (see BIDE paper for details).
if(!findClosedPatterns || !checkBackScanPruning(prefix)){
// recursive call
maxSuccessorSupport = projection(prefix, 2, projectedDatabase);
}
if(isMinWholeIntervalRespected(prefix)){
// Finally, because this prefix has support > minsup
// and passed the backscan pruning,
// we check if it has no sucessor with the same support
// (a forward extension)
// IF no forward extensionb
boolean noForwardSIExtension = !findClosedPatterns || !(prefix.getAbsoluteSupport() == maxSuccessorSupport);
// IF there is also no backward extension
boolean noBackwardExtension = !findClosedPatterns || !checkBackwardExtension(prefix);
// IF CLOSED
if(noForwardSIExtension && noBackwardExtension){
// we found a sequence, so save it!
savePattern(prefix);
}
}
}
}
}
}
/**
* This method saves a sequential pattern to the output file or
* in memory, depending on if the user provided an output file path or not
* when he launched the algorithm
* @param prefix the pattern to be saved.
* @throws IOException exception if error while writing the output file.
*/
private void savePattern(Sequence prefix) throws IOException {
// increase the number of pattern found for statistics purposes
patternCount++;
// if the result should be saved to a file
if(writer != null){
// create a StringBuilder
StringBuilder r = new StringBuilder("");
// for each itemset in this sequential pattern
for(Itemset itemset : prefix.getItemsets()){
// write timestamp
r.append('<');
r.append(itemset.getTimestamp());
r.append("> ");
// for each item
for(ItemSimple item : itemset.getItems()){
String string = item.toString();
r.append(string); // add the item
r.append(' ');
}
r.append("-1 "); // add the itemset separator
}
// add the support
r.append(" #SUP: ");
r.append(prefix.getSequencesID().size());
//
//// // print the list of Pattern IDs that contains this pattern.
// if(prefix.getSequencesID() != null){
// r.append(" #SID: ");
// for(Integer id : prefix.getSequencesID()){
// r.append(id);
// r.append(' ');
// }
// }
// write the string to the file
writer.write(r.toString());
// start a new line
writer.newLine();
}
// otherwise the result is kept into memory
else{
patterns.addSequence(prefix, prefix.size());
}
}
/**
* This is the "backscan-pruning" strategy described in the BIDE+
* paper to avoid extending some prefixs that are guaranteed to not
* generate a closed pattern (see the BIDE+ paper for details).
*
* @param prefix the current prefix
* @return boolean true if we should not extend the prefix
*/
private boolean checkBackwardExtension(Sequence prefix) {
// See the BIDE+ paper for details about this method.
// For the number of item occurences that can be generated with this prefix:
for(int i=0; i< prefix.getItemOccurencesTotalCount(); i++){
// (1)For each i, create the list of maximum periods
List<PseudoSequence> maximumPeriods = new ArrayList<PseudoSequence>();
for(PseudoSequence sequence : initialDatabase.getPseudoSequences()){
if(prefix.getSequencesID().contains(sequence.getId())){
// nov 2009 : FIXED BUG HERE, so that maxgap works
// with timestamp we need to do it differently than bide..
List<PseudoSequence> periods = sequence.getAllIthMaxPeriodOfAPrefix(prefix, i, true);
for(PseudoSequence period : periods){
if(period !=null){
maximumPeriods.add(period);
}
}
}
}
// (2)check if an element from the maximum periods has the same support as the prefix.
for(Pair pair : findAllFrequentPairsSatisfyingC1andC2ForBackwardExtensionCheck(prefix, maximumPeriods, i)){
if(pair.getCount() == prefix.getAbsoluteSupport()){
return true;
}
}
}
// System.out.println("NO BACKWARD");
return false;
}
/**
* Method to check if a prefix has a backward-extension (see Bide+ article for full details).
* This method do it a little bit differently than the BIDE+ article since
* we iterate with i on elements of the prefix instead of iterating with
* a i on the itemsets of the prefix. But the idea is the same!
* @param prefix the current prefix
* @return boolean true, if there is a backward extension
*/
private boolean checkBackScanPruning(Sequence prefix) {
//*********************************************************************
// VERY IMPORTANT : The backscan pruning cannot work correctly if
// the maximum whole interval constraint (constraint C4 in Hirate & Yamana)
// is not equal to infinity.
// *********************************************************************
// check if the backscan pruning is enabled
if(enableBackscanPruning == false){
return false;
}
// We check for an S-extension
for(int i=0; i< prefix.getItemOccurencesTotalCount(); i++){
// (1) For each i, we build the list of maximum periods
List<PseudoSequence> semimaximumPeriods = new ArrayList<PseudoSequence>();
// for each sequence in the original database
for(PseudoSequence sequence : initialDatabase.getPseudoSequences()){
// if the prefix appear in this sequence
if(prefix.getSequencesID().contains(sequence.getId())){
// get the ith maximum period
PseudoSequence period = sequence.getIthSemiMaximumPeriodOfAPrefix(prefix, i, true);
// if the period is not null
if(period !=null){
// we add it to the list of maximum periods
semimaximumPeriods.add(period);
}
}
}
// (2) check if an element of the semi-max perdios as the same frequency as the prefix.
Set<Pair> paires = findAllFrequentPairsSatisfyingC1andC2ForBackwardExtensionCheck(prefix, semimaximumPeriods, i);
for(Pair pair : paires){
// if there is extension with the same support
if(pair.getCount() == prefix.getAbsoluteSupport()){
// the prefix will not be closed and we return true
return true;
}
}
}
// System.out.println("NO PRUNING SHOULD BE DONE");
return false;
}
/**
* Method to find all frequent items in a database thas satisfy the C1, C2, C3 and C4
* time constraints if they were appended to the current prefix.
* This is for k> 1.
* @param prefix the current prefix
* @param maximumPeriods a list of i-th maximum periods
* @param iPeriod the variable i
* @return A list of pairs, where a pair is an item with (1) booleans indicating if it
* is in an itemset that is "cut" at left or right (prefix or postfix)
* , (2) the sequence IDs where it occurs and (3) a time interval.
*/
protected Set<Pair> findAllFrequentPairsSatisfyingC1andC2ForBackwardExtensionCheck(
Sequence prefix, List<PseudoSequence> maximumPeriods, int iPeriod) {
// We use a Map the store the pairs.
Map<Pair, Pair> mapPaires = new HashMap<Pair, Pair>();
// the set of pair that we have already seen for the current sequence
// (to count each item only one time for each sequence ID)
Set<Pair> alreadyCountedForSequenceID = new HashSet<Pair>();
// the last period that was scanned
PseudoSequence lastPeriod = null;
// for each period
for(PseudoSequence period : maximumPeriods){
// if the sequence does not have the same ID, we empty the set
// of items already seen
if(period != lastPeriod){
alreadyCountedForSequenceID.clear();
lastPeriod = period;
}
// for each itemset in the period
for(int i=0; i< period.size(); i++){
// for each item
for(int j=0; j < period.getSizeOfItemsetAt(i); j++){
// get the item
ItemSimple item = period.getItemAtInItemsetAt(j, i);
// Reminder: a maximum period is a subsequence of a sequence
// successorInterval: the time interval between (1) the itemset of the maximum period, which contains
// the item and (2) the itemset immediately after this maximum period.
// If the sucessor is cut in half, it is the time interval between (1) the item of the itemset containing the item
// and (2) the time of the last itemset in the maximum period.
long successorInterval = period.getTimeSucessor() - period.getAbsoluteTimeStamp(i);
// totaltime: total time length of the prefix if we add the current item to the prefix.
long totalTime = prefix.getTimeLength() + successorInterval;
//predecessorInterval : the time interval etween (1) the itemset in the maximum period, which contains
// the item and (2) the itemset immediately before the maximum period. If the predecessor is cut
// in half, it is the time interval between (1) the time of the itemset containing the item and
// (2) the time of the first itemset of the maximum period.
long predecessorInterval = period.getAbsoluteTimeStamp(i) - period.getTimePredecessor();
// Check if the time interval of the successor meet the C1 and C2 constraints
boolean checkGapSucessor
= successorInterval >= minInterval && successorInterval <= maxInterval || successorInterval == 0;
// Check if the time inverval with the predecessor meet the C1 and C2 constraints.
// If the "i" of this ith max period is 0.
// we don't need to check because it is the case of a backward extension where the
// item would be added before the prefix.
boolean checkGapPredecesseur
= predecessorInterval >= minInterval && predecessorInterval <= maxInterval
|| iPeriod ==0 || predecessorInterval == 0;
// Check that the sequential pattern would meet the C3 and C4 constraints.
// If the "i" of this ith max period is 0.
// we don't need to check because it is the case of a backward extension where the
// item would be added INSIDE the prefix.
boolean checkWholeInterval
= totalTime <= maxWholeInterval && totalTime >= minWholeInterval || iPeriod !=0;
// If all the constraints (C1, C2, C3 and C4) are met (the constraints about time interval)
if(checkGapSucessor && checkGapPredecesseur && checkWholeInterval){ // C1 C2, C3, C4 check
// create a new pair with the current item and indicate if
// the item would be part of an itemset that is cut at right or at left,
// and the time interval with the previous item in the prefix
Pair pair = new Pair(successorInterval, period.isCutAtRight(i), period.isCutAtLeft(i), item); // INTERVALLE ?
// check if there is already a pair for that item
Pair oldpair = mapPaires.get(pair);
// if the pair was not already counted for that sequence
if(!alreadyCountedForSequenceID.contains(pair)){
// if there was no pair already
if(oldpair == null){
// put the pair
mapPaires.put(pair, pair);
}else{
//otherwise use the old one
pair = oldpair;
}
// remember that we have seen this pair for that sequence
alreadyCountedForSequenceID.add(pair);
// add the sequence ID to the pair
pair.getSequencesID().add(period.getId());
}
}
}
}
}
// return the pairs
return mapPaires.keySet();
}
/**
* For each item, calculate the sequence id of sequences containing that item
* @param database the current sequence database
* @return Map of items to sequence IDs that contains each item
*/
private Map<ItemSimple, Set<Integer>> findSequencesContainingItems(SequenceDatabase contexte) {
// the following set is to remember if an item was already seen for a sequence
Set<Integer> alreadyCounted = new HashSet<Integer>();
// The latest sequence that was scanned
Sequence lastSequence = null;
// We use a map to store the sequence IDs where an item appear
// Key : item Value : a set of sequence IDs
Map<ItemSimple, Set<Integer>> mapSequenceID = new HashMap<ItemSimple, Set<Integer>>();
// for each sequence
for(Sequence sequence : contexte.getSequences()){
// If we scan a new sequence (with a different id),
// then reset the set of items that we have seen...
if(lastSequence == null || lastSequence.getId() != sequence.getId()){ // FIX
alreadyCounted.clear();
lastSequence = sequence;
}
// for each itemset in that sequence
for(Itemset itemset : sequence.getItemsets()){
// for each item
for(ItemSimple item : itemset.getItems()){
// if we have not seen this item yet for that sequence
if(!alreadyCounted.contains(item.getId())){
// get the set of sequence ids for that item
Set<Integer> sequenceIDs = mapSequenceID.get(item);
if(sequenceIDs == null){
// if null create a new set
sequenceIDs = new HashSet<Integer>();
mapSequenceID.put(item, sequenceIDs);
}
// add the current sequence id to this set
sequenceIDs.add(sequence.getId());
// remember that we have seen this item
alreadyCounted.add(item.getId());
}
}
}
}
// return the map
return mapSequenceID;
}
/**
* Method to recursively grow a given sequential pattern.
* @param prefix the current sequential pattern that we want to try to grow
* @param k the size of the prefix in terms of item count.
* @param database the current projected sequence database
* @throws IOException exception if there is an error writing to the output file
*/
private int projection(Sequence prefix, int k, PseudoSequenceDatabase database) throws IOException {
int maxSupport = 0;
// For each pair found (a pair is an item with a boolean indicating if it
// appears in an itemset that is cut (a postfix) or not, and the sequence IDs
// where it appears in the projected database) that satisfy the C1, C2 and C3 constraints
for(Pair pair : findAllFrequentPairsSatisfyingC1andC2(prefix, database.getPseudoSequences())){
// If the pair is frequent
if(pair.getCount() >= minsuppRelative){
Sequence newPrefix;
// if the item is part of a postfix (an itemset cut at right)
if(pair.isPostfix()){
// we append it to the last itemset of the prefix
newPrefix = appendItemToPrefixOfSequence(prefix, pair.getItem()); // is =<is, (deltaT,i)>
}else{
// else, we append it as a new itemset to the sequence
newPrefix = appendItemToSequence(prefix, pair.getItem(), pair.getTimestamp());
}
// if the constraint C4 is respected
if(isMaxWholeIntervalRespected(newPrefix)){ // C4 check
// make a recursive call to extend the prefix with this item
// and generate other patterns starting with that prefix + item
int successorSupport = projectionPair(newPrefix, pair, prefix, database, k);
// record the largest support of patterns found starting
// with this prefix+pair until now
if(successorSupport > maxSupport){
maxSupport = successorSupport;
}
}
}
}
// return the maximum support
return maxSupport;
}
/**
* Check if the constraints C1 and C2 are respected.
* @param timeInterval a time interval
* @return true, if yes. false, if no.
*/
private boolean isTheMinAndMaxIntervalRespected(long timeInterval){
return (timeInterval >= minInterval) && (timeInterval <= maxInterval);
}
/**
* Check if the constraints C3 is respected by a seq. pattern.
* @param sequence a sequential pattern
* @return true, if yes. false, if no.
*/
private boolean isMaxWholeIntervalRespected(Sequence sequence){
return (sequence.get(sequence.size()-1).getTimestamp() <= maxWholeInterval);
}
/**
* Check if the constraints C4 is respected by a seq. pattern.
* @param sequence a sequential pattern
* @return true, if yes. false, if no.
*/
private boolean isMinWholeIntervalRespected(Sequence sequence){
return (sequence.get(sequence.size()-1).getTimestamp() >= minWholeInterval);
}
/**
* Do a database projection of a sequence database with a pair.
* @param pair the pair
* @param oldPrefix the current prefix
* @param newPrefix the new prefix obtained by appending the pair to the current prefix
* @param database the database to
* @param k the length of the current prefix in terms of itemset count
* @throws IOException if error writing to output file
*/
private int projectionPair(Sequence newPrefix, Pair paire, Sequence oldPrefix, PseudoSequenceDatabase database, int k) throws IOException {
// variable to store the maximum support of frequent seq. patterns that can be obtained
// by growing newPrefix.
int maxSupport = 0;
// Create projected databases (because of the clustering, there can be more than one
// unlike the regular PrefixSpan algorithm)
// Create array to store the projected databases
PseudoSequenceDatabase[] projectedContexts = null;
// if the projection is with a valued item
if(paire.getItem() instanceof ItemValued){
// we use clustering
projectedContexts = buildProjectedContextItemValued((ItemValued)paire.getItem(), database, paire.isPostfix(), paire.getTimestamp());
}else{
// otherwise, we do a simple database projection similarly to the Hirate & Yamana algorithm
projectedContexts = buildProjectedDatabase(paire.getItem(), database, paire.isPostfix(), paire.getTimestamp());
}
// for each projected database
for(PseudoSequenceDatabase projectedContext : projectedContexts){
Sequence prefix;
// if there is no valued item (no clustering was done)
if(projectedContext.getCluster() == null){
// just clone the new prefix and set its sequence IDs
prefix = newPrefix.cloneSequence();
prefix.setSequencesID(paire.getSequencesID());
}
else{
// Otherwise there is one or more clusters
// create the item corresponding to this cluster
ItemValued item2 = new ItemValued(projectedContext.getCluster().getItemId(),
projectedContext.getCluster().getaverage(),
projectedContext.getCluster().getLower(),
projectedContext.getCluster().getHigher());
// Get the sequence IDs corresponding to this cluster
Set<Integer> sequenceIDs = projectedContext.getCluster().getSequenceIDs();
// if the item used for the projection was found in a postfix
if(paire.isPostfix()){
// we use special method to append for this case
prefix = appendItemToPrefixOfSequence(oldPrefix, item2);
}else{
// otherwise we use the regular method
prefix = appendItemToSequence(oldPrefix, item2, paire.getTimestamp());
}
// we set the sequence id of the prefix
prefix.setSequencesID(sequenceIDs);
}
// variable to store the largest support of patterns
// that will be found starting with this prefix
int maxSuccessor =0;
// We recursively try to extend the prefix
// if the users want to find closed pattern, otherwise, we make
// sure that the current prefix respects the backscan pruning condition
// (see BIDE paper for details).
if(!findClosedPatterns || !checkBackScanPruning(prefix)){
// recursive call
maxSuccessor = projection(prefix, k+1, projectedContext);
}
// if the C3 constraint is respected
if(isMinWholeIntervalRespected(prefix)){
// if the user wants closed patterns,
// then check if there is a forward extension of the current prefix
boolean noForwardSIExtension = !findClosedPatterns || !(prefix.getAbsoluteSupport() == maxSuccessor);
// if the user wants closed patterns,
// then check if there is a backward extension of the current prefix
boolean noBackwardExtension = !findClosedPatterns || !checkBackwardExtension(prefix);
// if the pattern is closed
if(noForwardSIExtension && noBackwardExtension){
// add the sequential patterns to the set of patterns found
savePattern(prefix);
}
// if this is the pattern with the highest support found,
// then record the support.
if(prefix.getAbsoluteSupport() > maxSupport){
maxSupport = prefix.getAbsoluteSupport();
}
}
}
return maxSupport;
}
/**
* Method find all the frequent pairs that could extend the current prefix in a sequence database
* and such that the resulting prefix would respect the C1 and C2 constraints.
* For k>1.
* @param prefixe the current prefix
* @param database the current sequence database
* @return the set of frequent pairs
*/
protected Set<Pair> findAllFrequentPairsSatisfyingC1andC2(Sequence prefixe, List<PseudoSequence> database) {
// Create a Map of pairs to store the pairs
Map<Pair, Pair> mapPaires = new HashMap<Pair, Pair>();
// Important: We need to make sure that don't count two time the same element
// This is the remember the last sequence scanned
PseudoSequence lastSequence = null;
// This is the remember the pairs that have been already counted
Set<Pair> alreadyCountedForSequenceID = new HashSet<Pair>();
// for each sequence
for(PseudoSequence sequence : database){
// if this is a sequence with a different ID than the previous sequence
if(lastSequence == null || sequence.getId() != lastSequence.getId()){ // NEW PHILIPPE OCT-08
// reset the Pairs that have been already processed
alreadyCountedForSequenceID.clear();
// remember this sequence as the last sequence scanned for next time
lastSequence = sequence;
}
// for each itemset
for(int i=0; i< sequence.size(); i++){
// for each item in this itemset
for(int j=0; j < sequence.getSizeOfItemsetAt(i); j++){
// get the item
ItemSimple item = sequence.getItemAtInItemsetAt(j, i);
// check the C1 and C2 constraints if the item i was added to the current prefix
// if this item is not in a postfix.
if(isTheMinAndMaxIntervalRespected(sequence.getTimeStamp(i))
|| sequence.isCutAtLeft(i)){
// Create the pair corresponding to this item
Pair paire = new Pair(sequence.getTimeStamp(i), sequence.isCutAtRight(i),sequence.isCutAtLeft(i), item);
// if this pair was not processed already for this sequence ID
if(!alreadyCountedForSequenceID.contains(paire)){
// Get the old pair for this item in the map
Pair oldPaire = mapPaires.get(paire);
// if none, put the new one
if(oldPaire == null){
mapPaires.put(paire, paire);
}else{
// otherwise use the old one
paire = oldPaire;
}
// remember that we process this pair now
alreadyCountedForSequenceID.add(paire);
// remember the sequence ID of this sequence for this pair
paire.getSequencesID().add(sequence.getId());
}
}
}
}
}
// return the pairs
return mapPaires.keySet();
}
/**
* Do a database projection on a sequence database with an item with a given timestamp. This
* method is for the case of an item that is not a valued item.
* @param item the item
* @param contexte the database
* @param inSuffix true if the item was in a suffix
* @param timestamp the timestamp
* @return a projected sequence database
*/
private PseudoSequenceDatabase[] buildProjectedDatabase(ItemSimple item,
PseudoSequenceDatabase contexte, boolean inSuffix, long timestamp) {
// This structure will store the projected database
PseudoSequenceDatabase sequenceDatabase = new PseudoSequenceDatabase();
// Contrarily to PrefixSpan, we need to create all subsequences in the projected database
// corresponding to each occurence of the item (because of the timestamps). Each time that
// we encounter the item that is used
// for the projection and its timestamp, we will add the sequence to the projected database
// (if this database is not empty). We also need to adjust timestamps for all new
// sequences that are created (as in Hirate & Yamana) in the projected database.
// for each sequence
for(PseudoSequence sequence : contexte.getPseudoSequences()){
// for each itemset
for(int i =0; i< sequence.size(); i++){
// if the timestamp does not match, then skip it
if(timestamp != -1 && timestamp != sequence.getTimeStamp(i)){
continue;
}
// if the current itemset contain the item
int index = sequence.indexOf(i, item.getId());
// if it contains the item and the current itemset is cut at left if the item is in a suffix
if(index != -1 && sequence.isCutAtLeft(i) == inSuffix){
// if its not the last item
if(index != sequence.getSizeOfItemsetAt(i)-1){
// create the projected sequence
PseudoSequence newSequence = new PseudoSequence(sequence.getAbsoluteTimeStamp(i),
sequence, i, index+1);
// if the projected sequence is not empty
// then add it to the projected database
if(newSequence.size() >0){
sequenceDatabase.addSequence(newSequence);
}
}else if ((i != sequence.size()-1)){
// otherwise, if it is not the last itemset of the sequence,
// create the projected sequence
PseudoSequence newSequence = new PseudoSequence(sequence.getAbsoluteTimeStamp(i), sequence, i+1, 0);
// if size of pseudo sequence >0, add it to the projected database
if(newSequence.size() >0){
sequenceDatabase.addSequence(newSequence);
}
}
}
}
}
// return the projected database
return new PseudoSequenceDatabase[]{sequenceDatabase};
}
/**
* Do a database projection with a valued item (an item having a value)
* @param item the item
* @param database the database
* @param inSuffix if the item was found in a suffix (an itemset cut at right)
* @param timestamp the timestamp of the itemsets where the item was found
* @return a set of pseudo-sequence database(s) obtained by the projection
*/
private PseudoSequenceDatabase[] buildProjectedContextItemValued(ItemValued item,
PseudoSequenceDatabase database, boolean inSuffix, long timestamp) {
// structure that will contain the projected database
PseudoSequenceDatabase sequenceDatabase = new PseudoSequenceDatabase();
// For clustering, we will keep all the item occurences of the item, removed from the projected sequences.
// This information is kept in a list ordered by the sequences containing the item.
List<ItemValued> removedItems = new ArrayList<ItemValued>();
// The item occurrences that are removed to do the projection and that resulted
// in an empty pseudo-projected sequence.
List<ItemValued> removedItemsDestroyed = new ArrayList<ItemValued>();
// for each sequence
for(PseudoSequence sequence : database.getPseudoSequences()){
// for each itemset in the current sequence
for(int i =0; i< sequence.size(); i++){
// if the timestamp does not match with the timestamp of the item
// that is used for projection, skip this itemset
if(timestamp != -1 && timestamp != sequence.getTimeStamp(i)){
continue;
}
int index = sequence.indexOf(i, item.getId());
// if the curren itemset contains the item
if(index != -1 && sequence.isCutAtLeft(i) == inSuffix){
// if it is not the last item from the current itemset
if(index != sequence.getSizeOfItemsetAt(i)-1){
// create new pseudosequence
PseudoSequence newSequence = new PseudoSequence(sequence.getAbsoluteTimeStamp(i),
sequence, i, index+1);
// if the length of the pseudosequence is >0, then add it to the projected database.
if(newSequence.size() >0){
sequenceDatabase.addSequence(newSequence);
}
// remember the item occurences used for the projection
removedItems.add((ItemValued)sequence.getItemAtInItemsetAt(index, i));
}else if(i == sequence.size()-1){
// if it is the last item from the sequence and the last itemset, the projected
// sequence is empty, but we still need to
// remember the item occurences used for the projection
removedItemsDestroyed.add((ItemValued)sequence.getItemAtInItemsetAt(index, i));
// removedItems.add(sequence.getItemAtInItemsetAt(index, i)); // AJOUT PHILIPPE 2 OCT
}else{
// otherwise if it is not the last itemset from the sequence
// create new pseudosequence
PseudoSequence newSequence = new PseudoSequence(sequence.getAbsoluteTimeStamp(i), sequence, i+1, 0);
// if the length of the pseudosequence is >0, then add it to the projected database.
if(newSequence.size() >0){
sequenceDatabase.addSequence(newSequence);
}
// remember the item occurences used for the projection
removedItems.add((ItemValued)sequence.getItemAtInItemsetAt(index, i));
}
}
}
}
// now that the previous for loop has created all the projected sequence, we need to perform
// the clustering to try to separate the pseudo sequences in several sequence database according
// to the values that are given to the items.
return breakInClusters(item, database, sequenceDatabase, removedItems, removedItemsDestroyed);
}
/**
* Separate a pseudo-sequence database into several sequence databases according to the values
* associated to occurences of the item used for the pseudo-projection.
* @param item the item used for the pseudo-projection
* @param database the original sequence database
* @param sequenceDatabase the projected sequence database
* @param removedItems the item occurrences that were removed to do the projection in a non empty pseudo-projected sequence.
* @param removedItemsDestroyed the item occurrences that were removed to do the projection and that resulted in an empty pseudo-projected sequence.
* @return
*/
private PseudoSequenceDatabase[] breakInClusters(ItemValued item,
PseudoSequenceDatabase database, PseudoSequenceDatabase sequenceDatabase,
List<ItemValued> removedItems, List<ItemValued> removedItemsDestroyed) {
// If no clustering was performed
if(removedItems.size() == 0 &&
removedItemsDestroyed.size() ==0) {
// return a single sequence database
return new PseudoSequenceDatabase[]{sequenceDatabase};
}
PseudoSequenceDatabase[] sequenceDatabases;
// if the number of sequences in the projected database is at least twice the minsup threshold,
// then it would make sence to try to separate the databases according to clusters
if(sequenceDatabase.getSequenceIDs().size() >= (minsuppRelative *2)){
// we call the method to separate the database by clusters
sequenceDatabases = createSequenceDatabasesByClusters(sequenceDatabase, removedItems);
}else{ // Otherwise, we return a single database
sequenceDatabases = new PseudoSequenceDatabase[]{sequenceDatabase};
Cluster cluster = new Cluster(removedItems, removedItemsDestroyed);
cluster.addItems(removedItemsDestroyed);
cluster.computeHigherAndLower();
sequenceDatabase.setCluster(cluster);
}
// ------------------------------------------------------
// Extra step: Compute support for each cluster from sequences of the initial database taken
// as parameter (instread of the projected context, which would be wrong). This could
// probably be optimized by combining the first loop with this method.
findSequencesContainingClusters(database, sequenceDatabases, item); // could be optimized
return sequenceDatabases;
}
/**
* This method
* @param database
* @param sequenceDatabases
* @param item
*/
private void findSequencesContainingClusters(PseudoSequenceDatabase database, PseudoSequenceDatabase[] sequenceDatabases, ItemValued item) {
// Create a list of clusters corresponding to each sequence database
Cluster[] clusters = new Cluster[sequenceDatabases.length];
// for each sequence database
for(int i=0; i< sequenceDatabases.length; i++){
// get the corresponding cluster
clusters[i] = sequenceDatabases[i].getCluster();
clusters[i].setSequenceIDs(new HashSet<Integer>());
}
// This set will be used to make sure that we don,t count the same cluster twice
// for sequences having the same id
Set<Cluster> alreadyCounted = new HashSet<Cluster>();
// this variable is to remember the last sequence scanned
PseudoSequence lastSequence = null;
// for each sequence
for(PseudoSequence sequence : database.getPseudoSequences()){
// if the sequence is not null and it has not the same ID as the last sequence scanned
// clear the set of clusters already seen and update the last sequence seen
if(lastSequence == null || lastSequence.getId() != sequence.getId()){
alreadyCounted.clear();
lastSequence = sequence;
}
//
// for each itemset
for(int i=0; i< sequence.size(); i++){
// for each item
for(int j=0; j< sequence.getSizeOfItemsetAt(i); j++){
ItemSimple item2 = sequence.getItemAtInItemsetAt(j, i);
// if it is the item that we are looking for
if(item2.getId() == item.getId()){
// find the cluster containing this item
Cluster cluster = findClusterContainingItem(clusters, (ItemValued)item2);
// if there is a cluster and that we did not see it yet
if(cluster != null && !alreadyCounted.contains(cluster)){
// add the sequence ID to the set of sequence IDs of the cluster
cluster.getSequenceIDs().add(sequence.getId());
// rememer that the sequence ID was added to the cluster
alreadyCounted.add(cluster);
}
}
}
}
}
}
/**
* Find the cluster that contains a given item from an array of clusters
* @param clusters the array of clusters
* @param item the item
* @return the cluster containing the item or null if no cluster contains the item
*/
private Cluster findClusterContainingItem(Cluster[] clusters, ItemValued item) {
// for each cluster
for(Cluster cluster : clusters){
// if the cluster contains the item
if(cluster.containsItem(item)){
//return the cluster
return cluster;
}
}
// no cluster contains the item, so return null
return null;
}
/**
* This method separate sequences from a sequence database in several sequence databases
* according to a set of clusters found
* by clustering valued items.
* @param database a sequence database
* @param items The items to be clustered
* @return Un ou plusieurs contexte
*/
private PseudoSequenceDatabase[] createSequenceDatabasesByClusters(
PseudoSequenceDatabase database, List<ItemValued> items) {
// We associate sequence IDs to each item to make sure that the clusters generated
// by the clustering algorithm are frequent cluster and that items from the same sequence
// are note counted more than once for the same sequence ID.
for(int i=0; i< items.size(); i++){
items.get(i).setSequenceID(database.getPseudoSequences().get(i).getId());
}
// Apply the clustering algorithm on the list of items
List<Cluster> clusters = algoClustering.runAlgorithm(items);
// create a sequenceDatabase for each cluster
PseudoSequenceDatabase[] sequenceDatabases = new PseudoSequenceDatabase[clusters.size()];
// For each sequence, assign it to a sequenceDatabase based on the clusters found.
for(int i=0; i< database.size(); i++){
// Get the corresponding valued item
ItemValued item = items.get(i);
// find the cluster containing the item
int clusterIndex = clusters.indexOf(item.getCluster());
if(clusterIndex == -1){ //2010 ADDED THIS TO FIX A PROBLEM
continue;
}
// if the sequence database for this cluster has not been created
if(sequenceDatabases[clusterIndex] == null){
// create it
sequenceDatabases[clusterIndex] = new PseudoSequenceDatabase();
sequenceDatabases[clusterIndex].setCluster(clusters.get(clusterIndex));
}
// add the sequence to the cluster
sequenceDatabases[clusterIndex].addSequence(database.getPseudoSequences().get(i));
}
// return the sequence databases.
return sequenceDatabases;
}
/**
* This method creates a copy of the sequence and add a given item
* as a new itemset to the sequence.
* It sets the support of the sequence as the support of the item.
* @param prefix the sequence
* @param item the item
* @return the new sequence
*/
private Sequence appendItemToSequence(Sequence prefix, ItemSimple item, long timestamp) {
Sequence newPrefix = prefix.cloneSequence(); // isSuffix
long decalage = newPrefix.get(newPrefix.size()-1).getTimestamp();
newPrefix.addItemset(new Itemset(item, timestamp + decalage)); // cr�� un nouvel itemset + decalage
return newPrefix;
}
/**
* This method creates a copy of the sequence and add a given item
* to the last itemset of the sequence.
* It sets the support of the sequence as the support of the item.
* @param prefix the sequence
* @param item the item
* @return the new sequence
*/
private Sequence appendItemToPrefixOfSequence(Sequence prefix, ItemSimple item) {
Sequence newPrefix = prefix.cloneSequence();
Itemset itemset = newPrefix.get(newPrefix.size()-1); // add to last itemset
itemset.addItem(item);
return newPrefix;
}
/**
* Print statistics about the algorithm execution to System.out.
*/
public void printStatistics() {
StringBuilder r = new StringBuilder(200);
r.append("============= Algorithm - STATISTICS =============\n Total time ~ ");
r.append(endTime - startTime);
r.append(" ms\n");
r.append(" Frequent sequences count : ");
r.append(patternCount);
r.append('\n');
// r.append(patterns.toString(databaseSize));
r.append("===================================================\n");
System.out.println(r.toString());
}
/**
* Print the seq. patterns found to System.out. with
* @param databaseSize the size of the database (a number of sequences)
*/
public void printResult(int databaseSize) {
StringBuilder r = new StringBuilder(200);
r.append("============= Algorithm - STATISTICS =============\n Total time ~ ");
r.append(endTime - startTime);
r.append(" ms\n");
r.append(" Frequent sequences count : ");
r.append(patternCount);
r.append('\n');
r.append(patterns.toString(databaseSize));
r.append("===================================================\n");
System.out.println(r.toString());
}
/**
* Get the minsup threshold as a percentage (doule)
* @return a double
*/
public double getMinSupp() {
return minsupp;
}
/**
* Get the minsup threshold as an integer (sequence count)
* @return an integer
*/
public int getMinsuppRelative() {
return minsuppRelative;
}
}