package ca.pfv.spmf.algorithms.sequentialpatterns.BIDE_and_prefixspan_with_strings; /* This file is copyright (c) 2008-2013 Philippe Fournier-Viger * * This file is part of the SPMF DATA MINING SOFTWARE * (http://www.philippe-fournier-viger.com/spmf). * * SPMF is free software: you can redistribute it and/or modify it under the * terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * * SPMF is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR * A PARTICULAR PURPOSE. See the GNU General Public License for more details. * You should have received a copy of the GNU General Public License along with * SPMF. If not, see <http://www.gnu.org/licenses/>. */ import java.io.BufferedWriter; import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import ca.pfv.spmf.input.sequence_database_list_strings.Sequence; import ca.pfv.spmf.input.sequence_database_list_strings.SequenceDatabase; import ca.pfv.spmf.tools.MemoryLogger; /*** * This is an implementation of the PrefixSpan algorithm by Pei et al. 2001 modfied to take * sequences of strings as input instead of sequences of integers. * <br/><br/> * * NOTE: This implementation saves the pattern to a file as soon * as they are found or can keep the pattern into memory if no output path * is provided to the runAlgorithm() method. <br/><br/> * In future a version of SPMF, it is planned to remove this package and to provide a more general * mechanism for handling strings in sequences that would work for all algorithms * that take sequences as input. But this has not been done yet. * @see Sequence @see SequenceDatabase @see SequentialPattern @see SequentialPatterns @see Pair @see PseudoSequence * @author Philippe Fournier-Viger */ public class AlgoPrefixSpan_with_Strings{ // for statistics private long startTime; private long endTime; // the number of pattern found private int patternCount; // absolute minimum support private int minsuppAbsolute; // writer to write output file BufferedWriter writer = null; // The sequential patterns that are found // (if the user want to keep them into memory) private SequentialPatterns patterns = null; /** * Default constructor */ public AlgoPrefixSpan_with_Strings(){ } /** * Run the algorithm * @param database : a sequence database * @param minsup : the minimum support as an integer * @param outputFilePath : the path of the output file to save the result * or null if you want the result to be saved into memory * @return return the result, if saved into memory, otherwise null * @throws IOException exception if error while writing the file */ public SequentialPatterns runAlgorithm(SequenceDatabase database, String outputFilePath, int minsup) throws IOException { // initialize variables for statistics patternCount =0; MemoryLogger.getInstance().reset(); // to check the memory usage // keep the minimum support because we will need it this.minsuppAbsolute = minsup; // save the start time startTime = System.currentTimeMillis(); // run the algorithm prefixSpan(database, outputFilePath); // save the end time endTime = System.currentTimeMillis(); // close the output file if the result was saved to a file if(writer != null){ writer.close(); } return patterns; } /** * This is the main method for the PrefixSpan algorithm that is called * to start the algorithm * @param outputFilePath an output file path if the result should be saved to a file * or null if the result should be saved to memory. * @param database a sequence database * @throws IOException exception if an error while writing the output file */ private void prefixSpan(SequenceDatabase database, String outputFilePath) throws IOException{ // if the user want to keep the result into memory if(outputFilePath == null){ writer = null; patterns = new SequentialPatterns("FREQUENT SEQUENTIAL PATTERNS"); }else{ // if the user want to save the result to a file patterns = null; writer = new BufferedWriter(new FileWriter(outputFilePath)); } // We have to scan the database to find all frequent patterns of size 1. // We note the sequences in which these patterns appear. Map<String, Set<Integer>> mapSequenceID = findSequencesContainingItems(database); // WE CONVERT THE DATABASE ITON A PSEUDO-DATABASE, AND REMOVE // THE ITEMS OF SIZE 1 THAT ARE NOT FREQUENT, SO THAT THE ALGORITHM // WILL NOT CONSIDER THEM ANYMORE. (OPTIMIZATION : OCTOBER-08 ) // Create a list of pseudosequence List<PseudoSequence> initialContext = new ArrayList<PseudoSequence>(); // for each sequence in the database for(Sequence sequence : database.getSequences()){ // remove infrequent items Sequence optimizedSequence = sequence.cloneSequenceMinusItems(mapSequenceID, minsuppAbsolute); if(optimizedSequence.size() != 0){ // if the size is > 0, create a pseudo sequence with this sequence initialContext.add(new PseudoSequence(optimizedSequence, 0, 0)); } } // For each item for(Entry<String, Set<Integer>> entry : mapSequenceID.entrySet()){ // if the item is frequent (has a support >= minsup) if(entry.getValue().size() >= minsuppAbsolute){ // if the item is frequent // build the projected context String item = entry.getKey(); List<PseudoSequence> projectedContext = buildProjectedContext(item, initialContext, false); // Create the prefix for the projected context. SequentialPattern prefix = new SequentialPattern(0); prefix.addItemset(new Itemset(item)); prefix.setSequencesID(entry.getValue()); // The prefix is a frequent sequential pattern. // We save it in the result. savePattern(prefix); // we found a sequence. // Recursive call ! recursion(prefix, projectedContext); } } } /** * This method saves a sequential pattern to the output file or * in memory, depending on if the user provided an output file path or not * when he launched the algorithm * @param prefix the pattern to be saved. * @throws IOException exception if error while writing the output file. */ private void savePattern(SequentialPattern prefix) throws IOException { // increase the number of pattern found for statistics purposes patternCount++; // if the result should be saved to a file if(writer != null){ StringBuilder r = new StringBuilder(""); for(Itemset itemset : prefix.getItemsets()){ // r.append('('); for(String item : itemset.getItems()){ String string = item.toString(); r.append(string); r.append(' '); } r.append("-1 "); } // // // print the list of Pattern IDs that contains this pattern. // if(prefix.getSequencesID() != null){ // r.append("SID: "); // for(Integer id : prefix.getSequencesID()){ // r.append(id); // r.append(' '); // } // } r.append(" #SUP: "); r.append(prefix.getSequencesID().size()); writer.write(r.toString()); writer.newLine(); }// otherwise the result is kept into memory else{ patterns.addSequence(prefix, prefix.size()); } } /** * For each item, calculate the sequence id of sequences containing that item * @param database the current sequence database * @return Map of items to sequence IDs that contains each item */ private Map<String, Set<Integer>> findSequencesContainingItems(SequenceDatabase contexte) { // We use a map to store the sequence IDs where an item appear // Key : item Value : a set of sequence IDs Map<String, Set<Integer>> mapSequenceID = new HashMap<String, Set<Integer>>(); // pour conserver les ID des s�quences: <Id Item, Set d'id de s�quences> // for each sequence in the current database for(Sequence sequence : contexte.getSequences()){ // for each itemset in this sequence for(List<String> itemset : sequence.getItemsets()){ // for each item for(String item : itemset){ // get the set of sequence IDs for this item until now Set<Integer> sequenceIDs = mapSequenceID.get(item); if(sequenceIDs == null){ // if the set does not exist, create one sequenceIDs = new HashSet<Integer>(); mapSequenceID.put(item, sequenceIDs); } // add the sequence ID of the current sequence to the // set of sequences IDs of this item sequenceIDs.add(sequence.getId()); // } } } } return mapSequenceID; } /** * Create a projected database by pseudo-projection * @param item The item to use to make the pseudo-projection * @param context The current database. * @param inSuffix This boolean indicates if the item "item" is part of a suffix or not. * @return the projected database. */ private List<PseudoSequence> buildProjectedContext(String item, List<PseudoSequence> database, boolean inSuffix) { // We create a new projected database List<PseudoSequence> sequenceDatabase = new ArrayList<PseudoSequence>(); // for each sequence in the database received as parameter for(PseudoSequence sequence : database){ // for each sequence for(int i =0; i< sequence.size(); i++){ // for each item of the sequence // check if the itemset contains the item that we use for the projection int index = sequence.indexOf(i, item); // if it does not, and the current item is part of a suffix if inSuffix is true // and vice-versa if(index != -1 && sequence.isPostfix(i) == inSuffix){ // if this is not the last item of the itemset of this sequence if(index != sequence.getSizeOfItemsetAt(i)-1){ // if this is not the last item of the itemset // create a new pseudo sequence PseudoSequence newSequence = new PseudoSequence( sequence, i, index+1); if(newSequence.size() >0){ sequenceDatabase.add(newSequence); } }else if ((i != sequence.size()-1)){// if this is not the last itemset of the sequence // create a new pseudo sequence PseudoSequence newSequence = new PseudoSequence( sequence, i+1, 0); if(newSequence.size() >0){ // if the size of this pseudo sequence is greater than 0 // add it to the projected database. sequenceDatabase.add(newSequence); } } } } } return sequenceDatabase; // return the projected database } /** * Method to recursively grow a given sequential pattern. * @param prefix the current sequential pattern that we want to try to grow * @param database the current projected sequence database * @throws IOException exception if there is an error writing to the output file */ private void recursion(SequentialPattern prefix, List<PseudoSequence> database) throws IOException { // find frequent items of size 1 in the current projected database. Set<Pair> pairs = findAllFrequentPairs(prefix, database); // For each pair found (a pair is an item with a boolean indicating if it // appears in an itemset that is cut (a postfix) or not, and the sequence IDs // where it appears in the projected database). for(Pair pair : pairs){ // if the item is frequent in the current projected database if(pair.getCount() >= minsuppAbsolute){ // create the new postfix by appending this item to the prefix SequentialPattern newPrefix; // if the item is part of a postfix if(pair.isPostfix()){ // we append it to the last itemset of the prefix newPrefix = appendItemToPrefixOfSequence(prefix, pair.getItem()); }else{ // else, we append it as a new itemset to the sequence newPrefix = appendItemToSequence(prefix, pair.getItem()); } // build the projected database with this item List<PseudoSequence> projectedDB = buildProjectedContext(pair.getItem(), database, pair.isPostfix()); newPrefix.setSequencesID(pair.getSequencesID()); // save the pattern savePattern(newPrefix); // make a recursive call recursion(newPrefix, projectedDB); } } MemoryLogger.getInstance().checkMemory(); } /** * Method to find all frequent items in a projected sequence database * @param sequences the set of sequences * @return A list of pairs, where a pair is an item with (1) a boolean indicating if it * is in an itemset that is "cut" and (2) the sequence IDs where it occurs. */ protected Set<Pair> findAllFrequentPairs(SequentialPattern prefix, List<PseudoSequence> sequences){ // We use a Map the store the pairs. Map<Pair, Pair> mapPairs = new HashMap<Pair, Pair>(); // for each sequence for(PseudoSequence sequence : sequences){ // for each itemset for(int i=0; i< sequence.size(); i++){ // for each item for(int j=0; j < sequence.getSizeOfItemsetAt(i); j++){ String item = sequence.getItemAtInItemsetAt(j, i); // create the pair corresponding to this item Pair paire = new Pair(sequence.isPostfix(i), item); // false is ok? // get the pair object store in the map if there is one already Pair oldPaire = mapPairs.get(paire); // if there is no pair object yet if(oldPaire == null){ // store the pair object that we created mapPairs.put(paire, paire); }else{ // otherwise use the old one paire = oldPaire; } // record the current sequence id for that pair paire.getSequencesID().add(sequence.getId()); } } } MemoryLogger.getInstance().checkMemory(); // check the memory for statistics. // return the map of pairs return mapPairs.keySet(); } /** * This method creates a copy of the sequence and add a given item * as a new itemset to the sequence. * It sets the support of the sequence as the support of the item. * @param prefix the sequence * @param item the item * @return the new sequence */ private SequentialPattern appendItemToSequence(SequentialPattern prefix, String item) { SequentialPattern newPrefix = prefix.cloneSequence(); newPrefix.addItemset(new Itemset(item)); return newPrefix; } /** * This method creates a copy of the sequence and add a given item * to the last itemset of the sequence. * It sets the support of the sequence as the support of the item. * @param prefix the sequence * @param item the item * @return the new sequence */ private SequentialPattern appendItemToPrefixOfSequence(SequentialPattern prefix, String item) { SequentialPattern newPrefix = prefix.cloneSequence(); Itemset itemset = newPrefix.get(newPrefix.size()-1); itemset.addItem(item); return newPrefix; } /** * Print statistics about the algorithm execution to System.out. * @param size the size of the database */ public void printStatistics(int size) { StringBuilder r = new StringBuilder(200); r.append("============= PREFIXSPAN - STATISTICS =============\n Total time ~ "); r.append(endTime - startTime); r.append(" ms\n"); r.append(" Frequent sequences count : " + patternCount); r.append('\n'); r.append(" Max memory (mb) : " ); r.append(MemoryLogger.getInstance().getMaxMemory()); r.append(patternCount); r.append('\n'); r.append("===================================================\n"); System.out.println(r.toString()); } }