package ca.pfv.spmf.algorithms.sequentialpatterns.spam; /* This file is copyright (c) 2008-2013 Philippe Fournier-Viger * * This file is part of the SPMF DATA MINING SOFTWARE * (http://www.philippe-fournier-viger.com/spmf). * * SPMF is free software: you can redistribute it and/or modify it under the * terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * * SPMF is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR * A PARTICULAR PURPOSE. See the GNU General Public License for more details. * You should have received a copy of the GNU General Public License along with * SPMF. If not, see <http://www.gnu.org/licenses/>. */ import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileInputStream; import java.io.FileWriter; import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; import ca.pfv.spmf.patterns.itemset_list_integers_without_support.Itemset; import ca.pfv.spmf.tools.MemoryLogger; /*** * This is an implementation of the SPAM algorithm. * <br/><br/> * The SPAM algorithm was originally described in this paper: * <br/><br/> * * Jay Ayres, Johannes Gehrke, Tomi Yiu, and Jason Flannick. Sequential PAttern Mining Using Bitmaps. * In Proceedings of the Eighth ACM SIGKDD International Conference on Knowledge Discovery and Data Mining. * Edmonton, Alberta, Canada, July 2002. * <br/><br/> * * I tried to do what is indicated in that paper but some optimizations are not described with enough details in the paper. * So my implementation does not include these optimizations for example: * - lookup tables for bitmaps * - compression of bitmaps. * * @see Bitmap * @see Prefix * @author Philippe Fournier-Viger */ public class AlgoSPAM{ // for statistics private long startTime; private long endTime; private int patternCount; // minsup private int minsup = 0; // object to write to a file BufferedWriter writer = null; // Vertical database Map<Integer, Bitmap> verticalDB = new HashMap<Integer, Bitmap>(); // List indicating the number of bits per sequence List<Integer> sequencesSize = null; int lastBitIndex = 0; // the last bit position that is used in bitmaps // maximum pattern length in terms of item count private int maximumPatternLength = Integer.MAX_VALUE; // the max gap between two itemsets of a pattern. // It is an optional parameter that the user can set. private int maxGap = Integer.MAX_VALUE; /** * Default constructor */ public AlgoSPAM(){ } /** * Method to run the algorithm * @param input path to an input file * @param outputFilePath path for writing the output file * @param minsupRel the minimum support as a relative value * @throws IOException exception if error while writing the file or reading */ public void runAlgorithm(String input, String outputFilePath, double minsupRel) throws IOException { // create an object to write the file writer = new BufferedWriter(new FileWriter(outputFilePath)); // initialize the number of patterns found patternCount =0; // to log the memory used MemoryLogger.getInstance().reset(); // record start time startTime = System.currentTimeMillis(); // RUN THE ALGORITHM spam(input, minsupRel); // record end time endTime = System.currentTimeMillis(); // close the file writer.close(); } /** * This is the main method for the SPAM algorithm * @param an input file * @param minsupRel the minimum support as a relative value * @throws IOException */ private void spam(String input, double minsupRel) throws IOException{ // the structure to store the vertical database // key: an item value : bitmap verticalDB = new HashMap<Integer, Bitmap>(); // STEP 0: SCAN THE DATABASE TO STORE THE FIRST BIT POSITION OF EACH SEQUENCE // AND CALCULATE THE TOTAL NUMBER OF BIT FOR EACH BITMAP sequencesSize = new ArrayList<Integer>(); lastBitIndex =0; // variable to record the last bit position that we will use in bitmaps try { // read the file FileInputStream fin = new FileInputStream(new File(input)); BufferedReader reader = new BufferedReader(new InputStreamReader(fin)); String thisLine; int bitIndex =0; // for each line (sequence) in the file until the end while ((thisLine = reader.readLine()) != null) { // if the line is a comment, is empty or is a // kind of metadata if (thisLine.isEmpty() == true || thisLine.charAt(0) == '#' || thisLine.charAt(0) == '%' || thisLine.charAt(0) == '@') { continue; } // record the length of the current sequence (for optimizations) sequencesSize.add(bitIndex); // split the sequence according to spaces into tokens for(String token: thisLine.split(" ")){ // if it is not an itemset separator if(token.equals("-1")){ // indicate the end of an itemset // increase the number of bits that we will need for each bitmap bitIndex++; } } } // record the last bit position for the bitmaps lastBitIndex = bitIndex -1; reader.close(); // close the input file } catch (Exception e) { e.printStackTrace(); } // Calculate the absolute minimum support // by multipling the percentage with the number of // sequences in this database minsup = (int)Math.ceil((minsupRel * sequencesSize.size())); if(minsup ==0){ minsup =1; } // STEP1: SCAN THE DATABASE TO CREATE THE BITMAP VERTICAL DATABASE REPRESENTATION try { FileInputStream fin = new FileInputStream(new File(input)); BufferedReader reader = new BufferedReader(new InputStreamReader(fin)); String thisLine; int sid =0; // to know which sequence we are scanning int tid =0; // to know which itemset we are scanning // for each line (sequence) from the input file while ((thisLine = reader.readLine()) != null) { // split the sequence according to spaces into tokens for(String token: thisLine.split(" ")){ if(token.equals("-1")){ // indicate the end of an itemset tid++; }else if(token.equals("-2")){ // indicate the end of a sequence // determineSection(bitindex - previousBitIndex); // register the sequence length for the bitmap sid++; tid =0; }else{ // indicate an item // Get the bitmap for this item. If none, create one. Integer item = Integer.parseInt(token); Bitmap bitmapItem = verticalDB.get(item); if(bitmapItem == null){ bitmapItem = new Bitmap(lastBitIndex); verticalDB.put(item, bitmapItem); } // Register the bit in the bitmap for this item bitmapItem.registerBit(sid, tid, sequencesSize); } } } reader.close(); } catch (Exception e) { e.printStackTrace(); } // STEP2: REMOVE INFREQUENT ITEMS FROM THE DATABASE BECAUSE THEY WILL NOT APPEAR IN ANY FREQUENT SEQUENTIAL PATTERNS List<Integer> frequentItems = new ArrayList<Integer>(); Iterator<Entry<Integer, Bitmap>> iter = verticalDB.entrySet().iterator(); // we iterate over items from the vertical database that we have in memory while (iter.hasNext()) { // we get the bitmap for this item Map.Entry<Integer, Bitmap> entry = (Map.Entry<Integer, Bitmap>) iter.next(); // if the cardinality of this bitmap is lower than minsup if(entry.getValue().getSupport() < minsup){ // we remove this item from the database. iter.remove(); }else{ // otherwise, we save this item as a frequent // sequential pattern of size 1 savePattern(entry.getKey(), entry.getValue()); // and we add this item to a list of frequent items // that we will use later. frequentItems.add(entry.getKey()); } } // STEP3: WE PERFORM THE RECURSIVE DEPTH FIRST SEARCH // to find longer sequential patterns recursively if(maximumPatternLength == 1){ return; } // for each frequent item for(Entry<Integer, Bitmap> entry: verticalDB.entrySet()){ // We create a prefix with that item Prefix prefix = new Prefix(); prefix.addItemset(new Itemset(entry.getKey())); // We call the depth first search method with that prefix // and the list of frequent items to try to find // larger sequential patterns by appending some of these // items. dfsPruning(prefix, entry.getValue(), frequentItems, frequentItems, entry.getKey(), 2); } } /** * This is the dfsPruning method as described in the SPAM paper. * @param prefix the current prefix * @param prefixBitmap the bitmap corresponding to the current prefix * @param sn a list of items to be considered for i-steps * @param in a list of items to be considered for s-steps * @param hasToBeGreaterThanForIStep * @param m size of the current prefix in terms of items * @throws IOException if there is an error writing a pattern to the output file */ private void dfsPruning(Prefix prefix, Bitmap prefixBitmap, List<Integer> sn, List<Integer> in, int hasToBeGreaterThanForIStep, int m) throws IOException { // System.out.println(prefix.toString()); // ====== S-STEPS ====== // Temporary variables (as described in the paper) List<Integer> sTemp = new ArrayList<Integer>(); List<Bitmap> sTempBitmaps = new ArrayList<Bitmap>(); // for each item in sn for(Integer i : sn){ // perform the S-STEP with that item to get a new bitmap Bitmap newBitmap = prefixBitmap.createNewBitmapSStep(verticalDB.get(i), sequencesSize, lastBitIndex, maxGap); // if the support is higher than minsup if(newBitmap.getSupport() >= minsup){ // record that item and pattern in temporary variables sTemp.add(i); sTempBitmaps.add(newBitmap); } } // for each pattern recorded for the s-step for(int k=0; k < sTemp.size(); k++){ int item = sTemp.get(k); // create the new prefix Prefix prefixSStep = prefix.cloneSequence(); prefixSStep.addItemset(new Itemset(item)); // create the new bitmap Bitmap newBitmap = sTempBitmaps.get(k); // save the pattern to the file savePattern(prefixSStep, newBitmap); // recursively try to extend that pattern if(maximumPatternLength > m){ dfsPruning(prefixSStep, newBitmap, sTemp, sTemp, item, m+1); } } // ======== I STEPS ======= // Temporary variables List<Integer> iTemp = new ArrayList<Integer>(); List<Bitmap> iTempBitmaps = new ArrayList<Bitmap>(); // for each item in in for(Integer i : in){ // the item has to be greater than the largest item // already in the last itemset of prefix. if(i > hasToBeGreaterThanForIStep){ // Perform an i-step with this item and the current prefix. // This creates a new bitmap Bitmap newBitmap = prefixBitmap.createNewBitmapIStep(verticalDB.get(i), sequencesSize, lastBitIndex); // If the support is no less than minsup if(newBitmap.getSupport() >= minsup){ // record that item and pattern in temporary variables iTemp.add(i); iTempBitmaps.add(newBitmap); } } } // for each pattern recorded for the i-step for(int k=0; k < iTemp.size(); k++){ int item = iTemp.get(k); // create the new prefix Prefix prefixIStep = prefix.cloneSequence(); prefixIStep.getItemsets().get(prefixIStep.size()-1).addItem(item); // create the new bitmap Bitmap newBitmap = iTempBitmaps.get(k); // save the pattern savePattern(prefixIStep, newBitmap); // recursively try to extend that pattern if(maximumPatternLength > m){ dfsPruning(prefixIStep, newBitmap, sTemp, iTemp, item, m+1); } } // check the memory usage MemoryLogger.getInstance().checkMemory(); } /** * Save a pattern of size 1 to the output file * @param item the item * @param bitmap its bitmap * @throws IOException exception if error while writing to the file */ private void savePattern(Integer item, Bitmap bitmap) throws IOException { patternCount++; // increase the pattern count StringBuilder r = new StringBuilder(""); r.append(item); r.append(" -1 "); r.append("SUP: "); r.append(bitmap.getSupport()); writer.write(r.toString()); writer.newLine(); } /** * Save a pattern of size > 1 to the output file. * @param prefix the prefix * @param bitmap its bitmap * @throws IOException exception if error while writing to the file */ private void savePattern(Prefix prefix, Bitmap bitmap) throws IOException { patternCount++; StringBuilder r = new StringBuilder(""); for(Itemset itemset : prefix.getItemsets()){ // r.append('('); for(Integer item : itemset.getItems()){ String string = item.toString(); r.append(string); r.append(' '); } r.append("-1 "); } r.append("SUP: "); r.append(bitmap.getSupport()); writer.write(r.toString()); // System.out.println(r.toString()); writer.newLine(); } /** * Print the statistics of the algorithm execution to System.out. */ public void printStatistics() { StringBuilder r = new StringBuilder(200); r.append("============= Algorithm - STATISTICS =============\n Total time ~ "); r.append(endTime - startTime); r.append(" ms\n"); r.append(" Frequent sequences count : " + patternCount); r.append('\n'); r.append(" Max memory (mb) : " ); r.append(MemoryLogger.getInstance().getMaxMemory()); r.append(patternCount); r.append('\n'); r.append("minsup " + minsup); r.append('\n'); r.append("===================================================\n"); System.out.println(r.toString()); } /** * Get the maximum length of patterns to be found (in terms of itemset count) * @return the maximumPatternLength */ public int getMaximumPatternLength() { return maximumPatternLength; } /** * Set the maximum length of patterns to be found (in terms of itemset count) * @param maximumPatternLength the maximumPatternLength to set */ public void setMaximumPatternLength(int maximumPatternLength) { this.maximumPatternLength = maximumPatternLength; } /** * This method allows to specify the maximum gap * between itemsets of patterns found by the algorithm. * If set to 1, only patterns of contiguous itemsets * will be found (no gap). * @param maxGap the maximum gap (an integer) */ public void setMaxGap(int maxGap) { this.maxGap = maxGap; } }