package ca.pfv.spmf.algorithms.sequentialpatterns.lapin; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileInputStream; import java.io.FileWriter; import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.BitSet; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import ca.pfv.spmf.datastructures.triangularmatrix.AbstractTriangularMatrix; import ca.pfv.spmf.datastructures.triangularmatrix.SparseTriangularMatrix; import ca.pfv.spmf.input.sequence_database_array_integers.SequenceDatabase; import ca.pfv.spmf.tools.MemoryLogger; /*** * This is an implementation of the LAPIN algorithm (a.k.a LAPIN-SPAM or LAPIN-LCI). * This implementation tries to be faithful to the original technical report. There is only a minor difference in * how the I-Step is performed. When an I-step is performed such thats * the resulting last itemset of the prefix would have 3 or more items, position lists are scanned * to ensure that only positions where the full itemset appear are considered. In the original LAPIN-SPAM, * position-lists are instead updated. But because this would be consume too much memory, we took * the design decision of doing it differently. * * The LAPIN-SPAM algorithm was originally described in this paper: * * Zhenlu Yang and Masrau Kitsuregawa. LAPIN-SPAM: An improved algorithm for mining sequential pattern * In Proc. of Int'l Special Workshop on Databases For Next Generation Researchers (SWOD'05) * in conjunction with ICDE'05, pp. 8-11, Tokyo, Japan, Apr. 2005. * * Copyright (c) 2008-2013 Philippe Fournier-Viger * * This file is part of the SPMF DATA MINING SOFTWARE * (http://www.philippe-fournier-viger.com/spmf). * * SPMF is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * SPMF is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with SPMF. If not, see <http://www.gnu.org/licenses/>. */ public class AlgoLAPIN_LCI{ // for statistics private long startTime; private long endTime; private int patternCount; // minsup private int minsup = 0; BufferedWriter writer = null; // Item-is-exist-table (one for each sequence) Table [] tables = null; // The set of Position Lists (one for each sequence) SEPositionList[] sePositionList; // SE position lists IEPositionList[] iePositionList; // 2-itemsets IE position lists // To activate "debug" mode final boolean DEBUG = false; SequenceDatabase seqDB = null; // for DEBUGGINGs // Used to count the support of 2-itemsets private AbstractTriangularMatrix matrixPairCount; // input file path String input; /** * Default constructor */ public AlgoLAPIN_LCI(){ } /** * Main method to run the algorithm * @param input an input file path * @param outputFilePath an output file path * @param minsupRel the minimum support threshold as a percentage * @throws IOException exception when writting result to a file */ public void runAlgorithm(String input, String outputFilePath, double minsupRel) throws IOException { this.input = input; // prepare file writer for saving result to file writer = new BufferedWriter(new FileWriter(outputFilePath)); patternCount =0; // reset tool to calculate max. memory usage MemoryLogger.getInstance().reset(); startTime = System.currentTimeMillis(); // launch the algorithm! lapin(input, minsupRel); endTime = System.currentTimeMillis(); writer.close(); } /** * Run the LAPIN algorithm * @param input the input file path * @param minsupRel the minsup threshold as a percentage */ private void lapin(String input, double minsupRel) throws IOException{ if(DEBUG) { System.out.println("=== First database scan to count number of sequences and support of single items ==="); } // FIRST DATABASE SCAN: SCAN THE DATABASE TO COUNT // - THE NUMBER OF SEQUENCES // - THE SUPPORT OF EACH SINGLE ITEM // - THE LARGEST ITEM ID int sequenceCount = 0; int largestItemID = 0; // This map will store for each item (key) the first position where the item appears in each // sequence where it appears (value) Map<Integer, List<Position>> mapItemFirstOccurrences = new HashMap<Integer,List<Position>>(); try { // Read the input file BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(new File(input)))); String thisLine; // for each sequence of the input fiel while ((thisLine = reader.readLine()) != null) { // we use a set to remember which item have been seen already Set<Integer> itemsAlreadySeen = new HashSet<Integer>(); // to know the itemset number short itemsetID = 0; // for each token in this line for(String integer: thisLine.split(" ")){ // if it is the end of an itemset if("-1".equals(integer)){ itemsetID++; }else if("-2".equals(integer)){ // if it is the end of line // nothing to do here }else{ // otherwise, it is an item Integer item = Integer.valueOf(integer); // if this item was not seen already in that sequence if(itemsAlreadySeen.contains(item) == false) { // Get the list of positions of that item List<Position> list = mapItemFirstOccurrences.get(item); // if that list is null, create a new list if(list == null){ list = new ArrayList<Position>(); mapItemFirstOccurrences.put(item, list); } // Add the position of the item in that sequence to the list of first positions // of that item Position position = new Position(sequenceCount, itemsetID); list.add(position); // Remember that we have seen this item itemsAlreadySeen.add(item); // Check if the item is the largest item until now if(item > largestItemID) { largestItemID = item; } } } } // Increase the count of sequences from the input file sequenceCount++; } reader.close(); }catch (Exception e) { e.printStackTrace(); }; // Initialize the list of tables tables = new Table[sequenceCount]; // Calculate absolute minimum support as a number of sequences minsup = (int) Math.ceil(minsupRel * sequenceCount); if(minsup == 0){ minsup = 1; } if(DEBUG) { System.out.println( "Number of items: " + mapItemFirstOccurrences.size()); System.out.println( "Sequence count: " + sequenceCount); System.out.println( "Abs. minsup: " + minsup + " sequences"); System.out.println( "Rel. minsup: " + minsupRel + " %"); System.out.println("=== Determining the frequent items ==="); } // // For each frequent item, save it and add it to the list of frequent items List<Integer> frequentItems = new ArrayList<Integer>(); for(Entry<Integer, List<Position>> entry : mapItemFirstOccurrences.entrySet()){ // Get the border created by this item List<Position> itemBorder = entry.getValue(); // if the item is frequent if(itemBorder.size() >= minsup){ // Output the item and add it to the list of frequent items Integer item = entry.getKey(); savePattern(item, itemBorder.size()); frequentItems.add(item); if(DEBUG) { System.out.println(" Item " + item + " is frequent with support = " + itemBorder.size()); } } } if(DEBUG) { System.out.println("=== Second database scan to construct item-is-exist tables ==="); } // sort the frequent items (useful when generating 2-IE-sequences, later on). Collections.sort(frequentItems); // SECOND DATABASE SCAN: // Now we will read the database again to create the Item-is-exist-table // and SE-position-lists and count support of 2-IE-sequences matrixPairCount = new SparseTriangularMatrix(largestItemID+1); // Initialise the IE position lists and SE position lists sePositionList = new SEPositionList[sequenceCount]; iePositionList = new IEPositionList[sequenceCount]; try { // Prepare to read the file BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(new File(input)))); String thisLine; // For each sequence in the file int currentSequenceID = 0; while ((thisLine = reader.readLine()) != null) { // (1) ------- PARSE THE SEQUENCE BACKWARD TO CREATE THE ITEM-IS-EXIST TABLE FOR THATS SEQUENCE // AND COUNT THE SUPPORT OF 2-IE-Sequences // We will also use a structure to remember in which sequence we have seen each pair of items // Note that in this structure, we will add +1 to the sid because by default the matrix is filled with 0 // and we don't want to think that the first sequence was already seen for all pairs. AbstractTriangularMatrix matrixPairLastSeenInSID = new SparseTriangularMatrix(largestItemID+1); // We count the number of positions (number of itemsets). // To do that we count the number of "-" symbols in the file. // We need to subtract 1 because the end of line "-2" contains "-". int positionCount = -1; for(char caracter : thisLine.toCharArray()) { if(caracter == '-') { positionCount++; } } // Now we will scan the sequence again. // This time we will remember which item were seen already Set<Integer> itemsAlreadySeen = new HashSet<Integer>(); // During this scan, we will create the table for this sequence Table table = new Table(); // To do that, we first create an initial position vector for that table BitSet currentBitset = new BitSet(mapItemFirstOccurrences.size()); // OK ? // This variable will be used to remember if a new item appeared in the current itemset boolean seenNewItem = false; // We will scan the sequence backward, starting from the end because // we should not create a bit vector for all positions but for only // the positions that are different from the previous one. String[] tokens = thisLine.split(" "); // This is the number of itemsets int currentPosition = positionCount; // to keep the current itemset in memory List<Integer> currentItemset = new ArrayList<Integer>(); // For each token in that sequence for(int i = tokens.length-1; i >=0 ; i--){ // get the token String token = tokens[i]; // if we reached the end of an itemset if("-1".equals(token)){ // update the triangular matrix for counting 2-IE-sequences // by comparing each pairs of items in the current itemset for(int k=0; k < currentItemset.size(); k++) { Integer item1 = currentItemset.get(k); for(int m=k+1; m < currentItemset.size(); m++) { Integer item2 = currentItemset.get(m); // if that pair is frequent int sid = matrixPairLastSeenInSID.getSupportForItems(item1, item2); // and if we have not seen this sequence yet if(sid != currentSequenceID+1){ // increment support count of this pair matrixPairCount.incrementCount(item1, item2); // remember that we have seen this pair so that we don't count it again matrixPairLastSeenInSID.setSupport(item1, item2, currentSequenceID+1); } } } currentItemset.clear(); // Decrease the current index of the position (itemset) in the sequence currentPosition--; // if the bit vector has changed since previous position, then // we need to add a new bit vector to the table if(seenNewItem) { // create the position vector and add it to the item-is-exist table PositionVector vector = new PositionVector(currentPosition, (BitSet)currentBitset.clone()); table.add(vector); } }else if("-2".equals(token)){ // if end of sequence, nothing to do }else{ // otherwise, it is an item Integer item = Integer.valueOf(token); if(mapItemFirstOccurrences.get(item).size() >= minsup) { // only for frequent items // if first time that we see this item if(itemsAlreadySeen.contains(item) == false) { // remember that we have seen a new item seenNewItem = true; // remember that we have seen this item itemsAlreadySeen.add(item); // add this item to the current bit vector currentBitset.set(item); } // add this item to the current itemset currentItemset.add(item); } } } // Lastly, // update the triangular matrix for counting 2-IE-sequences one more time // for the case where the pair is in first position of the sequence // by considering each pair of items in the last itemset. // This is done like it was done above, so I will not comment this part of the code again. for(int k=0; k < currentItemset.size(); k++) { Integer item1 = currentItemset.get(k); for(int m=k+1; m < currentItemset.size(); m++) { Integer item2 = currentItemset.get(m); // if th int sid = matrixPairLastSeenInSID.getSupportForItems(item1, item2); if(sid != currentSequenceID+1){ matrixPairCount.incrementCount(item1, item2); matrixPairLastSeenInSID.setSupport(item1, item2, currentSequenceID+1); } } } // If a new item was seen // Add an extra row to the item-is-exist table that will be called -1 with all items in this sequence if(seenNewItem) { PositionVector vector = new PositionVector(-1, (BitSet)currentBitset.clone()); table.add(vector); } // // // // Initialize the IE lists and SE lists for that sequence // which will be filled with the next database scan. sePositionList[currentSequenceID] = new SEPositionList(itemsAlreadySeen); iePositionList[currentSequenceID] = new IEPositionList(); if(DEBUG) { System.out.println("Table for sequence " + currentSequenceID + " : " + thisLine); System.out.println(table.toString()); } // put the current table in the array of item-is-exist-tables tables[currentSequenceID] = table; // we will process the next sequence id currentSequenceID++; } reader.close(); }catch (Exception e) { e.printStackTrace(); } // THIRD SCAN TO // PARSE THE SEQUENCE FORWARD TO CREATE THE SE-POSITION LIST OF THAT SEQUENCE // AND IEPositionList for frequent 2-IE-SEQUENCES try { BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(new File(input)))); String thisLine; // For each sequence int currentSequenceID = 0; while ((thisLine = reader.readLine()) != null) { // We will scan the sequence backward, starting from the end. String[] tokens = thisLine.split(" "); // to keep the current itemset in memory List<Integer> currentItemset = new ArrayList<Integer>(); // this variable will be used to remember which itemset we are visiting short itemsetID = 0; // empty the object to track the current itemset (if it was used for the previous sequence) currentItemset.clear(); // for each token of the current sequence for(int i = 0; i < tokens.length; i++){ String token = tokens[i]; // if we reached the end of an itemset if("-1".equals(token)){ // if the current itemset contains more than one item if(currentItemset.size() >1) { // update the position list for 2-IE-sequences for(int k=0; k < currentItemset.size(); k++) { Integer item1 = currentItemset.get(k); for(int m=k+1; m < currentItemset.size(); m++) { Integer item2 = currentItemset.get(m); // if the pair is frequent int support = matrixPairCount.getSupportForItems(item1, item2); if(support >= minsup){ iePositionList[currentSequenceID].register(item1, item2, itemsetID); } } } } // increase itemsetID itemsetID++; // clear itemset currentItemset.clear(); }else if("-2".equals(token)){ // if the end of a sequence, nothing special to do }else{ // otherwise, the current token is an item Integer item = Integer.valueOf(token); // if the item is frequent if(mapItemFirstOccurrences.get(item).size() >= minsup) { // we add the current position to the item SE-position list sePositionList[currentSequenceID].register(item, itemsetID); // we add the item to the current itemset currentItemset.add(item); } } } if(DEBUG) { System.out.println("SE Position list for sequence " + currentSequenceID ); System.out.println(sePositionList[currentSequenceID]); System.out.println("IE Position list for sequence " + currentSequenceID ); System.out.println(iePositionList[currentSequenceID]); } iePositionList[currentSequenceID].sort(); // sort the IE-position list // update the sequence id for the next sequence currentSequenceID++; } reader.close(); }catch (Exception e) { e.printStackTrace(); } if(DEBUG) { System.out.println("=== Starting sequential pattern generation ==="); } // For each frequent item, call the recursive method to explore larger patterns for(int i=0; i < frequentItems.size(); i++){ // Get the item int item1 = frequentItems.get(i); // Get the border for that item List<Position> item1Border = mapItemFirstOccurrences.get(item1); if(DEBUG) { System.out.println("=== Considering item " + item1); System.out.println(" Border of " + item1); for(Position pos : item1Border) { System.out.println(" seq: " + pos.sid + " itemset: " + pos.position); } } // if the border contains at least minsup sequence (if the item is frequent) if(item1Border.size() >= minsup){ // Create an object prefix to represent the sequential pattern containing the item Prefix prefix = new Prefix(); List<Integer> itemset = new ArrayList<Integer>(1); itemset.add(item1); prefix.itemsets.add(itemset); // make a recursive call to find s-extensions of this prefix genPatterns(prefix, item1Border, frequentItems, frequentItems, item1, true); // true, to disallow I-extension because we explore 2-IE sequences separately } // For each frequent 2-IE sequences stating with item1, we will explore 2-IE sequences // by considering each frequent item larger than item1 for(int k=i+1; k < frequentItems.size(); k++){ // We consider item2 int item2 = frequentItems.get(k); // Get the support of item1, item2 int support = matrixPairCount.getSupportForItems(item1, item2); // if the pair {item1, item2} is frequent if(support >= minsup){ // get the list of position of item2 List<Position> item2Border = mapItemFirstOccurrences.get(item2); // Create the border by using the 2-IE position list List<Position> ie12Border = new ArrayList<Position>(); // We will loop over the border of item1 or item2 (the smallest one) List<Position> borderToUse; if(item2Border.size() < item1Border.size()) { borderToUse = item2Border; }else { borderToUse = item1Border; } // For each sequence of the border that we consider for(Position sequenceToUse : borderToUse) { // Get the sequence id int sid = sequenceToUse.sid; // For this sequence, we will get the position list of each item List<Short> listPosition1 = sePositionList[sid].getListForItem(item1); List<Short> listPosition2 = sePositionList[sid].getListForItem(item2); // if one of them is null, that means that both item1 and item2 do not appear in that sequence // so we continue to the next sequence if(listPosition1 == null || listPosition2 == null) { continue; } // otherwise // find the first common position of item1 and item2 in the sequence int index1 = 0; int index2 = 0; // we do that by the following while loop while(index1 < listPosition1.size() && index2 < listPosition2.size()) { short position1 = listPosition1.get(index1); short position2 = listPosition2.get(index2); if(position1 < position2) { index1++; }else if(position1 > position2) { index2++; }else { // we have found the position, so we add it to the new border and // then stop because we do not want to add more than one position for // the same sequence in the new border ie12Border.add(new Position(sid, position1)); break; } } } if(DEBUG) { System.out.println("=== Considering the 2-IE sequence {" + item1 + "," + item2 + "} with support " + support); System.out.println(" Border of {" + item1 + "," + item2 + "}"); for(Position pos : ie12Border) { System.out.println(" seq: " + pos.sid + " itemset: " + pos.position); } } // finally, we create the prefix for the pattern {item1, item2} Prefix prefix = new Prefix(); List<Integer> itemset = new ArrayList<Integer>(2); itemset.add(item1); itemset.add(item2); prefix.itemsets.add(itemset); // save the pattern savePattern(prefix, support); // perform recursive call to extend that pattern genPatterns(prefix, ie12Border, frequentItems, frequentItems, item2, false); // false, to allow I-extension } } } // Record the maximum memory usage MemoryLogger.getInstance().checkMemory(); writer.close(); } /** * The main recursive method of LAPIN * @param prefix the current prefix * @param prefix the prefix * @param prefixBorder a list of position that is the prefix border * @param in items that could be appended by i-extension * @param sn items that could be appended by s-extension * @param hasToBeGreaterThanForIStep * @throws IOException if error while writing to file */ private void genPatterns(Prefix prefix, List<Position> prefixBorder, List<Integer> sn, List<Integer> in, int hasToBeGreaterThanForIStep, boolean doNotPerformIExtensions) throws IOException { // if(DEBUG) { // if(seqDB == null) { // seqDB = new SequenceDatabase(); // seqDB.loadFile(input); // } // // FOR DEBUGGING = WORK ONLY FOR SEQUENCE WITH SINGLE ITEMS IN EACH ITEMSET // System.out.println("Checking if the border of " + prefix + " is correct"); // for(Position pos : prefixBorder) { // int sid = pos.sid; // Sequence seq = seqDB.getSequences().get(sid); // int calculatedPosition = 0; // // int prefixItemsetID =0; // for(; calculatedPosition< seq.size(); calculatedPosition++ ) { // Integer[] itemset = seq.get(calculatedPosition); // Integer itemToMatch = prefix.itemsets.get(prefixItemsetID).get(0); // if(itemset[0].equals(itemToMatch)) { // prefixItemsetID++; // if(prefixItemsetID == prefix.size()) { // if(pos.position != calculatedPosition) { // System.out.println("THE BORDER IS WRONG FOR PREFIX " + prefix + " AND SEQUENCE :" + sid + " " + seq); // System.out.println(); // }else { // System.out.println("THE BORDER IS OK"); // break; // } // } // } // } // } //// } // ====== S-STEPS ====== // // Temporary variables (as described in the paper) List<Integer> sTemp = new ArrayList<Integer>(); List<Integer> sTempSupport = new ArrayList<Integer>(); // // // for each item in sn for(Integer item : sn){ // perform the S-STEP int support = calculateSupportSStep(item, prefixBorder); // if the support is higher than minsup if(support >= minsup){ // // record that item and pattern in temporary variables sTemp.add(item); sTempSupport.add(support); } } // for each pattern recorded for the s-step for(int k=0; k < sTemp.size(); k++){ int item = sTemp.get(k); // create the new prefix Prefix prefixSStep = prefix.cloneSequence(); List<Integer> itemset = new ArrayList<Integer>(1); itemset.add(item); prefixSStep.itemsets.add(itemset); // save the pattern to the file savePattern(prefixSStep, sTempSupport.get(k)); // recursively try to extend that pattern List<Position> newBorder = recalculateBorderForSExtension(prefixBorder, item); // Recursive call genPatterns(prefixSStep, newBorder, sTemp, sTemp, item, false); } if(doNotPerformIExtensions) { return; } // ======== I STEPS ======= // Temporary variables List<Integer> iTemp = new ArrayList<Integer>(); List<List<Position> > iTempBorder= new ArrayList<List<Position>>(); // // // for each item in in // the item has to be greater than the largest item // already in the last itemset of prefix. int index = Collections.binarySearch(in, hasToBeGreaterThanForIStep); for(int i = index; i< in.size(); i++) { Integer item = in.get(i); List<Integer> lastItemset = prefix.itemsets.get(prefix.itemsets.size() -1); // Integer lastItem = lastItemset.get(lastItemset.size()-1); boolean willAddSecondItem = lastItemset.size() == 1; // AN OPTIMIZATION // perform the I-STEP int support = estimateSupportIStep(item, prefixBorder); // if the estimated support is higher than minsup if(support >= minsup){ // recalculate the border // in this case, the method takes the prefix border as input List<Position> newBorder = recalculateBorderForIExtension(lastItemset, prefixBorder, hasToBeGreaterThanForIStep, item, willAddSecondItem); // record that item and pattern in temporary variables if(newBorder.size() >= minsup) { iTemp.add(item); iTempBorder.add(newBorder); } } } // for each pattern recorded for the i-step for(int k=0; k < iTemp.size(); k++){ int item = iTemp.get(k); // create the new prefix Prefix prefixIStep = prefix.cloneSequence(); prefixIStep.itemsets.get(prefixIStep.size()-1).add(item); // save the pattern List<Position> newBorder = iTempBorder.get(k); savePattern(prefixIStep, newBorder.size()); // recursively try to extend that pattern genPatterns(prefixIStep, newBorder, sTemp, iTemp, item, false); } // check the memory usage MemoryLogger.getInstance().checkMemory(); } /** * Recalculate the prefix border following an i-extension * @param prefixLastItemset last itemset of the previous prefix * @param prefixBorder the previous prefix border * @param item1 the last item * @param item2 the item that will be appended * @param willAddSecondItem if the item will be added to an itemset containing a single item * @return the updated border */ private List<Position> recalculateBorderForIExtension( List<Integer> prefixLastItemset, List<Position> prefixBorder, int item1, int item2, boolean willAddSecondItem) { // Create the new border (a list of position) List<Position> newBorder = new ArrayList<Position>(); // for each sequence where the prefix appeared for(Position previousPosition : prefixBorder) { int sid = previousPosition.sid; // get where the last two items of the prefix appeared int previousItemsetID = previousPosition.position; IEPositionList positionLists = iePositionList[sid]; // find the position that is immediately larger or equal than the current one // by checking each position in the list of positions for the pair List<Short> listPositions = positionLists.getListForPair(item1, item2); if(listPositions != null) { // for each position loop: for(short pos : listPositions) { // if the position is larger or equal to the current one if(pos >= previousItemsetID){ // IMPORTANT: // if the prefix has two items in its last itemset, // then we also need to check that the full last itemset of prefix is at the current position // This will not be done very optimally but it is it difficult to do a better solution. if(willAddSecondItem == false) { // We take the SE position list of the current sequence SEPositionList plists = sePositionList[sid]; // For each item of the last itemset of the prefix for(int i=0; i< prefixLastItemset.size()-1; i++) { // We check if that item appears at that position Integer itemX = prefixLastItemset.get(i); List<Short> plistX = plists.getListForItem(itemX); int index = Collections.binarySearch(plistX, pos); // if not, then we stop considering this position if(index <0) { continue loop; } } // If the loop has finished, that means that all items from the last itemset of // the prefix have appeared at the position pos } // Then we add the position to the new border Position newPosition = new Position(sid, pos); newBorder.add(newPosition); // After that we will continue to the next sequence to continue creating the new border break; } } } } return newBorder; } /** * Estimate support of appending an item to the current prefix by I-extension * @param item the item * @param itemBorder the prefix border * @return the estimated support (an upper bound) */ private int estimateSupportIStep(Integer item, List<Position> itemBorder) { // First we need to take the two last items int support = 0; for(Position pos : itemBorder) { Table table = tables[pos.sid]; int numberOfVectors = table.positionVectors.size(); // Scan from last position to first position (they are ordered backward in the table) for(int j = 0; j < numberOfVectors; j++) { PositionVector vector = table.positionVectors.get(j); if(vector.position < pos.position) { if(vector.bitset.get(item)) { support += 1; } break; } } } return support; } /** * Calculate the support of the new prefix resulting from appending an item to the * prefix by S-extension * @param item the item * @param itemBorder the prefix border * @return the support */ private int calculateSupportSStep(Integer item, List<Position> itemBorder) { // Initialize a variable to count the support int support = 0; // For each sequence where there is a position in the border for(Position pos : itemBorder) { // get the Item-is-exist table corresponding to the sequence Table table = tables[pos.sid]; // Get the number of vectors in that table int numberOfVectors = table.positionVectors.size(); // We will scan the vectors to determine if the item appears after the corresponding // position. If yes, we will increase the support by 1. // IMPORTANT: We scan the table starting from the last vectors because // vectors have been inserted in reverse order in the table. // Also note that we will skip the first vector that has the position -1 because // this vector was added for i-extension only and should not be considered for s-extension // Thus, for each vector starting from the second-last one for(int j = numberOfVectors-2; j>=0; j--) { // Get the vector PositionVector vector = table.positionVectors.get(j); // if the position of this vector is larger or equal to the position that // we are searching for if(vector.position >= pos.position) { // check if the bit corresponding to the item is set to 1 if(vector.bitset.get(item)) { // if yes, we increase the support by 1 support += 1; } // and we don't need to continue looking at the vectors for that sequence break; } } } // return the calculated support return support; } /** * Method to recalculate the border of a prefix after an S-extension with an item * @param prefixBorder the border of the prefix * @param item the item used to extend the prefix * @return the new border */ private List<Position> recalculateBorderForSExtension( List<Position> prefixBorder, int item) { // Create a list of position that will be used to store the new bordre List<Position> newBorder = new ArrayList<Position>(); // for each sequence where the prefix appeared for(Position previousPosition : prefixBorder) { // get the sequence id int sid = previousPosition.sid; // get the index of the itemset where the last item of the prefix appeared int previousItemsetID = previousPosition.position; // Get the SE position list for the sequence SEPositionList positionLists = sePositionList[sid]; // Get the list of position for the item for that sequence List<Short> listPositions = positionLists.getListForItem(item); // if the item appears in that sequence if(listPositions != null) { // We check if there is a position where the item appears // that is after the border. // For each position for(short pos : listPositions) { // if the position is larger if(pos > previousItemsetID){ // add the position to the new border Position newPosition = new Position(sid, pos); newBorder.add(newPosition); // and stop because we don't want to add more than one position for the same sequence. break; } } } } // return the new border return newBorder; } /** * Save a pattern that has been found * @param item the item * @param support the support of the item * @throws IOException if error while writing to file */ private void savePattern(Integer item, int support) throws IOException { // increase the number of patterns found patternCount++; // create a string buffer to store the string reprensentation of this pattern StringBuilder r = new StringBuilder(""); // append the item r.append(item); // append -1 to indicate the end of the itemset r.append(" -1 "); // append its support r.append("#SUP: "); r.append(support); // write the buffer to the output file writer.write(r.toString()); // create a new line to be ready for writing the next pattern writer.newLine(); if(DEBUG) { System.out.println(r.toString()); } } /** * Save a prefix (pattern) to file * @param prefix the prefix * @param support the prefix support * @throws IOException if error ocurrs when writing to file */ private void savePattern(Prefix prefix, int support) throws IOException { // increase the number of patterns found patternCount++; // Create a string buffer to store the pattern and its support as a string StringBuilder r = new StringBuilder(""); // for each itemset for(List<Integer> itemset : prefix.itemsets){ // for each item for(Integer item : itemset){ // append it String string = item.toString(); r.append(string); r.append(' '); } // at the end of an itemset we put a -1 r.append("-1 "); } // then, append the support of the pattern: r.append("#SUP: "); r.append(support); // write the buffer to the output file writer.write(r.toString()); if(DEBUG) { System.out.println(r.toString()); } // create a new line to be ready for the next pattern writer.newLine(); } /** * Print statistics about the algorithm execution time */ public void printStatistics() { StringBuilder r = new StringBuilder(200); r.append("============= LAPIN - STATISTICS =============\n Total time ~ "); r.append(endTime - startTime); r.append(" ms\n"); r.append(" Frequent sequences count : " + patternCount); r.append('\n'); r.append(" Max memory (mb) : " ); r.append(MemoryLogger.getInstance().getMaxMemory()); r.append(patternCount); r.append('\n'); r.append("==================================================="); System.out.println(r.toString()); } /** * A inner class to store a position (a sequence id + an itemset id). This will be use * to represent the border of a prefix. */ class Position{ /** a sequence id */ int sid; /** an itemset position in the sequence */ short position; /** * Default constructor * @param sid the sequence id * @param position the position as a short (itemset number) */ public Position(int sid, short position) { this.sid = sid; this.position = position; } } }