package ca.pfv.spmf.algorithms.sequentialpatterns.BIDE_and_prefixspan_with_strings; /* This file is copyright (c) 2008-2013 Philippe Fournier-Viger * * This file is part of the SPMF DATA MINING SOFTWARE * (http://www.philippe-fournier-viger.com/spmf). * * SPMF is free software: you can redistribute it and/or modify it under the * terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * * SPMF is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR * A PARTICULAR PURPOSE. See the GNU General Public License for more details. * You should have received a copy of the GNU General Public License along with * SPMF. If not, see <http://www.gnu.org/licenses/>. */ import java.util.ArrayList; import java.util.List; import ca.pfv.spmf.input.sequence_database_list_strings.Sequence; /** * This represents a sequence from a projected database (as used by the BIDE+ algorithm). * Since it is a projected sequence, * it makes reference to the original Sequence. * <br/><br/> * * This class is a subclass of "PseudoSequence" used by the PrefixSpan algorithm. * This class also include several methods for calculating the maximum periods, * semi-maximum periods, etc. as required by the BIDE+ algorithm. * These methods are quite complex so if you want to understand them, it is * recommended to read the BIDE+ paper carefully before reading the code. * * @see AlgoBIDEPlus_withStrings * @see Itemset * @see Sequence * @see PseudoSequence * @author Philippe Fournier-Viger */ // note that this class extends PseudoSequence used by PrefixSpan: class PseudoSequenceBIDE extends PseudoSequence { // variable to indicate if the pseudo sequence is cut at right (a prefix) int lastItemset; // the position where the cut was done in terms of itemset in the original sequence int lastItem; // the position where the cut was done in terms of item in the original sequence /** * Constructor of a pseudo-sequence based on a pseudo-sequence (overloaded) * @param sequence the original pseudo-sequence * @param indexItemset the position of the itemset where the pseudo sequence starts (if it is cut at left) * @param indexItem the position of the item where the pseudo sequence starts (if it is cut at left) */ protected PseudoSequenceBIDE(PseudoSequenceBIDE sequence, int indexItemset, int indexItem){ // record the original sequence this.sequence = sequence.sequence; // record the position where the sequence starts if it is cut at left this.firstItemset = indexItemset + sequence.firstItemset; if(this.firstItemset == sequence.firstItemset){ this.firstItem = indexItem + sequence.firstItem; }else{ this.firstItem = indexItem; } // this sequence ends at the same place as the original sequence this.lastItemset = sequence.lastItemset; this.lastItem = sequence.lastItem; } /** * Constructor of a pseudo-sequence based on a pseudo-sequence (overloaded) * @param sequence the original pseudo-sequence * @param indexItemset the position of the itemset where the pseudo sequence starts (!=0 if it is cut at left) * @param indexItem the position of the item where the pseudo sequence starts (!=0 if it is cut at left) * @param lastItemset the position of the itemset where this pseudo sequence ends * @param lastItem the position of the item where this pseudo sequence ends */ protected PseudoSequenceBIDE(PseudoSequenceBIDE sequence, int indexItemset, int indexItem, int lastItemset, int lastItem){ // record the original sequence this.sequence = sequence.sequence; // record the position where the sequence starts if it is cut at left this.firstItemset = indexItemset + sequence.firstItemset; if(this.firstItemset == sequence.firstItemset){ this.firstItem = indexItem + sequence.firstItem; }else{ this.firstItem = indexItem; // ?????????? necessary?? } // record the position where the sequence ends if it is cut at right this.lastItemset = lastItemset; this.lastItem = lastItem; } /** * Constructor of a pseudo-sequence based on an original sequence (overloaded) * @param sequence the original sequence * @param indexItemset the position of the itemset where the pseudo sequence starts (!=0 if it is cut at left) * @param indexItem the position of the item where the pseudo sequence starts (!=0 if it is cut at left) */ protected PseudoSequenceBIDE(Sequence sequence, int indexItemset, int indexItem){ this.sequence = sequence; // record the position where the sequence starts if it is cut at left this.firstItemset = indexItemset; this.firstItem = indexItem; // We assume that the sequence is not cut at right. // So we set the last itemset to the last itemset in the original sequence // and the last item to the last item in the last itemset of the original // sequence. this.lastItemset = sequence.size()-1; this.lastItem = sequence.getItemsets().get(lastItemset).size()-1; } /****************************************** * All the following methods are specific for BIDE+. * I provide a brief description of them. * Please refer to the original BIDE+ paper for more * details about the BIDE+ algorithm and how it works because * it is quite complex. * * Remember that a sequential pattern is not closed if there exists * a forward-extension or a backward-extension. ******************************************/ /** * Structure to contains a sequence and list of positions to elements in the sequence. */ protected static class PseudoSequencePair{ // a pseudo sequence final PseudoSequenceBIDE pseudoSequence; // a list of positions final List<Position> list; // a simple constructor public PseudoSequencePair(PseudoSequenceBIDE pseudoSequence, List<Position> list){ this.pseudoSequence = pseudoSequence; this.list = list; } } /** * Internal class representing a position in a pseudo-sequence. * (position of an itemset + position of an item). */ protected static class Position{ final int itemset; final int item; public Position(int itemset, int item){ this.itemset = itemset; this.item = item; } } /** * Get the position of the last item. * @return the position. */ protected int getLastItemPosition(){ return lastItem - firstItem -1; } /** * Check if a given itemset is the last itemset (overloaded). * @return true if yes. */ protected boolean isLastItemset(int index) { return (index + firstItemset) == lastItemset; } /** * Check if a given itemset is the last itemset (overloaded). * @return true if yes. */ protected int getSizeOfItemsetAt(int index) { int size = sequence.getItemsets().get(index + firstItemset).size(); if(isLastItemset(index)){ size -= ((size -1) - lastItem); } if(isFirstItemset(index)){ size -= firstItem; } return size; } /** * Get a string representation of this sequence. */ public String toString() { StringBuilder r = new StringBuilder(); for(int i=0; i < size(); i++){ r.append("{"); for(int j=0; j < getSizeOfItemsetAt(i); j++){ if(!isLastItemset(i) || (j <= lastItem)){ r.append(getItemAtInItemsetAt(j, i).toString()); if(isPostfix(i)){ r.append('*'); } r.append(' '); } } r.append("}"); } r.append(" "); return r.toString(); } public int size(){ int size = sequence.size() - firstItemset - ((sequence.size()-1) - lastItemset); if(size == 1 && sequence.getItemsets().get(firstItemset).size() == 0){ return 0; } return size; } /** * Return true if this itemset is cut at right (a prefix). * @param indexItemset the position of the given itemset. * @return true if it is cut at right. */ protected boolean isCutAtRight(int index) { if(!isLastItemset(index)){ return false; } return (sequence.getItemsets().get(index + firstItemset).size() -1) != lastItem; } /** * Method that find all instances of a prefix in a sequence S. * The meaning of instance is the one from the BIDE+ article when they tak about * "first instance", "last instance". Here instead of finding only * the "first instance" or "last instance", we find all instances and * they also respect the timestamps! */ protected List<PseudoSequencePair> getAllInstancesOfPrefix(List<Itemset> prefix, int i){ List<List<Position>> listInstances = getAllInstancesOfPrefixHelper(prefix, 0, new ArrayList<List<Position>>(), new ArrayList<Position>(), 0); //we cut the instances found according to the maximum size // of the prefix that we are searching for. List<PseudoSequencePair> allPairs = new ArrayList<PseudoSequencePair>(); for(List<Position> listPositions : listInstances){ PseudoSequenceBIDE newSequence = new PseudoSequenceBIDE(this, this.firstItemset, this.firstItem, listPositions.get(i-1).itemset, listPositions.get(i-1).item); allPairs.add(new PseudoSequencePair(newSequence,listPositions)); } return allPairs; } // helper for the above method protected List<List<Position>> getAllInstancesOfPrefixHelper(List<Itemset> prefix, int indexItemset, List<List<Position>> allInstances, List<Position> listPositionsTotal, int decalageItemset){ for(int i=decalageItemset; i< size(); i++){ int indexItem =0; List<Position> listPositions = new ArrayList<Position>(); String iDCourant = prefix.get(indexItemset).get(indexItem); for(int j=0; j < getSizeOfItemsetAt(i); j++){ String id = getItemAtInItemsetAt(j, i); if(id.equals(iDCourant)){// l'item match listPositions.add(new Position(i,j)); if(listPositions.size()+ listPositionsTotal.size() == getItemOccurencesTotalCount(prefix)) // si on a trouv� tout le pr�fix { List<Position> newList = new ArrayList<Position>(listPositionsTotal); newList.addAll(listPositions); allInstances.add(newList); }else if(indexItem+1 >= prefix.get(indexItemset).size()){ // if we have found the itemset List<Position> newList = new ArrayList<Position>(listPositionsTotal); newList.addAll(listPositions); if(indexItemset+1 < prefix.size()){ getAllInstancesOfPrefixHelper(prefix, indexItemset+1, allInstances, newList, i+1); } }else{ indexItem++; iDCourant = prefix.get(indexItemset).get(indexItem); } } } } return allInstances; } /** * Last instance of a prefix sequence X in a sequence S. * For example, the last instance of AB in ABBCA is ABB. * Additionnal difficulty : this sequence must respect timestamps! */ protected PseudoSequencePair getLastInstanceOfPrefixSequence(List<Itemset> prefix, int i){ List<PseudoSequencePair> list = getAllInstancesOfPrefix(prefix, i); // Return the last one PseudoSequencePair sequenceRetourPair = list.get(0); for(PseudoSequencePair sequencePair : list){ PseudoSequenceBIDE sequence = sequencePair.pseudoSequence; PseudoSequenceBIDE sequenceRetour = sequenceRetourPair.pseudoSequence; if((sequence.lastItemset > sequenceRetour.lastItemset) || (sequenceRetour.lastItemset == sequence.lastItemset && sequence.lastItem > sequenceRetour.lastItem) ){ sequenceRetourPair = sequencePair; } } return sequenceRetourPair; } /** * First Instance of the prefix X in a sequence S. * Method to find the first Instance of the prefix sequence X= e1, e2... ei+1 in a sequence S. * Exemple: first instance of AB in the sequence CAABC = CAAB. * Additionnal difficulty : this sequence must respect timestamps! */ protected PseudoSequencePair getFirstInstanceOfPrefixSequence(List<Itemset> prefix, int i){ List<PseudoSequencePair> list = getAllInstancesOfPrefix(prefix, i); // Return the first one PseudoSequencePair sequenceRetourPair = list.get(0); for(PseudoSequencePair sequencePair : list){ PseudoSequenceBIDE sequence = sequencePair.pseudoSequence; PseudoSequenceBIDE sequenceRetour = sequenceRetourPair.pseudoSequence; if((sequence.lastItemset < sequenceRetour.lastItemset) || (sequenceRetour.lastItemset == sequence.lastItemset && sequence.lastItem < sequenceRetour.lastItem)){ sequenceRetourPair = sequencePair; } } return sequenceRetourPair; } /** * Get the ith last-in-last appearance with respect to a prefix sequence Sp. * n = size of S * If i == n, it is the last appearance of ei in the last instance of Sp. * If 0 <= i < n, it is the last appearance of ei in the last instance of Sp, and LLi must appear * before LLi+1. * Example : If S= CAABC and SP = AB then LL0 = second A in CAABC * If S= CACAC and SP = CAC then LL0 = second C in S, * LL1 = second A in S and * LL2 = third C in S * @param prefix : le prefix * @param i : le i�me �l�ment du pr�fixe. * @return */ protected Position getIthLastInLastApearanceWithRespectToPrefix(List<Itemset> prefix, int i){ // we obtain the last instance: // The last instance is a PseudoSequencePair object. // It consists of // - the pseudo sequence that is the last instance // - the list of positions for each element of prefix in that last instance. PseudoSequencePair lastInstancePair = getLastInstanceOfPrefixSequence(prefix, getItemOccurencesTotalCount(prefix)); // ith item of prefix id is : String iditem = getIthItem(prefix,i); if(i == getItemOccurencesTotalCount(prefix)-1){ // return the last occurence of that item: for(int j=lastInstancePair.pseudoSequence.size()-1; j>=0; j--){ for(int k=lastInstancePair.pseudoSequence.getItemset(j).size()-1; k>=0; k--){ if(lastInstancePair.pseudoSequence.getItemAtInItemsetAt(k, j).equals(iditem)){ return new Position(j, k); } } } }else{ // return the last before LLi+1 Position LLiplus1 = getIthLastInLastApearanceWithRespectToPrefix(prefix, i+1); for(int j=LLiplus1.itemset; j>=0; j--){ for(int k=lastInstancePair.pseudoSequence.getItemset(j).size()-1; k>=0; k--){ if(j == LLiplus1.itemset && k >= LLiplus1.item){ continue; } if(lastInstancePair.pseudoSequence.getItemAtInItemsetAt(k, j).equals(iditem)){ return new Position(j, k); } } } } return null; // should not happen! } /** * Get the ith maximum period of a prefix sequence for this sequence S. * The ith maximum period of the prefix Sp in S is : * * if 0 < i <= n, it is the piece of sequence between the end of the first instance of prefix e1... ei-1 in S * and the ith last-in-last appearance with respect to prefix Sp * * if i = 0, it is the piece of sequence in S located before the first last-in-last appearance with respect to prefix Sp. * Example1: if S = ABCB and Sp = AB * the 1th semi-period of Sp in S is empty * the 2th semi-period of Sp in S is BC * Example2: if S = ABBB and Sp = BB * the 1th semi-period of Sp in S is AB * the 2th semi-period of Sp in S is B */ protected PseudoSequenceBIDE getIthMaximumPeriodOfAPrefix(List<Itemset> prefix, int i){ if(i == 0){ //it is the piece of sequence in S located before the first last-in-last appearance with respect to prefix Sp. Position ithlastlast = getIthLastInLastApearanceWithRespectToPrefix(prefix, 0); return trimBeginingAndEnd(null, ithlastlast); } // ELSE it is the piece of sequence between the end of the first instance of prefix e1... ei-1 in S // and the ith last-in-last appearance with respect to prefix Sp // Important : We thus have to cut the prefix at ei-1 (short prefix = e1 ... ei-1). // It is because the parameter i is used by getLastInstanceOfPrefixSequence(...). (???) PseudoSequencePair firstInstance = this.getFirstInstanceOfPrefixSequence(prefix, i); Position lastOfFirstInstance = firstInstance.list.get(i-1); Position ithlastlast = this.getIthLastInLastApearanceWithRespectToPrefix(prefix, i); return trimBeginingAndEnd(lastOfFirstInstance, ithlastlast); } /** * Method that is used by the one above. * This method cut a sequence by removing some part at the begining and at the end. * IMPORTANT : this method assumes that the sequence has never been cut. * This simplify what this method has to do to handle time. * * @return null If the result is an empty sequence.! */ protected PseudoSequenceBIDE trimBeginingAndEnd(Position positionStart, Position positionEnd){ int itemsetStart = 0; int itemStart =0; int itemsetEnd=lastItemset; int itemEnd=lastItem; if(positionStart != null){ // where the cut starts itemsetStart = positionStart.itemset; itemStart = positionStart.item + 1; if(itemStart == getSizeOfItemsetAt(itemsetStart)){ itemsetStart++; itemStart =0; } if(itemsetStart == size()){// the resulting sequence is empty! return null; } } if(positionEnd != null){ // We cut the right part itemsetEnd = positionEnd.itemset; itemEnd = positionEnd.item -1; if(itemEnd<0){ itemsetEnd--; if(itemsetEnd < itemsetStart){ return null; } itemEnd = getSizeOfItemsetAt(itemsetEnd)-1; } } // Check if the end is not before the beginning of the sequence! if(itemsetEnd == itemsetStart && itemEnd < itemStart){ return null; } return new PseudoSequenceBIDE(this, itemsetStart, itemStart, itemsetEnd, itemEnd); } /** * Get the ith semi-maximum period of a prefix sequence for this sequence S. * The ith semi-maximum period of the prefix Sp in S is : * * if 0 < i <= n, it is the piece of sequence between the end of the first instance of prefix e1... ei-1 in S * and the ith last-in-first appearance with respect to prefix Sp * * if i = 0, it is the piece of sequence in S located before the first last-in-first appearance with respect to prefix Sp. */ protected PseudoSequenceBIDE getIthSemiMaximumPeriodOfAPrefix(List<Itemset> prefix, int i){ if(i == 0){ //it is the piece of sequence in S located before the first last-in-first appearance with respect to prefix Sp. Position ithlastfirst = getIthLastInFirstApearanceWithRespectToPrefix(prefix, 0); PseudoSequenceBIDE pseudo = trimBeginingAndEnd(null, ithlastfirst); // pseudo.toString(); return pseudo; } // ELSE it is the piece of sequence between the end of the first instance of prefix e1... ei-1 in S // and the ith last-in-first appearance with respect to prefix Sp /// Important: we have to cut the prefix at ei-1 (short prefix = e1 ... ei-1). // since the parameter i is used by getLastInstanceOfPrefixSequence(...) and.....(???) // THIS IS DONE AS FOLLOWS: // We get the first instance of prefix e1... ei-1 PseudoSequencePair firstInstance = this.getFirstInstanceOfPrefixSequence(prefix, i); // e1... ei-1 // we get the position of the last item of that first instance Position endOfFirstInstance = firstInstance.list.get(i-1); // we get the ith last-in-first appearance with respect to prefix Sp Position ithlastfirst = this.getIthLastInFirstApearanceWithRespectToPrefix(prefix, i); //we return the piece of sequence between the end of the first instance of prefix e1... ei-1 in S // and the ith last-in-first appearance with respect to prefix Sp return trimBeginingAndEnd(endOfFirstInstance, ithlastfirst); } /** * Return the sum of the size of all itemsets of this sequence. * Note: used by the BIDE algorithm */ protected int getItemOccurencesTotalCount(List<Itemset> itemsets){ int count =0; for(Itemset itemset : itemsets){ count += itemset.size(); } return count; } /** * Get the ith item in a pseudo-sequence. * @param itemsets a list of itemsets * @param i the position of an item * @return the item. */ protected String getIthItem(List<Itemset> itemsets, int i) { for(int j=0; j< itemsets.size(); j++){ if(i < itemsets.get(j).size()){ return itemsets.get(j).get(i); } i = i- itemsets.get(j).size(); } return null; } //----------------------- Backscan search space pruning sept. 2009 updated /** * Get the ith first-in-last appearance with respect to a prefix sequence Sp. * n = size of S * If i == n, it is the last appearance of ei in the first instance of Sp. * If 0 <= i < n, it is the last appearance of ei in the first instance of Sp, and LFi must appear * before LFi+1. * @param prefix : the prefix * @param i : the i�me element of the prefix. * @return */ protected Position getIthLastInFirstApearanceWithRespectToPrefix(List<Itemset> prefix, int i){ // First, we get the first instance. // The first instance is a PseudoSequencePair object. // It consists of // - the pseudosequence that is the first instance // - the list of positions for each element of prefix in that first instance. PseudoSequencePair firstInstancePair = getFirstInstanceOfPrefixSequence(prefix, getItemOccurencesTotalCount(prefix)); // IF WE DON'T USE TIMESTAMP THE "ITH LAST IN LAST" IS A LITTLE BIT COMPLICATED TO GET : // ith item of prefix id is : String iditem = getIthItem(prefix,i); if(i == getItemOccurencesTotalCount(prefix)-1){ // return the last occurence of that item: for(int j=firstInstancePair.pseudoSequence.size()-1; j>=0; j--){ for(int k=firstInstancePair.pseudoSequence.getItemset(j).size()-1; k>=0; k--){ if(firstInstancePair.pseudoSequence.getItemAtInItemsetAt(k, j).equals(iditem)){ return new Position(j, k); } } } }else{ // return the last before LLi+1 Position LLiplus1 = getIthLastInFirstApearanceWithRespectToPrefix(prefix, i+1); if(LLiplus1 == null){ System.out.println("DEBUG"); } for(int j= LLiplus1.itemset; j>=0; j--){ for(int k=firstInstancePair.pseudoSequence.getItemset(j).size()-1; k>=0; k--){ if(j == LLiplus1.itemset && k >= LLiplus1.item){ continue; } if(firstInstancePair.pseudoSequence.getItemAtInItemsetAt(k, j).equals(iditem)){ return new Position(j, k); } } } } return null; // should not happen! } }