package ca.pfv.spmf.algorithms.sequential_rules.cmrules; /* This file is copyright (c) 2008-2013 Philippe Fournier-Viger * * This file is part of the SPMF DATA MINING SOFTWARE * (http://www.philippe-fournier-viger.com/spmf). * * SPMF is free software: you can redistribute it and/or modify it under the * terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * * SPMF is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR * A PARTICULAR PURPOSE. See the GNU General Public License for more details. * You should have received a copy of the GNU General Public License along with * SPMF. If not, see <http://www.gnu.org/licenses/>. */ import java.io.BufferedWriter; import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import ca.pfv.spmf.datastructures.triangularmatrix.TriangularMatrix; import ca.pfv.spmf.input.sequence_database_list_integers.Sequence; import ca.pfv.spmf.input.sequence_database_list_integers.SequenceDatabase; import ca.pfv.spmf.input.transaction_database_list_integers.TransactionDatabase; import ca.pfv.spmf.patterns.itemset_array_integers_with_tids.Itemset; import ca.pfv.spmf.patterns.itemset_array_integers_with_tids.Itemsets; import ca.pfv.spmf.tools.MemoryLogger; /** * The CMRules algorithm for mining sequential rules common to several sequences. * <br/><br/> * * This algorithm is described in: * <br/><br/> * * Fournier-Viger, P., Faghihi, U., Nkambou, R. & Mephu Nguifo, E. (2010). * CMRules: An Efficient Algorithm for Mining Sequential Rules Common to Several Sequences. * Proceedings of the 23th International Florida Artificial Intelligence Research Society Conference * (FLAIRS 2010). AAAI press. * <br/><br/> * * This implementation use a modified AprioriTID algorithm for generating association rules * in phase 1. * * @see Rule * @see Rules * @see Itemset * @see Itemsets * @see TransactionDatabase * @see SequenceDatabase * @see Sequence * @see TriangularMatrix * @see AlgoAprioriTID_forCMRules * * @author Philippe Fournier-Viger */ public class AlgoCMRules { //*** statistics about the latest execution *** int associationRulesCount = 0 ; // number of association rules int ruleCount; // the number of sequential rules generated long timeStart = 0; // start time long timeEnd = 0; // end time long timeEndConvert = 0; // end time for conversion to transaction database long timeEndApriori = 0; // end time for calculating frequent itemsets long timeEndSequentialMeasures = 0; // end time for calculating measures for sequential rules long timeBeginCalculateSequentialMeasures = 0; // start time for calculating measures for sequential rules long timeEndPreprocessing = 0; // end time for pre-processing // *** parameters *** public int minCSupRelative = 0; // min. seq. support public double minSeqConfidence; // min seq. confidence SequenceDatabase sequences; // the sequence database // Special parameters to set the size of rules to be discovered int minLeftSize = 0; // min size of left part of the rule int maxLeftSize = 500; // max size of left part of the rule int minRightSize = 0; // min size of right part of the rule int maxRightSize = 500; // max size of right part of the rule // *** internal variables *** // this is the largest item ID in the database int maxItemId = 0; // this map indicate the tidset (value) for each item (key) Map<Integer, Set<Integer>> mapItemCount = new HashMap<Integer, Set<Integer>>(); // list of frequent items List<Integer> listFrequentsSize1 = new ArrayList<Integer>(); // the set of frequent itemsets found by Apriori TID private Itemsets patterns; // a triangular matrix for efficiently counting the support of pairs of items private TriangularMatrix matrix; // object to write the output file BufferedWriter writer = null; /** * Default constructor. */ public AlgoCMRules() { } /** * Run the algorithm with the minsup parameter as a percentage value (double). * @param input input file containing a sequence database. * @param output the file path for writing the result * @param absoluteMinSupport the minsup is a percentage value (ex.: 0.05 = 5 % of all sequences in the database) * @param minConfidence the minimum confidence threshold * @throws IOException exception if error while writing the output file. */ public void runAlgorithm(String input, String output, double absoluteMinSupport, double minConfidence) throws IOException { // load the sequence database from the input file sequences = new SequenceDatabase(); sequences.loadFile(input); // convert absolute minimum support to a relative minimum support by // multiplying by the database size this.minCSupRelative = (int) Math.ceil(absoluteMinSupport * sequences.size()); runAlgorithm(minCSupRelative, minConfidence, input, output); } /** * Run the algorithm with the minsup parameter as a number of sequences (integer). * @param input input file containing a sequence database. * @param output the file path for writing the result * @param relativeSupport the minsup is a number of sequences (ex.: 5 = 5 sequences of the database) * @param minConfidence the minimum confidence threshold * @throws IOException exception if error while writing the output file. */ public void runAlgorithm(int relativeSupport, double minConfidence, String input, String output) throws IOException { // reset the utility for recording memory usage MemoryLogger.getInstance().reset(); // save the parameters this.minSeqConfidence = minConfidence; this.minCSupRelative = relativeSupport; // if set to 0, then set it to 1 if(this.minCSupRelative == 0){ // protection this.minCSupRelative = 1; } // It the sequence database has not been loaded yet, then load it from // the input file if(sequences == null){ sequences = new SequenceDatabase(); sequences.loadFile(input); } // Create the writer oject for writing the output file writer = new BufferedWriter(new FileWriter(output)); // record the start time timeStart = System.currentTimeMillis(); // for stats // remove items that are infrequent from the database and // at the same time calculate the support of each item // as well as the largest item in the dataase System.out.println("STEP 0"); removeItemsThatAreNotFrequent(sequences); // Put items that are frequent in a list that is lexically ordered for(int i=0; i<= maxItemId; i++){ // if it is frequent (tidset with size >0) if(mapItemCount.get(i) != null && mapItemCount.get(i).size() >= minCSupRelative){ // add to the list listFrequentsSize1.add(i); } } // sort the list by lexical order Collections.sort(listFrequentsSize1); // record end time for pre-processing timeEndPreprocessing = System.currentTimeMillis(); // for stats //STEP 1 : Transform sequence database in a binary context // by calling the following method: TransactionDatabase context = convert(sequences); // (1.b) create the triangular matrix for counting the support of itemsets of size 2 // for optimization purposes. // matrix = new TriangularMatrix(maxItemId+1); // for each transaction, take each itemset of size 2, // // and update the triangular matrix. // for(Itemset itemset : context.getObjects()){ // Object[] array = itemset.getItems().toArray(); // for(int i=0; i< itemset.size(); i++){ // Integer itemI = (Integer) array[i]; // for(int j=i+1; j< itemset.size(); j++){ // Integer itemJ = (Integer) array[j]; // // update the matrix // matrix.incrementCount(itemI, itemJ); // } // } // } // // record the end time for converting the database timeEndConvert = System.currentTimeMillis(); // STEP 2: Applying the APRIORI-TID algorithm to find frequent itemsets System.out.println("STEP2"); AlgoAprioriTID_forCMRules apriori = new AlgoAprioriTID_forCMRules(context, matrix); // we don't want itemset having more item that the maximum desired size // for a sequential rules in terms of items apriori.setMaxItemsetSize(maxLeftSize + maxRightSize); // apply apriori patterns = apriori.runAlgorithm(minCSupRelative, listFrequentsSize1, mapItemCount); // check memory usage MemoryLogger.getInstance().checkMemory(); // record end time for Apriori algorithm timeEndApriori = System.currentTimeMillis(); // STEP 3: Generate all rules from the set of frequent itemsets. // This is based on the algorithm for association rule by Agrawal & Srikant, 94 // except that the sequential measures are calculated for each rule // to see if it is a valid sequential rule //System.out.println("STEP3 " + patterns.getItemsetsCount()); generateRules(patterns); // check memory usage MemoryLogger.getInstance().checkMemory(); // record end time for rule generation timeEnd = System.currentTimeMillis(); /// we don't need the sequence database anymore sequences = null; // close the output file writer.close(); } /** * Remove itemsets that are not frequent from a sequence database * @param sequences * @return */ private Map<Integer, Set<Integer>> removeItemsThatAreNotFrequent(SequenceDatabase sequences) { // (1) count the support of each item in the database in one database pass // Store the information in a map where each item (key) is associated to // a tidset (value). mapItemCount = new HashMap<Integer, Set<Integer>>(); // for each sequence for(Sequence sequence : sequences.getSequences()){ // for each itemset in that sequence for(List<Integer> itemset : sequence.getItemsets()){ // for each item in that itemset for(int i=0; i< itemset.size(); i++){ // get its tidset Set<Integer> ids = mapItemCount.get(itemset.get(i)); if(ids == null){ // if no tidset create one ids = new HashSet<Integer>(); mapItemCount.put(itemset.get(i), ids); // if it is the largest item seen until now, then remember that if(itemset.get(i) > maxItemId){ maxItemId = itemset.get(i); } } // add the sequence ID to the tidset ids.add(sequence.getId()); } } } System.out.println("NUMBER OF DIFFERENT ITEMS : " + mapItemCount.size()); // (2) remove all items that are not frequent from the database // for each sequence for(Sequence sequence : sequences.getSequences()){ int i=0; // for each itemset in that sequence while(i < sequence.getItemsets().size()){ List<Integer> itemset = sequence.getItemsets().get(i); int j=0; // for each item in that itemset while(j < itemset.size()){ // if the item is not frequent double count = mapItemCount.get(itemset.get(j)).size(); if(count < minCSupRelative){ // then remove it itemset.remove(j); }else{ // otherwise go to next item j++; } } // if a sequence becomes empty ecause of removed items, then remove it if(itemset.size() == 0){ sequence.getItemsets().remove(i); }else{ // otherwise go to next itemset i++; } } } // return the map of items - tidsets return mapItemCount; } /** * This method update the interestingness measure of a given rule based * on a sequence * @param rule the rule * @param sequence the sequence */ private void calculateSequentialMeasures( Rule rule, Sequence sequence) { // This method will pass through the sequence and try to see if the left part // matches and then if the right part matches // This is a set to remember the items previously seen from the left or right part of the rules Set<Integer> setAlreadySeen = new HashSet<Integer>(rule.getItemset1().size() * 3); // could be replaced with a flag on items // First we will try to match the left part of the rule int i=0; firstpass:{ // for each itemset in the sequence for(; i< sequence.getItemsets().size(); i++){ int j=0; List<Integer> itemset = sequence.get(i); for(; j< itemset.size(); j++){ // FOR EACH ITEM OF THIS SEQUENCE int item = itemset.get(j); // if the left part of the rule contains item J if(rule.getItemset1().contains(item)){ // left part of rule // if we have found all items from the leftpart, // we will break and try to find the right part next setAlreadySeen.add(item); if(setAlreadySeen.size() == rule.getItemset1().size()){ break firstpass; } } } } } i++; // i++ because we will try to find the right part of the rule starting from the next itemset setAlreadySeen.clear(); // clear the set of items seen because we will reuse this variable but here for the right side of the rule // for each itemset in the sequence starting at i for(; i< sequence.getItemsets().size(); i++){ int j=0; List<Integer> itemset = sequence.get(i); // for each item j in that itemset for(; j< itemset.size(); j++){ int item = itemset.get(j); // if the right part of the rule contains item J if(rule.getItemset2().contains(item)){ setAlreadySeen.add(item); // if we have found all items from the right part, if(setAlreadySeen.size() == rule.getItemset2().size()){ // update the support of the rule because we have found // the whole rule rule.sequentialTransactionCount++; // we can stop scanning the sequence return; } } } } } /** * This method convert a sequence database into a transaction database * @param sequences a sequence database * @return a transaction database */ private TransactionDatabase convert(SequenceDatabase sequences) { // create new transaction database TransactionDatabase transactionDatabase = new TransactionDatabase(); // for each sequence in the seq. database for(Sequence sequence : sequences.getSequences()){ //create a empty itemset that will be the transaction corresponding // to this sequence List<Integer> transaction = new ArrayList<Integer>(); // for each itemset in the sequence for(List<Integer> itemset : sequence.getItemsets()){ // add all items to the transaction transaction.addAll(itemset); } // add the transaction to the transaction database transactionDatabase.addTransaction(transaction); } transactionDatabase.printDatabase(); return transactionDatabase; } /** * Print statistics about the latest algorithm execution. */ public void printStats() { System.out.println("============= SEQUENTIAL RULES - STATS ============="); System.out.println("Association rules count: " + associationRulesCount); System.out.println("Sequential rules count: " + ruleCount); System.out.println("Total time : " + (timeEnd - timeStart) + " ms"); System.out.println("Max memory: " + MemoryLogger.getInstance().getMaxMemory()); System.out .println("==================================================="); } /** * Calculate the interestingness measures of a rule and then if it is a valid rule, * save it to the output file. * @param rule an association rule that may be a valid sequential rule * @throws IOException exception if error writing the output file. */ void checkRule(Rule rule) throws IOException{ // increase the number of association rules associationRulesCount++; // calculate the interestingness measure by scanning sequences // from the tid set of the rule to update its measures for(Integer seqId : rule.getItemset1().getTransactionsIds()){ calculateSequentialMeasures(rule, sequences.getSequences().get(seqId)); } // if the rule meet the minconf and minsuf criteria if(rule.sequentialTransactionCount >= minCSupRelative && rule.getSequentialConfidence() >= minSeqConfidence){ // save the rule to the file saveRule(rule); } } /** * Run the Agrawal algorithm for generating association rules that is modified * to also find sequential rules * @param patterns a set of frequent itemsets * @throws IOException exception if error writing to the output file */ void generateRules(Itemsets patterns) throws IOException { //For each frequent itemset of size >=2 for(int k=2; k< patterns.getLevels().size(); k++){ for(Itemset lk : patterns.getLevels().get(k)){ // create H1 Set<Itemset> H1 = new HashSet<Itemset>(); for(Itemset itemsetSize1 : patterns.getLevels().get(1)){ if(lk.contains(itemsetSize1.getItems()[0])){ H1.add(itemsetSize1); } } // lk.print(); // DEBUG // System.out.println(); // DEBUG /// ================ I ADDED THIS BECAUSE THE ALGORITHM AS DESCRIBED BY AGRAWAL94 /// ================ DID NOT GENERATE ALL THE ASSOCIATION RULES Set<Itemset> H1_for_recursion = new HashSet<Itemset>(); // for each itemset in H1 for(Itemset hm_P_1 : H1){ // make a copy of itemset_Lk_minus_hm_P_1 but remove // items from hm_P_1 Itemset itemset_Lk_minus_hm_P_1 = (Itemset)lk.cloneItemSetMinusAnItemset(hm_P_1); // This is the definition of confidence: // double conf = supp(lk) / supp (lk - hm+1) // To calculate the confidence, we need // the support of : itemset_Lk_minus_hm_P_1 calculateSupport(itemset_Lk_minus_hm_P_1); // THIS COULD BE OPTIMIZED ? OR DONE ANOTHER WAY ? // calculate the confidence double conf = ((double)lk.getAbsoluteSupport()) / ((double)itemset_Lk_minus_hm_P_1.getAbsoluteSupport()); // if the confidence is enough if(conf >= minSeqConfidence){ // check if it respect the size constraint int leftsize = lk.size() - 1; if(leftsize <= maxLeftSize && leftsize >= minLeftSize && 1 >= minRightSize && 1 <= maxRightSize){ // if yes create the rule Rule rule = new Rule(itemset_Lk_minus_hm_P_1, hm_P_1, lk.getAbsoluteSupport(), conf); // then check if this association rule is also // a sequential rule checkRule(rule); } // if the size constraints are met, then // record Hm+1 for recursion if(1 != maxRightSize && leftsize != minLeftSize){ H1_for_recursion.add(hm_P_1);// for recursion } } } // ================ END OF WHAT I HAVE ADDED // If it is still possible to exapdn the rule if(1 != maxRightSize && lk.size() - 1 != minLeftSize){ // then call the apGenRules procedure for further expansion apGenrules(k, 1, lk, H1_for_recursion); } } } } /** * Save a rule to the output file * @param support the support of the rule * @param confIJ the confidence of the rule * @param itemsetI the left itemset * @param itemsetJ the right itemset * @throws IOException exception if error writing the file */ private void saveRule(Rule rule) throws IOException { // increase the number of valid rules found ruleCount++; // create string buffer StringBuilder buffer = new StringBuilder(); // write the left itemset for(int i=0; i<rule.getItemset1().size(); i++){ buffer.append(rule.getItemset1().get(i)); if(i != rule.getItemset1().size() -1){ buffer.append(","); } } // write separator buffer.append(" ==> "); // write right itemset for(int i=0; i<rule.getItemset2().size(); i++){ buffer.append(rule.getItemset2().get(i)); if(i != rule.getItemset2().size() -1){ buffer.append(","); } } // write support buffer.append(" #SUP: "); buffer.append(rule.getSequentialAbsoluteSeqSupport()); // write confidence buffer.append(" #CONF: "); buffer.append(rule.getConfidence()); // write the buffer to the file and write a new line writer.write(buffer.toString()); writer.newLine(); } /** * The ApGenRules as described in p.14 of the paper by Agrawal. * (see the Agrawal paper for more details). * @param lk a itemset that is used to generate rules * @throws IOException exception if error while writing output file */ private void apGenrules(int k, int m, Itemset lk, Set<Itemset> Hm) throws IOException { // System.out.println(" " + lk.toString() + " " + Hm.toString()); if(k > m+1){ int leftsize = lk.size() - (1 + m); Set<Itemset> Hm_plus_1 = generateCandidateSizeK(Hm); Set<Itemset> Hm_plus_1_for_recursion = new HashSet<Itemset>(); // for each itemset Hm+1 for(Itemset hm_P_1 : Hm_plus_1){ // Generate the itemset Lk / Hm+1 Itemset itemset_Lk_minus_hm_P_1 = (Itemset)lk.cloneItemSetMinusAnItemset(hm_P_1); // Calculate the support of Lk / Hm+1 calculateSupport(itemset_Lk_minus_hm_P_1); // calculate the confidence of the rule Lk / Hm+1 ==> Hm+1 double conf = ((double)lk.getAbsoluteSupport()) / ((double)itemset_Lk_minus_hm_P_1.getAbsoluteSupport()); // if this association rule has enough confidence if(conf >= minSeqConfidence){ // if it respect the size constraints if(leftsize <= maxLeftSize && leftsize >= minLeftSize && m+1 >= minRightSize && m+1 <= maxRightSize){ // Create the rule Lk / Hm+1 ==> Hm+1 Rule rule = new Rule(itemset_Lk_minus_hm_P_1, hm_P_1, lk.getAbsoluteSupport(), conf); // Then check if it is also a valid sequential rules checkRule(rule); } // if the size constraints allow further expansion of the rule if(1+m != maxRightSize && leftsize != minLeftSize){ // add Hm+1 to the set for the recursion Hm_plus_1_for_recursion.add(hm_P_1); } } } // if the size constraints allow further expansion of the rule if(1+m != maxRightSize && leftsize != minLeftSize){ // recursive call to apGenRules apGenrules(k, m+1, lk, Hm_plus_1_for_recursion); } } // check the memory usage MemoryLogger.getInstance().checkMemory(); } /** * Calculate the support of an itemset by looking at the frequent patterns * of the same size. * * @param itemset_Lk_minus_hm_P_1 * The itemset. */ private void calculateSupport(Itemset itemset_Lk_minus_hm_P_1) { // loop over all the patterns of the same size. for(Itemset itemset : patterns.getLevels().get(itemset_Lk_minus_hm_P_1.size())){ // If the pattern is found if(itemset.isEqualTo(itemset_Lk_minus_hm_P_1)){ // set its support to the same value. itemset_Lk_minus_hm_P_1.setTIDs(itemset.getTransactionsIds()); return; } } } /** * Generating candidate itemsets of size k from frequent itemsets of size * k-1. This is called "apriori-gen" in the paper by agrawal. This method is * also used by the Apriori algorithm for generating candidates. * * @param levelK_1 a set of itemsets of size k-1 * @return a set of candidates */ protected Set<Itemset> generateCandidateSizeK(Set<Itemset> levelK_1) { // Initialize the set of candidates Set<Itemset> candidates = new HashSet<Itemset>(); // For each itemset I1 and I2 of level k-1 for(Itemset itemset1 : levelK_1){ for(Itemset itemset2 : levelK_1){ // If I1 is smaller than I2 according to lexical order and // they share all the same items except the last one. Integer missing = itemset1.allTheSameExceptLastItem(itemset2); if(missing != null ){ // Create a new candidate by combining itemset1 and itemset2 int newItemset[] = new int[itemset1.size()+1]; System.arraycopy(itemset1.itemset, 0, newItemset, 0, itemset1.size()); newItemset[itemset1.size()] = missing; Itemset candidate = new Itemset(newItemset); // The candidate is tested to see if its subsets of size k-1 // are included in // level k-1 (they are frequent). if(allSubsetsOfSizeK_1AreFrequent(candidate,levelK_1)){ candidates.add(candidate); } } } } return candidates; } /** * This method checks if all the subsets of size "k" of the itemset * "candidate" are frequent. It is similar to what is used in the * Apriori algorithm for generating frequent itemsets. * * @param candidate * An itemset of size "k". * @param levelK_1 * The frequent itemsets of size "k-1". * @return true is all susets are frequent */ protected boolean allSubsetsOfSizeK_1AreFrequent(Itemset candidate, Set<Itemset> levelK_1) { // To generate all the set of size K-1, we will proceed // by removing each item, one by one. if(candidate.size() == 1){ return true; } // for each item of candidate, we will consider that this item is removed for(Integer item : candidate.getItems()){ // create the subset without this item Itemset subset = (Itemset) candidate.cloneItemSetMinusOneItem(item); // we will search itemsets of size k-1 to see if this itemset appears boolean found = false; // for each itemset of size k-1 for(Itemset itemset : levelK_1){ // if the itemset is equals to "subset", we found it and stop the loop if(itemset.isEqualTo(subset)){ found = true; break; } } // if not found return false if(found == false){ return false; } } // otherwise, all the subsets were found, so we return true return true; } /** * Set the minimum antecedent size constraint for rules to be found. * @param minLeftSize an integer */ public void setMinLeftSize(int minLeftSize) { this.minLeftSize = minLeftSize; } /** * Set the maximum antecedent size constraint for rules to be found. * @param maxLeftSize an integer */ public void setMaxLeftSize(int maxLeftSize) { this.maxLeftSize = maxLeftSize; } /** * Set the minimum consequent size constraint for rules to be found. * @param minRightSize an integer */ public void setMinRightSize(int minRightSize) { this.minRightSize = minRightSize; } /** * Set the maximum consequent size constraint for rules to be found. * @param maxRightSize an integer */ public void setMaxRightSize(int maxRightSize) { this.maxRightSize = maxRightSize; } }