package ca.pfv.spmf.algorithms.frequentpatterns.zart; /* This file is copyright (c) 2008-2013 Philippe Fournier-Viger * * This file is part of the SPMF DATA MINING SOFTWARE * (http://www.philippe-fournier-viger.com/spmf). * * SPMF is free software: you can redistribute it and/or modify it under the * terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * * SPMF is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR * A PARTICULAR PURPOSE. See the GNU General Public License for more details. * You should have received a copy of the GNU General Public License along with * SPMF. If not, see <http://www.gnu.org/licenses/>. */ import java.io.BufferedWriter; import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import ca.pfv.spmf.input.transaction_database_list_integers.TransactionDatabase; import ca.pfv.spmf.patterns.itemset_array_integers_with_count.Itemset; import ca.pfv.spmf.tools.MemoryLogger; /** * This is an implementation of Zart, an algorithm for mining frequent closed itemsets * and their associated generators at the same time. The Zart algorithm is described in the article : * <br/><br/> * * "Zart : a Multifunctional Itemset Mining Algorithm" de Laszlo Szathmary et al. * ZART finds all the frequent closed itemsets in a binary context, their associated * minimal generator(s) and their support. * <br/><br/> * * This algorithm could be optimized in various way as described in the article by Szathmary, * for example, by using the Trie data structure and by removing unfrequent items, but this was not done here. * * @see TransactionDatabase * @see Itemset * @author Philippe Fournier-Viger */ public class AlgoZart { // start time of the latest execution long startTimestamp; // end time of the latest execution long endTimestamp; // relative minimum support threshold private int minsupRelative =0; // the input database private TransactionDatabase context = null; // the TZ, TF and TC structures as described in the paper private TZTableClosed tableClosed = null; // table of closed itemsets and their generators private TFTableFrequent tableFrequent = null; // table of frequent itemsets private TCTableCandidate tableCandidate = null; // table of candidates // The list of frequent generators FG private List<Itemset> frequentGeneratorsFG = null; // 2 /** * Default constructor */ public AlgoZart() { } /*** * Run the algorithm * @param database a transaction database * @param minsupp the minimum support threshold * @return a set of closed itemsets and their associated generator(s) */ public TZTableClosed runAlgorithm(TransactionDatabase database, double minsupp){ // record the start time startTimestamp = System.currentTimeMillis(); // reset the utility for recording the memory usage MemoryLogger.getInstance().reset(); // save database received as parameter this.context = database; // Initialize the FG, TZ,TF and TC structure // used by the algorithm (as described in the paper) frequentGeneratorsFG = new ArrayList<Itemset>(); // 2 tableClosed = new TZTableClosed(); // tabled of closed itemsets tableFrequent = new TFTableFrequent(); // table of frequent itemsets tableCandidate = new TCTableCandidate(); // table of candidates // convert the minimum support from absolute to relative by // multiplying by the database size minsupRelative = (int) Math.ceil(minsupp * database.size()); // (1) Scan the database and count the support of each item (in a map) // for this map : key = item value = support Map<Integer, Integer> mapItemSupport = new HashMap<Integer, Integer>(); // for each transaction for(List<Integer> transaction : database.getTransactions()){ // for each item in the transaction for(Integer item : transaction){ // increase the support count of the item Integer count = mapItemSupport.get(item); if (count == null) { // if first time, then put 1 mapItemSupport.put(item, 1); } else { // otherwise increase by 1 mapItemSupport.put(item, ++count); } } } // (0) Remove infrequent items from each transaction. // For each transaction for(List<Integer> transaction : database.getTransactions()){ // for each item Iterator<Integer> it = transaction.iterator(); while (it.hasNext()) { // get the item Integer item = (Integer) it.next(); // if infrequent, then remove it if(mapItemSupport.get(item) < minsupRelative){ it.remove(); } } } // (1) fill candidates with 1-itemsets (single items) tableCandidate.levels.add(new ArrayList<Itemset>()); for(Integer item : mapItemSupport.keySet()){ // create an itemset for the item and set its support Itemset itemset = new Itemset(item); itemset.setAbsoluteSupport(mapItemSupport.get(item)); // if the support is higher than minsup if(mapItemSupport.get(item) >= minsupRelative){ // add it to frequent itemsets and candidates table tableFrequent.addFrequentItemset(itemset); tableCandidate.levels.get(0).add(itemset); } } // // sort candidates // Collections.sort(tableCandidate.levels.get(0), new Comparator<Itemset>() { // public int compare(Itemset i1, Itemset i2) { // return i1.getItems().get(0) - i2.getItems().get(0); // } // }); // // if there are frequent items if(tableFrequent.levels.size() != 0) { //This variable will be used to indicate if a full column is set to // 1 in the binary context, which means that a non-empty itemset is shared // by all transactions boolean fullCollumn = false; // 1 // 6 : Loops over frequent itemsets of size 1 for(Itemset l : tableFrequent.getLevelForZart(0)){ // assign the value true to l in the map for closed itemsets tableFrequent.mapClosed.put(l, true); // 8 // If L has the support equal to the number of transactions in the database if(l.getAbsoluteSupport() == database.getTransactions().size()){ // 9 // 10 The empty set is its generator (IMPORTANT) tableFrequent.mapKey.put(l, false); // there is an itemset shared by all transactions fullCollumn = true; // 11 }else{ // otherwise, put l into the table of frequent itemsets tableFrequent.mapKey.put(l, true); // 13 } } // create the empty set Itemset emptyset = new Itemset(new int[]{}); // 15 if there is an itemset shared by all transactions if(fullCollumn){ // add the empty set as a generator frequentGeneratorsFG.add(emptyset); }else{ // Otherwise, the empty set is closed and it is its own generator // So we add it to the tables accordingly tableFrequent.addFrequentItemset(emptyset); // add to table of frequent itemsets tableFrequent.mapClosed.put(emptyset, true); // add to table of closed itemsets tableFrequent.mapPredSupp.put(emptyset, database.size()); tableClosed.addClosedItemset(emptyset); tableClosed.mapGenerators.put(emptyset, new ArrayList<Itemset>()); // we set its support as the database size emptyset.setAbsoluteSupport(database.size()); } // Now, Zart will recursively generate candidates of larger size i+1 // by using itemsets of size i to discover all frequent itemsets, closed itemsets // and their generator. // This process is baserd on the Apriori algorithm but modified. int i=1; for(; true; i++){ // 16 zartGen(i); // 18 Ci+1 = ZartGen(Fi); // if there is no candidate, then // the algorithm stops if(tableCandidate.levels.get(i).size() == 0){ // 19 break; } // if there is an itemset of size i with its key value to true if(tableCandidate.thereisARowKeyValueIsTrue(i)){ // 20 // 22 for each transaction for(List<Integer> o : database.getTransactions()){ //22 // for each subset of the candidate for(Itemset s : subset(tableCandidate.levels.get(i), o)){ // 23, 24 if(tableCandidate.mapKey.get(s)){ // increase its support count s.increaseTransactionCount(); //25 } } } } // for each candidate itemset of size i for(Itemset c : tableCandidate.levels.get(i)){ //28 // if it is a frequent itemset if(c.getAbsoluteSupport() >= minsupRelative){ //31 // if c is set to true in mapKey and its support is // equal to the one of predSup if(tableCandidate.mapKey.get(c) == true && c.getAbsoluteSupport() == tableCandidate.mapPredSupp.get(c)){ // set its key to false! tableCandidate.mapKey.put(c, false); //32 } // add the itemset to the list of frequent itemset tableFrequent.addFrequentItemset(c); // 33 // put c in the maps of TF // Note that this step was not explicit in the original algorithm. tableFrequent.mapKey.put(c, tableCandidate.mapKey.get(c)); tableFrequent.mapPredSupp.put(c, tableCandidate.mapPredSupp.get(c)); } } // for each frequent itemset of size i for(Itemset l : tableFrequent.getLevelForZart(i)){ // 36 // add it as closed to the map of closed itemsets by assuming // that it is closed until now tableFrequent.mapClosed.put(l, true); //37 // for all suset of l of size i-1 for(Itemset s : subset(tableFrequent.getLevelForZart(i-1), l)){ // 38, 39 // if it has the same support as l, that means // that l is not closed so we mark it as such. if(s.getAbsoluteSupport() == l.getAbsoluteSupport()){ // 40 tableFrequent.mapClosed.put(s, false); } } } // 42 tableClosed.levels.add(new ArrayList<Itemset>()); // for each frequent itemsets of size i-1 for(Itemset l : tableFrequent.getLevelForZart(i-1)){ // if it is marked as closed, then we add it to // the tale of closed itemsets. if(tableFrequent.mapClosed.get(l) == true){ tableClosed.getLevelForZart(i-1).add(l); } } // find the generators for closed itemsets of size i-1 findGenerators(tableClosed.getLevelForZart(i-1), i); // 43 // check the memory usage MemoryLogger.getInstance().checkMemory(); } // .... 45 tableClosed.levels.add(new ArrayList<Itemset>()); for(Itemset l : tableFrequent.getLevelForZart(i-1)){ tableClosed.getLevelForZart(i-1).add(l); } // Call the find generator method to find the generators. // This is line 46 in the pseudo code of Zart. findGenerators(tableClosed.getLevelForZart(i-1), i); } // check the memory usage MemoryLogger.getInstance().checkMemory(); // record the end time endTimestamp = System.currentTimeMillis(); // return a table containing the closed itemsets and their associatied generator(s) return tableClosed; } /** * Find generators for each itemset of a list of closed itemsets * @param zi a list of itemset zi of size i * @param i the size i */ private void findGenerators(List<Itemset> zi, int i) { // for each itemset in the list for(Itemset z : zi){ // 1 // get the list of all frequent generators contained in z List<Itemset> s = subset(frequentGeneratorsFG, z); // 3 // register them in the map associating closed itemsets to their generators tableClosed.mapGenerators.put(z, s); // 4 // remove the generators from the list of generators // because a generator is member of only one equivalence class frequentGeneratorsFG.removeAll(s); // 5 } // for each frequent itemsets of size i-1 for(Itemset l : tableFrequent.getLevelForZart(i-1)){ // if the key value is set to true and it is not closed if(tableFrequent.mapKey.get(l) == true && tableFrequent.mapClosed.get(l) == false){ // then add it to the list of generators frequentGeneratorsFG.add(l); } } } /** * This returns the list of itemsets from a list of itemsets s that * are included in a given itemset l. * @param s a list of itemsets S of the same size * @param l an itemset L. * @return the list of itemsets from S that are contained in L */ private List<Itemset> subset(List<Itemset> s, Itemset l) { // Initialize the list of subsets List<Itemset> retour = new ArrayList<Itemset>(); // for each itemset in S for(Itemset itemsetS : s){ boolean allIncluded = true; // for each item of this itemset, for(int i=0; i<itemsetS.size(); i++){ // if that item is not contained in the itemset l // then the itemset S is not included in l and we note it if(!l.contains(itemsetS.get(i))){ allIncluded = false; } } // if s is included in l if(allIncluded){ // then add it to the list of subsets retour.add(itemsetS); } } // return the list return retour; } /** * This returns the list of itemsets from a list of itemsets s that * are included in a given itemset l. * @param s a list of itemsets S of the same size * @param l an itemset L. * @return the list of itemsets from S that are contained in L */ private List<Itemset> subset(List<Itemset> s, List<Integer> l) { // Initialize the list of subsets List<Itemset> subset = new ArrayList<Itemset>(); // for each itemset in S for(Itemset itemsetS : s){ boolean allIncluded = true; // for each item of this itemset, for(int i=0; i<itemsetS.size(); i++){ // if that item is not contained in the itemset l // then the itemset S is not included in l and we note it if(!l.contains(itemsetS.get(i))){ allIncluded = false; } } // if s is included in l if(allIncluded){ // then add it to the list of subsets subset.add(itemsetS); } } // return the list return subset; } /** * Method for the generation of candidates of size i. * @param i the size i. */ private void zartGen(int i) { // This method generates the candidates of size i // (similar to apriori-gen). prepareCandidateSizeI(i); // Then, for each candidate found in the previous step // we check if all the subsets of size i-1 (also named k-1 here) are frequents. // If one subset is infrequent, then the candidate is infrequent // and we don't need to consider it anymore. // for each candidate for(Itemset c : new ArrayList<Itemset>(tableCandidate.levels.get(i))){ // 2 // set the key to true tableCandidate.mapKey.put(c, true); // 4 // set the support to database size +1. tableCandidate.mapPredSupp.put(c, context.getTransactions().size() + 1); // 7 // To generate all sets of size k-1: S, we will proceed // by removing each element one by one. // for each element for(int j=0; j<c.size(); j++){ // we copy the itemset without the current item Itemset s = (Itemset) c.cloneItemSetMinusOneItem(c.get(j)); boolean found = false; // now for each frequent itemsets of size i-1 for(Itemset itemset2 : tableFrequent.getLevelForZart(i-1)){ // if we have found the subset, then we stop this loop // and set the variale to true to remember that we found it if(itemset2.isEqualTo(s)){ found = true; break; } } // if the current subset of S is not frequent if(found == false){ // then we remove it from the candidates tableCandidate.levels.get(i).remove(c); }else{ // if the current subset is frequent, // then get the previous occurence in the table of candidates Itemset occurenceS = getPreviousOccurenceOfItemset(s, tableCandidate.levels.get(i-1)); // AJOUT N�CESSAIRE // if the support of that occurence is lower if(occurenceS.getAbsoluteSupport() < tableCandidate.mapPredSupp.get(c)){ // 11 // then we will use that support for this subset tableCandidate.mapPredSupp.put(c, occurenceS.getAbsoluteSupport()); }else{ //otherwise, we use the support of c tableCandidate.mapPredSupp.put(c, tableCandidate.mapPredSupp.get(c)); } // After that, if the previous occurrence has the key set to false if(tableFrequent.mapKey.get(occurenceS) == false){ // 12 // we will also set it to false in the table of candidates tableCandidate.mapKey.put(c, false); } } } // 15 // finally, if the key of the candidate c has been set to false // then we will set its support to the support stored in the // table of candidates. if(tableCandidate.mapKey.get(c) == false){ c.setAbsoluteSupport(tableCandidate.mapPredSupp.get(c)); } } } /** * Get the previous occurence of an itemset in a list of itemset. * @param itemset the given itemset * @param list the list of itemsets * @return the previous occurence or null if there is not such previous occurence */ private Itemset getPreviousOccurenceOfItemset(Itemset itemset, List<Itemset> list){ // for each itemset in the list for(Itemset itemset2 : list){ // if it is equal to the itemset that is searched, then // return it if(itemset2.isEqualTo(itemset)){ return itemset2; } } // otherwise, it was not found, so return null return null; } /** * This is the method to generate candidate itemsets of size i. * It is similar to the Apriori candidate generation. * @param size the size i */ protected void prepareCandidateSizeI(int size) { // add a new list in candidates to store the candidates of size i tableCandidate.levels.add(new ArrayList<Itemset>()); // For each frequent itemset I1 and I2 of size i-1 for(Itemset itemset1 : tableFrequent.getLevelForZart(size-1)){ for(Itemset itemset2 : tableFrequent.getLevelForZart(size-1)){ // If I1 is smaller than I2 according to lectical order // and that they have only one element that is different // Integer missing = itemset2.haveOneItemDifferent(itemset1); Integer missing = itemset2.allTheSameExceptLastItem(itemset1); if(missing != null){ // Create a new candidate by combining itemset1 and itemset2 int union[] = new int[itemset1.size()+1]; System.arraycopy(itemset2.itemset, 0, union, 0, itemset2.size()); union[itemset2.size()] = missing; // add the resulting itemset // to the table of candidates of size i tableCandidate.levels.get(size).add(new Itemset(union)); } } } } /** * Get the table of frequent itemsets * @return the table of frequent itemsets */ public TFTableFrequent getTableFrequent() { return tableFrequent; } /** * Print statistics about the latest execution of the algorithm. */ public void printStatistics() { System.out.println("========== ZART - STATS ============"); System.out.println(" Total time ~: " + (endTimestamp - startTimestamp) + " ms"); System.out.println(" Max memory:" + MemoryLogger.getInstance().getMaxMemory()); System.out.println("====================================="); } /** * Save the results found to a file * @param output the path of an output file * @throws IOException exception if error while writing to the file */ public void saveResultsToFile(String output) throws IOException { //prepare the output file BufferedWriter writer = new BufferedWriter(new FileWriter(output)); writer.write("======= List of closed itemsets and their generators ============"); writer.newLine(); // for each level in the table of closed itemset // (the level i is the closed itemsets of size i) for(int i=0; i< tableClosed.levels.size(); i++){ // for each closed itemsets of size i for(Itemset closed : tableClosed.levels.get(i)){ // write the itemset writer.write(" CLOSED : \n " + closed.toString() + " #SUP: " + closed.getAbsoluteSupport()); writer.newLine(); // write the generators writer.write(" GENERATOR(S) :"); writer.newLine(); /// for each generator of that closed itemset List<Itemset> generators = tableClosed.mapGenerators.get(closed); // if there is no generators, it means that the closed itemset is a generator if(generators.size() == 0) { writer.write(" " + closed.toString() ); writer.newLine(); }else { // otherwise we write the generators for(Itemset generator : generators){ // write the generator writer.write(" " + generator.toString()); writer.newLine(); } } } } // We then print the list of frequent itemsets writer.write("======= List of frequent itemsets ============"); writer.newLine(); // for itemsets of size i from 0 to the largest itemsets for(int i=0; i< tableFrequent.levels.size(); i++){ // for each frequent itemset of size i for(Itemset itemset : tableFrequent.levels.get(i)){ // write the itemset writer.write(" ITEMSET : " + itemset.toString() + " #SUP: " + itemset.getAbsoluteSupport()); writer.newLine(); } } // Finally, the output file is closed writer.close(); } }