package ca.pfv.spmf.algorithms.frequentpatterns.charm; /* This file is copyright (c) 2008-2013 Philippe Fournier-Viger * * This file is part of the SPMF DATA MINING SOFTWARE * (http://www.philippe-fournier-viger.com/spmf). * * SPMF is free software: you can redistribute it and/or modify it under the * terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * * SPMF is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR * A PARTICULAR PURPOSE. See the GNU General Public License for more details. * You should have received a copy of the GNU General Public License along with * SPMF. If not, see <http://www.gnu.org/licenses/>. */ import java.io.BufferedWriter; import java.io.FileWriter; import java.io.IOException; import java.util.Iterator; import java.util.List; import ca.pfv.spmf.datastructures.triangularmatrix.TriangularMatrix; import ca.pfv.spmf.input.transaction_database_list_integers.TransactionDatabase; import ca.pfv.spmf.patterns.itemset_array_integers_with_tids_bitset.Itemset; import ca.pfv.spmf.patterns.itemset_array_integers_with_tids_bitset.Itemsets; /** * This is an implementation of the CHARM-MFI algorithm (thesis of L. Szathmary, 2006) * that is a simple extension that takes the output of CHARM as input and * do post-processing to keep only maximal itemsets. <br/><br/> * * But it was found that the Charm-MFI algorithm is actually incorrect so * this implementation was modified to fix the original algorithm and * generate the correct result. <br/><br/> * * Note that this algorithm is not very efficient because the maximal itemsets * are generated by post-processing.<br/><br/> * * Also, note that this version can save the result to a file * or keep it into memory if no output path is provided * by the user to the runAlgorithm() method. * * @see TriangularMatrix * @see TransactionDatabase * @see Itemset * @see Itemsets * @see HashTable * @see AlgoCharm_Bitset * @author Philippe Fournier-Viger */ public class AlgoCharmMFI { /** start time of the last execution */ private long startTimestamp; /** end time of the last execution */ private long endTimestamp; /** The patterns that are found (if the user want to keep them into memory) */ protected Itemsets maximalItemsets; /** object to write the output file */ BufferedWriter writer = null; /** * Default constructor */ public AlgoCharmMFI() { } /** * Run the CHARM-MFI algorithm. * @param output an output file path or null if the user want to keep the result in memory. * @param frequentClosed a set of frequent closed itemsets * @return the set of maximal itemsets (if the user chose to keep the result in memory. * @throws IOException An exception if an error occurs while writting the output to a file. */ public Itemsets runAlgorithm(String output, Itemsets frequentClosed) throws IOException { // if the user want to keep the result into memory if(output == null){ writer = null; }else{ // if the user want to save the result to a file writer = new BufferedWriter(new FileWriter(output)); } // Initialize the structure to store maximal itemsets maximalItemsets = frequentClosed; maximalItemsets.setName("FREQUENT MAXIMAL ITEMSETS"); // record the start time of the algorithm startTimestamp = System.currentTimeMillis(); // get the size of the largest closed itemset. int maxItemsetLength = frequentClosed.getLevels().size(); // IMPORTANT NOTE : THE ORIGINAL ALGORITHM IS INCORRECT (IT // DOES NOT PRODUCE THE SET OF MAXIMAL ITEMSETS IN SOME CASES BECAUSE // IT ONLY COMPARES CLOSED ITEMSETS OF SIZE I WITH THOSE OF SIZE I+1. // HOWEVER, IT IS POSSIBLE THAT AN ITEMSET IS NOT MAXIMAL BECAUSE // OF A CLOSED ITEMSET OF A SIZE LARGER THAN I+1. ): // TO FIX IT THE ALGORITHM, WE HAVE MODIFIED IT AS FOLLOWS: // For closed itemsets of size i=1 to the largest size for (int i = 1; i < maxItemsetLength - 1; i++) { // Get the itemsets of size i List<Itemset> ti = frequentClosed.getLevels().get(i); // For closed itemsets of size j = i+1 to the largest size for (int j = i+1; j < maxItemsetLength; j++) { // get itemsets of size j List<Itemset> tip1 = frequentClosed.getLevels().get(j); // Check which itemsets are maximals by comparing itemsets // of size i and i+1 findMaximal(ti, tip1, frequentClosed); } } // If the user chose to save the output to a file if(writer != null){ // For itemsets of size i = 1 to the maximum itemset length for(List<Itemset> level : maximalItemsets.getLevels()){ // For each itemset of length i for(int i=0; i < level.size(); i++){ Itemset itemset = level.get(i); // save the itemset an its support writer.write(itemset.toString() + " #SUP: " + itemset.getAbsoluteSupport()); writer.newLine(); } } writer.close(); } endTimestamp = System.currentTimeMillis(); // Return all frequent maximal itemsets found! return maximalItemsets; } /** * Check if itemsets of size i are closed by comparing them with * itemsets of size j where j > i. * @param ti itemsets of size i * @param tip1 itemsets of size j * @param maximalItemsets the current set of maximal itemsets */ private void findMaximal(List<Itemset> ti, List<Itemset> tip1, Itemsets maximalItemsets) { // for each itemset of j for (Itemset itemsetJ : tip1) { // iterates over the itemsets of size i Iterator<Itemset> iter = ti.iterator(); while (iter.hasNext()) { Itemset itemsetI = (Itemset) iter.next(); // if the current itemset of size i is contained // in the current itemset of size J if (itemsetJ.containsAll(itemsetI) ) { // Then, it means that the itemset of size I is not maximal so we remove it iter.remove(); // We decrease the current number of maximal itemsets. maximalItemsets.decreaseItemsetCount(); // NOTE: IF WE WOULD LIKE TO MAKE THIS MORE EFFICIENT // WE COULD USE LINKED-LIST TO STORE ITEMSETS // INSTEAD OF ARRAY LISTS..... // THE COST FOR THE REMOVE OPERATION MAY BE SMALLER... } } } } /** * Print statistics about the algorithm execution to System.out. */ public void printStats(int transactionCount) { System.out.println("============= CHARM-MFI - STATS ============="); long temps = endTimestamp - startTimestamp; System.out.println(" Transactions count from database : " + transactionCount); System.out.println(" Frequent maximal itemsets count : " + maximalItemsets.getItemsetsCount()); System.out.println(" Total time ~ " + temps + " ms"); System.out.println("==================================================="); } /** * Get the set of maximal itemsets found by Charm-MFI * @return the set of maximal itemsets */ public Itemsets getItemsets() { return maximalItemsets; } }