package ca.pfv.spmf.algorithms.frequentpatterns.charm; /* This file is copyright (c) 2008-2013 Philippe Fournier-Viger * * This file is part of the SPMF DATA MINING SOFTWARE * (http://www.philippe-fournier-viger.com/spmf). * * SPMF is free software: you can redistribute it and/or modify it under the * terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * * SPMF is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR * A PARTICULAR PURPOSE. See the GNU General Public License for more details. * You should have received a copy of the GNU General Public License along with * SPMF. If not, see <http://www.gnu.org/licenses/>. */ import java.io.IOException; import java.util.Arrays; import java.util.BitSet; import java.util.Map; import ca.pfv.spmf.algorithms.ArraysAlgos; import ca.pfv.spmf.algorithms.frequentpatterns.charm.AlgoCharm_Bitset.BitSetSupport; import ca.pfv.spmf.datastructures.triangularmatrix.TriangularMatrix; import ca.pfv.spmf.input.transaction_database_list_integers.TransactionDatabase; import ca.pfv.spmf.patterns.itemset_array_integers_with_tids_bitset.Itemset; import ca.pfv.spmf.patterns.itemset_array_integers_with_tids_bitset.Itemsets; import ca.pfv.spmf.tools.MemoryLogger; /** * This is an implementation of the dCharm algorithm. The difference between DECLAT * and CHARM is that dCharm utilizes diffsets instead of tidsets. * In this implementation, diffsets are represented as bitsets. * Note that this class is a subclass of the Charm algorithm because a lot of * code is the same and we wanted to avoid redundancy. * * IMPORTANT: dCharm returns Itemsets annotated with their diffsets * rather than tidsets when the user choose to keep the result in memory. * * dCharm was proposed by ZAKI (2000). * <br/><br/> * * See this article for details about dCharm: * <br/><br/> * * Mohammed Javeed Zaki, Ching-Jiu Hsiao: CHARM: An Efficient Algorithm for Closed Itemset Mining. SDM 2002. * <br/><br/> * * and diffsets have been proposed in: <br/><br/> * * M. J. Zaki and K. Gouda. Fast vertical mining using Diffsets. Technical Report 01-1, Computer Science * Dept., Rensselaer Polytechnic Institute, March 2001. * * This version saves the result to a file * or keep it into memory if no output path is provided * by the user to the runAlgorithm method(). * * @see TriangularMatrix * @see TransactionDatabase * @see Itemset * @see Itemsets * @author Philippe Fournier-Viger */ public class AlgoDCharm_Bitset extends AlgoCharm_Bitset{ /** * Print statistics about the algorithm execution to System.out. */ public void printStats() { System.out.println("============= dCharm vALTERNATE-Bitset v96r6 - STATS ============="); long temps = endTime - startTimestamp; System.out.println(" Transactions count from database : " + database.size()); System.out.println(" Frequent itemsets count : " + itemsetCount); System.out.println(" Total time ~ " + temps + " ms"); System.out.println(" Maximum memory usage : " + MemoryLogger.getInstance().getMaxMemory() + " mb"); System.out.println("==================================================="); } /** * This method scans the database to calculate the support of each single item * @param database the transaction database * @param mapItemTIDS a map to store the tidset corresponding to each item * @return the maximum item id appearing in this database */ int calculateSupportSingleItems(TransactionDatabase database, final Map<Integer, BitSetSupport> mapItemTIDS) { // (1) First database pass : calculate diffsets of each item. int maxItemId = 0; // for each transaction for (int i = 0; i < database.size(); i++) { // Add the transaction id to the set of all transaction ids // for each item in that transaction // For each item for (Integer item : database.getTransactions().get(i)) { // Get the current tidset of that item BitSetSupport tids = mapItemTIDS.get(item); // If none, then we create one if(tids == null){ tids = new BitSetSupport(); // For a new item, we sets all the bits of its diffset to true tids.bitset.set(0, database.size(), true); mapItemTIDS.put(item, tids); // we remember the largest item seen until now if (item > maxItemId) { maxItemId = item; } } //We set to false the bit corresponding to this transaction // in the diffset of that item tids.bitset.set(i, false); // we increase the support of that item tids.support++; } } return maxItemId; } /** * Perform the intersection of two diffsets for itemsets containing more than one item. * @param tidsetI the first diffset * @param tidsetJ the second diffset * @return the resulting diffset and its support */ BitSetSupport performAND(BitSetSupport tidsetI, BitSetSupport tidsetJ) { // Create the new diffset BitSetSupport bitsetSupportIJ = new BitSetSupport(); // Calculate the diffset bitsetSupportIJ.bitset = (BitSet)tidsetJ.bitset.clone(); bitsetSupportIJ.bitset.andNot(tidsetI.bitset); // Calculate the support bitsetSupportIJ.support = tidsetI.support - bitsetSupportIJ.bitset.cardinality(); // return the new diffset return bitsetSupportIJ; } /** * Perform the intersection of two diffsets representing single items. * @param tidsetI the first diffset * @param tidsetJ the second diffset * @param supportIJ the support of the intersection (already known) so it does not need to * be calculated again * @return the resulting diffset and its support */ BitSetSupport performANDFirstTime(BitSetSupport tidsetI, BitSetSupport tidsetJ, int supportIJ) { // Create the new diffset and perform the logical AND to intersect the diffsets BitSetSupport bitsetSupportIJ = new BitSetSupport(); //Calculate the diffset bitsetSupportIJ.bitset = (BitSet)tidsetJ.bitset.clone(); bitsetSupportIJ.bitset.andNot(tidsetI.bitset); // Calculate the support bitsetSupportIJ.support = tidsetI.support - bitsetSupportIJ.bitset.cardinality(); // return the new tidset return bitsetSupportIJ; } /** * Save an itemset(as described in the paper). * @param prefix the prefix part of this itemset * @param suffix the suffix part of this itemset * @param tidset the tidset of this itemset * @throws IOException if an error occurs when writing to file */ void save(int[] prefix, int[] suffix, BitSetSupport tidset) throws IOException { // First we concatenate the suffix and prefix of that itemset. int[] prefixSuffix; if(prefix == null) { prefixSuffix = suffix; }else { prefixSuffix = ArraysAlgos.concatenate(prefix, suffix); } // Sort the resulting itemset Arrays.sort(prefixSuffix); // Create an instance of "Itemset" for that itemset to put in hash table ca.pfv.spmf.patterns.itemset_array_integers_with_count.Itemset itemset = new ca.pfv.spmf.patterns.itemset_array_integers_with_count.Itemset(prefixSuffix); itemset.setAbsoluteSupport(tidset.support); // Calculate the hash code of that itemset int hashcode = hash.hashCode(tidset.bitset); // Check in the hash table to see if the itemset has // a superset already in the hash table. If not, then it is // a closed itemset and we should output it as well // as insert it in the hash table. if (!hash.containsSupersetOf(itemset, hashcode)) { // increase the itemset count itemsetCount++; // if the result should be saved to memory if (writer == null) { // save it to memory with its tidset Itemset itemsetWithTidset = new Itemset(prefixSuffix, null, tidset.support); // ^^ NOTE: IN THE LINE ABOVE WE SET THE "TIDSET" TO NULL FOR DCHARM BECAUSE // IT IS NOT MEANINGFUL TO KEEP THE DIFFSET. closedItemsets.addItemset(itemsetWithTidset, itemset.size()); } else { // otherwise if the result should be saved to a file, // then write it to the output file writer.write(itemset.toString() + " #SUP: " + itemset.support); writer.newLine(); } // add the itemset to the hashtable hash.put(itemset, hashcode); } } }