package ca.pfv.spmf.algorithms.frequentpatterns.charm;
/* This file is copyright (c) 2008-2013 Philippe Fournier-Viger
*
* This file is part of the SPMF DATA MINING SOFTWARE
* (http://www.philippe-fournier-viger.com/spmf).
*
* SPMF is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
*
* SPMF is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
* You should have received a copy of the GNU General Public License along with
* SPMF. If not, see <http://www.gnu.org/licenses/>.
*/
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.BitSet;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import ca.pfv.spmf.algorithms.ArraysAlgos;
import ca.pfv.spmf.datastructures.triangularmatrix.TriangularMatrix;
import ca.pfv.spmf.input.transaction_database_list_integers.TransactionDatabase;
import ca.pfv.spmf.patterns.itemset_array_integers_with_tids_bitset.Itemset;
import ca.pfv.spmf.patterns.itemset_array_integers_with_tids_bitset.Itemsets;
import ca.pfv.spmf.tools.MemoryLogger;
/**
* This is a new implementation of the CHARM algorithm (2014) that relies on bitsets to implement
* tidsets.
*
* Charm was proposed by ZAKI (2001).
* <br/><br/>
*
* See this article for details about CHARM:
* <br/><br/>
*
* Zaki, M. J., & Hsiao, C. J. (2002). CHARM: An Efficient Algorithm for Closed Itemset Mining. In SDM (Vol. 2, pp. 457-473).
* <br/><br/>
*
* This version saves the result to a file
* or keep it into memory if no output path is provided
* by the user to the runAlgorithm method().
*
* @see TriangularMatrix
* @see TransactionDatabase
* @see Itemset
* @see Itemsets
* @author Philippe Fournier-Viger
*/
public class AlgoCharm_Bitset {
/** relative minimum support **/
private int minsupRelative;
/** the transaction database **/
protected TransactionDatabase database;
/** start time of the last execution */
protected long startTimestamp;
/** end time of the last execution */
protected long endTime;
/**
The patterns that are found
(if the user want to keep them into memory) */
protected Itemsets closedItemsets;
/** object to write the output file */
BufferedWriter writer = null;
/** the number of patterns found */
protected int itemsetCount;
/** For optimization with a triangular matrix for counting
/ itemsets of size 2. */
private TriangularMatrix matrix; // the triangular matrix
/** The hash table for storing itemsets for closeness checking (an optimization) */
protected HashTable hash;
/**
* Default constructor
*/
public AlgoCharm_Bitset() {
}
/**
* Run the algorithm and save the output to a file or keep it into memory.
* @param database a transaction database
* @param output an output file path for writing the result or if null the result is saved into memory and returned
* @param minsup the minimum support
* @param useTriangularMatrixOptimization if true the triangular matrix optimization will be applied.
* @param hashTableSize the size of the hashtable (e.g. 10,000).
* @return the set of closed itemsets found if the result is kept into memory or null otherwise.
* @throws IOException exception if error while writing the file.
*/
public Itemsets runAlgorithm(String output, TransactionDatabase database, double minsup,
boolean useTriangularMatrixOptimization, int hashTableSize) throws IOException {
// Reset the tool to assess the maximum memory usage (for statistics)
MemoryLogger.getInstance().reset();
// if the user want to keep the result into memory
if(output == null){
writer = null;
closedItemsets = new Itemsets("FREQUENT CLOSED ITEMSETS");
}else{ // if the user want to save the result to a file
closedItemsets = null;
writer = new BufferedWriter(new FileWriter(output));
}
// Create the hash table to store itemsets for closeness checking
this.hash = new HashTable(hashTableSize);
// reset the number of itemset found to 0
itemsetCount = 0;
this.database = database;
// record the start time
startTimestamp = System.currentTimeMillis();
// convert from an absolute minsup to a relative minsup by multiplying
// by the database size
this.minsupRelative = (int) Math.ceil(minsup * database.size());
// (1) First database pass : calculate tidsets of each item.
// This map will contain the tidset of each item
// Key: item Value : tidset
final Map<Integer, BitSetSupport> mapItemTIDS = new HashMap<Integer, BitSetSupport>();
// for each transaction
int maxItemId = 0;
maxItemId = calculateSupportSingleItems(database, mapItemTIDS);
// If the user chose to use the triangular matrix optimization
// for counting the support of itemsets of size 2.
if (useTriangularMatrixOptimization) {
// We create the triangular matrix.
matrix = new TriangularMatrix(maxItemId + 1);
// for each transaction, take each itemset of size 2,
// and update the triangular matrix.
for (List<Integer> itemset : database.getTransactions()) {
Object[] array = itemset.toArray();
// for each item i in the transaction
for (int i = 0; i < itemset.size(); i++) {
Integer itemI = (Integer) array[i];
// compare with each other item j in the same transaction
for (int j = i + 1; j < itemset.size(); j++) {
Integer itemJ = (Integer) array[j];
// update the matrix count by 1 for the pair i, j
matrix.incrementCount(itemI, itemJ);
}
}
}
}
// (2) create the list of single items
List<Integer> frequentItems = new ArrayList<Integer>();
// for each item
for(Entry<Integer, BitSetSupport> entry : mapItemTIDS.entrySet()) {
// get the support and tidset of that item
BitSetSupport tidset = entry.getValue();
int support = tidset.support;
int item = entry.getKey();
// if the item is frequent
if(support >= minsupRelative) {
// add the item to the list of frequent items
frequentItems.add(item);
}
}
// Sort the list of items by the total order of increasing support.
// This total order is suggested in the article by Zaki.
Collections.sort(frequentItems, new Comparator<Integer>() {
@Override
public int compare(Integer arg0, Integer arg1) {
return mapItemTIDS.get(arg0).support - mapItemTIDS.get(arg1).support;
}});
// Now we will combine each pairs of single items to generate equivalence classes
// of 2-itemsets
// For each frequent item X according to the total order
for(int i=0; i < frequentItems.size(); i++) {
Integer itemX = frequentItems.get(i);
// If the itemset is null (which means that it has been removed, then we
// continue to the next item
if(itemX == null) {
continue;
}
// We obtain the tidset and support of that item X
BitSetSupport tidsetX = mapItemTIDS.get(itemX);
// We create an itemset with the item X.
int[] itemsetX = new int[] {itemX};
// We create an empty equivalence class for storing all itemsets obtained by joining
// X with other itemsets.
// This equivalence class is represented by two structures.
// The first structure stores the suffix of all itemsets starting with the prefix "X".
// For example, if X = "1" and the equivalence class contains 12, 13, 14, then
// the structure "equivalenceClassIitems" will only contain 2, 3 and 4 instead of
// 12, 13 and 14. The reason for this implementation choice is that it is more
// memory efficient.
/// Moreover, when the charm properties requires to replace X with Xj (see the article),
// it can be done very efficiently if we keep X separately.
List<int[]> equivalenceClassIitemsets = new ArrayList<int[]>();
// The second structure stores the tidset of each itemset in the equivalence class
// of the prefix "i"
List<BitSetSupport> equivalenceClassItidsets = new ArrayList<BitSetSupport>();
// For each item itemJ that is larger than i according to the total order of
// increasing support.
loopJ: for(int j=i+1; j < frequentItems.size(); j++) {
Integer itemJ = frequentItems.get(j);
// If the itemset is null (which means that it has been removed, then we
// continue to the next item
if(itemJ == null) {
continue;
}
// If the triangular matrix optimization is activated and X is a single item
// we obtain the support of the pair of item "x", "j" by using the matrix.
// This allows to determine
// directly the support without performing a join.
// Then if the support is less than minsup, the itemset X + j is infrequent
// and we don't need to consider it anymore.
int supportIJ = -1;
if(itemsetX.length == 1 && useTriangularMatrixOptimization) {
// check the support of {i,j} according to the triangular matrix
supportIJ = matrix.getSupportForItems(itemX, itemJ);
// if not frequent
if (supportIJ < minsupRelative) {
// skip j;
continue loopJ;
}
}
// We obtain the tidset of J.
BitSetSupport tidsetJ = mapItemTIDS.get(itemJ);
// Calculate the tidset of itemset "X" + "J" by performing the intersection of
// the tidsets of X and the tidset of J.
BitSetSupport bitsetSupportUnion = new BitSetSupport();
if(itemsetX.length == 1 && useTriangularMatrixOptimization) {
// If the triangular matrix optimization is used and X is a single item, then
// we perform the intersection but we do not calculate the support since
// it was already calculated using the triangular matrix
bitsetSupportUnion = performANDFirstTime(tidsetX, tidsetJ, supportIJ);
}else {
// Otherwise, we perform the intersection and calculate the support
// by calculating the cardinality of the resulting tidset.
bitsetSupportUnion = performAND(tidsetX, tidsetJ);
}
// if the union is infrequent, we don't need to consider it further
if(bitsetSupportUnion.support < minsupRelative) {
continue;
}
// We next check which of the four Charm properties hold
// If Property 1 holds
if(tidsetX.support == tidsetJ.support &&
bitsetSupportUnion.support == tidsetX.support) {
// We remove Xj
frequentItems.set(j, null);
// Then, we calculate the union of X and Xj
int[] realUnion = new int[itemsetX.length + 1];
System.arraycopy(itemsetX, 0, realUnion, 0, itemsetX.length);
realUnion[itemsetX.length] = itemJ;
// Then we replace X by the union
itemsetX = realUnion;
}else if(tidsetX.support < tidsetJ.support
&& bitsetSupportUnion.support == tidsetX.support) {
// If property 2 holds
// Then, we calculate the union of X and Xj
int[] realUnion = new int[itemsetX.length + 1];
System.arraycopy(itemsetX, 0, realUnion, 0, itemsetX.length);
realUnion[itemsetX.length] = itemJ;
// Then we replace X by the union
itemsetX = realUnion;
}else if(tidsetX.support > tidsetJ.support
&& bitsetSupportUnion.support == tidsetJ.support) {
// If property 3 holds
// We remove Xj
frequentItems.set(j, null);
// Then, we add the itemset X + J to the equivalence class that
// we are building.
// Note that we actually only add J because we keep the prefix X for
// for the whole equivalence class. Thus X + J can be reconstructed at any time.
equivalenceClassIitemsets.add(new int[] {itemJ});
// We also keep the tidset of X + J
equivalenceClassItidsets.add(bitsetSupportUnion);
}else {
// If property 4 holds
// Then, we add the itemset X + J to the equivalence class that
// we are building.
// Note that we actually only add J because we keep the prefix X for
// for the whole equivalence class. Thus X + J can be reconstructed at any time.
equivalenceClassIitemsets.add(new int[] {itemJ});
// We also keep the tidset of X + J
equivalenceClassItidsets.add(bitsetSupportUnion);
}
}
// Process all itemsets from the equivalence class that we are building, which
// has X as prefix, to find larger itemsets.
// Note that we only do that if the equivalence class contains at least an itemset.
if(equivalenceClassIitemsets.size() > 0) {
// call to recursive method
processEquivalenceClass(itemsetX, equivalenceClassIitemsets, equivalenceClassItidsets);
}
// Save the itemset X with its support (can be obtained from its tidset.
save(null, itemsetX, tidsetX);
}
// close the output file if the result was saved to a file
if(writer != null){
writer.close();
}
// we check the memory usage
MemoryLogger.getInstance().checkMemory();
// record the end time for statistics
endTime = System.currentTimeMillis();
// Return all frequent itemsets found!
return closedItemsets;
}
int calculateSupportSingleItems(TransactionDatabase database,
final Map<Integer, BitSetSupport> mapItemTIDS) {
int maxItemId = 0;
for (int i = 0; i < database.size(); i++) {
// Add the transaction id to the set of all transaction ids
// for each item in that transaction
// For each item
for (Integer item : database.getTransactions().get(i)) {
// Get the current tidset of that item and its support
BitSetSupport tids = mapItemTIDS.get(item);
// If no tidset, then we create one
if(tids == null){
tids = new BitSetSupport();
mapItemTIDS.put(item, tids);
// we remember the largest item seen until now
if (item > maxItemId) {
maxItemId = item;
}
}
// we add the current transaction id to the tidset of the item
tids.bitset.set(i);
// we increase the support of that item
tids.support++;
}
}
return maxItemId;
}
/**
* Perform the intersection of two tidsets representing single items.
* @param tidsetI the first tidset
* @param tidsetJ the second tidset
* @param supportIJ the support of the intersection (already known) so it does not need to
* be calculated again
* @return the resulting tidset and its support
*/
BitSetSupport performANDFirstTime(BitSetSupport tidsetI,
BitSetSupport tidsetJ, int supportIJ) {
// Create the new tidset and perform the logical AND to intersect the tidset
BitSetSupport bitsetSupportIJ = new BitSetSupport();
bitsetSupportIJ.bitset = (BitSet)tidsetI.bitset.clone();
bitsetSupportIJ.bitset.and(tidsetJ.bitset);
// set the support as the support provided as parameter
bitsetSupportIJ.support = supportIJ;
// return the new tidset
return bitsetSupportIJ;
}
/**
* Perform the intersection of two tidsets for itemsets containing more than one item.
* @param tidsetI the first tidset
* @param tidsetJ the second tidset
* @return the resulting tidset and its support
*/
BitSetSupport performAND(BitSetSupport tidsetI, BitSetSupport tidsetJ) {
// Create the new tidset and perform the logical AND to intersect the tidset
BitSetSupport bitsetSupportIJ = new BitSetSupport();
bitsetSupportIJ.bitset = (BitSet)tidsetI.bitset.clone();
bitsetSupportIJ.bitset.and(tidsetJ.bitset);
// set the support as the cardinality of the new tidset
bitsetSupportIJ.support = bitsetSupportIJ.bitset.cardinality();
// return the new tidset
return bitsetSupportIJ;
}
/**
* This method process all itemsets from an equivalence class to generate larger itemsets,
* @param prefix the prefix of all itemsets of the current equivalence class
* @param equivalenceClassItemsets the list of last items of itemsets of the current equivalence class
* @param equivalenceClassTidsets the list of tidsets of itemsets of the current equivalence class
* @throws IOException
*/
void processEquivalenceClass(int[] prefix, List<int[]> equivalenceClassItemsets,
List<BitSetSupport> equivalenceClassTidsets) throws IOException {
// If there is only on itemset in equivalence class
if(equivalenceClassItemsets.size() == 1) {
int[] itemsetI = equivalenceClassItemsets.get(0);
BitSetSupport tidsetI = equivalenceClassTidsets.get(0);
// Then, we just attempt to save that itemset to the output and stop.
// To save the itemset we call the method save with the prefix "prefix" and the suffix
// "itemsetI".
save(prefix, itemsetI, tidsetI);
return;
}
// If there are only two itemsets in the equivalence class
if(equivalenceClassItemsets.size() == 2) {
// We get the suffix of the first itemset (an itemset that we will call I)
int[] itemsetI = equivalenceClassItemsets.get(0);
BitSetSupport tidsetI = equivalenceClassTidsets.get(0);
// We get the suffix of the second itemset (an itemset that we will call J)
int[] itemsetJ = equivalenceClassItemsets.get(1);
BitSetSupport tidsetJ = equivalenceClassTidsets.get(1);
// We calculate the tidset of the itemset resulting from the union of
// the first itemset and the second itemset.
BitSetSupport bitsetSupportIJ = performAND(tidsetI, tidsetJ);
// If the itemset is frequent
if(bitsetSupportIJ.support >= minsupRelative) {
// we attempt to save the itemset prefix + itemsetI + itemsetJ
int[] suffixIJ = ArraysAlgos.concatenate(itemsetI, itemsetJ);
save(prefix, suffixIJ, bitsetSupportIJ);
}
// If the itemset prefix+I does not have the same support as prefix+I+J,
// then prefix+I may be closed, so we attempt to save it.
if(bitsetSupportIJ.support != tidsetI.support) {
save(prefix, itemsetI, tidsetI);
}
// If the itemset prefix+J does not have the same support as prefix+I+J,
// then prefix+J may be closed, so we attempt to save it.
if(bitsetSupportIJ.support != tidsetJ.support) {
save(prefix, itemsetJ, tidsetJ);
}
return;
}
// The next loop combines each pairs of itemsets of the equivalence class
// to form larger itemsets
// For each itemset "prefix" + an itemset X
for(int i=0; i < equivalenceClassItemsets.size(); i++) {
int[] itemsetX = equivalenceClassItemsets.get(i);
// If the itemset X is null, which means that it had been removed
if(itemsetX == null) {
continue;
}
// We obtain the tidset of X
BitSetSupport tidsetX = equivalenceClassTidsets.get(i);
// create the empty equivalence class for storing the equivalence class of
// all itemsets obtained by a join with X.
List<int[]> equivalenceClassIitemsets = new ArrayList<int[]>();
// We also create a structure to store the tidset of each itemset in the
// equivalence class
List<BitSetSupport> equivalenceClassItidsets = new ArrayList<BitSetSupport>();
// For each itemset "prefix" + an itemset J
for(int j=i+1; j < equivalenceClassItemsets.size(); j++) {
int[] itemsetJ = equivalenceClassItemsets.get(j);
// If J is null, that means that it has been removed by a Charm property,
// so we just continue to the next itemset
if(itemsetJ == null) {
continue;
}
// Get the tidset of J.
BitSetSupport tidsetJ = equivalenceClassTidsets.get(j);
// Calculate the tidset intersection of prefix + X + J
BitSetSupport bitsetSupportUnion = new BitSetSupport();
bitsetSupportUnion = performAND(tidsetX, tidsetJ);
// If prefix + X + J is infrequent, then we don't need
// to consider it anymore
if(bitsetSupportUnion.support < minsupRelative) {
continue;
}
// We next check which of the four Charm properties hold
// If Property 1 holds:
if(tidsetX.support == tidsetJ.support &&
bitsetSupportUnion.support == tidsetX.support) {
// Remove prefix + j
equivalenceClassItemsets.set(j, null);
equivalenceClassTidsets.set(j, null);
// Replace X by X + J
int[] realUnion = ArraysAlgos.concatenate(itemsetX, itemsetJ);
itemsetX = realUnion;
}else if(tidsetX.support < tidsetJ.support
&& bitsetSupportUnion.support == tidsetX.support) {
// If property 2 holds
// Replace X by X + J
int[] realUnion = ArraysAlgos.concatenate(itemsetX, itemsetJ);
itemsetX = realUnion;
}else if(tidsetX.support > tidsetJ.support
&& bitsetSupportUnion.support == tidsetJ.support) {
// If property 3 holds
// Remove prefix + j
equivalenceClassItemsets.set(j, null);
equivalenceClassTidsets.set(j, null);
// Then, we add the itemset prefix + X + J to the equivalence class that
// we are building.
// Note that we actually only add J because we keep the prefix prefix+X for
// for the whole equivalence class. Thus prefix+X + J can be reconstructed at any time.
equivalenceClassIitemsets.add(itemsetJ);
// We also keep the tidset of prefix+X+J
equivalenceClassItidsets.add(bitsetSupportUnion);
}else {
// If property 4 holds
// Then, we add the itemset prefix + X + J to the equivalence class that
// we are building.
// Note that we actually only add J because we keep the prefix prefix+X for
// for the whole equivalence class. Thus prefix+X + J can be reconstructed at any time.
equivalenceClassIitemsets.add(itemsetJ);
// We also keep the tidset of prefix+X+J
equivalenceClassItidsets.add(bitsetSupportUnion);
}
}
// Process all itemsets from the equivalence class that we are building, which
// has prefix+X as prefix, to find larger itemsets.
// Note that we only do that if the equivalence class contains at least an itemset
if(equivalenceClassIitemsets.size()>0) {
int[] newPrefix = ArraysAlgos.concatenate(prefix, itemsetX);
processEquivalenceClass(newPrefix, equivalenceClassIitemsets, equivalenceClassItidsets);
}
// Finally, we attempt to save the itemset prefix+X since it may be a closed itemset.
save(prefix, itemsetX, tidsetX);
}
// we check the memory usage
MemoryLogger.getInstance().checkMemory();
}
/**
* Print statistics about the algorithm execution to System.out.
*/
public void printStats() {
System.out.println("============= CHARM v96r6 Bitset - STATS =============");
long temps = endTime - startTimestamp;
System.out.println(" Transactions count from database : "
+ database.size());
System.out.println(" Frequent closed itemsets count : "
+ itemsetCount);
System.out.println(" Total time ~ " + temps + " ms");
System.out.println(" Maximum memory usage : "
+ MemoryLogger.getInstance().getMaxMemory() + " mb");
System.out
.println("===================================================");
}
/**
* Get the set of frequent itemsets.
* @return the frequent itemsets (Itemsets).
*/
public Itemsets getClosedItemsets() {
return closedItemsets;
}
/**
* Anonymous inner class to store a bitset and its cardinality.
* Storing the cardinality is useful because the cardinality() method
* of a bitset in Java is very expensive.
*/
public class BitSetSupport{
BitSet bitset = new BitSet();
int support;
}
/**
* Save an itemset(as described in the paper).
* @param prefix the prefix part of this itemset
* @param suffix the suffix part of this itemset
* @param tidset the tidset of this itemset
* @throws IOException if an error occurs when writing to file
*/
void save(int[] prefix, int[] suffix, BitSetSupport tidset) throws IOException {
// First we concatenate the suffix and prefix of that itemset.
int[] prefixSuffix;
if(prefix == null) {
prefixSuffix = suffix;
}else {
prefixSuffix = ArraysAlgos.concatenate(prefix, suffix);
}
// Sort the resulting itemset
Arrays.sort(prefixSuffix);
// Create an instance of "Itemset" for that itemset to put in hash table
ca.pfv.spmf.patterns.itemset_array_integers_with_count.Itemset itemset = new ca.pfv.spmf.patterns.itemset_array_integers_with_count.Itemset(prefixSuffix);
itemset.setAbsoluteSupport(tidset.support);
// Calculate the hash code of that itemset
int hashcode = hash.hashCode(tidset.bitset);
// Check in the hash table to see if the itemset has
// a superset already in the hash table. If not, then it is
// a closed itemset and we should output it as well
// as insert it in the hash table.
if (!hash.containsSupersetOf(itemset, hashcode)) {
// increase the itemset count
itemsetCount++;
// if the result should be saved to memory
if (writer == null) {
// save it to memory with its tidset
Itemset itemsetWithTidset = new Itemset(prefixSuffix, tidset.bitset, tidset.support);
closedItemsets.addItemset(itemsetWithTidset, itemset.size());
} else {
// otherwise if the result should be saved to a file,
// then write it to the output file
writer.write(itemset.toString() + " #SUP: " + itemset.support);
writer.newLine();
}
// add the itemset to the hashtable
hash.put(itemset, hashcode);
}
}
}