package ca.pfv.spmf.algorithms.frequentpatterns.defme;
/* This file is copyright (c) 2008-2013 Philippe Fournier-Viger
*
* This file is part of the SPMF DATA MINING SOFTWARE
* (http://www.philippe-fournier-viger.com/spmf).
*
* SPMF is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
*
* SPMF is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
* You should have received a copy of the GNU General Public License along with
* SPMF. If not, see <http://www.gnu.org/licenses/>.
*/
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.BitSet;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import ca.pfv.spmf.input.transaction_database_list_integers.TransactionDatabase;
import ca.pfv.spmf.patterns.itemset_array_integers_with_tids_bitset.Itemset;
import ca.pfv.spmf.patterns.itemset_array_integers_with_tids_bitset.Itemsets;
import ca.pfv.spmf.tools.MemoryLogger;
/**
* This is a recent implementation of the DefMe algorithm that uses bitsets to represent
* tidsets, and is implemented to mine itemsets.
*
* Defme was proposed by Soulet et al (2014).
* <br/><br/>
*
* See this article for details about DefMe:
* <br/><br/>
*
* Soulet, A., Rioult, F. (2014). Efficiently Depth-First Minimal Pattern Mining, PAKDD 2014.
* <br/><br/>
*
* This version saves the result to a file
* or keep it into memory if no output path is provided
* by the user to the runAlgorithm method().
*
* @see TransactionDatabase
* @see Itemset
* @see Itemsets
* @author Philippe Fournier-Viger
*/
public class AlgoDefMe {
/** relative minimum support **/
private int minsupRelative;
/** the transaction database **/
private TransactionDatabase database;
/** start time of the last execution */
private long startTimestamp;
/** end time of the last execution */
private long endTime;
/**
The patterns that are found
(if the user want to keep them into memory) */
protected Itemsets generators;
/** object to write the output file */
BufferedWriter writer = null;
/** the number of patterns found */
private int itemsetCount;
/** A map containing the tidset (i.e. cover) of each item represented as a bitset */
private Map<Integer, BitSetSupport> mapItemTIDS;
/**
* Default constructor
*/
public AlgoDefMe() {
}
/**
* Run the algorithm.
* @param database a transaction database
* @param output an output file path for writing the result or if null the result is saved into memory and returned
* @param minsup the minimum support
* @return the set of generators if the user chose to save the result to memory. Otherwise, null.
* @throws IOException exception if error while writing the file.
*/
public Itemsets runAlgorithm(String output, TransactionDatabase database, double minsup) throws IOException {
// Reset the tool to assess the maximum memory usage (for statistics)
MemoryLogger.getInstance().reset();
// if the user want to keep the result into memory
if(output == null){
writer = null;
generators = new Itemsets("FREQUENT ITEMSETS");
}else{ // if the user want to save the result to a file
generators = null;
writer = new BufferedWriter(new FileWriter(output));
}
// reset the number of itemset found to 0
itemsetCount = 0;
this.database = database;
// record the start time
startTimestamp = System.currentTimeMillis();
// convert from an absolute minsup to a relative minsup by multiplying
// by the database size
this.minsupRelative = (int) Math.ceil(minsup * database.size());
// Calculate the tidset of each single item (what is called COV() in the paper)
mapItemTIDS = new HashMap<Integer, BitSetSupport>();
// for each transaction
for (int i = 0; i < database.size(); i++) {
// Add the transaction id to the set of all transaction ids
// for each item in that transaction
// For each item
for (Integer item : database.getTransactions().get(i)) {
// Get the current tidset of that item
BitSetSupport tids = mapItemTIDS.get(item);
// If none, then we create one
if(tids == null){
tids = new BitSetSupport();
mapItemTIDS.put(item, tids);
}
// we add the current transaction id to the tidset of the item
tids.bitset.set(i);
// we increase the support of that item
tids.support++;
}
}
// (2) create the list of single frequent items
List<Integer> frequentItems = new ArrayList<Integer>();
// for each item
for(Entry<Integer, BitSetSupport> entry : mapItemTIDS.entrySet()) {
// get the support and tidset of that item
BitSetSupport tidset = entry.getValue();
int support = tidset.support;
int item = entry.getKey();
// if the item is frequent
if(support >= minsupRelative) {
// add the item to the list of frequent items
frequentItems.add(item);
}
}
// Sort the list of items by the total order of increasing support.
// This total order is suggested in the article by Zaki.
Collections.sort(frequentItems, new Comparator<Integer>() {
@Override
public int compare(Integer arg0, Integer arg1) {
return mapItemTIDS.get(arg0).support - mapItemTIDS.get(arg1).support;
}});
// Create the tidset of the empty set
BitSet tidsetEmptySet = new BitSet(database.size());
tidsetEmptySet.set(0, database.size());
// Initial call of the defme procedure
defme(new int[] {}, tidsetEmptySet, database.size(), frequentItems, 0, new BitSet[0]);
// we check the memory usage
MemoryLogger.getInstance().checkMemory();
// close the output file if the result was saved to a file
if(writer != null){
writer.close();
}
// record the end time for statistics
endTime = System.currentTimeMillis();
// Return all frequent itemsets found!
return generators;
}
/**
* This is the main procedure of DefMe, which is called recursively to grow patterns
* @param itemsetX The itemset X.
* @param tidsetX The tidset (cover) of X.
* @param supportX The support of X
* @param frequentItems The set of frequent items
* @param posTail The set "tail" is defined as the interval [postail, frequentItems.size()-1] in "frequentItems".
* @param critItemsetX The critical objects of each item from the itemset X, stored in an array.
* @throws IOException if an error occured while writing result to disk
*/
private void defme(int[] itemsetX, BitSet tidsetX, int supportX,
List<Integer> frequentItems, int posTail, BitSet[] critItemsetX) throws IOException {
// If not the empty set
if(itemsetX.length != 0) {
// check if for all e in X, COV*(X, e) != emptyset
for(BitSet covStarXe : critItemsetX) {
if(covStarXe.cardinality() ==0) {
// if the critical object (COV*) is empty, return..
return;
}
}
}
// save the itemset
save(itemsetX, tidsetX, supportX);
// for all e in tail
for(int i=posTail; i< frequentItems.size(); i++) {
// Calculate e
Integer e = frequentItems.get(i);
// Calculate Cov(e), i.e. the tidset of e
BitSetSupport tidsetE = mapItemTIDS.get(e);
// Calculate Xe, i.e. X U {e}
int[] xe = new int[itemsetX.length+1];
System.arraycopy(itemsetX, 0, xe, 0, itemsetX.length);
xe[itemsetX.length] = e;
// Calculate cov(Xe), i.e. tidset(X U {e})
BitSet tidsetXe = (BitSet)tidsetX.clone();
tidsetXe.and(tidsetE.bitset);
// The support of XU{e} is the cardinality of its tidset
int supportXe = tidsetXe.cardinality();
// If XU{e} is infrequent, we don't need to consider it anymore
if(supportXe < minsupRelative) {
continue;
}
// == Calculate critical objects (cov*(Y, e)) for each item e in Y = XU{e} ==
BitSet[] critItemsetY = new BitSet[xe.length];
// For the item e
BitSet critE = (BitSet)tidsetX.clone();
critE.andNot(tidsetE.bitset);
critItemsetY[critItemsetY.length-1] = critE;
// For any other item e' in X
for(int j=0; j< itemsetX.length; j++) {
// calculate cov* as follows:
critItemsetY[j] = (BitSet)critItemsetX[j].clone();
critItemsetY[j].and(tidsetE.bitset);
}
// recursive call to explore patterns by extending XU{e} with items from "tail"
defme(xe, tidsetXe, supportXe, frequentItems, i+1, critItemsetY);
}
}
/**
* Save an itemset to disk or memory (depending on what the user chose).
* @param itemsetArray the itemset to be saved
* @param tidset the tidset and support of this itemset
* @param support the support of that itemset
* @throws IOException if an error occurrs when writing to disk.
*/
private void save(int[] itemsetArray, BitSet tidset, int support) throws IOException {
// increase the itemset count
itemsetCount++;
// if the result should be saved to memory
if(writer == null){
// Create an object "Itemset" and add it to the set of frequent itemsets
Itemset itemset = new Itemset(itemsetArray);
itemset.setTIDs(tidset, support);
generators.addItemset(itemset, itemset.size());
}else{
// if the result should be saved to a file
// write it to the output file
StringBuilder buffer = new StringBuilder();
for(int item: itemsetArray) {
buffer.append(item);
buffer.append(" ");
}
// as well as its support
buffer.append("#SUP: ");
buffer.append(support);
writer.write(buffer.toString());
writer.newLine();
}
}
/**
* Print statistics about the algorithm execution to System.out.
*/
public void printStats() {
System.out.println("============= DefMe - STATS =============");
long temps = endTime - startTimestamp;
System.out.println(" Transactions count from database : " + database.size());
System.out.println(" Generator itemsets count : " + itemsetCount);
System.out.println(" Total time ~ " + temps + " ms");
System.out.println(" Maximum memory usage : "
+ MemoryLogger.getInstance().getMaxMemory() + " mb");
System.out.println("===================================================");
}
/**
* Get the set of frequent itemsets.
* @return the frequent itemsets (Itemsets).
*/
public Itemsets getItemsets() {
return generators;
}
/**
* Anonymous inner class to store a bitset and its cardinality
* (an itemset's tidset and its support).
* Storing the cardinality is useful because the cardinality() method
* of a bitset in Java is very expensive, so it should not be called
* more than once.
*/
public class BitSetSupport{
BitSet bitset = new BitSet();
int support;
}
}