package ca.pfv.spmf.algorithms.frequentpatterns.eclat;
/* This file is copyright (c) 2008-2013 Philippe Fournier-Viger
*
* This file is part of the SPMF DATA MINING SOFTWARE
* (http://www.philippe-fournier-viger.com/spmf).
*
* SPMF is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
*
* SPMF is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
* You should have received a copy of the GNU General Public License along with
* SPMF. If not, see <http://www.gnu.org/licenses/>.
*/
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import ca.pfv.spmf.datastructures.triangularmatrix.TriangularMatrix;
import ca.pfv.spmf.input.transaction_database_list_integers.TransactionDatabase;
import ca.pfv.spmf.patterns.itemset_array_integers_with_count.Itemset;
import ca.pfv.spmf.patterns.itemset_array_integers_with_count.Itemsets;
import ca.pfv.spmf.tools.MemoryLogger;
/**
* This is a recent version of the ECLAT algorithm. It uses sets of integers to represent tidsets.
*
* Eclat was proposed by ZAKI (2000).
* <br/><br/>
*
* See this article for details about ECLAT:
* <br/><br/>
*
* Zaki, M. J. (2000). Scalable algorithms for association mining. Knowledge and Data Engineering, IEEE Transactions on, 12(3), 372-390.
* <br/><br/>
*
* This version saves the result to a file
* or keep it into memory if no output path is provided
* by the user to the runAlgorithm method().
*
* @see TriangularMatrix
* @see TransactionDatabase
* @see Itemset
* @see Itemsets
* @author Philippe Fournier-Viger
*/
public class AlgoEclat {
/** relative minimum support **/
private int minsupRelative;
/** the transaction database **/
protected TransactionDatabase database;
/** start time of the last execution */
protected long startTimestamp;
/** end time of the last execution */
protected long endTime;
/**
The patterns that are found
(if the user want to keep them into memory) */
protected Itemsets frequentItemsets;
/** object to write the output file */
BufferedWriter writer = null;
/** the number of patterns found */
protected int itemsetCount;
/** For optimization with a triangular matrix for counting
/ itemsets of size 2. */
private TriangularMatrix matrix;
/**
* Default constructor
*/
public AlgoEclat() {
}
/**
* Run the algorithm.
* @param database a transaction database
* @param output an output file path for writing the result or if null the result is saved into memory and returned
* @param minsupp the minimum support
* @param useTriangularMatrixOptimization if true the triangular matrix optimization will be applied.
* @return the result
* @throws IOException exception if error while writing the file.
*/
public Itemsets runAlgorithm(String output, TransactionDatabase database, double minsupp,
boolean useTriangularMatrixOptimization) throws IOException {
MemoryLogger.getInstance().reset();
// if the user want to keep the result into memory
if(output == null){
writer = null;
frequentItemsets = new Itemsets("FREQUENT ITEMSETS");
}else{ // if the user want to save the result to a file
frequentItemsets = null;
writer = new BufferedWriter(new FileWriter(output));
}
// reset the number of itemset found to 0
itemsetCount =0;
this.database = database;
// record the start time
startTimestamp = System.currentTimeMillis();
// convert from an absolute minsup to a relative minsup by multiplying
// by the database size
this.minsupRelative = (int) Math.ceil(minsupp * database.size());
// (1) First database pass : calculate tidsets of each item.
// This map will contain the tidset of each item
// Key: item Value : tidset
final Map<Integer, Set<Integer>> mapItemCount = new HashMap<Integer, Set<Integer>>();
// for each transaction
int maxItemId = calculateSupportSingleItems(database, mapItemCount);
// if the user chose to use the triangular matrix optimization
// for counting the support of itemsets of size 2.
if (useTriangularMatrixOptimization) {
// We create the triangular matrix.
matrix = new TriangularMatrix(maxItemId + 1);
// for each transaction, take each itemset of size 2,
// and update the triangular matrix.
for (List<Integer> itemset : database.getTransactions()) {
Object[] array = itemset.toArray();
// for each item i in the transaction
for (int i = 0; i < itemset.size(); i++) {
Integer itemI = (Integer) array[i];
// compare with each other item j in the same transaction
for (int j = i + 1; j < itemset.size(); j++) {
Integer itemJ = (Integer) array[j];
// update the matrix count by 1 for the pair i, j
matrix.incrementCount(itemI, itemJ);
}
}
}
}
// (2) create the list of single items
List<Integer> frequentItems = new ArrayList<Integer>();
// for each item
for(Entry<Integer, Set<Integer>> entry : mapItemCount.entrySet()) {
// get the tidset of that item
Set<Integer> tidset = entry.getValue();
// get the support of that item (the cardinality of the tidset)
int support = tidset.size();
int item = entry.getKey();
// if the item is frequent
if(support >= minsupRelative) {
// add the item to the list of frequent single items
frequentItems.add(item);
// output the item
saveSingleItem(item, tidset, tidset.size());
}
}
// Sort the list of items by the total order of increasing support.
// This total order is suggested in the article by Zaki.
Collections.sort(frequentItems, new Comparator<Integer>() {
@Override
public int compare(Integer arg0, Integer arg1) {
return mapItemCount.get(arg0).size() - mapItemCount.get(arg1).size();
}});
// Now we will combine each pairs of single items to generate equivalence classes
// of 2-itemsets
// For each frequent item I according to the total order
for(int i=0; i < frequentItems.size(); i++) {
Integer itemI = frequentItems.get(i);
// we obtain the tidset and support of that item
Set<Integer> tidsetI = mapItemCount.get(itemI);
int supportI = tidsetI.size();
// We create empty equivalence class for storing all 2-itemsets starting with
// the item "i".
// This equivalence class is represented by two structures.
// The first structure stores the suffix of all 2-itemsets starting with the prefix "i".
// For example, if itemI = "1" and the equivalence class contains 12, 13, 14, then
// the structure "equivalenceC lassIitems" will only contain 2, 3 and 4 instead of
// 12, 13 and 14. The reason for this implementation choice is that it is more
// memory efficient.
List<Integer> equivalenceClassIitems = new ArrayList<Integer>();
// The second structure stores the tidset of each 2-itemset in the equivalence class
// of the prefix "i".
List<Set<Integer>> equivalenceClassItidsets = new ArrayList<Set<Integer>>();
// For each item itemJ that is larger than i according to the total order of
// increasing support.
loopJ: for(int j=i+1; j < frequentItems.size(); j++) {
int itemJ = frequentItems.get(j);
// if the triangular matrix optimization is activated we obtain
// the support of itemset "ij" in the matrix. This allows to determine
// directly without performing a join if "ij" is frequent.
if(useTriangularMatrixOptimization) {
// check the support of {i,j} according to the triangular matrix
int support = matrix.getSupportForItems(itemI, itemJ);
// if not frequent
if (support < minsupRelative) {
// we don't need to consider the itemset "ij" anymore
continue loopJ;
}
}
// Obtain the tidset of item J and its support.
Set<Integer> tidsetJ = mapItemCount.get(itemJ);
int supportJ = tidsetJ.size();
// Calculate the tidset of itemset "IJ" by performing the intersection of
// the tidsets of I and the tidset of J.
Set<Integer> tidsetIJ = performANDFirstTime(tidsetI, supportI, tidsetJ, supportJ);
// After that, we add the itemJ to the equivalence class of 2-itemsets
// starting with the prefix "i". Note that although we only add "j" to the
// equivalence class, the item "j"
// actually represents the itemset "ij" since we keep the prefix "i" for the
// whole equilvalence class.
if(useTriangularMatrixOptimization || calculateSupport(2, supportI, tidsetIJ) >= minsupRelative){
equivalenceClassIitems.add(itemJ);
// We also keep the tidset of "ij".
equivalenceClassItidsets.add(tidsetIJ);
}
}
// Process all itemsets from the equivalence class of 2-itemsets starting with prefix I
// to find larger itemsets if that class has more than 0 itemsets.
if(equivalenceClassIitems.size() > 0) {
// This is done by a recursive call. Note that we pass
// item I to that method as the prefix of that equivalence class.
processEquivalenceClass(new int[]{itemI}, supportI, equivalenceClassIitems, equivalenceClassItidsets);
}
}
// we check the memory usage
MemoryLogger.getInstance().checkMemory();
// We have finish the search.
// Therefore, we close the output file writer if the result was saved to a file
if(writer != null){
writer.close();
}
// record the end time for statistics
endTime = System.currentTimeMillis();
// Return all frequent itemsets found or null if the result was saved to a file.
return frequentItemsets;
}
/**
* This method scans the database to calculate the support of each single item
* @param database the transaction database
* @param mapItemTIDS a map to store the tidset corresponding to each item
* @return the maximum item id appearing in this database
*/
private int calculateSupportSingleItems(TransactionDatabase database,
final Map<Integer, Set<Integer>> mapItemCount) {
int maxItemId = 0;
for (int i = 0; i < database.size(); i++) {
// for each item in that transaction
for (Integer item : database.getTransactions().get(i)) {
// get the current tidset of that item
Set<Integer> set = mapItemCount.get(item);
// if no tidset, then we create one
if (set == null) {
set = new HashSet<Integer>();
mapItemCount.put(item, set);
// if the current item is larger than all items until
// now, remember that!
if (item > maxItemId) {
maxItemId = item;
}
}
// add the current transaction id (tid) to the tidset of the item
set.add(i);
}
}
return maxItemId;
}
//
// /**
// * Implementation of Insertion sort for sorting two list of integers at the same time.
// * This has an average performance of O(n log n)
// * @param a array of integers
// */
// public static void insertionSort(List<Integer> listItems, List<Set<Integer>> listTids){
// for(int j=1; j< listItems.size(); j++){
// Set<Integer> keyTids = listTids.get(j);
// Integer keyItem = listItems.get(j);
// int i = j - 1;
// for(; i>=0 && (listTids.get(i).size() > keyTids.size()); i--){
//
// listTids.set(i+1, listTids.get(i));
// listItems.set(i+1, listItems.get(i));
//// a[i+1] = a[i];
// }
//// a[i+1] = key;
// listTids.set(i+1, keyTids);
// listItems.set(i+1, keyItem);
// }
// }
/**
* This method process all itemsets from an equivalence class to generate larger itemsets,
* @param prefix a common prefix to all itemsets of the equivalence class
* @param supportPrefix the support of the prefix (not used by eclat, but used by dEclat)
* @param equivalenceClassItems a list of suffixes of itemsets in the current equivalence class.
* @param equivalenceClassTidsets a list of tidsets of itemsets of the current equivalence class.
* @throws IOException if error while writting the output to file
*/
private void processEquivalenceClass(int[] prefix, int supportPrefix, List<Integer> equivalenceClassItems,
List<Set<Integer>> equivalenceClassTidsets) throws IOException {
int length = prefix.length+1;
// If there is only one itemset in equivalence class
if(equivalenceClassItems.size() == 1) {
int itemI = equivalenceClassItems.get(0);
Set<Integer> tidsetItemset = equivalenceClassTidsets.get(0);
// Then, we just save that itemset to file and stop.
// To save the itemset we call the method save with the prefix "prefix" and the suffix
// "itemI".
int support = calculateSupport(length, supportPrefix, tidsetItemset);
save(prefix, itemI, tidsetItemset, support);
return;
}
// If there is only two itemsets in the equivalence class
if(equivalenceClassItems.size() == 2) {
// We get the suffix of the first itemset (an item that we will call I)
int itemI = equivalenceClassItems.get(0);
Set<Integer> tidsetI = equivalenceClassTidsets.get(0);
int supportI = calculateSupport(length, supportPrefix, tidsetI);
save(prefix, itemI, tidsetI, supportI);
// We get the suffix of the second itemset (an item that we will call J)
int itemJ = equivalenceClassItems.get(1);
Set<Integer> tidsetJ = equivalenceClassTidsets.get(1);
int supportJ = calculateSupport(length, supportPrefix, tidsetJ);
save(prefix, itemJ, tidsetJ, supportJ);
// We calculate the tidset of the itemset resulting from the union of
// the first itemset and the second itemset.
Set<Integer> tidsetIJ = this.performAND(tidsetI, tidsetI.size(), tidsetJ, tidsetJ.size());
int supportIJ = calculateSupport(length, supportI, tidsetIJ);
// We save the itemset prefix+IJ to the output
if(supportIJ >= minsupRelative) {
// Append the prefix with I
int newPrefix[] = new int[length];
System.arraycopy(prefix, 0, newPrefix, 0, prefix.length);
newPrefix[prefix.length] = itemI;
// We save the itemset prefix+IJ to the output
save(newPrefix, itemJ, tidsetIJ, supportIJ);
}
return;
}
// THE FOLLOWING OPTIMIZATION IS COMMENTED SINCE IT DOES NOT IMPROVE PERFORMANCE
// insertionSort(equivalenceClassItems, equivalenceClassTidsets);
// The next loop combines each pairs of itemsets of the equivalence class
// to form larger itemsets
// For each itemset "prefix" + "i"
for(int i=0; i< equivalenceClassItems.size(); i++) {
int suffixI = equivalenceClassItems.get(i);
// get the tidset and support of that itemset
Set<Integer> tidsetI = equivalenceClassTidsets.get(i);
// save the itemset to the file because it is frequent
int supportI = calculateSupport(length, supportPrefix, tidsetI);
save(prefix, suffixI, tidsetI, supportI);
// create the empty equivalence class for storing all itemsets of the
// equivalence class starting with prefix + i
List<Integer> equivalenceClassISuffixItems= new ArrayList<Integer>();
List<Set<Integer>> equivalenceITidsets = new ArrayList<Set<Integer>>();
// For each itemset "prefix" + j"
for(int j=i+1; j < equivalenceClassItems.size(); j++) {
int suffixJ = equivalenceClassItems.get(j);
// THE FOLLOWING CODE HAS BEEN COMMENTED BECAUSE IT DID NOT
// IMPROVE PERFORMANCE
// // if the triangular matrix optimization is activated we check if
// // items I and J are frequent according to the matrix. If not, we skip J.
// if(useTriangularMatrixOptimization) {
// // check the support of {i,j} according to the triangular matrix
// int support = matrix.getSupportForItems(itemI, itemJ);
// // if not frequent
// if (support < minsupRelative) {
// // skip j;
// continue loopJ;
// }
// }
// Get the tidset and support of the itemset prefix + "j"
Set<Integer> tidsetJ = equivalenceClassTidsets.get(j);
int supportJ = calculateSupport(length, supportPrefix, tidsetJ);
// We will now calculate the tidset of the itemset {prefix, i,j}
// This is done by intersecting the tidset of the itemset prefix+i
// with the itemset prefix+j
Set<Integer> tidsetIJ = performAND(tidsetI, supportI, tidsetJ,
supportJ);
int supportIJ = calculateSupport(length, supportI, tidsetIJ);
// If the itemset prefix+i+j is frequent, then we add it to the
// equivalence class of itemsets having the prefix "prefix"+i
// Note actually, we just keep "j" for optimization because all itemsets
// in the equivalence class of prefix+i will start with prefix+i so it would just
// waste memory to keep prefix + i for all itemsets.
if(supportIJ >= minsupRelative) {
equivalenceClassISuffixItems.add(suffixJ);
// We also keep the corresponding tidset
equivalenceITidsets.add(tidsetIJ);
}
}
// If there is more than an itemset in the equivalence class
// then we recursively process that equivalence class to find larger itemsets
if(equivalenceClassISuffixItems.size() >0) {
// We create the itemset prefix + i
int newPrefix[] = new int[prefix.length +1];
System.arraycopy(prefix, 0, newPrefix, 0, prefix.length);
newPrefix[prefix.length] = suffixI;
// Recursive call
processEquivalenceClass(newPrefix, supportI, equivalenceClassISuffixItems, equivalenceITidsets);
}
}
// we check the memory usage
MemoryLogger.getInstance().checkMemory();
}
/**
* Calculate the support of an itemset X using the tidset of X.
* @param lengthOfX the length of the itemset X - 1 (used by dEclat)
* @param supportPrefix the support of the prefix (not used by Eclat, but used by dEclat).
* @param tidsetI the tidset of X
* @return the support
*/
int calculateSupport(int lengthOfX, int supportPrefix, Set<Integer> tidsetI) {
return tidsetI.size();
}
/**
* This method performs the intersection of two tidsets.
* @param tidsetI the first tidset
* @param supportI the cardinality of the first tidset
* @param tidsetJ the second tidset
* @param supportJ the cardinality of the second tidset
* @return the resulting tidset.
*/
Set<Integer> performAND(Set<Integer> tidsetI, int supportI,
Set<Integer> tidsetJ, int supportJ) {
// Create the new tidset that will store the intersection
Set<Integer> tidsetIJ = new HashSet<Integer>();
// To reduce the number of comparisons of the two tidsets,
// if the tidset of I is larger than the tidset of J,
// we will loop on the tidset of J. Otherwise, we will loop on the tidset of I
if(supportI > supportJ) {
// for each tid containing j
for(Integer tid : tidsetJ) {
// if the transaction also contains i, add it to tidset of {i,j}
if(tidsetI.contains(tid)) {
// add it to the intersection
tidsetIJ.add(tid);
}
}
}else {
// for each tid containing i
for(Integer tid : tidsetI) {
// if the transaction also contains j, add it to tidset of {i,j}
if(tidsetJ.contains(tid)) {
// add it to the intersection
tidsetIJ.add(tid);
}
}
}
// return the new tidset
return tidsetIJ;
}
/**
* This method performs the intersection of two tidsets.
* @param tidsetI the first tidset
* @param supportI the cardinality of the first tidset
* @param tidsetJ the second tidset
* @param supportJ the cardinality of the second tidset
* @return the resulting tidset.
*/
Set<Integer> performANDFirstTime(Set<Integer> tidsetI, int supportI,
Set<Integer> tidsetJ, int supportJ) {
// return the new tidset
return performAND(tidsetI, supportI, tidsetJ, supportJ);
}
/**
* Save an itemset to disk or memory (depending on what the user chose).
* @param prefix the prefix of the itemset to be saved
* @param suffixItem the last item to be appended to the itemset
* @param tidset the tidset of this itemset
* @throws IOException if an error occurrs when writing to disk.
*/
private void save(int[] prefix, int suffixItem, Set<Integer> tidset, int support) throws IOException {
// increase the itemset count
itemsetCount++;
// if the result should be saved to memory
if(writer == null){
// append the prefix with the suffix
int[] itemsetArray = new int[prefix.length+1];
System.arraycopy(prefix, 0, itemsetArray, 0, prefix.length);
itemsetArray[prefix.length] = suffixItem;
// Create an object "Itemset" and add it to the set of frequent itemsets
Itemset itemset = new Itemset(itemsetArray);
itemset.setAbsoluteSupport(support);
frequentItemsets.addItemset(itemset, itemset.size());
}else{
// if the result should be saved to a file
// write it to the output file
StringBuilder buffer = new StringBuilder();
for(int item: prefix) {
buffer.append(item);
buffer.append(" ");
}
buffer.append(suffixItem);
// as well as its support
buffer.append(" #SUP: ");
buffer.append(support);
writer.write(buffer.toString());
writer.newLine();
}
}
/**
* Save an itemset containing a single item to disk or memory (depending on what the user chose).
* @param item the item to be saved
* @param tidset the tidset of this itemset
* @throws IOException if an error occurrs when writing to disk.
*/
private void saveSingleItem(int item, Set<Integer> tidset, int support) throws IOException {
// increase the itemset count
itemsetCount++;
// if the result should be saved to memory
if(writer == null){
// add it to the set of frequent itemsets
Itemset itemset = new Itemset(new int[] {item});
itemset.setAbsoluteSupport(support);
frequentItemsets.addItemset(itemset, itemset.size());
}else{
// if the result should be saved to a file
// write it to the output file
StringBuilder buffer = new StringBuilder();
buffer.append(item);
buffer.append(" #SUP: ");
buffer.append(support);
writer.write(buffer.toString());
writer.newLine();
}
}
/**
* Print statistics about the algorithm execution to System.out.
*/
public void printStats() {
System.out.println("============= ECLAT v0.96r6 - STATS =============");
long temps = endTime - startTimestamp;
System.out.println(" Transactions count from database : "
+ database.size());
System.out.println(" Frequent itemsets count : "
+ itemsetCount);
System.out.println(" Total time ~ " + temps + " ms");
System.out.println(" Maximum memory usage : "
+ MemoryLogger.getInstance().getMaxMemory() + " mb");
System.out.println("===================================================");
}
/**
* Get the set of frequent itemsets found by the algorithm.
* @return the frequent itemsets (Itemsets).
*/
public Itemsets getItemsets() {
return frequentItemsets;
}
}