package ca.pfv.spmf.algorithms.frequentpatterns.lcm;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import ca.pfv.spmf.patterns.itemset_array_integers_with_count.Itemset;
import ca.pfv.spmf.patterns.itemset_array_integers_with_count.Itemsets;
import ca.pfv.spmf.tools.MemoryLogger;
/* This file is copyright (c) 2012-2014 Alan Souza
*
* This file is part of the SPMF DATA MINING SOFTWARE
* (http://www.philippe-fournier-viger.com/spmf).
*
* SPMF is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
* SPMF is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
* You should have received a copy of the GNU General Public License along with
* SPMF. If not, see <http://www.gnu.org/licenses/>.
*/
/**
* This is an implementation of the LCM algorithm for
* mining frequent closed itemsets from a transaction database.
* More information on the LCM algorithm can be found in papers by
* T. Uno, such as: <br/><br/>
*
* T. Uno, M. Kiyomi, and H. Arimura. Lcm ver. 2:
* Efficient mining algorithms for
* frequent/closed/maximal itemsets. In FIMI, 2004
*
* This implementation of LCM was made by Alan Souza and was
* modified by Philippe Fournier-Viger to add optimizations and
* support for LCMFreq/LCMMax (note LCMMax has been temporarily removed in the
* current version of SPMF). <br/>
*
* The implementation is similar to LCM version 2 with some differences.
* For example, transaction merging is not performed yet and
* items in transactions are not sorted in descending order of frequency.
*
* @author Alan Souza <apsouza@inf.ufrgs.br>
*/
public class AlgoLCM {
private Itemsets closedFrequentItemsets;
// object to write the output file
BufferedWriter writer = null;
// the number of frequent itemsets found (for
// statistics)
private int frequentCount;
// the start time and end time of the last algorithm execution
long startTimestamp;
long endTimestamp;
int minsupRelative;
// mine all frequent itemsets
boolean mineAllFrequentItemsets;
private boolean mineAllMaximalItemsets;
// Buckets for occurence delivery
// Recall that each bucket correspond to an item
// and contains the transactions where the items appears.
private List<Transaction>[] buckets;
public AlgoLCM() {
}
/**
* Run the algorithm
* @param minimumSupport the minimum support threshold as percentage value between 0 and 1
* @param dataset the dataset
* @param outputPath the output file path to save the result or null if to be kept in memory
* @param mineAllFrequentItemsets mine all frequent itemsets
* @param mineAllMaximalItemsets mine only maximal itemsets
* @return the itemsets or null if the user choose to save to file
* @throws IOException if exception while reading/writing to file
*/
public Itemsets runAlgorithm(double minimumSupport, Dataset dataset, String outputPath, boolean mineAllFrequentItemsets, boolean mineAllMaximalItemsets) throws IOException {
// record the start time
startTimestamp = System.currentTimeMillis();
this.mineAllFrequentItemsets = mineAllFrequentItemsets;
this.mineAllMaximalItemsets = mineAllMaximalItemsets;
// if the user choose to save to file
// create object for writing the output file
if(outputPath != null) {
writer = new BufferedWriter(new FileWriter(outputPath));
}else {
// if the user choose to save to memory
writer = null;
this.closedFrequentItemsets = new Itemsets("Itemsets");
}
// reset the number of itemset found
frequentCount = 0;
// reset the memory usage checking utility
MemoryLogger.getInstance().reset();
// convert from an absolute minsup to a relative minsup by multiplying
// by the database size
this.minsupRelative = (int) Math.ceil(minimumSupport * dataset.getTransactions().size());
// Create the initial occurrence array for the dataset
performFirstOccurenceDelivery(dataset);
//======
// Remove infrequent items from transactions by using support calculated using
// the buckets. Recall that each bucket correspond to an item
// and contains the transactions where the items appears.
for(Transaction transaction : dataset.getTransactions()) {
transaction.removeInfrequentItems(buckets, minsupRelative);
}
//======
// Create the array of all frequent items.
List<Integer> allItems = new ArrayList<Integer>();
for(Integer item : dataset.getUniqueItems()) {
if(buckets[item].size() >= minsupRelative) {
allItems.add(item);
}
}
// Sort all items
Collections.sort(allItems);
//======
// Call the recursive method witht the empty set as prefix.
// Since it is the empty set, we will have all transactions and no frequency count
if(mineAllFrequentItemsets) {
backtrackingLCMFreq(null, dataset.getTransactions(), allItems);
}else if (mineAllMaximalItemsets){
backtrackingLCMMax(null, dataset.getTransactions(), allItems, -1, -1);
}else {
backtrackingLCM(null, dataset.getTransactions(), allItems, -1);
}
// record the end time
endTimestamp = System.currentTimeMillis();
//close the output file
if(writer != null) {
writer.close();
}
MemoryLogger.getInstance().checkMemory();
return closedFrequentItemsets;
}
/**
* Recursive method to find closed itemsets
* @param p a prefix itemset P
* @param transactionsOfP the transations containing P
* @param frequentItems the list of frequent items in the p-projected database
* @param tailPosInP the tail item position in itemset P
* @throws IOException if error writing to output file
*/
private void backtrackingLCM(List<Integer> p, List<Transaction> transactionsOfP,
List<Integer> frequentItems, int tailPosInP)
throws IOException {
// ======== for each frequent item e =============
for (int j = 0; j < frequentItems.size(); j++) {
Integer e = frequentItems.get(j);
// if the item is not already in p before the current tail position
// we will consider it to form a new closed itemset
if(p != null && containsByBinarySearch(p, e, tailPosInP)) {
continue;
}
// Calculate transactions containing P U e
// At the same time truncate the transactions to keep what appears after "e"
List<Transaction> transactionsPe = intersectTransactions(transactionsOfP, e); //ok
//====== Check if PU{e...} is a ppc extension ======
if (isPPCExtension(p, transactionsPe, e)) {
// ======= Create a closed itemset using PU{e...} =====
// First add all items from PU{e}
List<Integer> itemset = new ArrayList<Integer>();
if(p != null) {
//add every item i of p such that i < e to the itemset
for (int m = 0; m < p.size() && p.get(m) < e; m++) {
itemset.add(p.get(m));
}
}
itemset.add(e);
int tailPositionInPe = itemset.size()-1;
for (int k = j+1; k < frequentItems.size(); k++) {
Integer itemk = frequentItems.get(k);
// for every item i > e add if it is in all transactions of T(P U e)
if(isItemInAllTransactions(transactionsPe, itemk)) {
itemset.add(itemk);
}
}
// ===== save the frequent closed itemset
int supportPe = transactionsPe.size();
output(itemset, supportPe);
//==== perform database reduction ====
anyTimeDatabaseReductionClosed(transactionsPe, j, frequentItems, p, e);
// ================ Find frequent items in transactions containing P ============
// Get all frequent items e such that e > tailOfP
// (i.e. "e" appears after the position of the tail item in the list of all items)
List<Integer> newFrequentItems = new ArrayList<Integer>();
for (int k = j+1; k < frequentItems.size(); k++) {
Integer itemK = frequentItems.get(k);
int supportK = buckets[itemK].size();
if(supportK >= minsupRelative) {
newFrequentItems.add(itemK);
}
}
// === recursive call
backtrackingLCM(itemset, transactionsPe, newFrequentItems, tailPositionInPe);
}
}
MemoryLogger.getInstance().checkMemory();
}
/**
* Recursive method to find maximal itemsets
* @param p a prefix itemset P
* @param transactionsOfP the transations containing P
* @param frequentItems the list of frequent items in the p-projected database
* @param tailPosInP the tail item position in itemset P
* @throws IOException if error writing to output file
*/
private boolean backtrackingLCMMax(List<Integer> p, List<Transaction> transactionsOfP,
List<Integer> frequentItems, int tailPosInP, Integer itemELastAddedToP) throws IOException {
throw new RuntimeException("This algorithm is unavailable in the current version of SPMF. \n");
/*
boolean foundOneMaxItemset = false;
// ======== for each frequent item e =============
for (int j = 0; j < frequentItems.size(); j++) {
Integer e = frequentItems.get(j);
System.out.println(" p : " + p + " \te: " + e);
// if the item is not already in p before the current tail position
// we will consider it to form a new itemset
if(e <= itemELastAddedToP || (p != null && containsByBinarySearch(p, e, tailPosInP))) { // #DIFF
continue;
}
// Calculate transactions containing P U e
// At the same time truncate the transactions to keep what appears after "e"
List<Transaction> transactionsPe = intersectTransactions(transactionsOfP, e); //ok
//====== Check if PU{e...} is a ppc extension ======
if (isPPCExtension(p, transactionsPe, e)) {
// ======= Create a closed itemset using PU{e...} =====
// First add all items from PU{e}
List<Integer> itemset = new ArrayList<Integer>();
if(p != null) {
//add every item i of p such that i < e to the itemset
for (int m = 0; m < p.size() && p.get(m) < e; m++) {
itemset.add(p.get(m));
}
}
itemset.add(e);
int tailPositionInPe = itemset.size()-1;
for (int k = j+1; k < frequentItems.size(); k++) {
Integer itemk = frequentItems.get(k);
// for every item i > e add if it is in all transactions of T(P U e)
// ### FIRST DIFFERENCE WITH LCM ###
if(isItemInAllTransactions(transactionsPe, itemk)) {
itemset.add(itemk);
}
}
/// #### SECOND DIFFERENCE ####
// int itemsAddedAfterECount = itemset.size() - (tailPositionInPe + 1);
// ===== save the frequent closed itemset
int supportPe = transactionsPe.size();
//==== perform database reduction ====
anyTimeDatabaseReductionMax(transactionsPe, j, frequentItems, p, e); // ##DIF
// ================ Find frequent items in transactions containing P ============
// Get all frequent items e such that e > tailOfP
// (i.e. "e" appears after the position of the tail item in the list of all items)
List<Integer> newFrequentItems = new ArrayList<Integer>();
for (int k = 0; k < frequentItems.size(); k++) { /// ##### ANOTHER DIFFERENCE START FROM K = 0
Integer itemK = frequentItems.get(k);
int supportK = buckets[itemK].size();
if(supportK >= minsupRelative) {
newFrequentItems.add(itemK);
}
}
// ===== ### SECOND DIFFERENCE WITH LCM #### =====
// int itemsAddedAfterEcount = (itemset.size() - p.size());
System.out.println(itemset);
if(newFrequentItems.size() == itemset.size()) {
output(itemset, supportPe);
System.out.println("OUTPUT " + itemset);
foundOneMaxItemset = true;
}else { // #DIFF : ELSE
// === recursive call
boolean found = backtrackingLCMMax(itemset, transactionsPe, newFrequentItems, tailPositionInPe, e); // ### DIFF : PASS E
if(found == false) { // ## DIFF
System.out.println("OUTPUT2 " + itemset);
output(itemset, supportPe);
}
}
}
}
MemoryLogger.getInstance().checkMemory();
return foundOneMaxItemset; // ## DIFF
*/
}
/**
* Recursive method to find all frequent itemsets
* @param p a prefix itemset P
* @param transactionsOfP the transations containing P
* @param frequentItems the list of frequent items in the p-projected database
* @param tailPosInP the tail item position in itemset P
* @throws IOException if error writing to output file
*/
private void backtrackingLCMFreq(List<Integer> p, List<Transaction> transactionsOfP,
List<Integer> frequentItems) throws IOException {
// ======== for each frequent item e =============
for (int j = 0; j < frequentItems.size(); j++) {
Integer e = frequentItems.get(j);
// // if the item is not already in p before the current tail position
// // we will consider it to form a new closed itemset
// if(p != null && containsByBinarySearch(p, e)) {
// continue;
// }
// Calculate transactions containing P U e
// At the same time truncate the transactions to keep what appears after "e"
List<Transaction> transactionsPe = intersectTransactions(transactionsOfP, e); //ok
// ======= Create a closed itemset using PU{e...} =====
// First add all items from PU{e}
List<Integer> itemset = new ArrayList<Integer>();
if(p != null) {
//add every item i of p such that i < e to the itemset
for (int m = 0; m < p.size(); m++) {
itemset.add(p.get(m));
}
}
itemset.add(e);
int tailPositionInPe = itemset.size()-1;
// ===== save the frequent closed itemset
int supportPe = transactionsPe.size();
output(itemset, supportPe);
//==== perform database reduction ====
anyTimeDatabaseReductionFreq(transactionsPe, j, frequentItems, p, e);
// ================ Find frequent items in transactions containing P ============
// Get all frequent items e such that e > tailOfP
// (i.e. "e" appears after the position of the tail item in the list of all items)
List<Integer> newFrequentItems = new ArrayList<Integer>();
for (int k = j+1; k < frequentItems.size(); k++) {
Integer itemK = frequentItems.get(k);
int supportK = buckets[itemK].size();
if(supportK >= minsupRelative) {
newFrequentItems.add(itemK);
}
}
// === recursive call
backtrackingLCMFreq(itemset, transactionsPe, newFrequentItems);
}
MemoryLogger.getInstance().checkMemory();
}
/**
* Perform the initial occurence delivery with the original dataset
* containing all items
* @param dataset
*/
public void performFirstOccurenceDelivery(Dataset dataset) {
buckets = new List[dataset.getMaxItem() + 1];
for (Integer item : dataset.uniqueItems) {
buckets[item] = new ArrayList<Transaction>();
}
for (Transaction transaction : dataset.getTransactions()) {
for (Integer item : transaction.getItems()) {
// for each item get its bucket and add the current transaction
buckets[item].add(transaction);
}
}
}
/**
* Perform the anytime database reduction for an itemset P U {e}
* @param transactions the transactions
* @param j the position of j in the list of frequent items
* @param frequentItems
* @param itemset
* @param e
*/
private void anyTimeDatabaseReductionFreq(List<Transaction> transactionsPe, int j, List<Integer> frequentItems, List<Integer> itemset, Integer e) {
// We just reset the buckets for item > e
// instead of all buckets
for (int i = j+1; i < frequentItems.size(); i++) {
Integer item = frequentItems.get(i);
buckets[item] = new ArrayList<Transaction>();
}
// for each transaction
for(Transaction transaction : transactionsPe) {
// we consider each item I of the transaction such that itemI > e
for(int i = transaction.getItems().length-1; i >transaction.offset; i--) {
Integer item = transaction.getItems()[i];
if(frequentItems.contains(item)) {
// we add the transaction to the bucket of the itemI
buckets[item].add(transaction);
}
}
}
}
/**
* Perform the anytime database reduction for an itemset P U {e}
* @param transactions the transactions
* @param j the position of j in the list of frequent items
* @param frequentItems
* @param itemset
* @param e
*/
private void anyTimeDatabaseReductionClosed(List<Transaction> transactionsPe, int j, List<Integer> frequentItems, List<Integer> itemset, Integer e) {
// We just reset the buckets for item > e
// instead of all buckets
for (int i = j+1; i < frequentItems.size(); i++) {
Integer item = frequentItems.get(i);
buckets[item] = new ArrayList<Transaction>();
}
// for each transaction
for(Transaction transaction : transactionsPe) {
// we consider each item I of the transaction such that itemI > e
for(int i = transaction.getItems().length-1; i >transaction.offset; i--) {
Integer item = transaction.getItems()[i];
if(item > e && frequentItems.contains(item)) {
// we add the transaction to the bucket of the itemI
buckets[item].add(transaction);
}
}
}
}
private void anyTimeDatabaseReductionMax(List<Transaction> transactionsPe, int j, List<Integer> frequentItems, List<Integer> itemset, Integer e) {
// We just reset the buckets for item > e
// instead of all buckets
for (int i = 0; i < frequentItems.size(); i++) { // ## DIF INSTEAD OF J+1
Integer item = frequentItems.get(i);
buckets[item] = new ArrayList<Transaction>();
}
// for each transaction
for(Transaction transaction : transactionsPe) {
// we consider each item I of the transaction such that itemI > 0
for(int i = transaction.getItems().length-1; i >= 0; i--) { // #### DIF i >=0
Integer item = transaction.getItems()[i];
if(frequentItems.contains(item)) {
// we add the transaction to the bucket of the itemI
buckets[item].add(transaction);
}
}
}
}
/**
* Check if an item appears in this itemset
* @param item the item
* @return true if it appears. Otherwise, false.
*/
public boolean containsByBinarySearch(List<Integer> items, Integer item, int searchAfterPosition) {
if(items.size() == 0 || item > items.get(items.size() -1)) {
return false;
}
int low = searchAfterPosition +1;
int high = items.size() - 1;
while (high >= low) {
int middle = ( low + high ) >>> 1; // divide by 2
if (items.get(middle).equals(item)) {
return true;
}
if (items.get(middle) < item) {
low = middle + 1;
}
if (items.get(middle) > item) {
high = middle - 1;
}
}
return false;
}
public boolean containsByBinarySearch(List<Integer> items, Integer item) {
if(items.size() == 0 || item > items.get(items.size() -1)) {
return false;
}
int low = 0;
int high = items.size() - 1;
while (high >= low) {
int middle = ( low + high ) >>> 1; // divide by 2
if (items.get(middle).equals(item)) {
return true;
}
if (items.get(middle) < item) {
low = middle + 1;
}
if (items.get(middle) > item) {
high = middle - 1;
}
}
return false;
}
/**
* Calculate the transactions of the union of an itemset "P" with an item "e".
* @param transactionsOfP the transactions containing P
* @param e the item "e"
* @return the transactions containing P U "e"
*/
public List<Transaction> intersectTransactions(List<Transaction> transactionsOfP, Integer e) {
List<Transaction> transactionsPe = new ArrayList<Transaction>();
// transactions of P U e
for(Transaction transaction : transactionsOfP) {
// we remember the position where e appears.
// we will call this position an "offset"
int posE = transaction.containsByBinarySearch(e);
if (posE != -1) { // T(P U e)
transactionsPe.add(new Transaction(transaction, posE));
}
}
return transactionsPe;
}
/**
* Check if a given itemset PUe is a PPC extension according to
* the set of transactions containing PUe.
* @param p the itemset p
* @param e the item e
* @param transactionsPe the transactions containing P U e
* @param previouslyAddedItem
* @return true if it is a PPC extension
*/
private boolean isPPCExtension(List<Integer> p, List<Transaction> transactionsPe, Integer e) {
// We do a loop on each item i of the first transaction
Transaction firstTrans = transactionsPe.get(0);
Integer[] firstTransaction = firstTrans.getItems();
for (int i = 0; i < firstTrans.offset; i++) {
Integer item = firstTransaction[i];
// if p does not contain item i < e and item i is present in all transactions,
// then it PUe is not a ppc
if(item < e && (p == null || !containsByBinarySearch(p,item))
&& isItemInAllTransactionsExceptFirst(transactionsPe, item)) {
return false;
}
}
return true;
}
/**
* Check if a given itemset PUe is a PPC Max extension according to
* the set of transactions containing PUe.
* @param p the itemset p
* @param e the item e
* @param transactionsPe the transactions containing P U e
* @return true if it is a PPC extension
*/
private boolean isPPCMaxExtension(List<Integer> p, Integer e, List<Transaction> transactionsPe) {
// We do a loop on each item i not in P U e
Transaction firstTrans = transactionsPe.get(0);
Integer[] firstTransaction = firstTrans.getItems();
for (int i = 0; i < firstTransaction.length; i++) {
Integer item = firstTransaction[i];
// if p does not contain item i < e and item i is present in all transactions,
// then it PUe is not a ppc
if(item >= e) {
break;
}
if((p == null || !containsByBinarySearch(p,item))
&& isItemInAtLeastMinsupTransactionsWithoutFirst(transactionsPe, item)) {
return false;
}
}
return true;
}
/**
* Check if an item appears in at least minsup transactions
* @param transactions a list of transactions (without the first one)
* @param item an item
* @return true if the item appears in > minsup-1 transactions after the first one
*/
private boolean isItemInAtLeastMinsupTransactionsWithoutFirst(List<Transaction> transactions, Integer item) {
int supCount = 1;
for(int i=1; i < transactions.size(); i++) {
if(transactions.get(i).containsByBinarySearchOriginalTransaction(item)) {
supCount++;
if(supCount == minsupRelative) {
return true;
}
}
}
return false;
}
/**
* Check if an item appears in all transactions of a list of transactions
* @param transactions a list of transactions
* @param item an item
* @return true if the item appears in all transactions
*/
private boolean isItemInAtLeastMinsupTransactions(List<Transaction> transactions, Integer item) {
int supCount = 0;
for(Transaction transaction : transactions) {
if(transaction.containsByBinarySearch(item) != -1) {
supCount++;
if(supCount == minsupRelative) {
return true;
}
}
}
return false;
}
/**
* Check if an item appears in all transactions after the first one in a list of transactions
* @param transactions a list of transactions
* @param item an item
* @return true if the item appears in all transactions after the first one
*/
private boolean isItemInAllTransactionsExceptFirst(List<Transaction> transactions, Integer item) {
for(int i=1; i < transactions.size(); i++) {
if(transactions.get(i).containsByBinarySearchOriginalTransaction(item) == false) {
return false;
}
}
return true;
}
/**
* Check if an item appears in all transactions of a list of transactions
* @param transactions a list of transactions
* @param item an item
* @return true if the item appears in all transactions
*/
private boolean isItemInAllTransactions(List<Transaction> transactions, Integer item) {
for(Transaction transaction : transactions) {
if(transaction.containsByBinarySearch(item) == -1) {
return false;
}
}
return true;
}
/**
* Save a frequent closed itemset to file or memory depending on what the user chose.
* @param itemset the itemset
* @throws IOException if error while writting to output file
*/
private void output(List<Integer> itemset, int support) throws IOException {
// if not the empty set
if(!itemset.isEmpty()) {
frequentCount++;
// if save to memory
if(writer == null) {
// The following line is not too optimized since
// we convert an itemset as List<Integer> to int[]
// but this cost is still quite small, so we leave it like
closedFrequentItemsets.addItemset(new Itemset(itemset, support), itemset.size());
}else {
// if save to file
// create a stringuffer
StringBuilder buffer = new StringBuilder();
// append items from the itemset to the StringBuilder
for (int i = 0; i < itemset.size(); i++) {
buffer.append(itemset.get(i));
if (i != itemset.size() - 1) {
buffer.append(' ');
}
}
// append the support of the itemset
buffer.append(" #SUP: ");
buffer.append(support);
// write the strinbuffer to file and create a new line
// so that we are ready for writing the next itemset.
writer.write(buffer.toString());
writer.newLine();
}
}
}
/**
* Print statistics about the latest execution of the algorithm.
*/
public void printStats() {
if(mineAllFrequentItemsets) {
System.out.println("========== LCMFreq - STATS ============");
System.out.println(" Freq. itemsets count: " + frequentCount);
}else if(mineAllMaximalItemsets) {
System.out.println("========== LCMMax - STATS ============");
System.out.println(" Freq. maximal itemsets count: " + frequentCount);
}else {
System.out.println("========== LCM - STATS ============");
System.out.println(" Freq. closed itemsets count: " + frequentCount);
}
System.out.println(" Total time ~: " + (endTimestamp - startTimestamp)
+ " ms");
System.out.println(" Max memory:" + MemoryLogger.getInstance().getMaxMemory());
System.out.println("=====================================");
}
}