package ca.pfv.spmf.algorithms.frequentpatterns.aprioriTIDClose;
/* This file is copyright (c) 2008-2013 Philippe Fournier-Viger
*
* This file is part of the SPMF DATA MINING SOFTWARE
* (http://www.philippe-fournier-viger.com/spmf).
*
* SPMF is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
*
* SPMF is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
* You should have received a copy of the GNU General Public License along with
* SPMF. If not, see <http://www.gnu.org/licenses/>.
*/
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import ca.pfv.spmf.input.transaction_database_list_integers.TransactionDatabase;
import ca.pfv.spmf.patterns.itemset_array_integers_with_tids.Itemset;
import ca.pfv.spmf.patterns.itemset_array_integers_with_tids.Itemsets;
import ca.pfv.spmf.tools.MemoryLogger;
/**
* This is an implementation of the AprioriTID algorithm transformed to mine
* only frequent closed itemsets as proposed by Pasquier (1999), rather than all
* frequent itemsets.<br/><br/>
*
* AprioriTID was originally proposed in :<br/><br/>
*
* Agrawal R, Srikant R. "Fast Algorithms for Mining Association Rules", VLDB.
* Sep 12-15 1994, Chile, 487-99,<br/><br/>
*
* Modifying Apriori to mine closed itemsets was proposed in: <br/><br/>
*
* Pasquier, N., Bastide, Y., Taouil, R., & Lakhal, L. (1999).
* Discovering frequent closed itemsets for association rules.
* In Database Theory�ICDT�99 (pp. 398-416). Springer Berlin Heidelberg.<br/><br/>
*
* This implementation can save the result to a file or keep
* it into memory if no output path is provided to the runAlgorithm() method.
*
* @see Itemset
* @see Itemsets
* @author Philippe Fournier-Viger
*/
public class AlgoAprioriTIDClose {
// object for writing to file if the user choose to write to a file
BufferedWriter writer = null;
// variable to store the result if the user choose to save to memory instead of a file
protected Itemsets patterns = null;
// the number of transactions
private int databaseSize = 0;
// the current level
protected int k;
// variables for counting support of items
Map<Integer, Set<Integer>> mapItemTIDS = new HashMap<Integer, Set<Integer>>();
// the minimum support threshold
int minSuppRelative;
// Special parameter to set the maximum size of itemsets to be discovered
int maxItemsetSize = Integer.MAX_VALUE;
long startTimestamp = 0; // start time of latest execution
long endTimestamp = 0; // end time of latest execution
int itemsetCount = 0; // number of closed itemset found
/**
* Default constructor
*/
public AlgoAprioriTIDClose() {
}
/**
* Run the algorithm
* @param minsupp the minsup threshold
* @param outputFile an output file path, if the result should be saved otherwise
* leave it null and this method will keep the result into memory and return it.
* @return the set of itemsets found if the user chose to save the result to memory
* @throws IOException exception if error writing the output file
*/
public Itemsets runAlgorithm(TransactionDatabase database, double minsupp, String outputFile) throws IOException {
// record start time
startTimestamp = System.currentTimeMillis();
// reset number of itemsets found
itemsetCount = 0;
// if the user want to keep the result into memory
if(outputFile == null){
writer = null;
patterns = new Itemsets("FREQUENT CLOSED ITEMSETS");
}else{ // if the user want to save the result to a file
patterns = null;
writer = new BufferedWriter(new FileWriter(outputFile));
}
this.minSuppRelative = (int) Math.ceil(minsupp * database.size());
if (this.minSuppRelative == 0) { // protection
this.minSuppRelative = 1;
}
// (1) count the tid set of each item in the database in one database
// pass
mapItemTIDS = new HashMap<Integer, Set<Integer>>();
// key : item value: tidset of the item
// for each transaction
for (int j = 0; j < database.getTransactions().size(); j++) {
List<Integer> transaction = database.getTransactions().get(j);
// for each item in the transaction
for (int i = 0; i < transaction.size(); i++) {
// update the tidset of the item
Set<Integer> ids = mapItemTIDS.get(transaction.get(i));
if (ids == null) {
ids = new HashSet<Integer>();
mapItemTIDS.put(transaction.get(i), ids);
}
ids.add(j);
}
}
// save the database size
databaseSize = database.getTransactions().size();
// To build level 1, we keep only the frequent items.
// We scan the database one time to calculate the support of each
// candidate.
k = 1;
List<Itemset> level = new ArrayList<Itemset>();
// For each item
Iterator<Entry<Integer, Set<Integer>>> iterator = mapItemTIDS
.entrySet().iterator();
while (iterator.hasNext()) {
// check memory usage
MemoryLogger.getInstance().checkMemory();
Map.Entry<Integer, Set<Integer>> entry = (Map.Entry<Integer, Set<Integer>>) iterator
.next();
if (entry.getValue().size() >= minSuppRelative) { // if the item is
// frequent
Integer item = entry.getKey();
Itemset itemset = new Itemset(item);
itemset.setTIDs(mapItemTIDS.get(item));
level.add(itemset);
} else {
iterator.remove(); // if the item is not frequent we don't
// need to keep it into memory.
}
}
// sort itemsets of size 1 according to lexicographical order.
Collections.sort(level, new Comparator<Itemset>() {
public int compare(Itemset o1, Itemset o2) {
return o1.get(0) - o2.get(0);
}
});
// Generate candidates with size k = 1 (all itemsets of size 1)
k = 2;
// While the level is not empty
while (!level.isEmpty() && k <= maxItemsetSize) {
// We build the level k+1 with all the candidates that have
// a support higher than the minsup threshold.
List<Itemset> levelK = generateCandidateSizeK(level);
// We check all sets of level k-1 for closure
checkIfItemsetsK_1AreClosed(level, levelK);
level = levelK; // We keep only the last level...
k++;
}
// save end time
endTimestamp = System.currentTimeMillis();
// close the output file if the result was saved to a file
if(writer != null){
writer.close();
}
return patterns; // Return all frequent itemsets found!
}
/**
* Remove items that at not frequent from the transaction database
* @param database
* @return a map indicating the tidset of each item (key: item value: tidset)
*/
private Map<Integer, Set<Integer>> removeItemsThatAreNotFrequent(
TransactionDatabase database) {
// (1) count the support of each item in the database in one database
// pass
// Map with (key: item value: tidset)
mapItemTIDS = new HashMap<Integer, Set<Integer>>();
// for each transaction
for (int j = 0; j < database.getTransactions().size(); j++) {
List<Integer> transaction = database.getTransactions().get(j);
// for each item
for (int i = 0; i < transaction.size(); i++) {
// update the support count of the item
Set<Integer> ids = mapItemTIDS.get(transaction.get(i));
if (ids == null) {
ids = new HashSet<Integer>();
mapItemTIDS.put(transaction.get(i), ids);
}
ids.add(j);
}
}
System.out.println("NUMBER OF DIFFERENT ITEMS : " + mapItemTIDS.size());
// (2) remove all items that are not frequent from the database
// for each transaction
for (int j = 0; j < database.getTransactions().size(); j++) {
List<Integer> transaction = database.getTransactions().get(j);
// for each item in the transaction
Iterator<Integer> iter = transaction.iterator();
while (iter.hasNext()) {
Integer nextItem = iter.next();
// if the item is not frequent
Set<Integer> ids = mapItemTIDS.get(nextItem);
if (ids.size() < minSuppRelative) {
// remove it!
iter.remove();
}
}
}
return mapItemTIDS;
}
/**
* Checks if all the itemsets of size K-1 are closed by comparing
* them with itemsets of size K.
* @param levelKm1 itemsets of size k-1
* @param levelK itemsets of size k
* @throws IOException exception if error writing output file
*/
private void checkIfItemsetsK_1AreClosed(Collection<Itemset> levelKm1,
List<Itemset> levelK) throws IOException {
// for each itemset of size k-1
for (Itemset itemset : levelKm1) {
// consider it is closed
boolean isClosed = true;
// compare this itemset with all itemsets of size k
for (Itemset itemsetK : levelK) {
// if an itemset has the same support and contain the itemset of size k-1 ,
// then the itemset of size k-1 is not closed
if (itemsetK.getAbsoluteSupport() == itemset
.getAbsoluteSupport() && itemsetK.containsAll(itemset)) {
isClosed = false;
break;
}
}
// if itemset of size k-1 is closed
if (isClosed) {
// save the itemset of of size k-1 to file
saveItemset(itemset);
}
}
}
/**
* Save a frequent itemset to the output file or memory,
* depending on what the user chose.
* @param itemset the itemset
* @throws IOException exception if error writing the output file.
*/
void saveItemset(Itemset itemset) throws IOException {
itemsetCount++;
// if the result should be saved to a file
if(writer != null){
writer.write(itemset.toString() + " #SUP: "
+ itemset.getTransactionsIds().size() );
writer.newLine();
}// otherwise the result is kept into memory
else{
patterns.addItemset(itemset, itemset.size());
}
}
/**
* Method to generate itemsets of size k from frequent itemsets of size K-1.
* @param levelK_1 frequent itemsets of size k-1
* @return itemsets of size k
*/
protected List<Itemset> generateCandidateSizeK(List<Itemset> levelK_1) {
// create a variable to store candidates
List<Itemset> candidates = new ArrayList<Itemset>();
// For each itemset I1 and I2 of level k-1
loop1: for (int i = 0; i < levelK_1.size(); i++) {
Itemset itemset1 = levelK_1.get(i);
loop2: for (int j = i + 1; j < levelK_1.size(); j++) {
Itemset itemset2 = levelK_1.get(j);
// we compare items of itemset1 and itemset2.
// If they have all the same k-1 items and the last item of
// itemset1 is smaller than
// the last item of itemset2, we will combine them to generate a
// candidate
for (int k = 0; k < itemset1.size(); k++) {
// if they are the last items
if (k == itemset1.size() - 1) {
// the one from itemset1 should be smaller (lexical
// order)
// and different from the one of itemset2
if (itemset1.getItems()[k] >= itemset2.get(k)) {
continue loop1;
}
}
// if they are not the last items, and
else if (itemset1.getItems()[k] < itemset2.get(k)) {
continue loop2; // we continue searching
} else if (itemset1.getItems()[k] > itemset2.get(k)) {
continue loop1; // we stop searching: because of lexical
// order
}
}
// NOW COMBINE ITEMSET 1 AND ITEMSET 2
// create list of common tids
Set<Integer> list = new HashSet<Integer>();
for (Integer val1 : itemset1.getTransactionsIds()) {
if (itemset2.getTransactionsIds().contains(val1)) {
list.add(val1);
}
}
// if the combination of itemset1 and itemset2 is frequent
if (list.size() >= minSuppRelative) {
// Create a new candidate by combining itemset1 and itemset2
int newItemset[] = new int[itemset1.size()+1];
System.arraycopy(itemset1.itemset, 0, newItemset, 0, itemset1.size());
newItemset[itemset1.size()] = itemset2.getItems()[itemset2.size() -1];
Itemset candidate = new Itemset(newItemset);
candidate.setTIDs(list);
candidates.add(candidate);
// frequentItemsets.addItemset(candidate, k);
}
}
}
return candidates;
}
/**
* Get the frequent closed itemsets found by the latest execution.
* @return Itemsets
*/
public Itemsets getFrequentClosed() {
return patterns;
}
/**
* Set the maximum itemset size of itemsets to be found
* @param maxItemsetSize maximum itemset size.
*/
public void setMaxItemsetSize(int maxItemsetSize) {
this.maxItemsetSize = maxItemsetSize;
}
/**
* Print statistics about the algorithm execution to System.out.
*/
public void printStats() {
System.out
.println("============= APRIORI-CLOSE - STATS =============");
long temps = endTimestamp - startTimestamp;
// System.out.println(" Total time ~ " + temps + " ms");
System.out.println(" Transactions count from database : "
+ databaseSize);
System.out.println(" The algorithm stopped at size " + (k - 1)
+ ", because there is no candidate");
System.out.println(" Frequent closed itemsets count : "
+ itemsetCount);
System.out.println(" Maximum memory usage : " +
MemoryLogger.getInstance().getMaxMemory() + " mb");
System.out.println(" Total time ~ " + temps + " ms");
System.out
.println("===================================================");
}
}