package ca.pfv.spmf.algorithms.frequentpatterns.aprioriTID;
/* This file is copyright (c) 2008-2013 Philippe Fournier-Viger
*
* This file is part of the SPMF DATA MINING SOFTWARE
* (http://www.philippe-fournier-viger.com/spmf).
*
* SPMF is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
*
* SPMF is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
* You should have received a copy of the GNU General Public License along with
* SPMF. If not, see <http://www.gnu.org/licenses/>.
*/
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import ca.pfv.spmf.input.transaction_database_list_integers.TransactionDatabase;
import ca.pfv.spmf.patterns.itemset_array_integers_with_tids.Itemset;
import ca.pfv.spmf.patterns.itemset_array_integers_with_tids.Itemsets;
import ca.pfv.spmf.tools.MemoryLogger;
/**
* This is an implementation of the AprioriTID algorithm.<br/><br/>
*
* The AprioriTID algorithm finds all the frequents itemsets and their support
* in a binary context.<br/><br/>
*
* AprioriTID can be faster than Apriori and produce the same result.
* <br/><br/>
*
* AprioriTID was originally proposed in :<br/><br/>
*
* Agrawal R, Srikant R. "Fast Algorithms for Mining Association Rules", VLDB.
* Sep 12-15 1994, Chile, 487-99,<br/><br/>
*
* This implementation can save the result to a file or keep
* it into memory if no output path is provided to the runAlgorithm() method.
* <br/><br/>
*
* @see Itemset
* @see Itemsets
* @author Philippe Fournier-Viger
*/
public class AlgoAprioriTID {
// the current level
protected int k;
// variables for counting support of items
Map<Integer, Set<Integer>> mapItemTIDS = new HashMap<Integer, Set<Integer>>();
// the minimum support threshold
int minSuppRelative;
// Special parameter to set the maximum size of itemsets to be discovered
int maxItemsetSize = Integer.MAX_VALUE;
long startTimestamp = 0; // start time of latest execution
long endTimeStamp = 0; // end time of latest execution
// object for writing to file if the user choose to write to a file
BufferedWriter writer = null;
// variable to store the result if the user choose to save to memory instead of a file
protected Itemsets patterns = null;
// the number of frequent itemsets found
private int itemsetCount = 0;
// the number of transactions
private int databaseSize = 0;
// the current transaction database, if the user has provided one
// instead of an input file.
private TransactionDatabase database = null;
// indicate if the empty set should be added to the results
private boolean emptySetIsRequired = false;
/**
* Default constructor
*/
public AlgoAprioriTID() {
}
/**
* This method run the algorithm on a transaction database already in memory.
* @param database the transaction database
* @param minsup the minimum support threshold as a percentage (double)
* @return the method returns frequent itemsets
* @throws IOException exception if error reading/writing the file
*/
public Itemsets runAlgorithm(TransactionDatabase database, double minsup)
throws NumberFormatException, IOException {
// remember the transaction database received as parameter
this.database = database;
// call the real "runAlgorithm() method
Itemsets result = runAlgorithm(null, null, minsup);
// forget the database
this.database = null;
// return the result
return result;
}
/**
* This method run the algorithm.
* @param input the file path of an input file. if null, the result is returned by the method.
* @param output the output file path
* @param minsup the minimum support threshold as a percentage (double)
* @return if no output file path is provided, the method return frequent itemsets, otherwise null
* @throws IOException exception if error reading/writing the file
*/
public Itemsets runAlgorithm(String input, String output, double minsup)
throws NumberFormatException, IOException {
// record start time
startTimestamp = System.currentTimeMillis();
// reset number of itemsets found
itemsetCount = 0;
// if the user want to keep the result into memory
if(output == null){
writer = null;
patterns = new Itemsets("FREQUENT ITEMSETS");
}else{ // if the user want to save the result to a file
patterns = null;
writer = new BufferedWriter(new FileWriter(output));
}
// (1) count the tid set of each item in the database in one database
// pass
mapItemTIDS = new HashMap<Integer, Set<Integer>>(); // id item, count
// read the input file line by line until the end of the file
// (each line is a transaction)
databaseSize = 0;
// if the database is in memory
if(database != null){
// for each transaction
for(List<Integer> transaction : database.getTransactions()){ // for each transaction
// for each token (item)
for (int item : transaction) {
// get the set of tids for this item until now
Set<Integer> tids = mapItemTIDS.get(item);
// if null, create a new set
if (tids == null) {
tids = new HashSet<Integer>();
mapItemTIDS.put(item, tids);
}
// add the current transaction id (tid) to the set of the current item
tids.add(databaseSize);
}
databaseSize++; // increment the tid number
}
}else{
BufferedReader reader = new BufferedReader(new FileReader(input));
String line;
while (((line = reader.readLine()) != null)) { // for each transaction
// if the line is a comment, is empty or is a
// kind of metadata
if (line.isEmpty() == true ||
line.charAt(0) == '#' || line.charAt(0) == '%'
|| line.charAt(0) == '@') {
continue;
}
// split the line into tokens according to spaces
String[] lineSplited = line.split(" ");
// for each token (item)
for (String token : lineSplited) {
// convert from string item to integer
int item = Integer.parseInt(token);
// get the set of tids for this item until now
Set<Integer> tids = mapItemTIDS.get(item);
// if null, create a new set
if (tids == null) {
tids = new HashSet<Integer>();
mapItemTIDS.put(item, tids);
}
// add the current transaction id (tid) to the set of the current item
tids.add(databaseSize);
}
databaseSize++; // increment the tid number
}
reader.close(); // close the input file
}
// if the user want the empty set
if(emptySetIsRequired ){
// add the empty set to the set of patterns
patterns.addItemset(new Itemset(new int[]{}), 0);
}
// convert the support from a relative minimum support (%) to an
// absolute minimum support
this.minSuppRelative = (int) Math.ceil(minsup * databaseSize);
// To build level 1, we keep only the frequent items.
// We scan the database one time to calculate the support of each
// candidate.
k = 1;
List<Itemset> level = new ArrayList<Itemset>();
// For each item
Iterator<Entry<Integer, Set<Integer>>> iterator = mapItemTIDS.entrySet().iterator();
while (iterator.hasNext()) {
// check memory usage
MemoryLogger.getInstance().checkMemory();
Map.Entry<Integer, Set<Integer>> entry = (Map.Entry<Integer, Set<Integer>>) iterator
.next();
// if the item is frequent
if (entry.getValue().size() >= minSuppRelative) {
Integer item = entry.getKey();
Itemset itemset = new Itemset(item);
itemset.setTIDs(mapItemTIDS.get(item));
level.add(itemset);
// save the itemset
saveItemset(itemset);
} else {
iterator.remove(); // if the item is not frequent we don't
// need to keep it into memory.
}
}
// sort itemsets of size 1 according to lexicographical order.
Collections.sort(level, new Comparator<Itemset>() {
public int compare(Itemset o1, Itemset o2) {
return o1.get(0) - o2.get(0);
}
});
// Generate candidates with size k = 1 (all itemsets of size 1)
k = 2;
// While the level is not empty
while (!level.isEmpty() && k <= maxItemsetSize) {
// We build the level k+1 with all the candidates that have
// a support higher than the minsup threshold.
level = generateCandidateSizeK(level);
k++;
}
// close the output file if the result was saved to a file
if(writer != null){
writer.close();
}
// save the end time
endTimeStamp = System.currentTimeMillis();
// return frequent itemsets
return patterns;
}
/**
* Method to generate itemsets of size k from frequent itemsets of size K-1.
* @param levelK_1 frequent itemsets of size k-1
* @return itemsets of size k
*/
protected List<Itemset> generateCandidateSizeK(List<Itemset> levelK_1)
throws IOException {
// create a variable to store candidates
List<Itemset> candidates = new ArrayList<Itemset>();
// For each itemset I1 and I2 of level k-1
loop1: for (int i = 0; i < levelK_1.size(); i++) {
Itemset itemset1 = levelK_1.get(i);
loop2: for (int j = i + 1; j < levelK_1.size(); j++) {
Itemset itemset2 = levelK_1.get(j);
// we compare items of itemset1 and itemset2.
// If they have all the same k-1 items and the last item of
// itemset1 is smaller than
// the last item of itemset2, we will combine them to generate a
// candidate
for (int k = 0; k < itemset1.size(); k++) {
// if they are the last items
if (k == itemset1.size() - 1) {
// the one from itemset1 should be smaller (lexical
// order)
// and different from the one of itemset2
if (itemset1.getItems()[k] >= itemset2.get(k)) {
continue loop1;
}
}
// if the k-th items is smalle rinn itemset1
else if (itemset1.getItems()[k] < itemset2.getItems()[k]) {
continue loop2; // we continue searching
} else if (itemset1.getItems()[k] > itemset2.getItems()[k]) {
continue loop1; // we stop searching: because of lexical
// order
}
}
// create list of common tids
Set<Integer> list = new HashSet<Integer>();
// for each tid from the tidset of itemset1
for (Integer val1 : itemset1.getTransactionsIds()) {
// if it appears also in the tidset of itemset2
if (itemset2.getTransactionsIds().contains(val1)) {
// add it to common tids
list.add(val1);
}
}
// if the combination of itemset1 and itemset2 is frequent
if (list.size() >= minSuppRelative) {
// Create a new candidate by combining itemset1 and itemset2
int newItemset[] = new int[itemset1.size()+1];
System.arraycopy(itemset1.itemset, 0, newItemset, 0, itemset1.size());
newItemset[itemset1.size()] = itemset2.getItems()[itemset2.size() -1];
Itemset candidate = new Itemset(newItemset);
candidate.setTIDs(list);
// add it to the list of candidates
candidates.add(candidate);
// save it
saveItemset(candidate);
}
}
}
return candidates;
}
/**
* Set the maximum itemset size of itemsets to be found
* @param maxItemsetSize maximum itemset size.
*/
public void setMaxItemsetSize(int maxItemsetSize) {
this.maxItemsetSize = maxItemsetSize;
}
/**
* Save a frequent itemset to the output file or memory,
* depending on what the user chose.
* @param itemset the itemset
* @throws IOException exception if error writing the output file.
*/
void saveItemset(Itemset itemset) throws IOException {
itemsetCount++;
// if the result should be saved to a file
if(writer != null){
writer.write(itemset.toString() + " #SUP: "
+ itemset.getTransactionsIds().size() );
writer.newLine();
}// otherwise the result is kept into memory
else{
patterns.addItemset(itemset, itemset.size());
}
}
/**
* Method to indicate if the empty set should be included in results
* or not.
* @param emptySetIsRequired if true the empty set will be included.
*/
public void setEmptySetIsRequired(boolean emptySetIsRequired) {
this.emptySetIsRequired = emptySetIsRequired;
}
/**
* Print statistics about the algorithm execution to System.out.
*/
public void printStats() {
System.out.println("============= APRIORI - STATS =============");
System.out.println(" Transactions count from database : " + databaseSize);
System.out.println(" Frequent itemsets count : " + itemsetCount);
System.out.println(" Maximum memory usage : " +
MemoryLogger.getInstance().getMaxMemory() + " mb");
System.out.println(" Total time ~ " + (endTimeStamp - startTimestamp)
+ " ms");
System.out
.println("===================================================");
}
/**
* Get the number of transactions in the last database read.
* @return number of transactions.
*/
public int getDatabaseSize() {
return databaseSize;
}
}