package ca.pfv.spmf.algorithms.frequentpatterns.uapriori;
/* This file is copyright (c) 2008-2013 Philippe Fournier-Viger
*
* This file is part of the SPMF DATA MINING SOFTWARE
* (http://www.philippe-fournier-viger.com/spmf).
*
* SPMF is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
*
* SPMF is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
* You should have received a copy of the GNU General Public License along with
* SPMF. If not, see <http://www.gnu.org/licenses/>.
*/
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
/**
* This is an implementation of the U-Apriori algorithm as described by :<br/><br/>
*
* Chui, C., Kao, B., Hung, E. (2007), Mining Frequent Itemsets fomr Uncertain Data, PAKDD 2007, pp 47-58.
*
* @see ItemUApriori
* @see UncertainTransactionDatabase
* @see ItemsetUApriori
* @see ItemsetsUApriori
* @author Philippe Fournier-Viger
*/
public class AlgoUApriori {
// this is the database
protected UncertainTransactionDatabase database;
// variable indicating the current level for the Apriori generation
// (itemsets of size k)
protected int k;
// stats
protected int totalCandidateCount = 0; // number of candidates generated
protected int databaseScanCount = 0; // number of database scan
protected long startTimestamp; // start time of latest execution
protected long endTimestamp; // end time of latest execution
private int itemsetCount; // the number of itemsets found
// write to file
BufferedWriter writer = null;
/**
* Constructor
* @param database the database for applying this algorithm
*/
public AlgoUApriori(UncertainTransactionDatabase database) {
this.database = database;
}
/**
* Run this algorithm
* @param minsupp a minimum support threshold
* @param output the output file path for writing the result
* @throws IOException exception if error reading/writing files
*/
public void runAlgorithm(double minsupp, String output) throws IOException {
// record start time
startTimestamp = System.currentTimeMillis();
// reset variables for statistics
totalCandidateCount = 0;
databaseScanCount = 0;
itemsetCount=0;
// prepare the output file
writer = new BufferedWriter(new FileWriter(output));
// Generate candidates with size k = 1 (all itemsets of size 1)
k=1;
Set<ItemsetUApriori> candidatesSize1 = generateCandidateSize1();
// increase the number of candidates generated
totalCandidateCount+=candidatesSize1.size();
// calculate the support of each candidate of size 1
// by scanning the database
calculateSupportForEachCandidate(candidatesSize1);
// To build level 1, we keep only the frequent candidates.
// We scan the database one time to calculate the support of each candidate.
Set<ItemsetUApriori> level = createLevelWithFrequentCandidates(minsupp,
candidatesSize1);
// Now this is the recursive step
// itemsets of size k will be generated recursively starting from k=2
// by using itemsets of size k-1 until no candidates
// can be generated
k = 2;
// While the level is not empty
while (!level.isEmpty() ) {
// Generate candidates of size K
Set<ItemsetUApriori> candidatesK = generateCandidateSizeK(level);
// increase the candidate count
totalCandidateCount+=candidatesK.size();
// We scan the database one time to calculate the support
// of each candidates.
calculateSupportForEachCandidate(candidatesK);
// We build the level k+1 with all the candidates that have
// a support higher than the minsup threshold.
Set<ItemsetUApriori> levelK = createLevelWithFrequentCandidates(
minsupp, candidatesK);
level = levelK; // We keep only the last level...
k++;
}
// close the output file
writer.close();
// record end time
endTimestamp = System.currentTimeMillis();
}
/**
* Save an itemset to the output file.
* @param itemset the itemset
* @throws IOException exception if error writing the itemset to the file
*/
private void saveItemsetToFile(ItemsetUApriori itemset) throws IOException{
writer.write(itemset.toString() + " Support: " + itemset.getExpectedSupport());
writer.newLine();
itemsetCount++;
}
/**
* Take a set of candidates and compare them with the min expected support to keep
* only the itemset meeting that minimum threshold.
* @param minsupp the minimum expected threshold
* @param candidatesK a set of itemsets of size k
* @return the set of frequent itemsets of size k
* @throws IOException exception if error writing output file
*/
protected Set<ItemsetUApriori> createLevelWithFrequentCandidates(double minsupp,Set<ItemsetUApriori> candidatesK) throws IOException {
Set<ItemsetUApriori> levelK = new HashSet<ItemsetUApriori>();
// for each itemset
for (ItemsetUApriori candidate : candidatesK) {
// check if it has enough support
if (candidate.getExpectedSupport() >= minsupp) {
// if yes add it to the set of frequent itemset of size k
levelK.add(candidate);
// save the itemset to the output file
saveItemsetToFile(candidate);
}
}
// return frequent k-itemsets
return levelK;
}
/**
* Calculate the support of a set of candidates by scanning the database.
* @param candidatesK a set of candidates of size k
*/
protected void calculateSupportForEachCandidate(
Set<ItemsetUApriori> candidatesK) {
// increase database scan count
databaseScanCount++;
// for each transaction
for (ItemsetUApriori transaction : database.getTransactions()) {
// For each candidate of level K, we increase its support
// if it is included in the transaction.
// for each candidate
candidateLoop : for (ItemsetUApriori candidate : candidatesK) {
// initialize the expected support to 0
double expectedSupport = 0;
// for each item in candidate we will try to find it
for(ItemUApriori item : candidate.getItems()){
boolean found = false;
// for each item in the transaction
for(ItemUApriori itemT : transaction.getItems()){
// if we found the item
if(item.getId() == itemT.getId()){
found = true;
// update the expected support
if(expectedSupport == 0){
expectedSupport = itemT.getProbability();
}else{
expectedSupport *= itemT.getProbability();
}
break;
}
// if the lexical order is not respected then it is impossible
// that this item will be in this transaction so
// we stop searching for that item
else if (item.getId() < itemT.getId()){
break;
}
}
// if the last item that we searched was not found
// then the full itemset is not here, so we stop
if(found == false){
continue candidateLoop;
}
}
// If the candidate itemset was completely found,
// we increase the support of the candidate its calculated
// expected support.
candidate.increaseSupportBy(expectedSupport);
}
}
}
/**
* Generate candidate itemsets containing a single item.
* @return a set of candidate itemsets
*/
protected Set<ItemsetUApriori> generateCandidateSize1() {
// create the set of candidates as empty
Set<ItemsetUApriori> candidates = new HashSet<ItemsetUApriori>();
// for each item
for (ItemUApriori item : database.getAllItems()) {
// simply add it to the set of candidates
ItemsetUApriori itemset = new ItemsetUApriori();
itemset.addItem(item);
candidates.add(itemset);
}
return candidates;
}
/**
* Generate candidate itemsets of size K by using HWTUIs of size k-1
* @param levelK_1 itemsets of size k-1
* @return candidates of size K
*/
protected Set<ItemsetUApriori> generateCandidateSizeK(Set<ItemsetUApriori> levelK_1) {
// a set to store candidates
Set<ItemsetUApriori> candidates = new HashSet<ItemsetUApriori>();
// For each itemset I1 and I2 of level k-1
Object[] itemsets = levelK_1.toArray();
for(int i=0; i< levelK_1.size(); i++){
ItemsetUApriori itemset1 = (ItemsetUApriori)itemsets[i];
for(int j=0; j< levelK_1.size(); j++){
ItemsetUApriori itemset2 = (ItemsetUApriori)itemsets[j];
// If I1 is smaller than I2 according to lexical order and
// they share all the same items except the last one.
ItemUApriori missing = itemset1.allTheSameExceptLastItem(itemset2);
if(missing != null ){
// Then, create a new candidate by combining itemset1 and itemset2
ItemsetUApriori candidate = new ItemsetUApriori();
for(ItemUApriori item : itemset1.getItems()){
candidate.addItem(item);
}
candidate.addItem(missing);
// The candidate is tested to see if its subsets of size k-1 are included in
// level k-1 (they are frequent).
if(allSubsetsOfSizeK_1AreFrequent(candidate,levelK_1)){
// if it pass the test, add it to the set of candidates
candidates.add(candidate);
}
}
}
}
// return the set of candidates
return candidates;
}
/**
* Check if all subsets of size k-1 of a candidate itemset of size k are frequent.
* @param candidate the candidate itemset
* @param levelK_1 frequent itemsets of size k-1
* @return true if all subsets are frequent, otherwise false
*/
protected boolean allSubsetsOfSizeK_1AreFrequent(ItemsetUApriori candidate, Set<ItemsetUApriori> levelK_1) {
// To generate all the set of size K-1, we will proceed
// by removing each item, one by one.
//if only one item, return true because the empty set is always frequent
if(candidate.size() == 1){
return true;
}
// for each item
for(ItemUApriori item : candidate.getItems()){
// copy the itemset without this item to get a suset
ItemsetUApriori subset = candidate.cloneItemSetMinusOneItem(item);
boolean found = false;
// we scan itemsets of size k-1
for(ItemsetUApriori itemset : levelK_1){
// if we found the subset, then set found to true
// and stop this loop
if(itemset.isEqualTo(subset)){
found = true;
break;
}
}
// if the subset was not found, then we return false
if(found == false){
return false;
}
}
// all the subsets were found, so we return true
return true;
}
/**
* Print statistics about the latest execution.
*/
public void printStats() {
System.out
.println("============= U-APRIORI - STATS =============");
long temps = endTimestamp - startTimestamp;
// System.out.println(" Total time ~ " + temps + " ms");
System.out.println(" Transactions count from database : "
+ database.size());
System.out.println(" Candidates count : " + totalCandidateCount);
System.out.println(" Database scan count : " + databaseScanCount);
System.out.println(" The algorithm stopped at size " + (k - 1)
+ ", because there is no candidate");
System.out.println(" Uncertain itemsets count : " + itemsetCount);
System.out.println(" Total time ~ " + temps + " ms");
System.out
.println("===================================================");
}
}