package ca.pfv.spmf.algorithms.frequentpatterns.apriori_HT;
/* This file is copyright (c) 2008-2013 Philippe Fournier-Viger
*
* This file is part of the SPMF DATA MINING SOFTWARE
* (http://www.philippe-fournier-viger.com/spmf).
*
* SPMF is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
*
* SPMF is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
* You should have received a copy of the GNU General Public License along with
* SPMF. If not, see <http://www.gnu.org/licenses/>.
*/
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import ca.pfv.spmf.algorithms.frequentpatterns.apriori_HT.ItemsetHashTree.LeafNode;
import ca.pfv.spmf.patterns.itemset_array_integers_with_count.Itemset;
import ca.pfv.spmf.tools.MemoryLogger;
/**
* This is an implementation of the Apriori algorithm that use an Hash-tree to
* store candidates, to calculate their support and to generate candidates efficiently.
* The other version (AlgoApriori) do not use a hash tree.
* <br/><br/>
*
* The Apriori algorithm is described in :
* <br/><br/>
*
* Agrawal R, Srikant R. "Fast Algorithms for Mining Association Rules", VLDB.
* Sep 12-15 1994, Chile, 487-99,
* <br/><br/>
*
* The Apriori algorithm finds all the frequents itemsets and their support in a
* transaction database.
* <br/><br/>
*
* Note that the performance of the Hash-Tree version of Apriori depends on the BRANCH COUNT value
* in the class ItemsetHashTree. In my test, I have used a value of 30 because it seems to provide
* the best results. But other values could also be used (see Agrawal & Srikant for details).
* To change the branch_count variable, see the ItemsetHashTree class (default is 30).
*
*
* @see Itemset
* @see AbstractOrderedItemsetsAdapter
* @see ItemsetHashTree
* @author Philippe Fournier-Viger
*/
public class AlgoAprioriHT {
// the maximul level reached by Apriori
protected int k;
// For statistics
protected int totalCandidateCount = 0; // total number of candidates generated
protected long startTimestamp; // start time
protected long endTimestamp; // end time
private int itemsetCount; // number of itemsets found
private int hash_tree_branch_count; // the number of branches in the hash tree
// the relative minimum support used to find itemsets
private int minsupRelative;
// an in-memory representation of the transaction database
private List<int[]> database = null;
// write to file
BufferedWriter writer = null;
/**
* Default constructor
*/
public AlgoAprioriHT() {
}
/**
* Run the Apriori-HT algorithm
* @param minsup the minimum support threshold
* @param input path to the input file
* @param output path to save the result to an output file
* @param hash_tree_branch_count the number of child nodes for each node in the hash tree
* @throws IOException if an error while reading/writing files
*/
public void runAlgorithm(double minsup, String input, String output, int hash_tree_branch_count) throws IOException {
// record start time
startTimestamp = System.currentTimeMillis();
// prepare object for writing the file
writer = new BufferedWriter(new FileWriter(output));
// reset statistics
itemsetCount = 0;
totalCandidateCount = 0;
MemoryLogger.getInstance().reset();
int transactionCount = 0;
// save the parameter
this.hash_tree_branch_count = hash_tree_branch_count;
// structure to count the support of each item
// Key: item Value: support count
Map<Integer, Integer> mapItemCount = new HashMap<Integer, Integer>();
// the database in memory (intially empty)
database = new ArrayList<int[]>();
// scan the database to load it into memory and count the support of each single item at the same time
BufferedReader reader = new BufferedReader(new FileReader(input));
String line;
// for each line (transaction) of the input file until the end of file
while (((line = reader.readLine()) != null)) {
// if the line is a comment, is empty or is a
// kind of metadata
if (line.isEmpty() == true ||
line.charAt(0) == '#' || line.charAt(0) == '%'
|| line.charAt(0) == '@') {
continue;
}
// split the line into items
String[] lineSplited = line.split(" ");
// create an array to store the items
int transaction[] = new int[lineSplited.length];
// for each item in the current transaction
for (int i=0; i< lineSplited.length; i++) {
// convert to integer
Integer item = Integer.parseInt(lineSplited[i]);
// add the item to the transaction
transaction[i] = item;
// increase the support count of the item
Integer count = mapItemCount.get(item);
if (count == null) {
mapItemCount.put(item, 1);
} else {
mapItemCount.put(item, ++count);
}
}
// add transaction to the database
database.add(transaction);
// increase the transaction count
transactionCount++;
}
// close the input file
reader.close();
// convert absolute minimum support to a relative minimum support
// by multiplying by the database size
this.minsupRelative = (int) Math.ceil(minsup * transactionCount);
// System.out.println("database size = " +database.size() + " minsuprel = " + minsupRelative);
// Set variable k=1 because we start with itemsets of size 1
k = 1;
// Create the list of all frequent items of size 1
List<Integer> frequent1 = new ArrayList<Integer>();
// For each item
for(Entry<Integer, Integer> entry : mapItemCount.entrySet()){
// if its support is higher than the support
if(entry.getValue() >= minsupRelative){
// keep the item into memory for generating itemsets of size 2
frequent1.add(entry.getKey());
// and also save it to the output file
saveItemsetToFile(entry.getKey(), entry.getValue());
}
}
mapItemCount = null; // we don't need it anymore
// Sort the list of frequent items of size 1 by lexical order because
// Apriori need itemset sorted by a total order.
Collections.sort(frequent1, new Comparator<Integer>() {
public int compare(Integer o1, Integer o2) {
return o1 - o2;
}
});
// if no frequent item, we stop there!
if(frequent1.size() == 0){
return;
}
// increase the number of candidates
totalCandidateCount += frequent1.size();
// Now, the algorithm recursively generates frequent itemsets of size K
// by using frequent itemsets of size K-1 until no more
// candidates can be generated.
k = 2;
// While the level is not empty
int previousItemsetCount = itemsetCount;
// Create an hashtree for storing candidates for efficient support counting
ItemsetHashTree candidatesK = null;
do{
//check the memory usage
MemoryLogger.getInstance().checkMemory();
// Generate candidates of size K
if(k ==2){
// if K=2, use an optimized version of candidate generation
candidatesK = generateCandidate2(frequent1);
}else{
// Otherwise use the regular candidate generation procedure
candidatesK = generateCandidateSizeK(candidatesK, k);
}
// if no candidates were generated, we stop the algorithm
if(candidatesK.candidateCount ==0 ){
break;
}
// we keep the total number of candidates generated until now
// for statistics purposes
totalCandidateCount += candidatesK.candidateCount;
// We scan the database one time to calculate the support
// of each candidates and keep those with higher support.
// This is done efficiently because the candidates are stored in a hash-tree.
for(int[] transaction: database){
// NEW OPTIMIZATION 2013: Skip transactions shorter than k!
if(transaction.length >= k) {
candidatesK.updateSupportCount(transaction);
}
// END OF NEW OPTIMIZATION
}
// We next save to file all the candidates that have a support
// higher than the minsup threshold and remove those who does'nt.
// for each leaf node in the hash-tree
for(LeafNode node = candidatesK.lastInsertedNode;node != null; node = node.nextLeafNode){
// for each list of candidate itemsets stored in that node
for(List<Itemset> listCandidate: node.candidates){
// if the list is not null
if(listCandidate != null){
// for each candidate itemset
for(int i=0; i<listCandidate.size(); i++){
Itemset candidate = listCandidate.get(i);
// if enough support, save the itemset
if (candidate.getAbsoluteSupport() >= minsupRelative) {
saveItemsetToFile(candidate);
}else{
// otherwise remove it
listCandidate.remove(i);
}
}
}
}
}
// continue recursively if some new itemsets were generated
// during the current iteration
k++;
}while(previousItemsetCount != itemsetCount);
// save endtime
endTimestamp = System.currentTimeMillis();
// check the memory usage
MemoryLogger.getInstance().checkMemory();
// close the file
writer.close();
}
/**
* Method to generate candidates of size k, where k > 2
* @param candidatesK_1 the candidates of size k-1
* @param k k
* @return the candidates of size k, stored in an hash-tree
*/
private ItemsetHashTree generateCandidateSizeK(ItemsetHashTree candidatesK_1, int k) {
// create the hash-tree to store the candidates of size K
ItemsetHashTree newCandidates = new ItemsetHashTree(k, hash_tree_branch_count);
// The generation will be done by comparing the leaves of the hash-tree
// containing the itemsets of size k-1.
// To generate an itemsets, we need to use two itemsets from the same leaf node.
// For each leaf node
for(LeafNode node = candidatesK_1.lastInsertedNode; node != null; node = node.nextLeafNode){
List<Itemset> subgroups [] = node.candidates;
// For each sets of itemsets in this node
for(int i=0; i< subgroups.length; i++){
if(subgroups[i] == null){
continue;
}
// For each sets of itemsets in this node
for(int j=i; j< subgroups.length; j++){
if(subgroups[j] == null){
continue;
}
// try to use these list of itemsets to generate candidates.
generate(subgroups[i], subgroups[j], candidatesK_1, newCandidates);
}
}
}
return newCandidates;
}
/**
* Method to generate candidates of size k from two list of itemsets of size k-1
* @param list1 the first list
* @param list2 the second list (may be equal to the first list)
* @param candidatesK the hash-tree containing the candidates of size k-1
* @param newCandidates the hash-tree to store the candidates of size k
*/
private void generate(List<Itemset> list1, List<Itemset> list2,
ItemsetHashTree candidatesK_1, ItemsetHashTree newCandidates) {
// For each itemset I1 and I2 of lists
loop1: for (int i = 0; i < list1.size(); i++) {
int[] itemset1 = list1.get(i).itemset;
// if the two lists are the same, we will start from i+1 in the second list
// to avoid comparing pairs of itemsets twice.
int j = (list1 == list2)? i+1 : 0;
// For each itemset in list 2
loop2: for (; j < list2.size(); j++) {
int[] itemset2 = list2.get(j).itemset;
// we compare items of itemset1 and itemset2.
// If they have all the same k-1 items and the last item of
// itemset1 is smaller than
// the last item of itemset2, we will combine them to generate a
// candidate
for (int k = 0; k < itemset1.length; k++) {
// if k is not the last item
if (k != itemset1.length - 1) {
if (itemset2[k] > itemset1[k]) {
continue loop1; // we continue searching
}
if (itemset1[k] > itemset2[k]) {
continue loop2; // we continue searching
}
}
}
// If we are here, it is because the two itemsets share
// the same k-1 first item. Therefore, we can generate
// a new candidate.
// There is two cases depending if the last item of itemset1 is smaller
// or greater than the last item of itemset2. We do this just to make
// sure that we add items in the new candidate according to the lexicographical order
int newItemset[] = new int[itemset1.length+1];
if(itemset2[itemset2.length -1] < itemset1[itemset1.length -1]){
// Create a new candidate by combining itemset1 and itemset2
System.arraycopy(itemset2, 0, newItemset, 0, itemset2.length);
newItemset[itemset1.length] = itemset1[itemset1.length -1];
}else{
// Create a new candidate by combining itemset1 and itemset2
System.arraycopy(itemset1, 0, newItemset, 0, itemset1.length);
newItemset[itemset1.length] = itemset2[itemset2.length -1];
}
// The candidate is tested to see if its subsets of size k-1 are
// included in level k-1 (they are frequent).
if (allSubsetsOfSizeK_1AreFrequent(newItemset, candidatesK_1)) {
// If yes, we add the candidate to the hash-tree
newCandidates.insertCandidateItemset(new Itemset(newItemset));
}
}
}
}
/**
* Method for generating the candidate itemsets of size 2.
* @param frequent1 The frequent itemsets of size 1
* @return the candidate itemsets of size 2 stored in a hash-tree.
*/
private ItemsetHashTree generateCandidate2(List<Integer> frequent1) {
// we create an hash-tree to store the candidates
ItemsetHashTree candidates = new ItemsetHashTree(2, hash_tree_branch_count);
// For each pair of frequent items
for (int i = 0; i < frequent1.size(); i++) {
Integer item1 = frequent1.get(i);
for (int j = i + 1; j < frequent1.size(); j++) {
Integer item2 = frequent1.get(j);
// Create a new candidate by combining the two items and insert
// it in the hash-tree
candidates.insertCandidateItemset(new Itemset(new int []{item1, item2}));
}
}
return candidates; // return the hash-tree
}
/**
* This method checks if all the subsets of an items are frequent (i.e. if
* all the subsets are in the hash-tree of the previous level)
* @param itemset the itemset
* @param hashtreeCandidatesK_1 the hash-tree of the previous level
* @return
*/
protected boolean allSubsetsOfSizeK_1AreFrequent(int[] itemset, ItemsetHashTree hashtreeCandidatesK_1) {
// generate all subsets by always each item from the candidate, one by one
for(int posRemoved=0; posRemoved< itemset.length; posRemoved++){
if(hashtreeCandidatesK_1.isInTheTree(itemset, posRemoved) == false){ // if we did not find it, that means that candidate is not a frequent itemset because
// at least one of its subsets does not appear in level k-1.
return false;
}
}
return true;
}
/**
* Method to save a frequent itemset to file
* @param itemset
* @throws IOException
*/
void saveItemsetToFile(Itemset itemset) throws IOException {
writer.write(itemset.toString() + " #SUP: "
+ itemset.getAbsoluteSupport());
writer.newLine();
itemsetCount++;
}
/**
* Method to save a frequent itemset of size 1 to file.
* @param item the item contained in the itemset.
* @param support the support of the item.
* @throws IOException if an error happens while writing to file.
*/
void saveItemsetToFile(Integer item, Integer support) throws IOException {
writer.write(item + " #SUP: " + support);
writer.newLine();
itemsetCount++;
}
/**
* Method to print statistics about the execution of the algorithm.
*/
public void printStats() {
System.out.println("============= APRIORI-HT - STATS =============");
System.out.println(" Candidates count : " + totalCandidateCount);
System.out.println(" The algorithm stopped at size " + (k - 1)
+ ", because there is no candidate");
System.out.println(" Frequent itemsets count : " + itemsetCount);
System.out.println(" Maximum memory usage : " + MemoryLogger.getInstance().getMaxMemory() + " mb");
System.out.println(" Total time ~ " + (endTimestamp - startTimestamp) + " ms");
System.out.println("===================================================");
}
//
// private String toString(int[] newItemset) {
// StringBuilder temp = new StringBuilder();
// for(Integer integer: newItemset){
// temp.append(integer);
// temp.append(" ");
// }
// return temp.toString();
// }
}