package ca.pfv.spmf.algorithms.frequentpatterns.fpgrowth;
/* This file is copyright (c) 2008-2013 Philippe Fournier-Viger
*
* This file is part of the SPMF DATA MINING SOFTWARE
* (http://www.philippe-fournier-viger.com/spmf).
*
* SPMF is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
*
* SPMF is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
* You should have received a copy of the GNU General Public License along with
* SPMF. If not, see <http://www.gnu.org/licenses/>.
*/
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import ca.pfv.spmf.patterns.itemset_array_integers_with_count.Itemset;
import ca.pfv.spmf.patterns.itemset_array_integers_with_count.Itemsets;
import ca.pfv.spmf.tools.MemoryLogger;
/**
* This is an implementation of the FPGROWTH algorithm (Han et al., 2004).
* FPGrowth is described here:
* <br/><br/>
*
* Han, J., Pei, J., & Yin, Y. (2000, May). Mining frequent patterns without candidate generation. In ACM SIGMOD Record (Vol. 29, No. 2, pp. 1-12). ACM
* <br/><br/>
*
* This is an optimized version that saves the result to a file
* or keep it into memory if no output path is provided
* by the user to the runAlgorithm method().
*
* @see FPTree
* @see Itemset
* @see Itemsets
* @author Philippe Fournier-Viger
*/
public class AlgoFPGrowth {
// for statistics
private long startTimestamp; // start time of the latest execution
private long endTime; // end time of the latest execution
private int transactionCount = 0; // transaction count in the database
private int itemsetCount; // number of freq. itemsets found
// parameter
public int minSupportRelative;// the relative minimum support
BufferedWriter writer = null; // object to write the output file
// The patterns that are found
// (if the user want to keep them into memory)
protected Itemsets patterns = null;
// This variable is used to determine the size of buffers to store itemsets.
// A value of 50 is enough because it allows up to 2^50 patterns!
final int BUFFERS_SIZE = 50;
// buffer for storing the current itemset that is mined when performing mining
// the idea is to always reuse the same buffer to reduce memory usage.
private int[] itemsetBuffer = null;
// another buffer
private int[] itemsetTempBuffer = null;
// This buffer is used to store an itemset that will be written to file
// so that the algorithm can sort the itemset before it is output to file
// (when the user choose to output result to file).
private int[] itemsetOutputBuffer = null;
/**
* Constructor
*/
public AlgoFPGrowth() {
}
/**
* Method to run the FPGRowth algorithm.
* @param input the path to an input file containing a transaction database.
* @param output the output file path for saving the result (if null, the result
* will be returned by the method instead of being saved).
* @param minsupp the minimum support threshold.
* @return the result if no output file path is provided.
* @throws IOException exception if error reading or writing files
*/
public Itemsets runAlgorithm(String input, String output, double minsupp) throws FileNotFoundException, IOException {
// record start time
startTimestamp = System.currentTimeMillis();
// number of itemsets found
itemsetCount = 0;
//initialize tool to record memory usage
MemoryLogger.getInstance().reset();
MemoryLogger.getInstance().checkMemory();
// if the user want to keep the result into memory
if(output == null){
writer = null;
patterns = new Itemsets("FREQUENT ITEMSETS");
}else{ // if the user want to save the result to a file
patterns = null;
writer = new BufferedWriter(new FileWriter(output));
itemsetOutputBuffer = new int[BUFFERS_SIZE];
}
// (1) PREPROCESSING: Initial database scan to determine the frequency of each item
// The frequency is stored in a map:
// key: item value: support
final Map<Integer, Integer> mapSupport = scanDatabaseToDetermineFrequencyOfSingleItems(input);
// convert the minimum support as percentage to a
// relative minimum support
this.minSupportRelative = (int) Math.ceil(minsupp * transactionCount);
// (2) Scan the database again to build the initial FP-Tree
// Before inserting a transaction in the FPTree, we sort the items
// by descending order of support. We ignore items that
// do not have the minimum support.
FPTree tree = new FPTree();
// read the file
BufferedReader reader = new BufferedReader(new FileReader(input));
String line;
// for each line (transaction) until the end of the file
while( ((line = reader.readLine())!= null)){
// if the line is a comment, is empty or is a
// kind of metadata
if (line.isEmpty() == true || line.charAt(0) == '#' || line.charAt(0) == '%'
|| line.charAt(0) == '@') {
continue;
}
String[] lineSplited = line.split(" ");
// Set<Integer> alreadySeen = new HashSet<Integer>();
List<Integer> transaction = new ArrayList<Integer>();
// for each item in the transaction
for(String itemString : lineSplited){
Integer item = Integer.parseInt(itemString);
// only add items that have the minimum support
if(mapSupport.get(item) >= minSupportRelative){
transaction.add(item);
}
}
// sort item in the transaction by descending order of support
Collections.sort(transaction, new Comparator<Integer>(){
public int compare(Integer item1, Integer item2){
// compare the frequency
int compare = mapSupport.get(item2) - mapSupport.get(item1);
// if the same frequency, we check the lexical ordering!
if(compare == 0){
return (item1 - item2);
}
// otherwise, just use the frequency
return compare;
}
});
// add the sorted transaction to the fptree.
tree.addTransaction(transaction);
}
// close the input file
reader.close();
// We create the header table for the tree using the calculated support of single items
tree.createHeaderList(mapSupport);
// (5) We start to mine the FP-Tree by calling the recursive method.
// Initially, the prefix alpha is empty.
// if at least an item is frequent
if(tree.headerList.size() > 0) {
// initialize the buffer for storing the current itemset
itemsetBuffer = new int[BUFFERS_SIZE];
// and another buffer
itemsetTempBuffer = new int[BUFFERS_SIZE];
// recursively generate frequent itemsets using the fp-tree
// Note: we assume that the initial FP-Tree has more than one path
// which should generally be the case.
fpgrowth(tree, itemsetBuffer, 0, transactionCount, mapSupport);
}
// close the output file if the result was saved to a file
if(writer != null){
writer.close();
}
// record the execution end time
endTime= System.currentTimeMillis();
// check the memory usage
MemoryLogger.getInstance().checkMemory();
// return the result (if saved to memory)
return patterns;
}
/**
* Mine an FP-Tree having more than one path.
* @param tree the FP-tree
* @param prefix the current prefix, named "alpha"
* @param mapSupport the frequency of items in the FP-Tree
* @throws IOException exception if error writing the output file
*/
private void fpgrowth(FPTree tree, int [] prefix, int prefixLength, int prefixSupport, Map<Integer, Integer> mapSupport) throws IOException {
//// ======= DEBUG ========
// System.out.print("###### Prefix: ");
// for(int k=0; k< prefixLength; k++) {
// System.out.print(prefix[k] + " ");
// }
// System.out.println("\n");
//// ========== END DEBUG =======
// System.out.println(tree);
// We will check if the FPtree contains a single path
boolean singlePath = true;
// We will use a variable to keep the support of the single path if there is one
int singlePathSupport = 0;
// This variable is used to count the number of items in the single path
// if there is one
int position = 0;
// if the root has more than one child, than it is not a single path
if(tree.root.childs.size() > 1) {
singlePath = false;
}else {
// Otherwise,
// if the root has exactly one child, we need to recursively check childs
// of the child to see if they also have one child
FPNode currentNode = tree.root.childs.get(0);
while(true){
// if the current child has more than one child, it is not a single path!
if(currentNode.childs.size() > 1) {
singlePath = false;
break;
}
// otherwise, we copy the current item in the buffer and move to the child
// the buffer will be used to store all items in the path
itemsetTempBuffer[position] = currentNode.itemID;
// we keep the support of the path
singlePathSupport = currentNode.counter;
position++;
// if this node has no child, that means that this is the end of this path
// and it is a single path, so we break
if(currentNode.childs.size() == 0) {
break;
}
currentNode = currentNode.childs.get(0);
}
}
// Case 1: the FPtree contains a single path
if(singlePath && singlePathSupport >= minSupportRelative){
// We save the path, because it is a maximal itemset
saveAllCombinationsOfPrefixPath(itemsetTempBuffer, position, prefix, prefixLength, singlePathSupport);
}else {
// For each frequent item in the header table list of the tree in reverse order.
for(int i = tree.headerList.size()-1; i>=0; i--){
// get the item
Integer item = tree.headerList.get(i);
// get the item support
int support = mapSupport.get(item);
// Create Beta by concatening prefix Alpha by adding the current item to alpha
prefix[prefixLength] = item;
// calculate the support of the new prefix beta
int betaSupport = (prefixSupport < support) ? prefixSupport: support;
// save beta to the output file
saveItemset(prefix, prefixLength+1, betaSupport);
// === (A) Construct beta's conditional pattern base ===
// It is a subdatabase which consists of the set of prefix paths
// in the FP-tree co-occuring with the prefix pattern.
List<List<FPNode>> prefixPaths = new ArrayList<List<FPNode>>();
FPNode path = tree.mapItemNodes.get(item);
// Map to count the support of items in the conditional prefix tree
// Key: item Value: support
Map<Integer, Integer> mapSupportBeta = new HashMap<Integer, Integer>();
while(path != null){
// if the path is not just the root node
if(path.parent.itemID != -1){
// create the prefixpath
List<FPNode> prefixPath = new ArrayList<FPNode>();
// add this node.
prefixPath.add(path); // NOTE: we add it just to keep its support,
// actually it should not be part of the prefixPath
// ####
int pathCount = path.counter;
//Recursively add all the parents of this node.
FPNode parent = path.parent;
while(parent.itemID != -1){
prefixPath.add(parent);
// FOR EACH PATTERN WE ALSO UPDATE THE ITEM SUPPORT AT THE SAME TIME
// if the first time we see that node id
if(mapSupportBeta.get(parent.itemID) == null){
// just add the path count
mapSupportBeta.put(parent.itemID, pathCount);
}else{
// otherwise, make the sum with the value already stored
mapSupportBeta.put(parent.itemID, mapSupportBeta.get(parent.itemID) + pathCount);
}
parent = parent.parent;
}
// add the path to the list of prefixpaths
prefixPaths.add(prefixPath);
}
// We will look for the next prefixpath
path = path.nodeLink;
}
// (B) Construct beta's conditional FP-Tree
// Create the tree.
FPTree treeBeta = new FPTree();
// Add each prefixpath in the FP-tree.
for(List<FPNode> prefixPath : prefixPaths){
treeBeta.addPrefixPath(prefixPath, mapSupportBeta, minSupportRelative);
}
// Mine recursively the Beta tree if the root has child(s)
if(treeBeta.root.childs.size() > 0){
// Create the header list.
treeBeta.createHeaderList(mapSupportBeta);
// recursive call
fpgrowth(treeBeta, prefix, prefixLength+1, betaSupport, mapSupportBeta);
}
}
}
}
/**
* This method saves all combinations of a prefix path if it has enough support
* @param prefix the current prefix
* @param prefixLength the current prefix length
* @param prefixPath the prefix path
* @throws IOException if exception while writting to output file
*/
private void saveAllCombinationsOfPrefixPath(int[] itemsetBuffer, int position,
int[] prefix, int prefixLength, int support) throws IOException {
// Generate all subsets of the prefixPath except the empty set
// and output them
// We use bits to generate all subsets.
for (long i = 1, max = 1 << position; i < max; i++) {
// we create a new subset
int newPrefixLength = prefixLength;
// for each bit
for (int j = 0; j < position; j++) {
// check if the j bit is set to 1
int isSet = (int) i & (1 << j);
// if yes, add the bit position as an item to the new subset
if (isSet > 0) {
prefix[newPrefixLength++] = itemsetBuffer[j];
}
}
// save the itemset
saveItemset(prefix, newPrefixLength, support);
}
}
/**
* This method scans the input database to calculate the support of single items
* @param input the path of the input file
* @throws IOException exception if error while writing the file
* @return a map for storing the support of each item (key: item, value: support)
*/
private Map<Integer, Integer> scanDatabaseToDetermineFrequencyOfSingleItems(String input)
throws FileNotFoundException, IOException {
// a map for storing the support of each item (key: item, value: support)
Map<Integer, Integer> mapSupport = new HashMap<Integer, Integer>();
//Create object for reading the input file
BufferedReader reader = new BufferedReader(new FileReader(input));
String line;
// for each line (transaction) until the end of file
while( ((line = reader.readLine())!= null)){
// if the line is a comment, is empty or is a
// kind of metadata
if (line.isEmpty() == true || line.charAt(0) == '#' || line.charAt(0) == '%' || line.charAt(0) == '@') {
continue;
}
// split the line into items
String[] lineSplited = line.split(" ");
// for each item
for(String itemString : lineSplited){
// increase the support count of the item
Integer item = Integer.parseInt(itemString);
// increase the support count of the item
Integer count = mapSupport.get(item);
if(count == null){
mapSupport.put(item, 1);
}else{
mapSupport.put(item, ++count);
}
}
// increase the transaction count
transactionCount++;
}
// close the input file
reader.close();
return mapSupport;
}
/**
* Write a frequent itemset that is found to the output file or
* keep into memory if the user prefer that the result be saved into memory.
*/
private void saveItemset(int [] itemset, int itemsetLength, int support) throws IOException {
// increase the number of itemsets found for statistics purpose
itemsetCount++;
// if the result should be saved to a file
if(writer != null){
// copy the itemset in the output buffer and sort items
System.arraycopy(itemset, 0, itemsetOutputBuffer, 0, itemsetLength);
Arrays.sort(itemsetOutputBuffer, 0, itemsetLength);
// Create a string buffer
StringBuilder buffer = new StringBuilder();
// write the items of the itemset
for(int i=0; i< itemsetLength; i++){
buffer.append(itemsetOutputBuffer[i]);
if(i != itemsetLength-1){
buffer.append(' ');
}
}
// Then, write the support
buffer.append(" #SUP: ");
buffer.append(support);
// write to file and create a new line
writer.write(buffer.toString());
writer.newLine();
}// otherwise the result is kept into memory
else{
// create an object Itemset and add it to the set of patterns
// found.
int[] itemsetArray = new int[itemsetLength];
System.arraycopy(itemset, 0, itemsetArray, 0, itemsetLength);
// sort the itemset so that it is sorted according to lexical ordering before we show it to the user
Arrays.sort(itemsetArray);
Itemset itemsetObj = new Itemset(itemsetArray);
itemsetObj.setAbsoluteSupport(support);
patterns.addItemset(itemsetObj, itemsetLength);
}
}
/**
* Print statistics about the algorithm execution to System.out.
*/
public void printStats() {
System.out.println("============= FP-GROWTH 0.96r14 - STATS =============");
long temps = endTime - startTimestamp;
System.out.println(" Transactions count from database : " + transactionCount);
System.out.print(" Max memory usage: " + MemoryLogger.getInstance().getMaxMemory() + " mb \n");
System.out.println(" Frequent itemsets count : " + itemsetCount);
System.out.println(" Total time ~ " + temps + " ms");
System.out.println("===================================================");
}
/**
* Get the number of transactions in the last transaction database read.
* @return the number of transactions.
*/
public int getDatabaseSize() {
return transactionCount;
}
}