package ca.pfv.spmf.algorithms.frequentpatterns.fpgrowth;
/* This file is copyright (c) 2008-2015 Philippe Fournier-Viger
*
* This file is part of the SPMF DATA MINING SOFTWARE
* (http://www.philippe-fournier-viger.com/spmf).
*
* SPMF is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
*
* SPMF is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
* You should have received a copy of the GNU General Public License along with
* SPMF. If not, see <http://www.gnu.org/licenses/>.
*/
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import ca.pfv.spmf.patterns.itemset_array_integers_with_count.Itemset;
import ca.pfv.spmf.patterns.itemset_array_integers_with_count.Itemsets;
import ca.pfv.spmf.tools.MemoryLogger;
/**
* This is an implementation of the FPMax algorithm (Grahne et al., 2004).
* FPGrowth is described here:
* <br/><br/>
*
* High Performance Mining of Maximal Frequent Itemsets (with J. Zhu).
* 6th International Workshop on High Performance Data Mining (HPDM '03) * <br/><br/>
*
* This is an optimized version that saves the result to a file
* or keep it into memory if no output path is provided
* by the user to the runAlgorithm method().
*
* @see FPTree
* @see Itemset
* @see Itemsets
* @author Philippe Fournier-Viger, 2015
*/
public class AlgoFPMax {
// for statistics
private long startTimestamp; // start time of the latest execution
private long endTime; // end time of the latest execution
private int transactionCount = 0; // transaction count in the database
private int itemsetCount; // number of freq. itemsets found
// parameter
public int minSupportRelative;// the relative minimum support
BufferedWriter writer = null; // object to write the output file
// The patterns that are found
// (if the user want to keep them into memory)
protected Itemsets patterns = null;
// This variable is used to determine the size of buffers to store itemsets.
// A value of 2000 should be enough
final int BUFFERS_SIZE = 2000;
// buffer for storing the current itemset that is mined when performing mining
// the idea is to always reuse the same buffer to reduce memory usage.
private int[] itemsetBuffer = null;
// This is the MFI tree for storing maximal itemsets
public MFITree mfiTree = null;
// Map to store the support of single items in the original databse
private Map<Integer, Integer> originalMapSupport = null;
// If set to true, the algorithm will show information for debugging in the console
private final boolean DEBUG = false;
// Comparator to compare the items based on the order of decreasing support in the original DB.
Comparator<Integer> comparatorOriginalOrder = new Comparator<Integer>(){
public int compare(Integer item1, Integer item2){
// compare the frequency
int compare = originalMapSupport.get(item2) - originalMapSupport.get(item1);
// if the same frequency, we check the lexical ordering!
if(compare == 0){
compare = (item1 - item2);
return compare;
}
return compare;
}
};
/**
* Constructor
*/
public AlgoFPMax() {
}
/**
* Method to run the FPGRowth algorithm.
* @param input the path to an input file containing a transaction database.
* @param output the output file path for saving the result (if null, the result
* will be returned by the method instead of being saved).
* @param minsupp the minimum support threshold.
* @return the result if no output file path is provided.
* @throws IOException exception if error reading or writing files
*/
public Itemsets runAlgorithm(String input, String output, double minsupp) throws FileNotFoundException, IOException {
// record start time
startTimestamp = System.currentTimeMillis();
// number of itemsets found
itemsetCount = 0;
//initialize tool to record memory usage
MemoryLogger.getInstance().reset();
MemoryLogger.getInstance().checkMemory();
// if the user want to keep the result into memory
if(output == null){
writer = null;
patterns = new Itemsets("FREQUENT ITEMSETS");
}else{ // if the user want to save the result to a file
patterns = null;
writer = new BufferedWriter(new FileWriter(output));
}
// (1) PREPROCESSING: Initial database scan to determine the frequency of each item
// The frequency is stored in a map:
// key: item value: support
originalMapSupport = scanDatabaseToDetermineFrequencyOfSingleItems(input);
// convert the minimum support as percentage to a
// relative minimum support
this.minSupportRelative = (int) Math.ceil(minsupp * transactionCount);
// Create the MFI Tree
mfiTree = new MFITree();
// (2) Scan the database again to build the initial FP-Tree
// Before inserting a transaction in the FPTree, we sort the items
// by descending order of support. We ignore items that
// do not have the minimum support.
FPTree tree = new FPTree();
// read the file
BufferedReader reader = new BufferedReader(new FileReader(input));
String line;
// for each line (transaction) until the end of the file
while( ((line = reader.readLine())!= null)){
// if the line is a comment, is empty or is a
// kind of metadata
if (line.isEmpty() == true || line.charAt(0) == '#' || line.charAt(0) == '%'
|| line.charAt(0) == '@') {
continue;
}
String[] lineSplited = line.split(" ");
List<Integer> transaction = new ArrayList<Integer>();
// for each item in the transaction
for(String itemString : lineSplited){
Integer item = Integer.parseInt(itemString);
// only add items that have the minimum support
if(originalMapSupport.get(item) >= minSupportRelative){
transaction.add(item);
}
}
// sort item in the transaction by descending order of support
Collections.sort(transaction, comparatorOriginalOrder);
// add the sorted transaction to the fptree.
tree.addTransaction(transaction);
}
// close the input file
reader.close();
// We create the header table for the tree using the calculated support of single items
tree.createHeaderList(originalMapSupport);
// System.out.println(tree);
// (5) We start to mine the FP-Tree by calling the recursive method.
// Initially, the prefix alpha is empty.
// if at least an item is frequent
if(tree.headerList.size() > 0) {
// initialize the buffer for storing the current itemset
itemsetBuffer = new int[BUFFERS_SIZE];
// Next we will recursively generate frequent itemsets using the fp-tree
fpMax(tree, itemsetBuffer, 0, transactionCount, originalMapSupport);
}
// close the output file if the result was saved to a file
if(writer != null){
writer.close();
}
// record the execution end time
endTime= System.currentTimeMillis();
// check the memory usage
MemoryLogger.getInstance().checkMemory();
// return the result (if saved to memory)
return patterns;
}
/**
* Mine an FP-Tree having more than one path.
* @param tree the FP-tree
* @param prefix the current prefix, named "alpha"
* @param mapSupport the frequency of items in the FP-Tree
* @throws IOException exception if error writing the output file
*/
private void fpMax(FPTree tree, int [] prefix, int prefixLength, int prefixSupport, Map<Integer, Integer> mapSupport) throws IOException {
// ======= DEBUG ========
if(DEBUG) {
System.out.print("###### Prefix: ");
for(int k=0; k< prefixLength; k++) {
System.out.print(prefix[k] + " ");
}
System.out.println("\n");
System.out.println(tree);
} //========== END DEBUG =======
// We first check if the FPtree contains a single path
// We will check if the FPtree contains a single path
boolean singlePath = true;
// We will use a variable to keep the support of the single path if there is one
int singlePathSupport = 0;
// This variable is used to count the number of items in the single path (if there
// is one + the prefix length
int position = prefixLength;
if(tree.root.childs.size() > 1) {
// if the root has more than one child, than it is not a single path
singlePath = false;
}else {
// if the root has exactly one child, we need to recursively check childs
// of the child to see if they also have one child
FPNode currentNode = tree.root.childs.get(0);
while(true){
// if the current child has more than one child, it is not a single path!
if(currentNode.childs.size() > 1) {
singlePath = false;
break;
}
// otherwise, we copy the current item in the buffer and move to the child
itemsetBuffer[position] = currentNode.itemID;
singlePathSupport = currentNode.counter;
position++;
// if this node has no child, that means that this is the end of this path
// and it is a single path, so we break
if(currentNode.childs.size() == 0) {
break;
}
currentNode = currentNode.childs.get(0);
}
}
// Case 1: the FPtree contains a single path
if(singlePath && singlePathSupport >= minSupportRelative){
// We save the path, because it is a maximal itemset
saveItemset(itemsetBuffer, position, singlePathSupport);
}else {
// Case 2: There are multiple paths.
// For each frequent item in the header table list of the tree in reverse order. (in decreasing order of support...)
for(int i = tree.headerList.size()-1; i>=0; i--){
// get the item
Integer item = tree.headerList.get(i);
// get the item support
int support = mapSupport.get(item);
// Create Beta by concatening item to the current prefix alpha
prefix[prefixLength] = item;
// calculate the support of the new prefix beta
int betaSupport = (prefixSupport < support) ? prefixSupport: support;
// === (A) Construct beta's conditional pattern base ===
// It is a subdatabase which consists of the set of prefix paths
// in the FP-tree co-occuring with the prefix pattern.
List<List<FPNode>> prefixPaths = new ArrayList<List<FPNode>>();
FPNode path = tree.mapItemNodes.get(item);
// Map to count the support of items in the conditional prefix tree
// Key: item Value: support
Map<Integer, Integer> mapSupportBeta = new HashMap<Integer, Integer>();
while(path != null){
// if the path is not just the root node
if(path.parent.itemID != -1){
// create the prefixpath
List<FPNode> prefixPath = new ArrayList<FPNode>();
// add this node.
prefixPath.add(path); // NOTE: we add it just to keep its support,
// actually it should not be part of the prefixPath
// ####
int pathCount = path.counter;
//Recursively add all the parents of this node.
FPNode parent = path.parent;
while(parent.itemID != -1){
prefixPath.add(parent);
// FOR EACH PATTERN WE ALSO UPDATE THE ITEM SUPPORT AT THE SAME TIME
// if the first time we see that node id
if(mapSupportBeta.get(parent.itemID) == null){
// just add the path count
mapSupportBeta.put(parent.itemID, pathCount);
}else{
// otherwise, make the sum with the value already stored
mapSupportBeta.put(parent.itemID, mapSupportBeta.get(parent.itemID) + pathCount);
}
parent = parent.parent;
}
// add the path to the list of prefixpaths
prefixPaths.add(prefixPath);
}
// We will look for the next prefixpath
path = path.nodeLink;
}
// ===== FPMAX ======
// concatenate Beta with all the frequent itemsets in the pattern base
// to get head U P
List<Integer> headWithP = new ArrayList<Integer>(mapSupportBeta.size() + prefixLength+1);
// concatenate the prefix
for(int z=0; z < prefixLength+1; z++) {
headWithP.add(prefix[z]);
}
// concatenate the other FREQUENT items in the pattern base
// for each item
for(Entry<Integer,Integer> entry: mapSupportBeta.entrySet()) {
// if the item is frequent
if(entry.getValue() >= minSupportRelative) {
headWithP.add(entry.getKey());
}
}
// Sort Head U P according to the original header list total order on items
// sort item in the transaction by descending order of support
Collections.sort(headWithP, comparatorOriginalOrder);
//======= DEBUG ========
if(DEBUG) {
System.out.println(" CHECK2 : " + headWithP);
}
//========== END DEBUG =======
// CHECK IF HEAD U P IS A SUBSET OF A MFI ACCORDING TO THE MFI-TREE
if(mfiTree.passSubsetChecking(headWithP)) {
if(DEBUG) {
System.out.println(" passed!");
}
// (B) Construct beta's conditional FP-Tree using its prefix path
// Create the tree.
FPTree treeBeta = new FPTree();
// Add each prefixpath in the FP-tree.
for(List<FPNode> prefixPath : prefixPaths){
treeBeta.addPrefixPath(prefixPath, mapSupportBeta, minSupportRelative);
}
// Mine recursively the Beta tree if the root has child(s)
if(treeBeta.root.childs.size() > 0){
// Create the header list.
treeBeta.createHeaderList(originalMapSupport);
// recursive call
fpMax(treeBeta, prefix, prefixLength+1, betaSupport, mapSupportBeta);
}
// ======= After that, we still need to check if beta is a maximal itemset ====
List<Integer> temp = new ArrayList<Integer>(mapSupportBeta.size() + prefixLength+1);
for(int z=0; z < prefixLength+1; z++) {
temp.add(prefix[z]);
}
Collections.sort(temp, comparatorOriginalOrder);
// if beta pass the test, we save it
if(mfiTree.passSubsetChecking(temp)) {
saveItemset(prefix, prefixLength+1, betaSupport);
}
//===========================================================
}
else if (DEBUG){
System.out.println(" failed!");
}
}
}
}
/**
* Write a frequent itemset that is found to the output file or
* keep into memory if the user prefer that the result be saved into memory.
*/
private void saveItemset(int [] itemset, int itemsetLength, int support) throws IOException {
// copy the itemset in the output buffer and sort items according to the
// order of decreasing support in the original database
int[] itemsetCopy = new int[itemsetLength];
System.arraycopy(itemset, 0, itemsetCopy, 0, itemsetLength);
sortOriginalOrder(itemsetCopy, itemsetLength);
if(DEBUG) {
// //======= DEBUG ========
System.out.print(" ##### SAVING : ");
for(int i=0; i< itemsetLength; i++) {
System.out.print(itemsetCopy[i] + " ");
}
System.out.println("\n");
// //========== END DEBUG =======
}
// add the itemset to the MFI-TREE
mfiTree.addMFI(itemsetCopy, itemsetCopy.length, support);
// increase the number of itemsets found for statistics purpose
itemsetCount++;
// if the result should be saved to a file
if(writer != null){
// Create a string buffer
StringBuilder buffer = new StringBuilder();
// write the items of the itemset
for(int i=0; i< itemsetLength; i++){
buffer.append(itemsetCopy[i]);
if(i != itemsetLength-1){
buffer.append(' ');
}
}
// Then, write the support
buffer.append(" #SUP: ");
buffer.append(support);
// write to file and create a new line
writer.write(buffer.toString());
writer.newLine();
}// otherwise the result is kept into memory
else{
// sort the itemset so that it is sorted according to lexical ordering before we show it to the user
Arrays.sort(itemsetCopy);
Itemset itemsetObj = new Itemset(itemsetCopy);
itemsetObj.setAbsoluteSupport(support);
patterns.addItemset(itemsetObj, itemsetLength);
}
}
/**
* Sort an array of items according to the total order of support
* This has an average performance of O(n^2)
* @param a array of integers
*/
public void sortOriginalOrder(int [] a, int length){
// Perform a bubble sort
for(int i=0; i < length; i++){
for(int j= length -1; j>= i+1; j--){
boolean test = comparatorOriginalOrder.compare(a[j], a[j-1]) < 0;
if(test){
int temp = a[j];
a[j] = a[j-1];
a[j-1] = temp;
}
}
}
}
/**
* This method scans the input database to calculate the support of single items
* @param input the path of the input file
* @throws IOException exception if error while writing the file
* @return a map for storing the support of each item (key: item, value: support)
*/
private Map<Integer, Integer> scanDatabaseToDetermineFrequencyOfSingleItems(String input)
throws FileNotFoundException, IOException {
// a map for storing the support of each item (key: item, value: support)
Map<Integer, Integer> mapSupport = new HashMap<Integer, Integer>();
//Create object for reading the input file
BufferedReader reader = new BufferedReader(new FileReader(input));
String line;
// for each line (transaction) until the end of file
while( ((line = reader.readLine())!= null)){
// if the line is a comment, is empty or is a
// kind of metadata
if (line.isEmpty() == true || line.charAt(0) == '#' || line.charAt(0) == '%' || line.charAt(0) == '@') {
continue;
}
// split the line into items
String[] lineSplited = line.split(" ");
// for each item
for(String itemString : lineSplited){
// increase the support count of the item
Integer item = Integer.parseInt(itemString);
// increase the support count of the item
Integer count = mapSupport.get(item);
if(count == null){
mapSupport.put(item, 1);
}else{
mapSupport.put(item, ++count);
}
}
// increase the transaction count
transactionCount++;
}
// close the input file
reader.close();
return mapSupport;
}
/**
* Print statistics about the algorithm execution to System.out.
*/
public void printStats() {
System.out.println("============= FP-Max v0.96r14 - STATS =============");
long temps = endTime - startTimestamp;
System.out.println(" Transactions count from database : " + transactionCount);
System.out.print(" Max memory usage: " + MemoryLogger.getInstance().getMaxMemory() + " mb \n");
System.out.println(" Maximal frequent itemset count : " + itemsetCount);
System.out.println(" Total time ~ " + temps + " ms");
System.out.println("===================================================");
}
/**
* Get the number of transactions in the last transaction database read.
* @return the number of transactions.
*/
public int getDatabaseSize() {
return transactionCount;
}
}