package ca.pfv.spmf.algorithms.frequentpatterns.itemsettree;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import ca.pfv.spmf.patterns.itemset_array_integers_with_count.Itemset;
import ca.pfv.spmf.tools.MemoryLogger;
/**
* An implementation of the Itemset-tree
*
* It is based on the description in:
*
* Kubat, M., Hafez, A., Raghavan, V. V., Lekkala, J. R., Chen, W. K. (2003)
* Itemset Trees for Targeted Association Querying. Proc. of ICDE 2003.
*
* Copyright (c) 2008-2012 Philippe Fournier-Viger
*
* This file is part of the SPMF DATA MINING SOFTWARE
* (http://www.philippe-fournier-viger.com/spmf).
*
* SPMF is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
*
* SPMF is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with
* SPMF. If not, see <http://www.gnu.org/licenses/>.
*/
public class ItemsetTree extends AbstractItemsetTree implements Serializable{
/**
* Default constructor
*/
public ItemsetTree() {
super();
}
/**
* Build the itemset-tree based on an input file containing transactions
* @param input an input file
* @throws IOException exception if error while reading the file
*/
public void buildTree(String input)
throws IOException {
// record start time
startTimestamp = System.currentTimeMillis();
// reset memory usage statistics
MemoryLogger.getInstance().reset();
// create an empty root for the tree
root = new ItemsetTreeNode(null, 0);
// Scan the database to read the transactions
BufferedReader reader = new BufferedReader(new FileReader(input));
String line;
// for each line (transaction) until the end of file
while (((line = reader.readLine()) != null)) {
// if the line is a comment, is empty or is a
// kind of metadata
if (line.isEmpty() == true ||
line.charAt(0) == '#' || line.charAt(0) == '%'
|| line.charAt(0) == '@') {
continue;
}
// split the transaction into items
String[] lineSplited = line.split(" ");
// create a structure for storing the transaction
int[] itemset = new int[lineSplited.length];
// for each item in the transaction
for (int i=0; i< lineSplited.length; i++) {
// convert the item to integer and add it to the structure
itemset[i] = Integer.parseInt(lineSplited[i]);
}
// printTree();
// call the method "construct" to add the transaction to the tree
construct(null, root, itemset);
}
// close the input file
reader.close();
// check the memory usage
MemoryLogger.getInstance().checkMemory();
// close the file
endTimestamp = System.currentTimeMillis();
}
/**
* Add a transaction to the itemset tree.
* @param transaction the transaction to be added (array of ints)
*/
public void addTransaction(int[] transaction){
// call the "construct" algorithm to add it
construct(null, root, transaction);
}
/**
* Given the root of a sub-tree, add an itemset at the proper position in that tree
* @param r the root of the sub-tree
* @param s the itemset to be inserted
*/
private void construct(ItemsetTreeNode parentOfR, ItemsetTreeNode r, int[] s) {
// get the itemset in the root node
int[] sr = r.itemset;
// if the itemset in root node is the same as the one to be inserted,
// we just increase the support, and return.
if(same(s, sr)){
r.support++;
return;
}
// if the node to be inserted is an ancestor of the itemset of the root node
if(ancestorOf(s, sr)){
// create a new node for the itemset to be inserted with the support of
// the root node + 1
ItemsetTreeNode newNode = new ItemsetTreeNode(s, r.support +1);
// set the childs and parent pointers.
newNode.childs.add(r);
parentOfR.childs.remove(r);
parentOfR.childs.add(newNode);
// r.parent = newNode;
return; // return
}
// Otherwise, calculate the largest common ancestor
// of the itemset to be inserted and the root of the sutree
int[] l = getLargestCommonAncestor(s, sr);
if(l != null){ // if there is one largest common ancestor
// create a new node with that ancestor and the support of
// the root +1.
ItemsetTreeNode newNode = new ItemsetTreeNode(l, r.support +1);
// set the node childs and parent pointers
newNode.childs.add(r);
parentOfR.childs.remove(r);
parentOfR.childs.add(newNode);
// r.parent = newNode;
// append second children which is the itemset to be added with a
// support of 1
ItemsetTreeNode newNode2 = new ItemsetTreeNode(s, 1);
// update pointers for the new node
newNode.childs.add(newNode2);
// newNode2.parent = newNode;
return;
}
// else get the length of the root itemset
int indexLastItemOfR = (sr == null)? 0 : sr.length;
// increase the support of the root
r.support++;
// for each child of the root
for(ItemsetTreeNode ci : r.childs){
// if one children of the root is the itemset to be inserted s,
// then increase its support and stop
if(same(s, ci.itemset)){ // case 2
ci.support++;
return;
}
// if the itemset to be inserted is an ancestor of the child ci
if(ancestorOf(s, ci.itemset)){ // case 3
// create a new node between ci and r in the tree
// and update child /parents pointers
ItemsetTreeNode newNode = new ItemsetTreeNode(s, ci.support+ 1);
newNode.childs.add(ci);
// newNode.parent = r;
r.childs.remove(ci);
r.childs.add(newNode);
// ci.parent = newNode;
return;
}
// if the child ci is an ancestor of s
if(ancestorOf(ci.itemset, s)){ // case 4
// then make a recursive call to construct to handle this case.
construct(r, ci, s);
return;
}
// case 5
// if ci and s have a common ancestor that is larger than r:
if(ci.itemset[indexLastItemOfR] == s[indexLastItemOfR]){
// find the largest common ancestor
int[] ancestor = getLargestCommonAncestor(s, ci.itemset);
// create a new node for the ancestor itemset just found with the support
// of ci + 1
ItemsetTreeNode newNode = new ItemsetTreeNode(ancestor, ci.support+ 1);
// set r as aprent
// newNode.parent = r;
r.childs.add(newNode);
// add ci as a childre of the new node
newNode.childs.add(ci);
// ci.parent = newNode;
r.childs.remove(ci);
// create another new node for s with a support of 1, which
// will be the child of the first new node
ItemsetTreeNode newNode2 = new ItemsetTreeNode(s, 1);
// newNode2.parent = newNode;
newNode.childs.add(newNode2);
// end
return;
}
}
// Otherwise, case 1:
// A new node is created for s with a support of 1 and is added
// below the node r.
ItemsetTreeNode newNode = new ItemsetTreeNode(s, 1);
// newNode.parent = r;
r.childs.add(newNode);
}
/**
* Print statistics about the time and maximum memory usage for the construction
* of the itemset tree.
*/
public void printStatistics() {
System.out.println("========== ITEMSET TREE CONSTRUCTION - STATS ============");
System.out.println(" Tree construction time ~: " + (endTimestamp - startTimestamp)
+ " ms");
System.out.println(" Max memory:" + MemoryLogger.getInstance().getMaxMemory());
nodeCount = 0;
totalItemCountInNodes = 0;
recursiveStats(root);
System.out.println(" Node count: " + nodeCount);
System.out.println(" Sum of items in all node: " + totalItemCountInNodes + " avg per node :" + totalItemCountInNodes / ((double)nodeCount));
System.out.println("=====================================");
}
private void recursiveStats(ItemsetTreeNode root) {
if(root != null && root.itemset!=null){
nodeCount++;
totalItemCountInNodes += root.itemset.length;
}
for(ItemsetTreeNode node : root.childs){
recursiveStats(node);
}
}
/**
* Print the tree to System.out.
*/
public void printTree() {
System.out.println(root.toString(new StringBuilder(),""));
}
/**
* Return a string representation of the tree.
*/
public String toString() {
return root.toString(new StringBuilder(), "");
}
/**
* Get the support of a given itemset s.
* @param s the itemset
* @return the support as an integer.
*/
public int getSupportOfItemset(int[] s) {
return count(s, root); // call the method count.
}
/**
* This method calculate the support of an itemset by using a subtree
* defined by its root.
*
* Note: this is implemented based on the algorithm "count" of Table 2 in the paper by Kubat et al.
// Note that there was a few problem in the algorithm in the paper.
// I had to change > by < in : ci.itemset[ci.itemset.length -1] < s[s.length -1]){
// also the count was not correct so i had to change the way it counted the support a little bit
// by using += instead of return.
*
* @param s the itemset
* @param root the root of the subtree
* @return the support as an integer
*/
private int count(int[] s, ItemsetTreeNode root) {
// the variable count will be used to count the support
int count =0;
// for each child of the root
for(ItemsetTreeNode ci : root.childs){
// if the first item of the itemset that we are looking for
// is smaller than the first item of the child, we need to look
// further in that tree.
if(ci.itemset[0] <= s[0]){
// if s is included in ci, add the support of ci to the current count.
if(includedIn(s, ci.itemset)){
count += ci.support;
}else if(ci.itemset[ci.itemset.length -1] < s[s.length -1]){
// otherwise, if the last item of ci is smaller than
// the last item of s, then make a recursive call to explore
// the subtree where ci is the root
count += count(s, ci);
}
}
}
// return the total count
return count;
}
/**
* Check if an itemset is contained in another
* @param itemset1 the first itemset
* @param itemset2 the second itemset
* @return true if yes, otherwise false
*/
private boolean includedIn(int[] itemset1, int[] itemset2) {
int count = 0; // the current position of itemset1 that we want to find in itemset2
// for each item in itemset2
for(int i=0; i< itemset2.length; i++){
// if we found the item
if(itemset2[i] == itemset1[count]){
// we will look for the next item of itemset1
count++;
// if we have found all items already, return true
if(count == itemset1.length){
return true;
}
}
}
// it is not included, so return false!
return false;
}
/**
* This method pass through the itemset tree to get all itemsets
* that are subsuming a given itemset "s" and their support. Note that
* this method may also return infrequent itemsets that can be filtered by
* additional processing after.
* @param s the itemset
* @return an hashtable countaining itemsets and their support.
*/
public HashTableIT getFrequentItemsetSubsuming(int[] s){
// create a hash table to contain the itemsets to be more efficient
// we set the default size of the internal array to 1000
HashTableIT hash = new HashTableIT(1000);
// create an hashset to store the items of the itemset
HashSet<Integer> seti = new HashSet<Integer>();
for(int i=0; i< s.length; i++){
seti.add(s[i]);
}
// call the method selective mining for finding the sets subsuming s
selectiveMining(s, seti, root, hash);
return hash;
}
// /**
// * This method finds itemsets subsuming a given itemset. It is a recursive method that
// * scan a subtree of the itemset-tree. It stores itemsets found in an hashtable together
// * with their support.
// * @param s the itemset s
// * @param seti the items from the itemset s stored in a HashSet<Integer> for more efficiency for inclusion checking
// * @param t the root of the subtree
// * @param hash the hashtable for storing the result
// */
// private void selectiveMining(int[] s, HashSet<Integer> seti, ItemsetTreeNode t, HashTableIT hash) {
// // for all child nodes of the given root of the subtree
// for(ItemsetTreeNode ci : t.childs){
// // if the first item of s is smaller or equal to the
// // first item of the child
// if(ci.itemset[0] <= s[0]){
// // Check if s is included in ci
// if(includedIn(s, ci.itemset)){
// // if ci has not child, put s in the hashtable with
// // the support of ci, and then
// // call recursive add.
// // Note: This part is not explained correctly in the paper,
// // i had to figure it out by myself and fix it.
// if(ci.childs.size() ==0){
// hash.put(s, ci.support);
// recursiveAdd(s, seti, ci.itemset, ci.support, hash, 0);
// }else{
// // otherwise recursively explore subtree with ci as root.
// selectiveMining(s, seti, ci, hash);
// }
//
// }
// else if(ci.itemset[ci.itemset.length -1] < s[s.length -1]){
// // else if the last item of ci is smaller than the last
// // item of s, we also need to recursively explore subtree
// // with ci as root.
// selectiveMining(s, seti, ci, hash);
// }
// }
// }
// }
/**
* This method finds itemsets subsuming a given itemset. It is a recursive method that
* scan a subtree of the itemset-tree. It stores itemsets found in an hashtable together
* with their support.
* @param s the itemset s
* @param seti the items from the itemset s stored in a HashSet<Integer> for more efficiency for inclusion checking
* @param t the root of the subtree
* @param hash the hashtable for storing the result
* @return the cumulative support of the t's immediate children. This is needed to ensure t is correctly incorporated into the hashtable (and with the correct support).
*/
private int selectiveMining(int[] s, HashSet<Integer> seti, ItemsetTreeNode t, HashTableIT hash) {
//initializes the running cumulative support of t's immediate children
int childrenSup = 0;
// for all child nodes of the given root of the subtree
for(ItemsetTreeNode ci : t.childs){
//Add ci's support to the cumulative count
childrenSup += ci.support;
// if the first item of s is smaller or equal to the
// first item of the child
if(ci.itemset[0] <= s[0]){
// Check if s is included in ci
if(includedIn(s, ci.itemset)){
// if ci has not child, put s in the hashtable with
// the support of ci, and then
// call recursive add.
// Note: This part is not explained correctly in the paper,
// i had to figure it out by myself and fix it.
if(ci.childs.size() ==0){
hash.put(s, ci.support);
recursiveAdd(s, seti, ci.itemset, ci.support, hash, 0);
}else{
// otherwise recursively explore subtree with ci as root.
// Note, we subtract the count returned by selectiveMining (which contains the cumulative
//support of the ci's immediate children) from ci's support
//remainingSup thus indicates how many times ci's itemset appeared by itself in the database)
int remainingSup = ci.support - selectiveMining(s, seti, ci, hash);
//If remainingSup is greater than 0, then it means ci's children do not fully account for all of
//the occurrences of ci's itemset. In other words, ci's itemset appeared by itself remainingSup times
//Hence, we need to put s in the hashtable with remainingSup, and then call recursive add on ci
//with remainingSup.
if (remainingSup > 0)
{
hash.put(s, remainingSup);
recursiveAdd(s, seti, ci.itemset, remainingSup, hash, 0);
}
}
}
else if(ci.itemset[ci.itemset.length -1] < s[s.length -1]){
// else if the last item of ci is smaller than the last
// item of s, we also need to recursively explore subtree
// with ci as root.
selectiveMining(s, seti, ci, hash);
}
}
}
return childrenSup;
}
/**
* Perform a recursive add (as based on the procedure presented in the paper by Kubat et al.)
* @param s an itemset s
* @param seti the items from the itemset s in a HashSet of integers
* @param ci an itemset tree node ci
* @param cisupport the support of the itemset associated to ci
* @param hash an hashtable used to store itemset and their support
* @param pos the current position in the itemset ci
*/
private void recursiveAdd(int[] s, HashSet<Integer> seti, int[] ci, int cisupport, HashTableIT hash, int pos) {
// if we have reached the end of ci, then stop
if(pos >= ci.length){
return;
}
// if the itemset i contain the item as position pos in ci
if(!seti.contains(ci[pos])){
// create a new itemset "newS" by concatening the
// item as position pos inn ci with the itemset s.
// Note that the resulting itemset must be lexicographically ordered
// so we copy the item one by one and check where the item at
// position pos should be inserted.
int[] newS = new int[s.length+1]; // create the new itemset
int j=0; // current position
boolean added = false; //indicate if we have added the item at pos already
// for each item in s
for(Integer item : s){
// if added already or the current item is smaller than the one at pos
if(added || item < ci[pos]){
// we add the item from s
newS[j++] = item;
}else{
// otherwise, we insert the item at position pos
newS[j++] = ci[pos];
newS[j++] = item;
added = true; // we set that variable to true to not insert it twice!
}
}
// if the item at position pos was not yet added, that means
// that he should be inserted in the last position because he his
// greater than all other items
if(j < s.length+1){
newS[j++] = ci[pos];
}
// add the new itemset to the hashtable with the support of ci
hash.put(newS, cisupport);
// make a recursive call with the next position in ci with the new itemset
recursiveAdd(newS, seti, ci, cisupport, hash, pos+1);
}
// make a recursive call with the next position in ci with itemset "S"
recursiveAdd(s, seti, ci, cisupport, hash, pos+1);
}
}