package ca.pfv.spmf.algorithms.frequentpatterns.itemsettree; import java.io.BufferedReader; import java.io.FileReader; import java.io.IOException; import java.io.Serializable; import java.util.ArrayList; import java.util.HashSet; import java.util.Iterator; import java.util.List; import ca.pfv.spmf.algorithms.ArraysAlgos; import ca.pfv.spmf.patterns.itemset_array_integers_with_count.Itemset; import ca.pfv.spmf.tools.MemoryLogger; /** * This is the original implementation of the Memory Efficient Itemset-tree as * proposed in: * * Fournier-Viger, P., Mwamikazi, E., Gueniche, T., Faghihi, U. (2013). Memory * Efficient Itemset Tree for Targeted Association Rule Mining. * Proc. 9th International Conference on Advanced Data Mining and Applications * (ADMA 2013) Part II, Springer LNAI 8347, pp. 95-106. * * Copyright (c) 2013 Philippe Fournier-Viger * * This file is part of the SPMF DATA MINING SOFTWARE * (http://www.philippe-fournier-viger.com/spmf). * * SPMF is free software: you can redistribute it and/or modify it under the * terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * * SPMF is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR * A PARTICULAR PURPOSE. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along with * SPMF. If not, see <http://www.gnu.org/licenses/>. */ public class MemoryEfficientItemsetTree extends AbstractItemsetTree implements Serializable { private static final long serialVersionUID = 1L; long sumBranchesLength; // sum of branches length int totalNumberOfBranches; // total number of branches // This variable is commented and was only used // for testing the performance of random queries // HashSet<Integer> items = new HashSet<Integer>(); /** * Default constructor */ public MemoryEfficientItemsetTree() { super(); } /** * Build the itemset-tree based on an input file containing transactions * @param input an input file * @throws IOException exception if error while reading the file */ public void buildTree(String input) throws IOException { // record start time startTimestamp = System.currentTimeMillis(); // reset memory usage statistics MemoryLogger.getInstance().reset(); // create an empty root for the tree root = new ItemsetTreeNode(null, 0); // Scan the database to read the transactions BufferedReader reader = new BufferedReader(new FileReader(input)); String line; // for each line (transaction) until the end of file while (((line = reader.readLine()) != null)) { // if the line is a comment, is empty or is a // kind of metadata if (line.isEmpty() == true || line.charAt(0) == '#' || line.charAt(0) == '%' || line.charAt(0) == '@') { continue; } // split the transaction into items String[] lineSplited = line.split(" "); // create a structure for storing the transaction int[] itemset = new int[lineSplited.length]; // for each item in the transaction for (int i=0; i< lineSplited.length; i++) { // convert the item to integer and add it to the structure itemset[i] = Integer.parseInt(lineSplited[i]); // The next line is commented and was only used // for testing the performance of random queries //items.add(itemset[i]); } // printTree(); // call the method "construct" to add the transaction to the tree construct(null, root, itemset, null); // System.out.println("."); } // close the input file reader.close(); // check the memory usage MemoryLogger.getInstance().checkMemory(); // close the file endTimestamp = System.currentTimeMillis(); } /** * Add a transaction to the itemset tree. * @param transaction the transaction to be added (array of ints) */ public void addTransaction(int[] transaction){ // call the "construct" algorithm to add it construct(null, root, transaction, null); } /** * Given the root of a sub-tree, add an itemset at the proper position in that tree * @param r the root of the sub-tree * @param s the itemset to be inserted * @param prefix the current item(s) explored in this branch of the tree until the current node r. */ private void construct(ItemsetTreeNode parentOfR, ItemsetTreeNode r, int[] s, int[] prefix) { // if the itemset in root node is the same as the one to be inserted, // we just increase the support, and return. if(same(s, prefix, r.itemset)){ r.support++; return; } int[] rprefix = append(prefix, r.itemset); // if the node to be inserted is an ancestor of the itemset of the root node // then insert the itemset between r and its parent // Before: parent_of_r --> r // After: parent_of_r --> s --> r // e.g. for a regular itemset tree // {2}:4 --> {2,3,4,5,6}:6 // we insert {2,3} // {2}�4 --> {2,3}:7 --> {2,3,4,5,6}:6 // e.g. for a compact itemset tree // r_parent r // {2}:4 --> {3,4,5,6}:6 // we insert s={2,3} // r_parent s' r' // {2}:4 --> {3}:7 --> {4,5,6}:6 if(ancestorOf(s, rprefix)){ // Calculate s' and r' by using the prefix int[] sprime = copyItemsetWithoutItemsFrom(s, prefix); int[] rprime = copyItemsetWithoutItemsFrom(rprefix, sprime); // create a new node for the itemset to be inserted with the support of // the subtree root node + 1 ItemsetTreeNode newNodeS = new ItemsetTreeNode(sprime, r.support +1); // set the childs and parent pointers. newNodeS.childs.add(r); parentOfR.childs.remove(r); parentOfR.childs.add(newNodeS); // r.parent = newNodeS; r.itemset = rprime; return; // return } // Otherwise, calculate the largest common ancestor // of the itemset to be inserted and the root of the sutree int[] l = getLargestCommonAncestor(s, rprefix); if(l != null){ // if there is one largest common ancestor int[] sprime = copyItemsetWithoutItemsFrom(s, l); int[] rprime = copyItemsetWithoutItemsFrom(r.itemset, l); // create a new node with that ancestor and the support of // the root +1. ItemsetTreeNode newNode = new ItemsetTreeNode(l, r.support +1); // set the node childs and parent pointers newNode.childs.add(r); parentOfR.childs.remove(r); parentOfR.childs.add(newNode); // parentOfR = newNode; r.itemset = rprime; // append second children which is the itemset to be added with a // support of 1 ItemsetTreeNode newNode2 = new ItemsetTreeNode(sprime, 1); // update pointers for the new node newNode.childs.add(newNode2); // newNode2.parent = newNode; return; } // else get the length of the root itemset int indexLastItemOfR = (rprefix == null)? 0 : rprefix.length; // increase the support of the root r.support++; // for each child of the root for(ItemsetTreeNode ci : r.childs){ int[] ciprefix = append(rprefix, ci.itemset); // if one children of the root is the itemset to be inserted s, // then increase its support and stop if(same(s, ciprefix)){ // case 2 ci.support++; return; } // if the itemset to be inserted is an ancestor of the child ci if(ancestorOf(s, ciprefix)){ // case 3 int[] sprime = copyItemsetWithoutItemsFrom(s, rprefix); int[] ciprime = copyItemsetWithoutItemsFrom(ci.itemset, s); // create a new node between ci and r in the tree // and update child /parents pointers ItemsetTreeNode newNode = new ItemsetTreeNode(sprime, ci.support+ 1); newNode.childs.add(ci); // newNode.parent = r; r.childs.remove(ci); r.childs.add(newNode); // ci.parent = newNode; ci.itemset = ciprime; return; } // if the child ci is an ancestor of s if(ancestorOf(ciprefix, s)){ // case 4 // then make a recursive call to construct to handle this case. construct(r, ci, s, rprefix); return; } // case 5 // if ci and s have a common ancestor that is larger than r: if(ciprefix[indexLastItemOfR] == s[indexLastItemOfR]){ // find the largest common ancestor int[] ancestor = getLargestCommonAncestor(s, ciprefix); // create a new node for the ancestor itemset just found with the support // of ci + 1 int[] ancestorprime = copyItemsetWithoutItemsFrom(ancestor, rprefix); ItemsetTreeNode newNode = new ItemsetTreeNode(ancestorprime, ci.support+ 1); // set r as parent // newNode.parent = r; r.childs.add(newNode); // add ci as a children of the new node ci.itemset = copyItemsetWithoutItemsFrom(ci.itemset, ancestorprime); newNode.childs.add(ci); // ci.parent = newNode; r.childs.remove(ci); // create another new node for s with a support of 1, which // will be the child of the first new node int[] sprime = copyItemsetWithoutItemsFromArrays(s, ancestorprime, rprefix); ItemsetTreeNode newNode2 = new ItemsetTreeNode(sprime, 1); // newNode2.parent = newNode; newNode.childs.add(newNode2); // end return; } } // Otherwise, case 1: // A new node is created for s with a support of 1 and is added // below the node r. int[] sprime = copyItemsetWithoutItemsFrom(s, rprefix); ItemsetTreeNode newNode = new ItemsetTreeNode(sprime, 1); // newNode.parent = r; r.childs.add(newNode); } /** * Make a copy of an itemset while removing items that appears in * two itemsets named "prefix" and "s". * @param r the itemset * @param prefix the other itemset named "prefix" * @param s the other itemset named "s" * @return the itemset */ private int[] copyItemsetWithoutItemsFromArrays(int[] r, int[] prefix, int[] s) { // create an empty itemset List<Integer> rprime = new ArrayList<Integer>(r.length); // for each item in r loop1: for(Integer rvalue : r){ // if the other itemset prefix is not null if(prefix != null){ // for each item from the prefix for(int pvalue : prefix){ // if it is the current item in r if(pvalue == rvalue){ // skip this item from r continue loop1; // if the current item from prefix is larger // than the current item from r, // then break because itemsets are lexically ordered // so there will be no match. }else if(pvalue > rvalue){ break; } } } // if s is not null if(s != null){ // for each item in s for(int svalue : s){ // if this item in s is the current item in r if(rvalue == svalue){ // skip it (don't add it to the new itemset) continue loop1; // if the current item from s is larger // than the current item from r, // then break because itemsets are lexically ordered // so there will be no match. }else if(svalue > rvalue){ break; } } } rprime.add(rvalue); } // transform the new itemset "rprime" from ArrayList // to an array. int[] rprimeArray = new int[rprime.size()]; for(int i=0; i< rprime.size(); i++){ rprimeArray[i] = rprime.get(i); } // return the array return rprimeArray; } /** * Make a copy of an itemset without items from a second itemset. * @param itemset1 the first itemset * @param itemset2 the second itemset * @return the new itemset */ private int[] copyItemsetWithoutItemsFrom(int[] itemset1, int[] itemset2) { // if the second itemset is null, just return the first itemset if(itemset2 == null){ return itemset1; } // create a new itemset List<Integer> itemset1prime = new ArrayList<Integer>(itemset1.length); // for each item in the first itemset loop1: for(int i1value : itemset1){ // for each it in the second itemset for(int i2value : itemset2){ // if the items match, don't add the current item // from itemset1 to the new itemset if(i2value == i1value){ continue loop1; // otherwise, if the current item from "itemset2" // is larger than the current item from "itemset1" // there will be no match because itemsets are // lexically ordered. }else if(i2value > i1value){ break; } } // if the current item from itemset1 was not in itemset2, // then add it to the new itemset itemset1prime.add(i1value); } // convert the new itemset from an ArrayList to an array int[] itemset1primeArray = new int[itemset1prime.size()]; for(int i=0; i< itemset1prime.size(); i++){ itemset1primeArray[i] = itemset1prime.get(i); } // return the array return itemset1primeArray; } /** * Check if itemset1 is the same as the concatenation of prefix and itemset2 * @param itemset1 the first itemset * @param prefix a prefix * @param itemset2 another itemset * @return true if the same otherwise false */ private boolean same(int[] itemset1, int[] prefix, int[] itemset2) { if(prefix == null) { return same(itemset1, itemset2); } // if one is null, then returns false if(itemset2 == null || itemset1 == null){ return false; } // if they don't have the same size, then they cannot // be equal if(itemset1.length != itemset2.length + prefix.length){ return false; } // otherwise, loop on items from itemset1 // and check if they are the same as itemset 2 int i = 0; while(i < prefix.length){ if(itemset1[i] != prefix[i]){ // if one is different then they are not the same return false; } i++; } int j = 0; while(j< itemset2.length){ if(itemset1[j++] != itemset2[i++]){ // if one is different then they are not the same return false; } } // otherwise they are the same return true; } /** * Method that append two itemsets to create a larger one * @param a1 the first itemset * @param a2 the second itemset * @return the new itemset */ public int[] append(int[] a1, int[] a2){ //if the first itemset is null, return the second one if(a1 == null){ return a2; } //if the second itemset is null, return the first one if(a2 == null){ return a1; } // create the new itemset int[] newArray = new int[a1.length + a2.length]; // copy the first itemset in the new itemset int i=0; for(; i< a1.length; i++){ newArray[i] = a1[i]; } // copy the second itemset in the new itemset for(int j =0; j< a2.length; j++){ newArray[i++] = a2[j]; } // return the new itemset return newArray; } /** * Print statistics about the time and maximum memory usage for the construction * of the itemset tree. */ public void printStatistics() { System.gc(); System.out.println("========== MEMORY EFFICIENT ITEMSET TREE CONSTRUCTION - STATS ============"); System.out.println(" Tree construction time ~: " + (endTimestamp - startTimestamp) + " ms"); System.out.println(" Max memory:" + MemoryLogger.getInstance().getMaxMemory()); nodeCount = 0; totalItemCountInNodes = 0; sumBranchesLength = 0; totalNumberOfBranches = 0; recursiveStats(root, 1); System.out.println(" Node count: " + nodeCount); System.out.println(" Sum of items in all node: " + totalItemCountInNodes + " avg per node :" + totalItemCountInNodes / ((double)nodeCount)); System.out.println("====================================="); } /** * Recursive method to calculate statistics about the itemset tree * @param root the root node of the current subtree * @param length the cummulative sum of length of itemsets */ private void recursiveStats(ItemsetTreeNode root, int length) { // if the root is not null or the empty set if(root != null && root.itemset!=null){ // increase node count nodeCount++; // increase the total number of items totalItemCountInNodes += root.itemset.length; } // for each child node, make a recursive call for(ItemsetTreeNode node : root.childs){ recursiveStats(node, ++length); } // if no child, this node is a leaf, so // add the cummulative length of this branch to the sum // and add 1 to the total number of branches. if(root.childs.size() == 0) { sumBranchesLength += length; totalNumberOfBranches += 1; } } /** * Print the tree to System.out. */ public void printTree() { System.out.println(root.toString(new StringBuilder(),"")); } /** * Return a string representation of the tree. */ public String toString() { return root.toString(new StringBuilder(), ""); } /** * Get the support of a given itemset s. * @param s the itemset * @return the support as an integer. */ public int getSupportOfItemset(int[] s) { return count(s, root, new int[0]); // call the method count. } /** * This method calculate the support of an itemset by using a subtree * defined by its root. * * Note: this is implemented based on the algorithm "count" of Table 2 in the paper by Kubat et al. // Note that there was a few problem in the algorithm in the paper. // I had to change > by < in : ci.itemset[ci.itemset.length -1] < s[s.length -1]){ // also the count was not correct so i had to change the way it counted the support a little bit // by using += instead of return. * * @param s the itemset * @param root the root of the subtree * @param startFrom the items to match starting from position j in s * @return the support as an integer */ private int count(int[] s, ItemsetTreeNode root, int[] prefix) { // the variable count will be used to count the support int count =0; // for each child of the root for(ItemsetTreeNode ci : root.childs){ // if the first item of the itemset that we are looking for // is smaller than the first item of the child, we need to look // further in that tree. int[] ciprefix = append(prefix, ci.itemset); if(ciprefix[0] <= s[0]){ // if s is included in ci, add the support of ci to the current count. if(ArraysAlgos.includedIn(s, ciprefix)){ count += ci.support; }else if(ciprefix[ciprefix.length -1] < s[s.length -1]){ // otherwise, if the last item of ci is smaller than // the last item of s, then make a recursive call to explore // the subtree where ci is the root count += count(s, ci, ciprefix); } } } // return the total count return count; } /** * Get the frequent itemsets subsuming a given itemset for a given minimum support value. * @param is the itemset * @param minsup the minimum support threshold (integer) * @return an hashtable containing the frequent itemsets */ public HashTableIT getFrequentItemsetSubsuming(int[] is, int minsup) { // call the recursive method HashTableIT hashTable = getFrequentItemsetSubsuming(is); // after finding the itemsets we do a loop to remove those with a support lower than minsup, // This does not seems efficient but that is how the authors of the paper do it. // for each position in the internal array of the hash table for(List<Itemset> list : hashTable.table){ // if that position is not empty if(list != null){ // loop over the itemsets stored at that position Iterator<Itemset> it = list.iterator(); while (it.hasNext()) { // if the itemset is infrequent, remove it Itemset itemset = (Itemset) it.next(); if(itemset.support < minsup){ it.remove(); } } } } // then we return the hash table return hashTable; } /** * This method pass through the itemset tree to get all itemsets * that are subsuming a given itemset "s" and their support. Note that * this method may also return infrequent itemsets that can be filtered by * additional processing after. * @param s the itemset * @return an hashtable countaining itemsets and their support. */ public HashTableIT getFrequentItemsetSubsuming(int[] s){ // create a hash table to contain the itemsets to be more efficient // we set the default size of the internal array to 1000 HashTableIT hash = new HashTableIT(1000); // create an hashset to store the items of the itemset HashSet<Integer> seti = new HashSet<Integer>(); for(int i=0; i< s.length; i++){ seti.add(s[i]); } // call the method selective mining for finding the sets subsuming s selectiveMining(s, seti, root, hash, null); return hash; } /** * This method finds itemsets subsuming a given itemset. It is a recursive method that * scan a subtree of the itemset-tree. It stores itemsets found in an hashtable together * with their support. * @param s the itemset s * @param seti the items from the itemset s stored in a HashSet<Integer> for more efficiency for inclusion checking * @param t the root of the subtree * @param hash the hashtable for storing the result * @return the cumulative support of the t's immediate children. This is needed to ensure t is correctly incorporated into the hashtable (and with the correct support). */ private int selectiveMining(int[] s, HashSet<Integer> seti, ItemsetTreeNode t, HashTableIT hash, int[] prefix) { //initializes the running cumulative support of t's immediate children int childrenSup = 0; // for all child nodes of the given root of the subtree for(ItemsetTreeNode ci : t.childs){ //Add ci's support to the cumulative count childrenSup += ci.support; int[] ciprefix = append(prefix, ci.itemset); // if the first item of s is smaller or equal to the // first item of the child if(ciprefix[0] <= s[0]){ // Check if s is included in ci if(ArraysAlgos.includedIn(s, ciprefix)){ // if ci has not child, put s in the hashtable with // the support of ci, and then // call recursive add. // Note: This part is not explained correctly in the paper, // i had to figure it out by myself and fix it. if(ci.childs.size() ==0){ hash.put(s, ci.support); recursiveAdd(s, seti, ciprefix, ci.support, hash, 0); }else{ // otherwise recursively explore subtree with ci as root. // Note, we subtract the count returned by selectiveMining (which contains the cumulative //support of the ci's immediate children) from ci's support //remainingSup thus indicates how many times ci's itemset appeared by itself in the database) int remainingSup = ci.support - selectiveMining(s, seti, ci, hash, ciprefix); //If remainingSup is greater than 0, then it means ci's children do not fully account for all of //the occurrences of ci's itemset. In other words, ci's itemset appeared by itself remainingSup times //Hence, we need to put s in the hashtable with remainingSup, and then call recursive add on ci //with remainingSup. if (remainingSup > 0) { hash.put(s, remainingSup); recursiveAdd(s, seti, ciprefix, remainingSup, hash, 0); } } } else if(ciprefix[ciprefix.length -1] < s[s.length -1]){ // else if the last item of ci is smaller than the last // item of s, we also need to recursively explore subtree // with ci as root. selectiveMining(s, seti, ci, hash, ciprefix); } } } return childrenSup; } /** * Perform a recursive add (as based on the procedure presented in the paper by Kubat et al.) * @param s an itemset s * @param seti the items from the itemset s in a HashSet of integers * @param ci an itemset tree node ci * @param cisupport the support of the itemset associated to ci * @param hash an hashtable used to store itemset and their support * @param pos the current position in the itemset ci */ private void recursiveAdd(int[] s, HashSet<Integer> seti, int[] ci, int cisupport, HashTableIT hash, int pos) { // if we have reached the end of ci, then stop if(pos >= ci.length){ return; } // if the itemset i contain the item as position pos in ci if(!seti.contains(ci[pos])){ // create a new itemset "newS" by concatening the // item as position pos inn ci with the itemset s. // Note that the resulting itemset must be lexicographically ordered // so we copy the item one by one and check where the item at // position pos should be inserted. int[] newS = new int[s.length+1]; // create the new itemset int j=0; // current position boolean added = false; //indicate if we have added the item at pos already // for each item in s for(Integer item : s){ // if added already or the current item is smaller than the one at pos if(added || item < ci[pos]){ // we add the item from s newS[j++] = item; }else{ // otherwise, we insert the item at position pos newS[j++] = ci[pos]; newS[j++] = item; added = true; // we set that variable to true to not insert it twice! } } // if the item at position pos was not yet added, that means // that he should be inserted in the last position because he his // greater than all other items if(j < s.length+1){ newS[j++] = ci[pos]; } // add the new itemset to the hashtable with the support of ci hash.put(newS, cisupport); // make a recursive call with the next position in ci with the new itemset recursiveAdd(newS, seti, ci, cisupport, hash, pos+1); } // make a recursive call with the next position in ci with itemset "S" recursiveAdd(s, seti, ci, cisupport, hash, pos+1); } }