package ca.pfv.spmf.algorithms.frequentpatterns.apriori_HT; /* This file is copyright (c) 2008-2013 Philippe Fournier-Viger * * This file is part of the SPMF DATA MINING SOFTWARE * (http://www.philippe-fournier-viger.com/spmf). * * SPMF is free software: you can redistribute it and/or modify it under the * terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * * SPMF is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR * A PARTICULAR PURPOSE. See the GNU General Public License for more details. * You should have received a copy of the GNU General Public License along with * SPMF. If not, see <http://www.gnu.org/licenses/>. */ import java.util.ArrayList; import java.util.List; import ca.pfv.spmf.algorithms.ArraysAlgos; import ca.pfv.spmf.patterns.itemset_array_integers_with_count.Itemset; /** * This class represents an itemset hash tree as used by the AprioriHT algorithm * (a version of Apriori implemented with a hash tree). * <br/><br/> * * In the original Apriori paper it is suggested to not subdivide a node until * there is enough itemsets in that node. In this implementations, all nodes are * always subdivided. * * @see AlgoAprioriHT * @author Philippe Fournier-Viger */ public class ItemsetHashTree { // this constant indicates how many child nodes a node should have private int branch_count = 30; // the size of the itemsets that are inserted into this tree private int itemsetSize; // the number of itemsets that have been inserted into this tree int candidateCount; // the root node of the tree InnerNode root; // the last leaf node that was added to the tree LeafNode lastInsertedNode = null; /** * Constructor * @param itemsetSize the size of the itemsets that will be inserted in the tree */ public ItemsetHashTree(int itemsetSize, int branch_count){ this.itemsetSize = itemsetSize; this.branch_count = branch_count; root = new InnerNode(); // create root node } /** * Inserts an itemset in the hash-tree * @param itemset the itemset to be inserted */ public void insertCandidateItemset(Itemset itemset){ candidateCount++; // increase the counter for the number of itemsets in the tree insertCandidateItemset(root, itemset, 0); // insert the itemset } /** * Inserts an itemset in the hash-tree (this is called recursively to search where * to insert the itemset) * @param node the current node to be explored * @param itemset the itemset to be inserted * @param level the current level in the tree (root = level 1 ...) */ private void insertCandidateItemset(Node node, Itemset itemset, int level){ // use the modulo to know which child we should explore int branchIndex = itemset.itemset[level] % branch_count; // if we have reached the level of leaf nodes if(node instanceof LeafNode){ // insert the itemset in the appropriate list of the leaf node List<Itemset> list = ((LeafNode)node).candidates[branchIndex]; if(list == null){ list = new ArrayList<Itemset>(); ((LeafNode)node).candidates[branchIndex] = list; } list.add(itemset); }else{ Node nextNode = ((InnerNode)node).childs[branchIndex]; if(nextNode == null){ if(level == itemsetSize - 2){ nextNode = new LeafNode(); ((LeafNode)nextNode).nextLeafNode = lastInsertedNode; lastInsertedNode = (LeafNode)nextNode; }else{ nextNode = new InnerNode(); } ((InnerNode)node).childs[branchIndex] = nextNode; } insertCandidateItemset(nextNode, itemset, level+1); } } /** * Abstract class for a node in the hash-tree. */ abstract class Node{ } /** * Class for nodes that are not a leaves in the hash-tree. */ class InnerNode extends Node{ Node childs[ ] = new Node[branch_count]; // contains a list of child nodes } /** * Class for leaf nodes in the hash-tree. */ class LeafNode extends Node{ // contains a list of list of candidates final List<Itemset> [] candidates = new ArrayList[branch_count]; // a pointer to the leaf node that was created just before this one. // It is used to navigate quickly between leaves. LeafNode nextLeafNode = null; } /** * This method increase the support count of all itemsets contained in the hash-tree * that are contained in a transaction. * @param transaction the transaction. */ public void updateSupportCount(int[] transaction) { updateSupportCount(transaction, root, 0, new int[]{}); } /** * Recursive method for increasing the support count of all itemsets contained in the hash-tree * that are contained in a transaction. * @param transaction the transaction * @param node the current node that is explored * @param firstPositionToCheck the current position in the transaction to be explored * @param prefix the current prefix that is being explored */ private void updateSupportCount(int[] transaction, InnerNode node, int firstPositionToCheck, int [] prefix) { // the index of the last item in the transaction int lastPosition = transaction.length -1; // the index of the last item that can be the first item in lexical order of an itemset in the transaction int lastPositionToCheck = transaction.length - itemsetSize + prefix.length; // for each item until lastPositionToCheck for(int i=firstPositionToCheck; i <= lastPositionToCheck; i++){ int itemI = transaction[i]; // calculate which branch to explore int branchIndex = itemI% branch_count; Node nextNode = node.childs[branchIndex]; if(nextNode == null){ // there is no node, so nothing to do! we stop exploring this path... }else if(nextNode instanceof InnerNode){ // if the node is not a leaf node, // we create the new prefix by adding item i (the current item) int [] newPrefix = new int[prefix.length+1]; System.arraycopy(prefix, 0, newPrefix, 0, prefix.length); newPrefix[prefix.length] = itemI; // we call the method recursively updateSupportCount(transaction, (InnerNode) nextNode, i+1, newPrefix); }else{ // if the node is a leaf node LeafNode theNode = (LeafNode) nextNode; // we search for an additional item that could be added for(int j= i+1; j <= lastPosition; j++){ int itemJ = transaction[j]; // we check which branch int branchIndexNextNode = itemJ% branch_count; List<Itemset> listCandidates = theNode.candidates[branchIndexNextNode]; // if the branch is not null if(listCandidates != null){ // we check if the resulting itemset is in this branch. for(Itemset candidate: listCandidates){ // if so, we increase its support count if(sameAsPrefix(candidate.itemset, prefix, itemI, itemJ)){ candidate.support++; } } } } } } } /** * This method checks if an itemset exists in the tree. * @param itemset the itemset * @param posRemoved the position of an item that should be ignored in this itemset * @return true if the itemset appears in the tree */ public boolean isInTheTree(int[] itemset, int posRemoved) { // we start from the root.. Node node = root; int count = 0; // we will consider each item of the itemset to go down in the hash tree loop: for(int i=0; i< itemset.length; i++){ // if the current item is the item to be ignored, we ignore it if(i== posRemoved){ continue; } count++; // we check which branch we should explore int branchIndex = itemset[i] % branch_count; // if this is the last item of the itemset, this node is a leaf node if(count == itemsetSize){ // if the leaf node is null, the itemset is not there, so we return false. if(node == null){ return false; } // we check the appropriate branch of the leaf node List<Itemset> list = ((LeafNode)node).candidates[branchIndex]; // if it is null, then the itemset is not there if(list == null){ return false; } // Otherwise, we perform a binary search to check if the itemset // appear there. int first = 0; int last = list.size() - 1; // the binary search while( first <= last ) { int middle = ( first + last ) / 2; if(ArraysAlgos.sameAs(list.get(middle).getItems(), itemset, posRemoved) < 0 ){ first = middle + 1; // the itemset compared is larger than the subset according to the lexical order } else if(ArraysAlgos.sameAs(list.get(middle).getItems(), itemset, posRemoved) > 0 ){ last = middle - 1; // the itemset compared is smaller than the subset is smaller according to the lexical order } else{ break loop; // It was found, so we return true; } } return false; // it was not found }else{ // if it is not a leaf node if(node == null){ return false; } // we explore the next node in the appropriate branch node = ((InnerNode)node).childs[branchIndex]; } } return true; } /** * A method that check if an itemset is equal to another itemset called "prefix" + itemI + itemJ * @param itemset1 the itemset * @param prefix the itemset called "prefix" * @param itemI an item that should be appended to prefix * @param itemJ a second item that should be appended to prefix * @return */ private boolean sameAsPrefix(int [] itemset1, int [] prefix, int itemI, int itemJ) { for(int i=0; i < prefix.length; i++){ if(itemset1[i] != prefix[i]){ return false; } } return itemset1[itemset1.length -2] == itemI && itemset1[itemset1.length -1] == itemJ; } // // we use binary search to quickly find where to insert the itemset. // private void insertInOrder(Itemset candidate, List<Itemset> level) { // if(level.size() ==0){ // level.add(candidate); // }else{ // int insertPos = Collections.binarySearch(level, candidate, new Comparator<Itemset>(){ // @Override // public int compare(Itemset arg0, Itemset arg1) { // return sameAs(arg0, arg1.items); // REMOVE THE -1, it is useless... // } // }); // level.add(-insertPos - 1, candidate); // } // } // // // private String toString(int[] newItemset) { // StringBuilder temp = new StringBuilder(); // for(Integer integer: newItemset){ // temp.append(integer); // temp.append(" "); // } // return temp.toString(); // } }