package ca.pfv.spmf.algorithms.frequentpatterns.estDec; import java.io.BufferedWriter; import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Hashtable; import java.util.List; /** * This is an implementation of a estDecPlusTree. <br/> * <br/> * * This implementation was made by Azadeh Soltani <br/> * <br/> * * This file is part of the SPMF DATA MINING SOFTWARE * (http://www.philippe-fournier-viger.com/spmf). <br/> * <br/> * * SPMF is free software: you can redistribute it and/or modify it under the * terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. <br/> * <br/> * * SPMF is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR * A PARTICULAR PURPOSE. See the GNU General Public License for more details. <br/> * <br/> * * You should have received a copy of the GNU General Public License along with * SPMF. If not, see <http://www.gnu.org/licenses/>. * * @see Algo_estDecPlus * @see CPTreeNode * @author Azadeh Soltani */ public class CPTree { private double N; // |Dk| private double d; // decay rate private double delta; // delta threshold // number of frequent itemsets found by mining the CPTree int patternCount = 0; // Hashtable for storing frequent itemsets into memory // (used if result is saved to memory) Hashtable<int[], Double> patterns; // writer used if result is saved to file private BufferedWriter writer; // the minsup parameter private double minsup; // the minsig parameter private double minsig; // the minmerge parameter private double minmerg; // the tree root CPTreeNode root; int[] itemsetBuffer = new int[500]; /** * Constructor */ CPTree(double decay, double mins, double minSigValue, double deltaValue, double minMergeValue) { N = 0; minsup = mins; minsig = minSigValue; minmerg = minMergeValue; d = decay; delta = deltaValue; root = new CPTreeNode(); // null node } /** * Set the decay rate * @param b decay base * @param h decay-base life */ void setDecayRate(double b, double h) { d = Math.pow(b, -1.0 / h); } /** * Method for updating parameters (Phase 1: parameter updating phase) */ void updateParams() { // |Dk| = |Dk| + 1 N = N + 1; } /******************************************************************** * Method for inserting new possible frequent itemsets to the tree based on * the new transaction (Phase 3 : Insertion Phase). * * @param transaction * the new transaction m *********************************/ void insertItemset(int[] transaction) { // create a new transaction List<Integer> transaction2 = new ArrayList<Integer>(); // add each item from the given transaction that has enough support to // the new transaction for (int item : transaction) { CPTreeNode child = root.getChildWithID(item, -1); if (child == null) // if i does not belong to the tree, insert a node m // representing i into the tree root.children.add(new CPTreeNode(item, root, (short) -1, 1)); else if (child.counter1 / (N) >= minsig) { transaction2.add(item); } } // insert the new transaction for (int ind = 0; ind < transaction2.size(); ++ind) { Integer item = transaction2.get(ind); CPTreeNode child = root.getChildWithID(item, -1); if (child != null) { itemsetBuffer[0]=item; insert_n_itemsets(child, (short) 0, transaction2, ind + 1, itemsetBuffer,1); } } } /******************************************************************** * Method for obtaining the count of an itemset while ignoring an item at a * given position. * * @param itemset * the itemset * @param pos * the index of the item to be ignored in the itemset ********************************************************************/ double getCountOfItemset(int[] itemset) { CPTreeNode currentNode = root.getChildWithID(itemset[0], -1); int ind = 1; short parentInd = 0; int l = 1; int oldPInd; CPTreeNode parentNode = currentNode; while (true) { if (ind >= itemset.length) break; oldPInd = parentInd; parentInd = currentNode.getInnerIndexWithID(itemset[ind], parentNode, parentInd); if (parentInd != -1) { ind++; l++; continue; } else { currentNode = currentNode.getChildWithID(itemset[ind], oldPInd); if (currentNode != null) { parentNode = currentNode; parentInd = 0; l = 1; ind++; } else return 0; } }// while return currentNode.estimateMergeCount(l, currentNode.getLongestLevel()); } /******************************************************************** * Method for estimating the count of n-itemset from its n-1 subsets * * @param currentNode * , transaction, index ********************************************************************/ double estimateCount(int[] itemset,int length) { double min = Double.MAX_VALUE; // We will consider each subset of length n-1 of the itemset "itemset" to // find the minimum count of its n-1 subsets. // This is done by removing each item from "itemset" one at a time and checking // the count of the resulting itemset // We create a temporary array for storing any n-1 itemset int[] itemset2 = new int[length-1]; for (int i = 0; i < length; ++i) { // copy all items except the i-th item to the temporary array System.arraycopy(itemset, 0, itemset2, 0, i); System.arraycopy(itemset, i+1, itemset2, i, length - i - 1); // get the count of the itemset without the i-th item double c = getCountOfItemset(itemset2); if (c==0) return 0; if (c < min) min = c; } return min; } /******************************************************************** * Recursive method for inserting all itemsets corresponding to the a * transaction * * @param currentNode * a tree node * @param PI * the inner parent index * * @param transaction * the transaction * @param ind * the depth of the current node with respect to the root of the tree * @param itemset * @param length * the length of the itemset * ********************************************************************/ public void insert_n_itemsets(CPTreeNode currentNode, short PI, List<Integer> transaction, int ind, int[] itemset, int length) { // stop recursion if (ind >= transaction.size()) return; int item = transaction.get(ind); itemsetBuffer[length] = item; // look if there is a node already in the est-Tree short PI2; PI2 = currentNode.getInnerIndexWithID(item, currentNode, PI); if (PI2 != -1) { insert_n_itemsets(currentNode, PI2, transaction, ind + 1, itemset,length+1); } else { CPTreeNode child = currentNode.getChildWithID(item, PI); if (child != null) insert_n_itemsets(child, (short) 0, transaction, ind + 1, itemset,length+1); else { if (currentNode.counter1 / N >= minsig) { double c = estimateCount(itemsetBuffer,length+1); if (c / N >= minsig) { child = new CPTreeNode(item, currentNode, PI, c); currentNode.children.add(child); if (((currentNode.counter1 - child.counter2) / N) < delta && (child.counter2 / N) > minmerg) merge(currentNode, child); } } } } insert_n_itemsets(currentNode, PI, transaction, ind + 1, itemset,length); } /******************************************************************** * Method for force pruning * * @param root * t ********************************************************************/ void forcePruning(CPTreeNode currentNode) { for (int i = 0; i < currentNode.children.size(); ++i) { CPTreeNode node = currentNode.children.get(i); if (node.counter1 / N < minsig && currentNode.itemIDList != null) { currentNode.children.remove(i--); }else { forcePruning(node); } } } /******************************************************************** * Recursive method for finding frequent patterns. * * @param root root of the current subtree * @param pattern current pattern * @throws IOException ********************************************************************/ void patternMining(CPTreeNode currentNode, int[] pattern) throws IOException { if (currentNode.itemIDList != null && currentNode.itemIDList.size() > 0) { // list of itemset of patterns corresponding to each itemIdList item List<int[]> itemsetList = new ArrayList<int[]>(); // Create a new pattern by concatening // itemset "pattern" with the first item of itemIDList int[] concatenation = new int[pattern.length+1]; System.arraycopy(pattern, 0, concatenation, 0, pattern.length); concatenation[pattern.length] = currentNode.itemIDList.get(0); itemsetList.add(concatenation); // Compute support of that pattern double s = currentNode.computeSupport(N, 1); // if the support is enough, save that pattern if (s >= minsup) { patternCount++; // if store into file if (patterns == null) { writeItemset(concatenation, s); } else { // else, store into memory patterns.put(concatenation, s); } } for (int i = 1; i < currentNode.itemIDList.size(); ++i) { int PIn = currentNode.parents.get(i).pInd; // create another pattern by concatenating the pattern // at position PIn with the i-th item of "itemIDList" int[] patternPIn = itemsetList.get(PIn); int[] concatenation2 = new int[patternPIn.length+1]; System.arraycopy(patternPIn, 0, concatenation2, 0, patternPIn.length); concatenation2[patternPIn.length] = currentNode.itemIDList.get(i); itemsetList.add(concatenation2); // Compute support of that pattern s = currentNode.computeSupport(N, currentNode.getLevel(i)); // If the support is enough, save that pattern if (s >= minsup) { patternCount++; // if store into file if (patterns == null) { writeItemset(concatenation2, s); } else { // else, store into memory patterns.put(concatenation2, s); } }// if s }// for i // Recursively mine using the child nodes of the current node for (CPTreeNode node : currentNode.children) { patternMining(node, itemsetList.get(node.parents.get(0).pInd)); } } } /******************************************************************** * Method for finding frequent patterns and save them into memory * * @param root * root of the current subtree ********************************************************************/ Hashtable<int[], Double> patternMining_saveToMemory() throws IOException { // Initialize hashtable for storing frequent patterns into memory patterns = new Hashtable<int[], Double>(); patternCount = 0; // recursive method for pattern mining for (CPTreeNode node : root.children) patternMining(node, new int[0]); return patterns; // return patterns found } /******************************************************************** * Method for finding frequent patterns and save them into file * * @param root * the root of the curent subtree * @param outputPath * the output file path * @throws IOException ********************************************************************/ void patternMining_saveToFile(String outputPath) throws IOException { patterns = null; // because we will not save into memory writer = new BufferedWriter(new FileWriter(outputPath)); patternCount = 0; // recursive method for pattern mining for (CPTreeNode node : root.children) patternMining(node, new int[0]); writer.close(); } /******************************************************************** * Method for writing frequent patterns in output file * * @param itemset * the pattern to be saved * @param support * a double value ********************************************************************/ void writeItemset(int[] itemset, double support) throws IOException { StringBuilder buffer = new StringBuilder(); // for each item for (Integer item : itemset) { // write the item buffer.append(item); buffer.append(" "); } // write the support buffer.append("#SUP: "); buffer.append(support); writer.write(buffer.toString()); writer.newLine(); } /** * merge two parameter nodes * * @param mp * the parent node * @param m * the child node */ public void merge(CPTreeNode mp, CPTreeNode m) { // System.out.println("MERGE"); int l = mp.itemIDList.size(); mp.itemIDList.addAll(m.itemIDList); mp.parents.add(m.parents.get(0)); for (int j = 1; j < m.parents.size(); ++j) { mp.parents.add(new ParentNode(mp, (short) (l + m.parents.get(j).pInd))); } for (CPTreeNode mc : m.children) { ParentNode p = mc.parents.get(0); p.pNode = mp; p.pInd = (short) (l + p.pInd); mc.parents.set(0, p); mp.children.add(mc); } if (mp.counter2>m.counter2) mp.counter2=m.counter2; mp.children.remove(m); } /** * split the node mf * * * @param m a node which should be split */ public void split(CPTreeNode m) { int longestLevel = m.getLongestLevel(); // int l = m.itemIDList.size(); for (int j = 1; j < m.itemIDList.size(); ++j) if (m.isLeafLevel(j)) { CPTreeNode m2 = new CPTreeNode(); m2.itemIDList.add(m.itemIDList.get(j)); m2.parents.add(m.parents.get(j)); m.itemIDList.set(j, null); m2.counter1 = m.estimateMergeCount(m.getLevel(j), longestLevel); m2.counter2 = m2.counter1; for (int k = m.children.size()-1; k >= 0; --k) { CPTreeNode mc = m.children.get(k); if (mc.parents.get(0).pInd == j) { // make mc be a child node of m2 mc.parents.set(0, new ParentNode(m2, (short) 0)); m.children.remove(mc); m2.children.add(mc); } } // make m2 be a child node of m m.children.add(m2); } for (int k = m.itemIDList.size()-1; k >= 0 ; --k) if (m.itemIDList.get(k) == null) { m.itemIDList.remove(k); m.parents.remove(k); for (int y = 1; y < m.parents.size(); ++y) { ParentNode x = m.parents.get(y); if (x.pInd >k) { x.pInd--; m.parents.set(y, x); } }// for for (CPTreeNode mx : m.children) { ParentNode x = mx.parents.get(0); if (x.pInd > k) { x.pInd--; mx.parents.set(0, x); } }// for }// if int newLongestLevel = m.getLongestLevel(); m.counter2 = m.estimateMergeCount(newLongestLevel, longestLevel); } /** * */ public void traverse(CPTreeNode m, CPTreeNode mp, int q, int[] transaction) { if (q != -1 && m.parents.get(0).pInd != q && m.parents.get(0).pNode != mp) return; if (Arrays.binarySearch(transaction, m.itemIDList.get(0)) < 0) return; m.update(d); if (m.counter1 / N < minsig ) { mp.children.remove(m); return; } else { List<Integer> leafCommonItemInds = new ArrayList<Integer>(); List<Integer> levelParents = new ArrayList<Integer>(); int i = 1; // level if (m.isLeafLevel(0)) leafCommonItemInds.add(0); else{ levelParents.add(0); while (true) { levelParents=FindLevelCommonItems(m, levelParents, leafCommonItemInds, transaction); if (levelParents.size() != 0) i++; else break; }//while }//else if (i==m.getLongestLevel()) m.counter2 = m.counter2 * d + 1; if ((mp.counter1 - m.counter2) / N < delta && m.counter2 / N >= minmerg) { if (mp != root ) merge(mp, m); } else if ((m.counter1 - m.counter2) / N > delta && m.counter2 / N >= minmerg && m.itemIDList.size() > 1) split(m); for (int j : leafCommonItemInds) { for (int f = 0; f < m.children.size(); ++f) { CPTreeNode mc = m.children.get(f); traverse(mc, m, j, transaction); } } }// else } /** * */ List<Integer> FindLevelCommonItems(CPTreeNode m, List<Integer> levelParents, List<Integer> leafCommonItemInds, int[] transaction) { ArrayList<Integer> newParents = new ArrayList<Integer>(); for (int k = levelParents.get(0)+1; k < m.itemIDList.size(); ++k) { if (Arrays.binarySearch(transaction, m.itemIDList.get(k)) >= 0) { int pInd=m.parents.get(k).pInd; if (levelParents.contains(pInd)){ newParents.add(k); if (m.isLeafLevel(k)) leafCommonItemInds.add(k); } else break; } }// for return newParents; } @Override /** * Method for getting a string representation of the CP-tree * (to be used for debugging purposes). * @return a string */ public String toString() { return root.toString(""); } int nodeCount(CPTreeNode currentNode) { int s=1; for (CPTreeNode child: currentNode.children) s+= nodeCount(child); return s; } }// class