package ca.pfv.spmf.algorithms.frequentpatterns.cfpgrowth; /* This file is copyright (c) 2008-2013 Azadeh Soltani * * This file is part of the SPMF DATA MINING SOFTWARE * (http://www.philippe-fournier-viger.com/spmf). * * SPMF is free software: you can redistribute it and/or modify it under the * terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * * SPMF is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR * A PARTICULAR PURPOSE. See the GNU General Public License for more details. * You should have received a copy of the GNU General Public License along with * SPMF. If not, see <http://www.gnu.org/licenses/>. */ import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; import ca.pfv.spmf.patterns.itemset_array_integers_with_count.Itemset; import ca.pfv.spmf.patterns.itemset_array_integers_with_count.Itemsets; import ca.pfv.spmf.tools.MemoryLogger; /** * This is an implementation of the CFPGrowth++ algorithm. CFPGrowth++ was proposed in this paper: * <br/><br/> * * Kiran, R. U., & Reddy, P. K. (2011). Novel techniques to reduce search space * in multiple minimum supports-based frequent pattern mining algorithms. * In Proceedings of the 14th International Conference on Extending Database * Technology, ACM (pp. 11-20). * * and it is an optimization of the original CFPGrowth algorithm: * * Hu, Y. H., & Chen, Y. L. (2006). Mining association rules with multiple minimum supports: a new mining algorithm and a support tuning mechanism. Decision Support Systems, 42(1), 1-24. * <br/><br/> * * This is an optimized version that saves the result to a file * or keep it into memory if no output path is provided * by the user to the runAlgorithm method(). * * This implementation was made by Azadeh Soltani based on the FPGrowth * implementation by Philippe Fournier-Viger * * @see MISNode * @see MISTree * @author Azadeh Soltani */ public class AlgoCFPGrowth { // for statistics private long startTimestamp; // start time of the latest execution private long endTime; // end time of the latest execution private int transactionCount = 0; // transaction count in the database private int itemsetCount; // number of freq. itemsets found // object to write the output file BufferedWriter writer = null; // The patterns that are found // (if the user want to keep them into memory) protected Itemsets patterns = null; // the comparator that is used to compare the item ordering final Comparator<Integer> itemComparator; // array indicating the minimum support for each item int MIS[]; // the minimum MIS int minMIS; /** Object to check the maximum memory usage */ private MemoryLogger memoryLogger = null; /** * Constructor */ public AlgoCFPGrowth() { // Create a comparator that will be used to establish a total // order between items. itemComparator = new Comparator<Integer>() { public int compare(Integer o1, Integer o2) { // compare according to MIS value int compare = MIS[o2] - MIS[o1]; if (compare == 0) { // if the same MIS, we check the lexical // ordering! return (o1 - o2); } return compare; } }; } /** * Run the algorithm. * @param input the path to an input file containing a transaction database. * @param output the output file path for saving the result (if null, the result * will be returned by the method instead of being saved). * @param MISIn path to a file containing the MIS thresholds. * @return the result if no output file path is provided. * @throws IOException if error reading/writing files */ public Itemsets runAlgorithm(String input, String output, String MISIn) throws FileNotFoundException, IOException { // record start time startTimestamp = System.currentTimeMillis(); //initialize tool to record memory usage memoryLogger = new MemoryLogger(); memoryLogger.checkMemory(); // if the user want to keep the result into memory if(output == null){ writer = null; patterns = new Itemsets("FREQUENT ITEMSETS"); }else{ // if the user want to save the result to a file patterns = null; writer = new BufferedWriter(new FileWriter(output)); } // (1) PREPROCESSING: Perform an initial database scan to determine the // MIS of each item // This map is used to count the support of each item // Key: item Value: support final Map<Integer, Integer> mapSupport = new HashMap<Integer, Integer>(); // az---initializing MISs-------------- initMISfromFile(MISIn); // reset the number of frequent itemsets to 0 itemsetCount = 0; // (2) Scan the database to build the initial FP-Tree // Before inserting a transaction in the FPTree, we sort the items // by decreasing order of MIS. MISTree tree = new MISTree(); BufferedReader reader = new BufferedReader(new FileReader(input)); String line; // read the transaction database line (transaction) by line // until the end of file while (((line = reader.readLine()) != null)) { // if the line is a comment, is empty or is a // kind of metadata if (line.isEmpty() == true || line.charAt(0) == '#' || line.charAt(0) == '%' || line.charAt(0) == '@') { continue; } // split the current transaction into items (they are separated by spaces) String[] lineSplited = line.split(" "); List<Integer> transaction = new ArrayList<Integer>(); // for each item in the transaction for (String itemString : lineSplited) { // convert item to integer Integer item = Integer.parseInt(itemString); // increase the support of the item by 1 Integer count = mapSupport.get(item); if (count == null) { mapSupport.put(item, 1); } else { mapSupport.put(item, ++count); } // all items are added to transactions transaction.add(item); } transactionCount++; // increase the number of transactions // sort item in the transaction by non increasing order of MIS Collections.sort(transaction, this.itemComparator); // add the sorted transaction to the MISTree. tree.addTransaction(transaction); }// while reader.close(); // close the input file // tree.print(tree.root); // We create the header table for the tree tree.createHeaderList(this.itemComparator); // We search for for items with support smaller than minMIS and remove // them from the tree boolean sw = false; // for each item for (Entry<Integer, Integer> entry : mapSupport.entrySet()) { // if the support is lower than the minimum MIS value if (entry.getValue() < minMIS) { // remove from header list tree.deleteFromHeaderList(entry.getKey(), itemComparator); // System.out.println(entry.getKey()); // remove from the tree tree.MISPruning(entry.getKey()); // System.out.println(entry.getKey()); // tree.print(tree.root); sw = true; }// if }// for // merge child node with the same item id if (sw == true) { tree.MISMerge(tree.root); } // tree.print(tree.root); // (5) We start to mine the FP-Tree by calling the recursive method. // Initially, prefix alpha is empty. int[] prefixAlpha = new int[0]; if(tree.headerList.size() > 0) { cfpgrowth(tree, prefixAlpha, transactionCount, mapSupport); } // check the memory usage memoryLogger.checkMemory(); // close the output file if the result was saved to a file if(writer != null){ writer.close(); } // record end time endTime = System.currentTimeMillis(); return patterns; } /** * Read MIS values from the MIS file. * @param input path to the file containing the MIS values * @throws IOException if error occurs while reading the file or the file does not exist */ private void initMISfromFile(String input) throws FileNotFoundException, IOException { // create object to read the file BufferedReader reader = new BufferedReader(new FileReader(input)); String line; minMIS = Integer.MAX_VALUE; // to store the minimum MIS value int maxItemID = 0; // to store the largest item id // A map to store the MIS of each item // key : item value : MIS final Map<Integer, Integer> mapMIS = new HashMap<Integer, Integer>(); // For reach line (transaction) until the end of the file while (((line = reader.readLine()) != null)) { // if the line is a comment, is empty or is a // kind of metadata if (line.isEmpty() == true || line.charAt(0) == '#' || line.charAt(0) == '%' || line.charAt(0) == '@') { continue; } // split the line according to spaces String[] lineSplited = line.split(" "); // convert item to integer Integer item = Integer.parseInt(lineSplited[0]); // convert MIS to integer Integer itemMIS = Integer.parseInt(lineSplited[1]); // update minimum MIS if necessary if ((minMIS > itemMIS) && (itemMIS != 0)) { minMIS = itemMIS; } // record the MIS for that item in the map mapMIS.put(item, itemMIS); // update maximum item ID if necessary if (item > maxItemID) { maxItemID = item; } } // Store the values from the map in an array for more efficiency MIS = new int[maxItemID + 1]; for (Entry<Integer, Integer> entry : mapMIS.entrySet()) { MIS[entry.getKey()] = entry.getValue(); } // close the file reader.close(); } // /** // * This method used the frequency of items to generate an MIS value by // * using the function presented in the article of MISApriori // * (This method is an alternative to initMISfromFile() and is not used // * by default). // * @param input the input file // * @param mapSupport a // * @param beta // * @param LS // * @return // * @throws FileNotFoundException // * @throws IOException // */ // private int initMISfromFrequency(String input, // final Map<Integer, Integer> mapSupport, double beta, double LS) // throws FileNotFoundException, IOException { // int maxItemID = 0; // BufferedReader reader = new BufferedReader(new FileReader(input)); // String line; // // for each transaction // while (((line = reader.readLine()) != null)) { // String[] lineSplited = line.split(" "); // // for each item in the transaction // for (String itemString : lineSplited) { // // increase the support count of the item // Integer item = Integer.parseInt(itemString); // Integer count = mapSupport.get(item); // if (count == null) { // mapSupport.put(item, 1); // // az // if (maxItemID < item) // maxItemID = item; // } else { // mapSupport.put(item, ++count); // } // } // transactionCount++; // } // reader.close(); // MIS = new int[maxItemID + 1]; // minMIS = 1; // int LSRelative = (int) Math.ceil(LS * transactionCount); // for (Entry<Integer, Integer> entry : mapSupport.entrySet()) { // // calculate the MIS value // MIS[entry.getKey()] = (int) (beta * entry.getValue()); // if (MIS[entry.getKey()] < LSRelative) { // MIS[entry.getKey()] = LSRelative; // }// if // if (MIS[entry.getKey()] < minMIS) { // minMIS = MIS[entry.getKey()]; // }// if // }// for // // return minMIS; // } // // // end az------------------------------------------------------------------- /** * This method mines pattern from a Prefix-Tree recursively * * @param tree The Prefix Tree * @param prefix The current prefix "alpha" * @param mapSupport The frequency of each item in the prefix tree. * @throws IOException exception if error writing the output file. */ private void cfpgrowth(MISTree tree, int[] prefixAlpha, int prefixSupport, Map<Integer, Integer> mapSupport) throws IOException { // String test = ""; // for(int item : prefixAlpha){ // test += item + " "; // } // System.out.println(test); // We check if there is only one item in the header table if (tree.headerList.size() == 1) { MISNode node = tree.mapItemNodes.get(tree.headerList.get(0)); // If this node has no child if (node.nodeLink == null) { // If the support of this node is higher than the MIS of the first item // of the current prefix alpha if (node.counter >= MIS[prefixAlpha[0]]) { //write the itemset to the output file writeItemsetToFile(prefixAlpha, node.itemID, node.counter); } // end of code that i moved } else { // recursive call cfpgrowthMoreThanOnePath(tree, prefixAlpha, prefixSupport, mapSupport); } } else { // There is more than one path, recursive call cfpgrowthMoreThanOnePath(tree, prefixAlpha, prefixSupport, mapSupport); } } /** * Mine an FP-Tree having more than one path. * * @param tree the FP-tree * @param prefix the current prefix, named "alpha" * @param mapSupport the frequency of items in the FP-Tree * @throws IOException exception if error writing the file */ private void cfpgrowthMoreThanOnePath(MISTree tree, int[] prefixAlpha, int prefixSupport, Map<Integer, Integer> mapSupport) throws IOException { // We process each frequent item in the header table list of the tree in // reverse order. for (int i = tree.headerList.size() - 1; i >= 0; i--) { // get the item and its support Integer item = tree.headerList.get(i); int support = mapSupport.get(item); // if the item is not frequent, we skip it int mis = (prefixAlpha.length == 0) ? MIS[item] : MIS[prefixAlpha[0]]; // pfv if (support < mis) continue; // Let's Beta be the concatenation of Alpha with the current item int betaSupport = (prefixSupport < support) ? prefixSupport : support; // az // int mis = (prefixAlpha.length == 0) ? MIS[item] : // MIS[prefixAlpha[0]]; // pfv // if the support is higher than the MIS if (support >= mis) { // save the itemset to the file writeItemsetToFile(prefixAlpha, item, betaSupport); } // === Construct beta's conditional pattern base === // It is a subdatabase which consists of the set of prefix paths // in the FP-tree co-occuring with the suffix pattern. List<List<MISNode>> prefixPaths = new ArrayList<List<MISNode>>(); MISNode path = tree.mapItemNodes.get(item); // for each path while (path != null) { // if the path is not just the root node if (path.parent.itemID != -1) { // create the prefixpath List<MISNode> prefixPath = new ArrayList<MISNode>(); // add this node. prefixPath.add(path); // NOTE: we add it just to keep its support, // actually it should not be part of the prefixPath // Recursively add all the parents of this node. MISNode parent = path.parent; while (parent.itemID != -1) { prefixPath.add(parent); parent = parent.parent; } prefixPaths.add(prefixPath); } // We will look for the next prefixpath path = path.nodeLink; } // (A) Calculate the frequency of each item in the prefixpath Map<Integer, Integer> mapSupportBeta = new HashMap<Integer, Integer>(); // for each prefixpath for (List<MISNode> prefixPath : prefixPaths) { // the support of the prefixpath is the support of its first // node. int pathCount = prefixPath.get(0).counter; // for each node, except the first one, we // count the frequency for (int j = 1; j < prefixPath.size(); j++) { // Get the node MISNode node = prefixPath.get(j); if (mapSupportBeta.get(node.itemID) == null) { mapSupportBeta.put(node.itemID, pathCount); } else { mapSupportBeta.put(node.itemID, mapSupportBeta.get(node.itemID) + pathCount); } } } // (B) Construct beta's conditional FP-Tree MISTree treeBeta = new MISTree(); // add each prefixpath in the FP-tree for (List<MISNode> prefixPath : prefixPaths) { treeBeta.addPrefixPath(prefixPath, mapSupportBeta, minMIS); } // create the header list treeBeta.createHeaderList(itemComparator); // System.out.println(); // treeBeta.print(treeBeta.root); // Mine recursively the Beta tree. if (treeBeta.root.childs.size() > 0) { // create beta int[] beta = new int[prefixAlpha.length + 1]; System.arraycopy(prefixAlpha, 0, beta, 0, prefixAlpha.length); beta[prefixAlpha.length] = item; // recursive call to the main method to mine the conditional tree cfpgrowth(treeBeta, beta, betaSupport, mapSupportBeta); } } } /** * Write a frequent itemset that is found to the output file. * @param itemset an itemset * @param lastItem an item that should be appended to the itemset * @param support the support of "itemset" + "item". */ private void writeItemsetToFile(int[] itemset, int lastItem, int support) throws IOException { // increase the number of frequent itemsets found itemsetCount++; // if the result should be saved to a file if(writer != null){ // Create a string buffer StringBuilder buffer = new StringBuilder(); // write the items of the itemset for(int i=0; i< itemset.length; i++){ buffer.append(itemset[i]); buffer.append(' '); } buffer.append(lastItem); // Then, write the support buffer.append(" #SUP: "); buffer.append(support); // write to file and create a new line writer.write(buffer.toString()); writer.newLine(); }// otherwise the result is kept into memory else{ // concatenate the last item to the itemset int [] itemsetWithLastItem = new int[itemset.length+1]; System.arraycopy(itemset, 0, itemsetWithLastItem, 0, itemset.length); itemsetWithLastItem[itemset.length] = lastItem; Arrays.sort(itemsetWithLastItem); // ADDED TO FIX ASSOCIATION RULE BUG FOR CFPGROWTH+ // create an object Itemset and add it to the set of patterns // found. Itemset itemsetObj = new Itemset(itemsetWithLastItem); itemsetObj.setAbsoluteSupport(support); patterns.addItemset(itemsetObj, itemsetObj.size()); } } /** * Print statistics about the algorithm execution to System.out. */ public void printStats() { System.out.println("============= CFP-GROWTH++ - STATS ============="); long temps = endTime - startTimestamp; System.out.println(" Transactions count from database : " + transactionCount); System.out.print(" Max memory usage: " + memoryLogger.getMaxMemory() + " mb \n"); System.out.println(" Frequent itemsets count : " + itemsetCount); System.out.println(" Total time ~ " + temps + " ms"); System.out .println("==================================================="); } /** * Get the number of transactions in the last transaction database read. * @return the number of transactions. */ public int getDatabaseSize() { return transactionCount; } }