package ca.pfv.spmf.algorithms.frequentpatterns.hmine; /* This file is copyright (c) 2008-2013 Philippe Fournier-Viger * * This file is part of the SPMF DATA MINING SOFTWARE * (http://www.philippe-fournier-viger.com/spmf). * * SPMF is free software: you can redistribute it and/or modify it under the * terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * * SPMF is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR * A PARTICULAR PURPOSE. See the GNU General Public License for more details. * You should have received a copy of the GNU General Public License along with * SPMF. If not, see <http://www.gnu.org/licenses/>. */ import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; import ca.pfv.spmf.tools.MemoryLogger; /** * An implementation of the HMine algorithm for mining frequent itemsets from a * transaction database.<br/><br/> * * It is based on the description in:<br/><br/> * * Pei et al. (2007) H-Mine: Fast and space-preserving frequent pattern mining * in large databases. IIE Transactions, 39, 593-605.<br/><br/> * * I tried to follow as much as possible the description in the article for * HMine(mem). One observation is that the links for an item in the header table * are simply what is called a "tid set" in some other algorithms, because links * always point to the first element of a transaction. So actually, the algorithm * was more simple than I first thought. * * @author Philippe Fournier-Viger */ public class AlgoHMine { // the minimum support threshold chosen by the user private int minsup; // object to write the output file BufferedWriter writer = null; // the number of frequent itemsets found (for // statistics) private int frequentCount; // the start time and end time of the last algorithm execution long startTimestamp; long endTimestamp; // in-memory database where // each position in the main list is a transaction represented as a list of integers List<List<Integer>> database; /** * Default constructor */ public AlgoHMine() { } /** * Run the algorithm. * @param input the path of the input file (a transaction database) * @param output the output file path for writing the result * @param minsup the minimum support threshold * @throws IOException exception if error while writing the file */ public void runAlgorithm(String input, String output, int minsup) throws IOException { // record the start time startTimestamp = System.currentTimeMillis(); // create object for writing the output file writer = new BufferedWriter(new FileWriter(output)); // reset the number of itemset found frequentCount = 0; // reset the memory usage checking utility MemoryLogger.getInstance().reset(); // remember the minimum support threshold set by the user this.minsup = minsup; // (1) Scan the database and count the support of each item. // The support of items is stored in map where // key = item value = support count Map<Integer, Integer> mapItemCount = new HashMap<Integer, Integer>(); // scan the database BufferedReader reader = new BufferedReader(new FileReader(input)); String line; // for each line (transaction) until the end of the file while (((line = reader.readLine()) != null)) { // if the line is a comment, is empty or is a // kind of metadata if (line.isEmpty() == true || line.charAt(0) == '#' || line.charAt(0) == '%' || line.charAt(0) == '@') { continue; } // split the line into items String[] lineSplited = line.split(" "); // for each item in the transaction for (String itemString : lineSplited) { // increase the support count of the item by 1 Integer item = Integer.parseInt(itemString); Integer count = mapItemCount.get(item); if (count == null) { mapItemCount.put(item, 1); } else { mapItemCount.put(item, ++count); } } } // close the input file reader.close(); // (2) Scan the database again to construct in-memory database without // infrequent items and to record the tidset of each item. // Create the structure of the in-memory database database = new ArrayList<List<Integer>>(); // Create a map for recording the tidset of each item // Key: item Value: tidset as a list of integers Map<Integer, List<Integer>> mapItemTidset = new HashMap<Integer, List<Integer>>(); // TODO: For optimization, we could use a treemap sorted by descending // order of support. // Read the file again BufferedReader reader2 = new BufferedReader(new FileReader(input)); String line2; int tid = 0; // for each line (transaction) until the end of the file while (((line2 = reader2.readLine()) != null)) { // if the line is a comment, is empty or is a // kind of metadata if (line.isEmpty() == true || line.charAt(0) == '#' || line.charAt(0) == '%' || line.charAt(0) == '@') { continue; } // split the transaction into items String[] lineSplited = line2.split(" "); // Create a transaction object for storing the items as integers List<Integer> transaction = new ArrayList<Integer>(); // for each item in the transaction for (String itemString : lineSplited) { // convert the item to an integer Integer item = Integer.parseInt(itemString); // if the item is frequent if (mapItemCount.get(item) >= minsup) { // add the item to this transaction transaction.add(item); // update the tidset of the item by adding // the tid of the current transaction List<Integer> tidset = mapItemTidset.get(item); if (tidset == null) { tidset = new ArrayList<Integer>(); mapItemTidset.put(item, tidset); } tidset.add(tid); } } // add the transaction to the in-memory database database.add(transaction); // increase the id of the current transaction by 1 to get the id // of the next transction that will be read tid++; } // close the input file reader2.close(); // (3)For each frequent item, save it to file, and then // call the HMINE recursive method. for (Entry<Integer, List<Integer>> entry : mapItemTidset.entrySet()) { // Create an empty prefix with that item int[] prefix = new int[1]; prefix[0] = entry.getKey(); // save that prefix to the output file writeOut(prefix, entry.getValue().size()); // make a recursive call to grow that prefix to find // larger frequent itemsets hmine(prefix, entry.getKey(), entry.getValue()); } // record the end time endTimestamp = System.currentTimeMillis(); //close the output file writer.close(); } /** * This is the recursive procedure for growing a prefix to find larger frequent itemsets * @param prefix the prefix * @param itemProjection the last item added to the prefix * @param links a set of tids * @throws IOException exception if error while writing the output file */ private void hmine(int[] prefix, Integer itemProjection, List<Integer> links) throws IOException { // scan the projected database and calculate the links (tids) for each item // appearing after the item "itemprojection" // We will store these tidsets in a map // Key: item value: tidset as a list of integers Map<Integer, List<Integer>> mapItemTidset = new HashMap<Integer, List<Integer>>(); // for each transaction containing the item to perform the projection for (Integer tid : links) { boolean seen = false; // for each item in that transaction for (Integer item : database.get(tid)) { // if we have seen "itemprojection" already if (seen) { // get the tidset of the current item and add // the tid of the current transction to its tidset List<Integer> tidset = mapItemTidset.get(item); if (tidset == null) { tidset = new ArrayList<Integer>(); mapItemTidset.put(item, tidset); } tidset.add(tid); } // if this is the item for the projection, we remember it if (itemProjection.equals(item)) { seen = true; } } } // For each item having the minimum support in the projected database, // we will save to file, and then recursively call H-Mine. // for each item appearing in the projected database for (Entry<Integer, List<Integer>> entry : mapItemTidset.entrySet()) { // if the item is frequent if (entry.getValue().size() >= minsup) { // create a new prefix by appending the current item int[] newPrefix = new int[prefix.length + 1]; System.arraycopy(prefix, 0, newPrefix, 0, prefix.length); newPrefix[prefix.length] = entry.getKey(); // save the new prefix to the output file with its support (size of tidset) writeOut(newPrefix, entry.getValue().size()); // call hmine procedure to recursively grow this prefix // to try to find larger frequent itemsets starting with the same prefix hmine(newPrefix, entry.getKey(), entry.getValue()); } } // check the memory usage for statistics MemoryLogger.getInstance().checkMemory(); } /** * Write a frequent itemset to the output file. * @param itemset the itemset * @param support the support of the itemset */ private void writeOut(int[] itemset, int support) throws IOException { // increase the number of frequent itemsets found until now frequentCount++; // create a stringuffer StringBuilder buffer = new StringBuilder(); // append items from the itemset to the StringBuilder for (int i = 0; i < itemset.length; i++) { buffer.append(itemset[i]); if (i != itemset.length - 1) { buffer.append(' '); } } // append the support of the itemset buffer.append(" #SUP: "); buffer.append(support); // write the strinbuffer to file and create a new line // so that we are ready for writing the next itemset. writer.write(buffer.toString()); writer.newLine(); } /** * Print statistics about the latest execution of the algorithm * to System.out. */ public void printStatistics() { System.out.println("========== HMINE - STATS ============"); System.out.println(" Number of frequent itemsets: " + frequentCount); System.out.println(" Total time ~: " + (endTimestamp - startTimestamp) + " ms"); System.out.println(" Max memory:" + MemoryLogger.getInstance().getMaxMemory()); System.out.println("====================================="); } }