package ca.pfv.spmf.algorithms.frequentpatterns.relim; /* This file is copyright (c) 2008-2013 Philippe Fournier-Viger * * This file is part of the SPMF DATA MINING SOFTWARE * (http://www.philippe-fournier-viger.com/spmf). * * SPMF is free software: you can redistribute it and/or modify it under the * terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * * SPMF is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR * A PARTICULAR PURPOSE. See the GNU General Public License for more details. * You should have received a copy of the GNU General Public License along with * SPMF. If not, see <http://www.gnu.org/licenses/>. */ import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; import ca.pfv.spmf.tools.MemoryLogger; /** * This is an implementation of the RELIM algorithm for mining frequent itemsets. RELIM is proposed by : * <br/><br/> * * Borgelt, C. (2005) Keeping Things Simple: Finding Frequent Item Sets by Recursive Elimination * Workshop Open Source Data Mining Software (OSDM'05, Chicago, IL), 66-70. * ACM Press, New York, NY, USA 2005<br/><br/> * * RELIM is not a very efficient frequent itemset mining algorithm, but I decided to implement it * because it is simple.<br/><br/> * * Note that it might not be implemented in a very optimized way. One reason is that in the original * article there is no pseudo-code for the algorithm. * * @see DatabaseStructureRelim * @author Philippe Fournier-Viger */ public class AlgoRelim { // for statistics private long startTimestamp; // the start time private long endTimestamp; // the end time private int relativeMinsupp; // the minimum support as a relative value (integer) // the array private int items[]; // object to write the result to a file BufferedWriter writer = null; // the number of frequent itemsets found (for // statistics) private int frequentCount; /** * Default constructor */ public AlgoRelim() { } /** * Run the algorithm * @param minsupp minimum support threshold * @param input the file path of the input file * @param output the file path of the desired output file * @throws IOException exception if error reading/writing files */ public void runAlgorithm(double minsupp, String input, String output) throws IOException { // record start time startTimestamp = System.currentTimeMillis(); // prepare output file writer = new BufferedWriter(new FileWriter(output)); // reset the number of itemsets found to 0 frequentCount = 0; // reset the utility for checking the memory usage MemoryLogger.getInstance().reset(); // reset the number of transactions to 0 int transactionCount =0; // (1) Scan the database and count the support of each item (in a map) // for this map : key = item value = tidset final Map<Integer, Integer> mapSupport = new HashMap<Integer, Integer>(); // scan the database BufferedReader reader = new BufferedReader(new FileReader(input)); String line; // for each line (transaction) until the end of file while (((line = reader.readLine()) != null)){ // if the line is a comment, is empty or is a // kind of metadata if (line.isEmpty() == true || line.charAt(0) == '#' || line.charAt(0) == '%' || line.charAt(0) == '@') { continue; } // split the transaction into items String[] lineSplited = line.split(" "); // for each item in the // transaction for (String itemString : lineSplited) { // convert item to integer Integer item = Integer.parseInt(itemString); // increase the support count of the item Integer count = mapSupport.get(item); if (count == null) { mapSupport.put(item, 1); } else { mapSupport.put(item, ++count); } } // increase transaction count transactionCount++; } // close the input file reader.close(); // transform the minimum support from absolute to relative value // by multiplying by the number of transactions this.relativeMinsupp = (int) Math.ceil(minsupp * transactionCount); // (2) Sort items by frequency and then lexical ordering // a list to store items List<Integer> listItems = new ArrayList<Integer>(); // for each item for(Entry<Integer,Integer> entry : mapSupport.entrySet()){ Integer item = entry.getKey(); // if it is frequent add it to the list if(mapSupport.get(item) >= relativeMinsupp){ listItems.add(item); } } // sort the list Collections.sort(listItems, new Comparator<Integer>(){ public int compare(Integer item1, Integer item2){ // compare the support int compare = mapSupport.get(item1) - mapSupport.get(item2); // if same support, use lexical order if(compare ==0){ return (item1- item2); } // otherwise, use the support return compare; } }); //(3) Create initial database structure // This array will contain the support of each item // position i = support of item i int supports[] = new int[listItems.size()]; // put all frequent items in an array items = new int[listItems.size()]; for(int i=0; i< listItems.size(); i++){ items[i] = listItems.get(i); } // create adatabase structure DatabaseStructureRelim initialDatabase = new DatabaseStructureRelim(supports); initialDatabase.initializeTransactions(); // insert transactions into initial database structure... reader = new BufferedReader(new FileReader(input)); // for each line (transaction) until the end of file while (((line = reader.readLine()) != null)) { // if the line is a comment, is empty or is a // kind of metadata if (line.isEmpty() == true || line.charAt(0) == '#' || line.charAt(0) == '%' || line.charAt(0) == '@') { continue; } // split the transaction into items String[] lineSplited = line.split(" "); // create a list to store items of the transaction List<Integer> transaction = new ArrayList<Integer>(); //for each item for (String itemString : lineSplited) { // convert item to integer Integer item = Integer.parseInt(itemString); // if frequent add it to the transaction otherwise ignore it if(mapSupport.get(item) >= relativeMinsupp){ transaction.add(item); } } // if the transaction is empty, then we just ignore it if(transaction.size() ==0){ continue; } // Otherwise sort the transaction according to the frequency of items Collections.sort(transaction, new Comparator<Integer>(){ public int compare(Integer item1, Integer item2){ // first compare the support int compare = mapSupport.get(item1) - mapSupport.get(item2); // if equals then use the lexical order if(compare ==0){ return (item1 - item2); } //otherwise use the support return compare; } }); // increase the support of the first item of this transaciton int firstItem = transaction.get(0); int indexArray = listItems.indexOf(firstItem); supports[indexArray]++; // insert transaction in the data structure initialDatabase.transactions.get(indexArray).add(transaction.subList(1, transaction.size())); } // close the input file reader.close(); // (7) START RECURSION // call the recursive procedure to discover itemsets recursion(initialDatabase, new int[0]); // check the memory usage MemoryLogger.getInstance().checkMemory(); // close the output file writer.close(); // record end time endTimestamp = System.currentTimeMillis(); } /** * Recursive method for discovering frequent itemsets starting with a given prefix. * @param database the database structure * @param prefix the current prefix * @throws IOException exception if error writing to the output file */ private void recursion(DatabaseStructureRelim database, int[] prefix) throws IOException { // for each item for(int i=0; i< items.length; i++){ // if the support is higher than 0 if(database.supports[i] > 0 ){ // Check if frequent if(database.supports[i]>= relativeMinsupp){ // (1) add the frequent itemset to the set of frequent itemsets found! writeOut(items[i], prefix, database.supports[i]); } // for each transaction for this item database.supports[i] = 0; // empty list for i // create new prefix int[] newSupportPrefix = new int[database.supports.length]; // create new database structure for that prefix DatabaseStructureRelim databasePrefix = new DatabaseStructureRelim(newSupportPrefix); databasePrefix.initializeTransactions(); // for each transaction in the database for(List<Integer> transaction : database.transactions.get(i)){ // if the transaction is empty, then skip it if(transaction.size() == 0){ continue; } // Get the first item Integer firstItem = transaction.get(0); // find its position in the item array int index = getIndexOf(firstItem); // increase its support database.supports[index]++; // increase its support with respect to the new prefix newSupportPrefix[index]++; // if the transaction has more than two items if(transaction.size() >= 2){ // create sublist as described in the paper List<Integer> subList = transaction.subList(1, transaction.size()); // Get the database prefix databasePrefix.transactions.get(index).add(subList); // add the sublist at the item index database.transactions.get(index).add(subList); } } // Create the new prefix for recursion by appending the item at i int []newPrefix = new int[prefix.length+1]; System.arraycopy(prefix, 0, newPrefix, 0, prefix.length); newPrefix[prefix.length] = items[i]; // recursive call recursion(databasePrefix, newPrefix); } } // check the memory usage for statistics purpose MemoryLogger.getInstance().checkMemory(); } /** * Get the position of an item in the list of all items * @param item the item that is searched * @return the position (integer) or -1 if it is not there */ private int getIndexOf(int item){ // for each item for(int i=0; i < items.length; i++){ // if it is equal to the item that we search, return the position if(item == items[i]){ return i; } } // not found, then return -1 return -1; } /** * Write a frequent itemset to the output file. * @param prefix the itemset * @param item an item that should be appended to the itemset * @param support the support of the itemset with the item * @throws IOException exception if error while writing to the output file. */ private void writeOut(int item, int[]prefix, int support) throws IOException{ // increase the number of itemsets found frequentCount++; // create a string uffer StringBuilder buffer = new StringBuilder(); // add the item buffer.append(item); buffer.append(" "); // next add all other items from the itemset for (int i = 0; i < prefix.length; i++) { buffer.append(prefix[i]); if (i != prefix.length - 1) { buffer.append(' '); } } //Finally, write the support. buffer.append(" #SUP: "); buffer.append(support); writer.write(buffer.toString()); writer.newLine(); // create new line to be ready for next itemset } /** * Print statistics about the latest execution of the algorithm to System.out */ public void printStatistics() { System.out.println("========== RELIM - STATS ============"); System.out.println(" Number of frequent itemsets: " + frequentCount); System.out.println(" Total time ~: " + (endTimestamp - startTimestamp) + " ms"); System.out.println(" Max memory:" + MemoryLogger.getInstance().getMaxMemory()); System.out.println("====================================="); } }