package ca.pfv.spmf.algorithms.frequentpatterns.MSApriori; /* This file is copyright (c) 2008-2013 Azadeh Soltani, Philippe Fournier-Viger * * This file is part of the SPMF DATA MINING SOFTWARE * (http://www.philippe-fournier-viger.com/spmf). * * SPMF is free software: you can redistribute it and/or modify it under the * terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * * SPMF is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR * A PARTICULAR PURPOSE. See the GNU General Public License for more details. * You should have received a copy of the GNU General Public License along with * SPMF. If not, see <http://www.gnu.org/licenses/>. */ import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; import ca.pfv.spmf.patterns.itemset_array_integers_with_count.Itemset; import ca.pfv.spmf.tools.MemoryLogger; /** * This is an implementation of the MSApriori algorithm as described by :<br/><br/> * * Bing Liu et al. (1999). Mining Association Rules with Multiple Minimum Supports, Proceedings of KDD 1999. * <br/><br/> * * This implementation was made by AZADEH SOLTANI based on the Apriori implementation by Philippe Fournier-Viger * * @see Itemset * @author Azadeh Soltani, Philippe Fournier-Viger */ public class AlgoMSApriori { // the current level in the apriori generation (itemsets of size k) protected int k; // the array of MIS value where the position i indicate the MIS of item with ID i. int MIS[]; // for statistics protected long startTimestamp; // start time of latest execution protected long endTimestamp; // end time of latest execution private int itemsetCount; // number of frequent itemsets generated // the LS value as an integer private int LSRelative; // an in-memory representation of the transaction database // where position i represents the ith transaction as an integer array private List<Integer[]> database = null; // the comparator that is used to sort items by MIS values final Comparator<Integer> itemComparator; // object to write the output file BufferedWriter writer = null; /** * Constructor */ public AlgoMSApriori() { itemComparator = new Comparator<Integer>() { public int compare(Integer o1, Integer o2) { // first compare by MIS values int compare = MIS[o1] - MIS[o2]; //if the same MIS, we check the lexical ordering! if(compare ==0){ // return (o1 - o2); } // otherwise use MIS value return compare; } }; } /** * Run the algorithm * @param input an input file containing a transaction database * @param output an output file path for writing the result * @param beta the parameter Beta for generating MIS values for all items (see paper) * @param LS the parameter LS for generating MIS values for all items (see paper) * @throws IOException exception if error while writing the output file */ public void runAlgorithm(String input, String output, double beta, double LS) throws IOException { // record the start time startTimestamp = System.currentTimeMillis(); // Prepare for writing the output file writer = new BufferedWriter(new FileWriter(output)); // Reset the number of itemset found to 0 itemsetCount = 0; // reseter the utility for recording the max memory usage MemoryLogger.getInstance().reset(); // variable to store the maximum item id in the database int maxItemID = -1; // pfv // the number of transaction in the database int transactionCount = 0; // map to count the support of each item // key: item value: support of the item Map<Integer, Integer> mapItemCount = new HashMap<Integer, Integer>(); // the database in memory (intially empty) database = new ArrayList<Integer[]>(); // scan the database to load it into memory and count the support of // each single item at the same time BufferedReader reader = new BufferedReader(new FileReader(input)); String line; // for each line (transaction) until the end of the file while (((line = reader.readLine()) != null)) { // if the line is a comment, is empty or is a // kind of metadata if (line.isEmpty() == true || line.charAt(0) == '#' || line.charAt(0) == '%' || line.charAt(0) == '@') { continue; } // split the line into items String[] lineSplited = line.split(" "); // create an array of integer to store the transaction in memory Integer transaction[] = new Integer[lineSplited.length]; // for each item in the transaction for (int i = 0; i < lineSplited.length; i++) { // convert the item to integer Integer item = Integer.parseInt(lineSplited[i]); // add it to the in memory transaction transaction[i] = item; // increase the support count of the item Integer count = mapItemCount.get(item); if (count == null) { mapItemCount.put(item, 1); // if this is the largest item ID encountered, then remember it if (item > maxItemID) { maxItemID = item; } } else { mapItemCount.put(item, ++count); } } // add the transaction to the in memory database database.add(transaction); // increase the transaction count transactionCount++; } // close the input file reader.close(); // initialize array for storing the MIS values for each item MIS = new int[maxItemID + 1]; // transform the LS value to a relative value by multiplying by // the number of transactions this.LSRelative = (int) Math.ceil(LS * transactionCount); // pfv // Start generating frequent itemsets of size 1 k = 1; // Create a set M to store all items List<Integer> M = new ArrayList<Integer>(); // for each item for (Entry<Integer, Integer> entry : mapItemCount.entrySet()) { // add the item to M M.add(entry.getKey()); // calculate the MIS value for that item by using the formula described in the paper MIS[entry.getKey()] = (int) (beta * entry.getValue()); // if the MIS value for that item is lower than LS, then set it to LS if (MIS[entry.getKey()] < LSRelative){ MIS[entry.getKey()] = LSRelative; } //if the support of the item is higher than its MIS value if (entry.getValue() >= MIS[entry.getKey()]) { // save the item to the output file with its support saveItemsetToFile(entry.getKey(), entry.getValue()); } } // sort the list of items by MIS order Collections.sort(M, itemComparator); //pfv // if no frequent item was found, we stop there! if (itemsetCount == 0) { return; } // create the set F (as described in the paper) List<Integer> F = new ArrayList<Integer>(); // this variable will be used to store the smallest MIS value higher // such that the corresponding item has a support no less than it double minMIS = -1; int i; // for each item for (i = 0; i < M.size(); i++) { Integer item = M.get(i); //if its support is higher or = to its MIS value if (mapItemCount.get(item) >= MIS[item]) { // add it to F F.add(item); // set the min MIS to this value minMIS = MIS[item]; break; // break }// if }// for // for each folowing item in M for (i++; i < M.size(); i++) { Integer item = M.get(i); // if it has a support higher or equal to his MIS value if (mapItemCount.get(item) >= minMIS){ // add it to F F.add(item); } }// forj // sort the database by MIS order for (Integer[] transaction : database) { Arrays.sort(transaction, itemComparator); } // Now, the algorithm will discover itemset of size k > 1 starting from k=2 List<Itemset> level = null; k = 2; // Generate candidates and test them for k>1 by inscreasing k at each iteration // until no candidates can be generated do { // check the memory usage MemoryLogger.getInstance().checkMemory(); // Generate candidates of size K List<Itemset> candidatesK; // Generate candidates if (k == 2) { // if k=2 we use an optimization for candidate generation candidatesK = generateCandidate2(F, mapItemCount); } else { // otherwise, we use the general procedure for candidate generation candidatesK = generateCandidateSizeK(level); } // We scan the database one time to calculate the support // of each candidates and keep those with higher suport. // for each transaction for (Integer[] transaction : database) { // for each candidate loopCand: for (Itemset candidate : candidatesK) { // We will check if the candidate is contained in the transaction by // looking for each item one by one starting from pos =0. int pos = 0; // for each item in the transaction for (int item : transaction) { // if we have found the item at position pos if (item == candidate.get(pos)) { // search the next item from the candidate pos++; // if all items have been found if (pos == candidate.size()) { // then increase the support count of the candidate candidate.increaseTransactionCount(); continue loopCand; } // because of the total order, if the item at position pos is larger // than the current item in the transaction we can stop checking // this candidate because it will not be contained in the transaction. } else if (itemComparator.compare(item, candidate.get(pos)) > 0){ // pfv continue loopCand; } } } } // We build the level k+1 with all the candidates that have // a support higher than MIS[0] level = new ArrayList<Itemset>(); // for each candidate for (Itemset candidate : candidatesK) { // if its support is higher than the MIS of the first item // (because they are sorted by MIS order) if (candidate.getAbsoluteSupport() >= MIS[candidate.get(0)]) { // add it to the next level of candidate level.add(candidate); // save the itemset to the file saveItemsetToFile(candidate); } } k++; } while (level.isEmpty() == false); // record the end time endTimestamp = System.currentTimeMillis(); // check the memory usage MemoryLogger.getInstance().checkMemory(); // close the output file writer.close(); } /** * Generate candidates of size 2 by using frequent itemsets of size 1. * @param frequent1 frequent itemsets of size 1 (single items) * @param mapItemCount a map indicating the support of each item (key: item, value: support) * @return the set of candidates of size 2 */ private List<Itemset> generateCandidate2(List<Integer> frequent1, Map<Integer, Integer> mapItemCount) { // list to store the candidates List<Itemset> candidates = new ArrayList<Itemset>(); // For each itemset I1 and I2 of level k-1 for (int i = 0; i < frequent1.size(); i++) { Integer item1 = frequent1.get(i); for (int j = i + 1; j < frequent1.size(); j++) { Integer item2 = frequent1.get(j); // Create a new candidate by combining itemset1 and itemset2 candidates.add(new Itemset(new int[] { item1, item2 })); } } // return the set of candidates return candidates; } /** * Generate candidates of size K by using frequent itemsets of size K-1. * @return the set of candidates of size K */ protected List<Itemset> generateCandidateSizeK(List<Itemset> levelK_1) { // list to store the candidates generated List<Itemset> candidates = new ArrayList<Itemset>(); // For each itemset I1 and I2 of level k-1 loop1: for (int i = 0; i < levelK_1.size(); i++) { int[] itemset1 = levelK_1.get(i).getItems(); loop2: for (int j = i + 1; j < levelK_1.size(); j++) { int[] itemset2 = levelK_1.get(j).getItems(); // we compare items of itemset1 and itemset2. // If they have all the same k-1 items and the last item of // itemset1 is smaller than the last item of itemset2, we will combine them to generate a // candidate // for each item in itemset1 and itemset2 at position k for (int k = 0; k < itemset1.length; k++) { // if it is the last position if (k == itemset1.length - 1) { // the item from itemset1 should be smaller (lexical // order) // and different from the one of itemset2 if(itemComparator.compare(itemset1[k], itemset2[k]) > 0){ // if not then continue the loop continue loop1; } } // if they are not the last items, and else if (itemset1[k] != itemset2[k]){ // the item from itemset 1 is smaller than the one from itemset2 if(itemComparator.compare(itemset1[k], itemset2[k]) < 0){ // pfv continue loop2; // we continue searching } // the item from itemset 1 is larger than the one from itemset2 else if (itemComparator.compare(itemset1[k], itemset2[k]) > 0) { // pfv continue loop1; // we stop searching: because of MIS order } } } // Create a new candidate by combining itemset1 and itemset2 int newItemset[] = new int[itemset1.length + 1]; System.arraycopy(itemset1, 0, newItemset, 0, itemset1.length); newItemset[itemset1.length] = itemset2[itemset2.length - 1]; // The candidate is tested to see if its subsets of size k-1 are // included in level k-1 (to check if they are frequent). if (allSubsetsOfSizeK_1AreFrequent(newItemset, levelK_1)) { // if yes, then add to candidates candidates.add(new Itemset(newItemset)); } } } // return candidates return candidates; } // -------------------------------------------------------------------------------------------- protected boolean allSubsetsOfSizeK_1AreFrequent(int[] c, List<Itemset> levelK_1) { // generate all subsets by always each item from the candidate, one by // one for (int posRemoved = 0; posRemoved < c.length; posRemoved++) { // az ****************************** // if it does not contain first item of candidate and // MIS(c[0])!=MIS(c[1]) there is no need to check if ((posRemoved == 0) && MIS[c[0]] != MIS[c[1]]) { continue; } // end az****************************** // the binary search // perform a binary search to check if the subset appears in level // k-1. int first = 0; int last = levelK_1.size() - 1; boolean found = false; // the binary search while( first <= last ) { int middle = ( first + last ) >>> 1; // divide by 2 if(sameAs(levelK_1.get(middle), c, posRemoved) < 0 ){ first = middle + 1; // the itemset compared is larger than the subset according to the lexical order } else if(sameAs(levelK_1.get(middle), c, posRemoved) > 0 ){ last = middle - 1; // the itemset compared is smaller than the subset is smaller according to the lexical order } else{ found = true; // we have found it so we stop break; } } if (found == false) { // if we did not find it, that means that // candidate is not a frequent itemset // because // at least one of its subsets does not appear in level k-1. return false; } } return true; } /** * Method to check if two itemsets are equals * @param itemset the first itemset * @param candidate the second itemset * @param postRemoved a position that should be ignored from itemset "candidate" for the comparison * @return 0 if they are the same, <0 if "itemset" is smaller than candidate according to the MIS order, otherwise >0 */ private int sameAs(Itemset itemset, int[] candidate, int posRemoved) { // j will be position of the item that we are searching from "candidate" // in "itemset" int j = 0; // for each item in "itemset" for (int i = 0; i < itemset.size(); i++) { // if it is the position that should be ignored, then skip it if (j == posRemoved) { j++; } // if we have found the item at position j if (itemset.get(i) == candidate[j]) { // then we will search for the next one j++; }else{ // they are different so use the comparator to compare the two // items according to the MIS order return itemComparator.compare(itemset.get(i), candidate[j]); // pfv } } // they are the same! return 0; } /** * Save an itemset to the output file */ private void saveItemsetToFile(Itemset itemset) throws IOException { // write the itemset with its support count writer.write(itemset.toString() + " #SUP: " + itemset.getAbsoluteSupport()); writer.newLine(); // increase the number of itemsets found itemsetCount++; } /** * Save a frequent item to the output file */ private void saveItemsetToFile(Integer item, Integer support) throws IOException { // write the item with its support writer.write(item + " #SUP: " + support); writer.newLine(); // increase the number of itemsets found itemsetCount++; } /** * Print statistics about the latest execution of the algorithm to System.out. */ public void printStats() { System.out.println("============= MSAPRIORI - STATS ============="); System.out.println(" The algorithm stopped at level " + (k - 1) + ", because there is no candidate"); System.out.println(" Frequent itemsets count : " + itemsetCount); System.out.println(" Maximum memory usage : " + MemoryLogger.getInstance().getMaxMemory() + " mb"); System.out.println(" Total time ~ " + (endTimestamp - startTimestamp) + " ms"); System.out .println("==================================================="); } }