package ca.pfv.spmf.algorithms.associationrules.agrawal94_association_rules; /* This file is copyright (c) 2008-2015Philippe Fournier-Viger * * This file is part of the SPMF DATA MINING SOFTWARE * (http://www.philippe-fournier-viger.com/spmf). * * SPMF is free software: you can redistribute it and/or modify it under the * terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * SPMF is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR * A PARTICULAR PURPOSE. See the GNU General Public License for more details. * You should have received a copy of the GNU General Public License along with * SPMF. If not, see <http://www.gnu.org/licenses/>. */ import java.io.BufferedWriter; import java.io.FileWriter; import java.io.IOException; import java.text.DecimalFormat; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.List; import ca.pfv.spmf.algorithms.ArraysAlgos; import ca.pfv.spmf.patterns.itemset_array_integers_with_count.Itemset; import ca.pfv.spmf.patterns.itemset_array_integers_with_count.Itemsets; /** * This is an implementation of the "faster algorithm" for generating association rules, * described in Agrawal & * al. 1994, IBM Research Report RJ9839, June 1994. * <br/><br/> * * This implementation saves the result to a file * or can alternatively keep it into memory if no output * path is provided by the user when the runAlgorithm() * method is called. * * @see AssocRule * @see AssocRules * @author Philippe Fournier-Viger **/ public class AlgoAgrawalFaster94 { // the frequent itemsets that will be used to generate the rules private Itemsets patterns; // variable used to store the result if the user choose to save // the result in memory rather than to an output file protected AssocRules rules; // object to write the output file if the user wish to write to a file protected BufferedWriter writer = null; // for statistics protected long startTimestamp = 0; // last execution start time protected long endTimeStamp = 0; // last execution end time protected int ruleCount = 0; // number of rules generated protected int databaseSize = 0; // number of transactions in database // parameters protected double minconf; protected double minlift; protected boolean usingLift = true; /** * Default constructor */ public AlgoAgrawalFaster94(){ } /** * Run the algorithm * @param patterns a set of frequent itemsets * @param output an output file path for writing the result or null if the user want this method to return the result * @param databaseSize the number of transactions in the database * @param minconf the minconf threshold * @return the set of association rules if the user wished to save them into memory * @throws IOException exception if error writing to the output file */ public AssocRules runAlgorithm(Itemsets patterns, String output, int databaseSize, double minconf) throws IOException { // save the parameters this.minconf = minconf; this.minlift = 0; usingLift = false; // start the algorithm return runAlgorithm(patterns, output, databaseSize); } /** * Run the algorithm * @param patterns a set of frequent itemsets * @param output an output file path for writing the result or null if the user want this method to return the result * @param databaseSize the number of transactions in the database * @param minconf the minconf threshold * @param minlift the minlift threshold * @return the set of association rules if the user wished to save them into memory * @throws IOException exception if error writing to the output file */ public AssocRules runAlgorithm(Itemsets patterns, String output, int databaseSize, double minconf, double minlift) throws IOException { // save the parameters this.minconf = minconf; this.minlift = minlift; usingLift = true; // start the algorithm return runAlgorithm(patterns, output, databaseSize); } /** * Run the algorithm for generating association rules from a set of itemsets. * @param patterns the set of itemsets * @param output the output file path. If null the result is saved in memory and returned by the method. * @param databaseSize the number of transactions in the original database * @return the set of rules found if the user chose to save the result to memory * @throws IOException exception if error while writting to file */ private AssocRules runAlgorithm(Itemsets patterns, String output, int databaseSize) throws IOException { // if the user want to keep the result into memory if(output == null){ writer = null; rules = new AssocRules("ASSOCIATION RULES"); }else{ // if the user want to save the result to a file rules = null; writer = new BufferedWriter(new FileWriter(output)); } this.databaseSize = databaseSize; // record the time when the algorithm starts startTimestamp = System.currentTimeMillis(); // initialize variable to count the number of rules found ruleCount = 0; // save itemsets in a member variable this.patterns = patterns; // SORTING // First, we sort all itemsets having the same size by lexical order // We do this for optimization purposes. If the itemsets are sorted, it allows to // perform two optimizations: // 1) When we need to calculate the support of an itemset (in the method // "calculateSupport()") we can use a binary search instead of browsing the whole list. // 2) When combining itemsets to generate candidate, we can use the // lexical order to avoid comparisons (in the method "generateCandidates()"). // For itemsets of the same size for(List<Itemset> itemsetsSameSize : patterns.getLevels()){ // Sort by lexicographical order using a Comparator Collections.sort(itemsetsSameSize, new Comparator<Itemset>() { @Override public int compare(Itemset o1, Itemset o2) { // The following code assume that itemsets are the same size return ArraysAlgos.comparatorItemsetSameSize.compare(o1.getItems(), o2.getItems()); } }); } // END OF SORTING // Now we will generate the rules. // For each frequent itemset of size >=2 that we will name "lk" for (int k = 2; k < patterns.getLevels().size(); k++) { for (Itemset lk : patterns.getLevels().get(k)) { // create a variable H1 for recursion List<int[]> H1_for_recursion = new ArrayList<int[]>(); // For each itemset "itemsetSize1" of size 1 that is member of lk for(int item : lk.getItems()) { int itemsetHm_P_1[] = new int[] {item}; // make a copy of lk without items from hm_P_1 int[] itemset_Lk_minus_hm_P_1 = ArraysAlgos.cloneItemSetMinusOneItem(lk.getItems(), item); // Now we will calculate the support and confidence // of the rule: itemset_Lk_minus_hm_P_1 ==> hm_P_1 int support = calculateSupport(itemset_Lk_minus_hm_P_1); // THIS COULD BE // OPTIMIZED ? double supportAsDouble = (double) support; // calculate the confidence of the rule : itemset_Lk_minus_hm_P_1 ==> hm_P_1 double conf = lk.getAbsoluteSupport() / supportAsDouble; // if the confidence is lower than minconf if(conf < minconf || Double.isInfinite(conf)){ continue; } double lift = 0; int supportHm_P_1 = 0; // if the user is using the minlift threshold, we will need // to also calculate the lift of the rule: itemset_Lk_minus_hm_P_1 ==> hm_P_1 if(usingLift){ // if we want to calculate the lift, we need the support of hm_P_1 supportHm_P_1 = calculateSupport(itemsetHm_P_1); // if we want to calculate the lift, we need to add this. // calculate the lift double term1 = ((double)lk.getAbsoluteSupport()) /databaseSize; double term2 = supportAsDouble /databaseSize; double term3 = ((double)supportHm_P_1 / databaseSize); lift = term1 / (term2 * term3); // if the lift is not enough if(lift < minlift){ continue; } } // If we are here, it means that the rule respect the minconf and minlift parameters. // Therefore, we output the rule. saveRule(itemset_Lk_minus_hm_P_1, support, itemsetHm_P_1, supportHm_P_1, lk.getAbsoluteSupport(), conf, lift); // Then we keep the itemset hm_P_1 to find more rules using this itemset and lk. H1_for_recursion.add(itemsetHm_P_1); // ================ END OF WHAT I HAVE ADDED } // Finally, we make a recursive call to continue explores rules that can be made with "lk" apGenrules(k, 1, lk, H1_for_recursion); } } // close the file if we saved the result to a file if(writer != null){ writer.close(); } // record the end time of the algorithm execution endTimeStamp = System.currentTimeMillis(); // Return the rules found if the user chose to save the result to memory rather than a file. // Otherwise, null will be returned return rules; } /** * The ApGenRules as described in p.14 of the paper by Agrawal. * (see the Agrawal paper for more details). * @param k the size of the first itemset used to generate rules * @param m the recursive depth of the call to this method (first time 1, then 2...) * @param lk the itemset that is used to generate rules * @param Hm a set of itemsets that can be used with lk to generate rules * @throws IOException exception if error while writing output file */ private void apGenrules(int k, int m, Itemset lk, List<int[]> Hm) throws IOException { // if the itemset "lk" that is used to generate rules is larger than the size of itemsets in "Hm" if (k > m + 1) { // Create a list that we will be used to store itemsets for the recursive call List<int[]> Hm_plus_1_for_recursion = new ArrayList<int[]>(); // generate candidates using Hm List<int[]> Hm_plus_1 = generateCandidateSizeK(Hm); // for each such candidates for (int[] hm_P_1 : Hm_plus_1) { // We subtract the candidate from the itemset "lk" int[] itemset_Lk_minus_hm_P_1 = ArraysAlgos.cloneItemSetMinusAnItemset(lk.getItems(), hm_P_1); // We will now calculate the support of the rule Lk/(hm_P_1) ==> hm_P_1 // we need it to calculate the confidence int support = calculateSupport(itemset_Lk_minus_hm_P_1); double supportAsDouble = (double)support; // calculate the confidence of the rule Lk/(hm_P_1) ==> hm_P_1 double conf = lk.getAbsoluteSupport() / supportAsDouble; // if the confidence is not enough than we don't need to consider // the rule Lk/(hm_P_1) ==> hm_P_1 anymore so we continue if(conf < minconf || Double.isInfinite(conf)){ continue; } double lift = 0; int supportHm_P_1 = 0; // if the user is using the minlift threshold, then we will need to calculate the lift of the // rule as well and check if the lift is higher or equal to minlift. if(usingLift){ // if we want to calculate the lift, we need the support of Hm+1 supportHm_P_1 = calculateSupport(hm_P_1); // calculate the lift of the rule: Lk/(hm_P_1) ==> hm_P_1 double term1 = ((double)lk.getAbsoluteSupport()) /databaseSize; double term2 = (supportAsDouble) /databaseSize; lift = term1 / (term2 * ((double)supportHm_P_1 / databaseSize)); // if the lift is not enough if(lift < minlift){ continue; } } // The rule has passed the confidence and lift threshold requirements, // so we can output it saveRule(itemset_Lk_minus_hm_P_1, support, hm_P_1, supportHm_P_1, lk.getAbsoluteSupport(), conf, lift); // if k == m+1, then we cannot explore further rules using Lk since Lk will be too small. if(k != m+1) { Hm_plus_1_for_recursion.add(hm_P_1); } } // recursive call to apGenRules to find more rules using "lk" apGenrules(k, m + 1, lk, Hm_plus_1_for_recursion); } } /** * Calculate the support of an itemset by looking at the frequent patterns * of the same size. * Because patterns are sorted by lexical order, we use a binary search. * This is MUCH MORE efficient than just browsing the full list of patterns. * * @param itemset the itemset. * @return the support of the itemset */ private int calculateSupport(int[] itemset) { // We first get the list of patterns having the same size as "itemset" List<Itemset> patternsSameSize = patterns.getLevels().get(itemset.length); // // We perform a binary search to find the position of itemset in this list int first = 0; int last = patternsSameSize.size() - 1; while( first <= last ) { int middle = ( first + last ) >>1 ; // >>1 means to divide by 2 int[] itemsetMiddle = patternsSameSize.get(middle).getItems(); int comparison = ArraysAlgos.comparatorItemsetSameSize.compare(itemset, itemsetMiddle); if(comparison > 0 ){ first = middle + 1; // the itemset compared is larger than the subset according to the lexical order } else if(comparison < 0 ){ last = middle - 1; // the itemset compared is smaller than the subset is smaller according to the lexical order } else{ // we have found the itemset, so we return its support. return patternsSameSize.get(middle).getAbsoluteSupport(); } } // The following line will not happen because in the context of this algorithm, we will // always search for itemsets that are frequent and thus will be in the list of patterns. // We just put the following line to avoid compilation error and detect if the error if this // case was ever to happen. return 0; // throw new RuntimeException("INVALID SUPPORT - THIS SHOULD NOT HAPPEN BECAUSE ALL ITEMSETS HAVE TO BE FREQUENT"); } /** * Generating candidate itemsets of size k from frequent itemsets of size * k-1. This is called "apriori-gen" in the paper by agrawal. This method is * also used by the Apriori algorithm for generating candidates. * Note that this method is very optimized. It assumed that the list of * itemsets received as parameter are lexically ordered. * * @param levelK_1 a set of itemsets of size k-1 * @return a set of candidates */ protected List<int[]> generateCandidateSizeK(List<int[]> levelK_1) { // create a variable to store candidates List<int[]> candidates = new ArrayList<int[]>(); // For each itemset I1 and I2 of level k-1 loop1: for (int i = 0; i < levelK_1.size(); i++) { int[] itemset1 = levelK_1.get(i); loop2: for (int j = i + 1; j < levelK_1.size(); j++) { int[] itemset2 = levelK_1.get(j); // we compare items of itemset1 and itemset2. // If they have all the same k-1 items and the last item of // itemset1 is smaller than // the last item of itemset2, we will combine them to generate a // candidate for (int k = 0; k < itemset1.length; k++) { // if they are the last items if (k == itemset1.length - 1) { // the one from itemset1 should be smaller (lexical // order) // and different from the one of itemset2 if (itemset1[k] >= itemset2[k]) { continue loop1; } } // if they are not the last items, and else if (itemset1[k] < itemset2[k]) { continue loop2; // we continue searching } else if (itemset1[k] > itemset2[k]) { continue loop1; // we stop searching: because of lexical // order } } // Create a new candidate by combining itemset1 and itemset2 int lastItem1 = itemset1[itemset1.length -1]; int lastItem2 = itemset2[itemset2.length -1]; int newItemset[]; if(lastItem1 < lastItem2) { // Create a new candidate by combining itemset1 and itemset2 newItemset = new int[itemset1.length+1]; System.arraycopy(itemset1, 0, newItemset, 0, itemset1.length); newItemset[itemset1.length] = lastItem2; candidates.add(newItemset); }else { // Create a new candidate by combining itemset1 and itemset2 newItemset = new int[itemset1.length+1]; System.arraycopy(itemset2, 0, newItemset, 0, itemset2.length); newItemset[itemset2.length] = lastItem1; candidates.add(newItemset); } } } // return the set of candidates return candidates; } /** * Print statistics about the algorithm execution to System.out. */ public void printStats() { System.out.println("============= ASSOCIATION RULE GENERATION v0.96f- STATS ============="); System.out.println(" Number of association rules generated : " + ruleCount); System.out.println(" Total time ~ " + (endTimeStamp - startTimestamp) + " ms"); System.out.println("==================================================="); } /** * Save a rule to the output file or in memory depending * if the user has provided an output file path or not * @param itemset1 left itemset of the rule * @param supportItemset1 the support of itemset1 if known * @param itemset2 right itemset of the rule * @param supportItemset2 the support of itemset2 if known * @param absoluteSupport support of the rule * @param conf confidence of the rule * @param lift lift of the rule * @throws IOException exception if error writing the output file */ protected void saveRule(int[] itemset1, int supportItemset1, int[] itemset2, int supportItemset2, int absoluteSupport, double conf, double lift) throws IOException { ruleCount++; // if the result should be saved to a file if(writer != null){ StringBuilder buffer = new StringBuilder(); // write itemset 1 for (int i = 0; i < itemset1.length; i++) { buffer.append(itemset1[i]); if (i != itemset1.length - 1) { buffer.append(" "); } } // write separator buffer.append(" ==> "); // write itemset 2 for (int i = 0; i < itemset2.length; i++) { buffer.append(itemset2[i]); if (i != itemset2.length - 1) { buffer.append(" "); } } // write separator buffer.append(" #SUP: "); // write support buffer.append(absoluteSupport); // write separator buffer.append(" #CONF: "); // write confidence buffer.append(doubleToString(conf)); if(usingLift){ buffer.append(" #LIFT: "); buffer.append(doubleToString(lift)); } writer.write(buffer.toString()); writer.newLine(); }// otherwise the result is kept into memory else{ rules.addRule(new AssocRule(itemset1, itemset2, supportItemset1, absoluteSupport, conf, lift)); } } /** * Convert a double value to a string with only five decimal * @param value a double value * @return a string */ String doubleToString(double value) { // convert it to a string with two decimals DecimalFormat format = new DecimalFormat(); format.setMinimumFractionDigits(0); format.setMaximumFractionDigits(5); return format.format(value); } }