package ca.pfv.spmf.algorithms.associationrules.Indirect; /* This file is copyright (c) 2008-2012 Philippe Fournier-Viger * * This file is part of the SPMF DATA MINING SOFTWARE * (http://www.philippe-fournier-viger.com/spmf). * * SPMF is free software: you can redistribute it and/or modify it under the * terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * * SPMF is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR * A PARTICULAR PURPOSE. See the GNU General Public License for more details. * You should have received a copy of the GNU General Public License along with * SPMF. If not, see <http://www.gnu.org/licenses/>. */ import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import ca.pfv.spmf.patterns.itemset_array_integers_with_tids.Itemset; /** * This is an implementation of the INDIRECT algorithm for generating indirect association rules. * <br/><br/> * * The implementation is based on the description in the book: * Tan, Steinbach & Kumar (2006) "Introduction to data mining", chapter 7, p. 469, Algorithm 7.2. * and the KDD 2000 paper by Tan et al. * <br/><br/> * * However, note that the algorithm is not exactly the same as what the authors did, because there * is not enough details in the original paper and in the book. To implement the algorithm, I therefore * had to make some choices based on what I tought what the best or easiest way to do it. * <br/><br/> * * Also, note that instead of using the IS measure to compute the dependancy between itemsets, I chose * to use the confidence. The confidence is easier to calculate. * <br/><br/> * * Also, note that there is some faster algorithm that exists for generating indirect association rules * that have been proposed after INDIRECT (but not implemented in SPMF). * <br/><br/> * * Lastly, note that in my implementation I use an AprioriTID like procedure for generating frequent itemsets that * are needed to generate indirect rules. However I do not save the frequent itemsets to file because we don't * need to keep them (we just want to generate the indirect rules). * <br/><br/> * * If you find some errors or have some ideas for optimization, please let me know by contacting me on my website. * <br/><br/> * * One possible optimization that I could do in the future would be to use BitSet instead of HashSet * to represent the tids sets. * <br/><br/> * * @author Philippe Fournier-Viger */ public class AlgoINDIRECT { // variables for the tid (transaction ids) set of items Map<Integer, Set<Integer>> mapItemTIDS = new HashMap<Integer, Set<Integer>>(); // Parameters int minSuppRelative; double minconf = 0; double tsRelative = 0; // for statistics long startTimestamp = 0; // start time of the last algorithm execution long endTimeStamp = 0; // end time of the last algorithm execution // object to write the result to disk BufferedWriter writer = null; // the number of rule found private int ruleCount; // the size of the database private int tidcount =0; /** * Default constructor */ public AlgoINDIRECT() { } /** * Run the algorithm. * @param input the input file path * @param output the output file path * @param minsup the minimum support threshold * @param ts the ts threshold * @param minconf the minconf threshold * @throws IOException exception if there is an error while writing the output file. */ public void runAlgorithm(String input, String output, double minsup, double ts, double minconf) throws NumberFormatException, IOException { // record the algorithm start time startTimestamp = System.currentTimeMillis(); // create an object to write the output file writer = new BufferedWriter(new FileWriter(output)); this.minconf = minconf; // save minconf // (1) count the tid set of each item in the database in one database pass // To count, we use a map, where // key = item // value = set of transactions ids of transactions containing that item mapItemTIDS = new HashMap<Integer, Set<Integer>>(); // id item, count BufferedReader reader = new BufferedReader(new FileReader(input)); String line; tidcount=0; // variable used to know the line number that we are reading // for each line (transactions) of the input file while( ((line = reader.readLine())!= null)){ // if the line is a comment, is empty or is a // kind of metadata if (line.isEmpty() == true || line.charAt(0) == '#' || line.charAt(0) == '%' || line.charAt(0) == '@') { continue; } // split the line according to spaces String[] lineSplited = line.split(" "); // for each item on that line for(String stringItem : lineSplited){ // convert the item from string to int int item = Integer.parseInt(stringItem); // get the current tids set of this item Set<Integer> tids = mapItemTIDS.get(item); // if no set, create a new one if(tids == null){ tids = new HashSet<Integer>(); mapItemTIDS.put(item, tids); } // add the current transaction id to the set tids.add(tidcount); } tidcount++; // increase the transaction id } reader.close(); // close input file // Convert the absolute minimum support and absolute ts value // to relative value by multiplying by the number of transactions // in the transaction database. this.minSuppRelative = (int) Math.ceil(minsup * tidcount); this.tsRelative= (int) Math.ceil(ts * tidcount); // This algorithm use an Apriori-style generation (level by level) // To build level 1, we keep only the frequent items. int k=1; // create the variable to store itemset from level 1 List<Itemset> level = new ArrayList<Itemset>(); // For each item Iterator<Entry<Integer, Set<Integer>>> iterator = mapItemTIDS.entrySet().iterator(); while (iterator.hasNext()) { // If the current item is frequent Map.Entry<Integer, Set<Integer>> entry = (Map.Entry<Integer, Set<Integer>>) iterator.next(); if(entry.getValue().size() >= minSuppRelative){ // add the item to this level Integer item = entry.getKey(); Itemset itemset = new Itemset(item); itemset.setTIDs(mapItemTIDS.get(item)); level.add(itemset); }else{ // otherwise the item is not frequent we don't // need to keep it into memory. iterator.remove(); } } // Sort itemsets of size 1 according to lexicographical order. Collections.sort(level, new Comparator<Itemset>(){ public int compare(Itemset o1, Itemset o2) { return o1.get(0) - o2.get(0); } }); // Now we recursively find larger itemset to generate rules // starting from k = 2 and until there is no more candidates. k = 2; // While the level is not empty while (!level.isEmpty() ) { // We build the level k+1 with all the candidates that have // a support higher than the minsup threshold. level = generateCandidateSizeK(level, k); // We keep only the last level... k++; } // close the file writer.close(); endTimeStamp = System.currentTimeMillis(); } /** * Generate candidate of size K by using frequent itemsets of size k-1. * (this process is similar to Apriori). * @param levelK_1 frequent itemsets of size k-1 * @param level the value of k * @return the list of candidates of size k * @throws IOException exception if there is an error writing the file */ protected List<Itemset> generateCandidateSizeK(List<Itemset> levelK_1, int level) throws IOException { // create an empty list to store the candidate List<Itemset> nextLevel = new ArrayList<Itemset>(); // For each itemset I1 and I2 of level k-1 loop1: for(int i=0; i< levelK_1.size(); i++){ Itemset itemset1 = levelK_1.get(i); loop2: for(int j=i+1; j< levelK_1.size(); j++){ Itemset itemset2 = levelK_1.get(j); // we compare items of itemset1 and itemset2. // If they have all the same k-1 items and the last item of itemset1 is smaller than // the last item of itemset2, we will combine them to generate a candidate for(int k=0; k< itemset1.size(); k++){ // if they are the last items if(k == itemset1.size()-1){ // the one from itemset1 should be smaller (lexical order) // and different from the one of itemset2 if(itemset1.getItems()[k] >= itemset2.get(k)){ continue loop1; } } // if they are not the last items, and else if(itemset1.getItems()[k] < itemset2.get(k)){ continue loop2; // we continue searching } else if(itemset1.getItems()[k] > itemset2.get(k)){ continue loop1; // we stop searching: because of lexical order } } // ======= GENERATE ITEMSETS OF NEXT LEVEL AS IN APRIORI ====================== Set<Integer> list = new HashSet<Integer>(); for(Integer val1 : itemset1.getTransactionsIds()){ if(itemset2.getTransactionsIds().contains(val1)){ list.add(val1); } } if(list.size() >= minSuppRelative){ // Create a new candidate by combining // itemset1 and itemset2 int newItemset[] = new int[itemset1.size()+1]; System.arraycopy(itemset1.itemset, 0, newItemset, 0, itemset1.size()); newItemset[itemset1.size()] = itemset2.getItems()[itemset2.size() -1]; Itemset candidate = new Itemset(newItemset); candidate.setTIDs(list); // add the candidate to the set of candidate nextLevel.add(candidate); } } } // TRY ALL COMBINATION TO GENERATE INDIRECT RULES FROM ITEMSET OF SIZE K, IF K > 2 -- NOT VERY EFFICIENT if(level > 2){ // WE NEED TO FIND TWO IEMSETS WITH ONLY TWO ITEMS a,b THAT ARE DIFFERENT // SO WE COMPARE EACH ITEMSET OF SIZE K WITH EACH OTHER ITEMSET OF SIZE K. for(int i=0; i< levelK_1.size(); i++){ for(int j=i+1; j< levelK_1.size(); j++){ Itemset candidate1 = levelK_1.get(i); Itemset candidate2 = levelK_1.get(j); // We check if the pair of itemset have only one item that is different. loopX: for(Integer a : candidate1.getItems()){ // if candidate2 does not contain item a if(candidate2.contains(a) == false){ Integer b = null; // for each item of candidate 2 for(Integer itemM : candidate2.getItems()){ // if candidate1 does not contain that item if(candidate1.contains(itemM) == false){ if(b!= null){ continue loopX; // more than two items are different... we don't want that. } b = itemM; // the item that is different } } // if there is only one item that is different, then we call this method // to check if we can create an indirect rule such that it would meet the // ts threshold and the minconf threshold. testIndirectRule(candidate1, a, b); } } } } } return nextLevel; } /** * Test if an indirect rule satisfies the criterion for an indirect association rules ts and minconf. * @param itemset a potential itemset that could be a mediator if we remove "a" and "b" * @param a the item a * @param b the item b * @throws IOException exception if error while writing to the output file */ private void testIndirectRule(Itemset itemset, Integer a, Integer b) throws IOException { // These sets are respectively the sets of ids // of transactions containing "a" and "b" Set<Integer> tidsA = mapItemTIDS.get(a); Set<Integer> tidsB = mapItemTIDS.get(b); // Calculate the support of {a,b} by doing // the intersection of these two sets. int supportAB = 0; // variable to count the number of IDs in that intersection // for each ID in tidFromA for(Integer tidFromA : tidsA){ // if it appears in tidsB if(tidsB.contains(tidFromA)){ // increase the number of IDs shared by both supportAB++; } } // if the support of {a,b} is lower than the "ts" threshold. if(supportAB < tsRelative ){ // compute the support of Y U {a} int supAY =0; // for each tid of transactions containing "a" loop1: for(Integer tidA: tidsA){ // for each item in "itemset" for(Integer item: itemset.getItems()){ // if this item is not "a" and not "b" if(!item.equals(a) && !item.equals(b)){ // if this item appears in a transaction containing "a" if(!mapItemTIDS.get(item).contains(tidA)){ continue loop1; } } } supAY++; // increase the support of Y U {a} } // Calculate the confidence of Y U {a} double confAY = supAY / ((double)tidsA.size()) ; // if the confidence is high enough if(confAY >= minconf){ // We do the same thing.... // This time we compute the support of Y U {b} // variable to calculate the support of Y U {b} int supBY =0; // for each tid of transactions containing "b" loop2: for(Integer tidB: tidsB){ // for each item in "itemset" for(Integer item: itemset.getItems()){ // if this item is not "a" and not "b" if(!item.equals(a) && !item.equals(b)){ // if this item appears in a transaction containing "a" if(!mapItemTIDS.get(item).contains(tidB)){ continue loop2; } } } supBY++; // increase the support of Y U {b} } // Calculate the confidence of Y U {b} double confBY = supBY / ((double)tidsB.size()) ; // if the confidence is high enough if(confBY >= minconf){ // save the rule saveRule(a, b, itemset, confAY, confBY, supAY, supBY); } } } } /** * Save an indirect association rule. * @param a the item a * @param b the item b * @param itemset the mediator if we remove a and b * @param confAY confidence of Y U {a} * @param confBY confidence of Y U {b} * @param supAY support of Y U {a} * @param supBY support of Y U {b} * @throws IOException exception if error writing the file */ private void saveRule(Integer a, Integer b, Itemset itemset, double confAY, double confBY, int supAY, int supBY) throws IOException{ ruleCount++; // we increase the number of rule found // we create a string buffer because it is more efficient // for creating a string step by step StringBuilder buffer = new StringBuilder(); // we append all the elements from the rule to the StringBuilder buffer.append("(a="); buffer.append(a); buffer.append(" b="); buffer.append(b); buffer.append(" | mediator="); for(int i=0; i < itemset.size(); i++){ if(!itemset.get(i).equals(a) && !itemset.get(i).equals(b)){ buffer.append(itemset.get(i)); buffer.append(" "); } } buffer.append(")"); buffer.append(" #sup(a,mediator)= "); buffer.append(supAY); buffer.append(" #sup(b,mediator)= "); buffer.append(supBY); buffer.append(" #conf(a,mediator)= "); buffer.append(confAY); buffer.append(" #conf(b,mediator)= "); buffer.append(confBY); // we write to the file writer.write(buffer.toString()); writer.newLine(); // we write a new line } /** * Print statistics about the last algorithm execution to System.out. */ public void printStats() { System.out .println("============= INDIRECT RULES GENERATION - STATS ============="); System.out.println(" Transactions count from database : " + tidcount); System.out.println(" Indirect rule count : " + ruleCount); System.out.println(" Total time ~ " + (endTimeStamp - startTimestamp)+ " ms"); System.out .println("==================================================="); } }