package ca.pfv.spmf.algorithms.frequentpatterns.dci_closed; /* This file is copyright (c) 2008-2013 Philippe Fournier-Viger * * This file is part of the SPMF DATA MINING SOFTWARE * (http://www.philippe-fournier-viger.com/spmf). * * SPMF is free software: you can redistribute it and/or modify it under the * terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * * SPMF is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR * A PARTICULAR PURPOSE. See the GNU General Public License for more details. * You should have received a copy of the GNU General Public License along with * SPMF. If not, see <http://www.gnu.org/licenses/>. */ import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; /** * This is a basic implementation of the "DCI_Closed" algorithm (see AlgoDCI_Closed_Optimized * for the optimized version). * <br/><br/> * * The DCI_Closed algorithm finds all closed itemsets in a transaction database. * <br/><br/> * * This algorithm was originally proposed in the article: * <br/><br/> * * Lucchese, C., Orlando, S. & Perego, Raffaele (2004), DCI_Closed: a fast and memory efficient * algorithm to mine frequent closed itemsets, Proc. 2nd IEEE ICDM Workshop on Frequent Itemset * Mining Implementations at ICDM 2004. * <br/><br/> * * Implementation note: <br/> * - My implementation assumes that there is no item named "0". * <br/><br/> * * Possible optimizations:<br/> * - use a bit matrix like it is suggested in the article<br/> * - remove elements from postsets and use a linkedlist for postsets.<br/> * - closedset could be an array.<br/> * - etc. * *@author Philippe Fournier-Viger */ public class AlgoDCI_Closed { // number of closed itemsets found int closedCount =0; // the number of transaction in the transaction database int tidCount =0; // the largest item in the transaction database int maxItemId =1; // relative minimum support set by the user private int minSuppRelative; // object to write the output file BufferedWriter writer = null; // Map to store the database as a verticabl database // Key: item value : Set of Ids of transactions containing the item Map<Integer, Set<Integer>> database = null; /** * Default constructor */ public AlgoDCI_Closed() { } /** * Run the algorithm. * @param input the path of an input file (transaction database). * @param output the path of the output file for writing the result * @param minsup a minimum support threshold * @throws IOException exception if error while writing/reading files */ public void runAlgorithm(String input, String output, int minsup) throws IOException { // record start time long startTimestamp = System.currentTimeMillis(); // reset number of itemsets found closedCount=0; System.out.println("Running the DCI-Closed algorithm"); // Prepare object to write the output file writer = new BufferedWriter(new FileWriter(output)); // save the minimum support this.minSuppRelative = minsup; // (1) CREATE VERTICAL DATABASE INTO MEMORY createVerticalDatabase(input); // (2) INITIALIZE VARIABLES FOR THE FIRST CALL TO THE "DCI_CLOSED" PROCEDURE // (as described in the paper) List<Integer> closedset = new ArrayList<Integer>(); Set<Integer> closedsetTIDs = new HashSet<Integer>(); List<Integer> preset = new ArrayList<Integer>(); List<Integer> postset = new ArrayList<Integer>(maxItemId); // Create postset and sort it by descending order or support. // For each item: for(int i=1; i<= maxItemId; i++){ // Get the tidset of item i Set<Integer> tidset = database.get(i); // if the item is frequent if(tidset != null && tidset.size() >= minSuppRelative){ // add it to postset postset.add(i); } } // Sort items by support ascending order. // But use the lexicographical order if // the support is the same for two items. Collections.sort(postset, new Comparator<Integer>(){ public int compare(Integer item1, Integer item2) { int size1 = database.get(item1).size(); // support is the size of the tidset int size2 = database.get(item2).size(); // support is the size of the tidset // if the support is the same if(size1 == size2){ // use the lexical order return (item1 < item2) ? -1 : 1; } // otherwise, use the support return size1 - size2; } }); // System.out.println(postset); // (3) CALL THE "DCI_CLOSED" RECURSIVE PROCEDURE dci_closed(true, closedset, closedsetTIDs, postset, preset); // print statistics System.out.println("========== DCI_CLOSED - STATS ============"); System.out.println(" Number of transactions: " + tidCount ); System.out.println(" Number of frequent closed itemsets: " + closedCount ); System.out.println(" Total time ~: " + (System.currentTimeMillis() - startTimestamp) + " ms"); // close the file writer.close(); } /** * The method "DCI_CLOSED" as described in the paper. * @param firstime true if this method is called for the first time * @param closedset the closed set (see paper). * @param closedsetTIDs the tids set of the closed set * @param postset the postset (see paper for full details) * @param preset the preset (see paper) * @exception IOException if error writing the output file */ private void dci_closed(boolean firstTime, List<Integer> closedset, Set<Integer> closedsetTIDs, List<Integer> postset, List<Integer> preset) throws IOException { //L2: for all i in postset for(Integer i : postset){ // L4 Calculate the tidset of newgen // where newgen is "closedset" U {i} Set<Integer> newgenTIDs; // if the first time if(firstTime){ // it is the tidset of it newgenTIDs = database.get(i); }else{ // otherwise we intersect the tidset of closedset and the // tidset of i newgenTIDs = intersectTIDset(closedsetTIDs, database.get(i)); } // if newgen has a support no less than minsup if(newgenTIDs.size() >= minSuppRelative){ // L3: newgen = closedset U {i} // Create the itemset for newgen List<Integer> newgen = new ArrayList<Integer>(closedset.size()+1); newgen.addAll(closedset); newgen.add(i); // L5: if newgen is not a duplicate if(is_dup(newgenTIDs, preset) == false){ // L6: ClosedsetNew = newGen List<Integer> closedsetNew = new ArrayList<Integer>(); closedsetNew.addAll(newgen); // calculate tidset Set<Integer> closedsetNewTIDs = new HashSet<Integer>(); // if first time if(firstTime){ // the new tidset of closed set is the tidset of i closedsetNewTIDs = database.get(i); }else{ // otherwise, we add the tidset of newgen closedsetNewTIDs.addAll(newgenTIDs); } // L7 : PostsetNew = emptyset List<Integer> postsetNew = new ArrayList<Integer>(); // L8 for each j in Postset such that i _ j : for(Integer j : postset){ // if i is smaller than j according to the total order on items if(smallerAccordingToTotalOrder(i, j)){ // L9 // if the tidset of j contains the tidset of newgen if(database.get(j).containsAll(newgenTIDs)){ closedsetNew.add(j); // recalculate TIDS of closedsetNEW by intersection Set<Integer> jTIDs = database.get(j); Iterator<Integer> iter = closedsetNewTIDs.iterator(); while(iter.hasNext()){ Integer tid = iter.next(); if(jTIDs.contains(tid) == false){ iter.remove(); } } }else{ // otherwise add j to the new postset postsetNew.add(j); } } } // L15 : write out Closed_setNew and its support writeOut(closedsetNew, closedsetNewTIDs.size()); // L16: recursive call // FIXED: we have to make a copy of preset before the recursive call List<Integer> presetNew = new ArrayList<Integer>(preset); dci_closed(false, closedsetNew, closedsetNewTIDs, postsetNew, presetNew); // L17 : Preset = Preset U {i} preset.add(i); } } } } /** * Check if an item is smaller than another according to the support ascending order * or if the support is the same, use the lexicographical order. * @param i an item * @param j another item */ private boolean smallerAccordingToTotalOrder(Integer i, Integer j) { // compare the support int size1 = database.get(i).size(); // support of i is the tidset of i int size2 = database.get(j).size();// support of j is the tidset of j // if the support is the same if(size1 == size2){ // use the lexical order return (i < j) ? true : false; } // otherwise use the support return size2 - size1 >0; } /** * Write a frequent closed itemset that is found to the output file. */ private void writeOut(List<Integer> closedset, int support) throws IOException { // increase the number of closed itemsets closedCount++; StringBuilder buffer = new StringBuilder(); // for each item in the closed itemset Iterator<Integer> iterItem = closedset.iterator(); while(iterItem.hasNext()){ // append the item buffer.append(iterItem.next()); // if it is not the last item, append a space if(iterItem.hasNext()){ buffer.append(' '); }else{ break; } } // append the support buffer.append(" #SUP: "); buffer.append(support); // append the buffer writer.write(buffer.toString()); writer.newLine(); } /** * The method "is_dup" as described in the paper. * @param newgenTIDs the tidset of newgen * @param preset the itemset "preset" */ private boolean is_dup(Set<Integer> newgenTIDs, List<Integer> preset) { // L25 // for each integer j in preset for(Integer j : preset){ // L26 : // If tidset of newgen is included in tids of j, return true if(database.get(j).containsAll(newgenTIDs)){ // IMPORTANT // NOTE THAT IN ORIGINAL PAPER THEY WROTE FALSE, BUT IT SHOULD BE TRUE return true; } } return false; // NOTE THAT IN ORIGINAL PAPER THEY WROTE TRUE, BUT IT SHOULD BE FALSE } /** * Perform the intersection of two tidsets. * @param tidset1 a first tidset * @param tidset2 a second tidset * @return the intersection of the two tidsets. */ private Set<Integer> intersectTIDset(Set<Integer> tidset1, Set<Integer> tidset2) { // Create the new tidset Set<Integer> tidset = new HashSet<Integer>(); // if tidset1 is larger than tidset2 if(tidset1.size() > tidset2.size()){ // for each tid in tidset2 for(Integer tid : tidset2){ // if the tid is in tidset1 if(tidset1.contains(tid)){ // add it to the new tidset tidset.add(tid); } } }else{ // otherwise for(Integer tid : tidset1){ // if the tid is in tidset2 if(tidset2.contains(tid)){ // add it to the new tidset tidset.add(tid); } } } // return the new tidset return tidset; } /** * Create the in-memory vertical database by reading the input file. * @param input an input file path. * @throws IOException exception if an error while reading the file */ private void createVerticalDatabase(String input) throws IOException { // Prepare object to read the input file BufferedReader reader = new BufferedReader(new FileReader(input)); String line; // variable to count the number of transactions tidCount =0; maxItemId = 0; // the vertical database is a map: key= item value= tidset database = new HashMap<Integer, Set<Integer>>(); // for each line (transaction) in the transaction database while( ((line = reader.readLine())!= null)){ // if the line is a comment, is empty or is a // kind of metadata if (line.isEmpty() == true || line.charAt(0) == '#' || line.charAt(0) == '%' || line.charAt(0) == '@') { continue; } // split the line according to spaces String[] lineSplited = line.split(" "); // for each item for(String itemString : lineSplited){ // convert the item to integer Integer item = Integer.parseInt(itemString); // update the tidset of the item // by adding the tid of the current transaction Set<Integer> tidset = database.get(item); if(tidset == null){ tidset = new HashSet<Integer>(); database.put(item, tidset); } tidset.add(tidCount); // if this item is larger than maxItemId, replace it. if(item > maxItemId){ maxItemId = item; } } //increase the number of transactions read until now tidCount++; } // close the input file reader.close(); } }