package ca.pfv.spmf.algorithms.frequentpatterns.aprioriTID; /* This file is copyright (c) 2008-2013 Philippe Fournier-Viger * * This file is part of the SPMF DATA MINING SOFTWARE * (http://www.philippe-fournier-viger.com/spmf). * * SPMF is free software: you can redistribute it and/or modify it under the * terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * * SPMF is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR * A PARTICULAR PURPOSE. See the GNU General Public License for more details. * You should have received a copy of the GNU General Public License along with * SPMF. If not, see <http://www.gnu.org/licenses/>. */ import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; import java.util.BitSet; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; import ca.pfv.spmf.patterns.itemset_array_integers_with_tids_bitset.Itemset; import ca.pfv.spmf.tools.MemoryLogger; /** * This is an implementation of the AprioriTID algorithm. This version is very fast * because it uses bit vector for representing TID SETS (transaction id sets).<br/><br/> * * The AprioriTID algorithm finds all the frequents itemsets and their support * in a transaction database and save them to file.<br/><br/> * * AprioriTID was originally proposed in :<br/><br/> * * Agrawal R, Srikant R. "Fast Algorithms for Mining Association Rules", VLDB. * Sep 12-15 1994, Chile, 487-99,<br/><br/> * * * @see Itemset * @author Philippe Fournier-Viger */ public class AlgoAprioriTID_Bitset { // the current level protected int k; // variables for counting support of items Map<Integer, BitSet> mapItemTIDS = new HashMap<Integer, BitSet>(); // the minimum support threshold int minSuppRelative; // Special parameter to set the maximum size of itemsets to be discovered int maxItemsetSize = Integer.MAX_VALUE; long startTimestamp = 0; // start time of latest execution long endTimeStamp = 0; // end time of latest execution // object to write the output file BufferedWriter writer = null; // the number of frequent itemsets found private int itemsetCount; private int tidcount = 0; /** * Default constructor */ public AlgoAprioriTID_Bitset() { } public void runAlgorithm(String input, String output, double minsup) throws NumberFormatException, IOException { // record start time startTimestamp = System.currentTimeMillis(); // reset number of itemsets found itemsetCount = 0; // create object for writing the output file writer = new BufferedWriter(new FileWriter(output)); // initialize variable to count the number of transactions tidcount = 0; // read the input file line by line until the end of the file // (each line is a transaction) mapItemTIDS = new HashMap<Integer, BitSet>(); // key : item value: tidset of the item as a bitset BufferedReader reader = new BufferedReader(new FileReader(input)); String line; // for each line (transaction) until the end of file while (((line = reader.readLine()) != null)) { // check memory usage MemoryLogger.getInstance().checkMemory(); // if the line is a comment, is empty or is a // kind of metadata if (line.isEmpty() == true || line.charAt(0) == '#' || line.charAt(0) == '%' || line.charAt(0) == '@') { continue; } // split line into items according to spaces String[] lineSplited = line.split(" "); // for each item for (String stringItem : lineSplited) { // convert from string to integer int item = Integer.parseInt(stringItem); // update the tidset of the item BitSet tids = mapItemTIDS.get(item); if (tids == null) { tids = new BitSet(); mapItemTIDS.put(item, tids); } tids.set(tidcount); } // increase the transaction count tidcount++; } reader.close(); // close the input file // convert the support from a relative minimum support (%) to an // absolute minimum support this.minSuppRelative = (int) Math.ceil(minsup * tidcount); // To build level 1, we keep only the frequent items. // We scan the database one time to calculate the support of each // candidate. k = 1; List<Itemset> level = new ArrayList<Itemset>(); // For each item Iterator<Entry<Integer, BitSet>> iterator = mapItemTIDS.entrySet() .iterator(); while (iterator.hasNext()) { // for the current item Map.Entry<Integer, BitSet> entry = (Map.Entry<Integer, BitSet>) iterator .next(); // get the support count (cardinality of the tidset) int cardinality = entry.getValue().cardinality(); // if the item is frequent if (cardinality >= minSuppRelative) { // add the item to the set of frequent itemsets of size 1 Integer item = entry.getKey(); Itemset itemset = new Itemset(item); itemset.setTIDs(mapItemTIDS.get(item), cardinality); level.add(itemset); // save the itemset saveItemsetToFile(itemset); } else { iterator.remove(); // if the item is not frequent we don't // need to keep it into memory. } } // sort itemsets of size 1 according to lexicographical order. Collections.sort(level, new Comparator<Itemset>() { public int compare(Itemset o1, Itemset o2) { return o1.get(0) - o2.get(0); } }); // Generate candidates with size k = 1 (all itemsets of size 1) k = 2; // While the level is not empty while (!level.isEmpty() && k <= maxItemsetSize) { // We build the level k+1 with all the candidates that have // a support higher than the minsup threshold. level = generateCandidateSizeK(level); ; // We keep only the last level... k++; } // close the file writer.close(); // save end time endTimeStamp = System.currentTimeMillis(); } /** * Method to generate itemsets of size k from frequent itemsets of size K-1. * @param levelK_1 frequent itemsets of size k-1 * @return itemsets of size k */ protected List<Itemset> generateCandidateSizeK(List<Itemset> levelK_1) throws IOException { // create a variable to store candidates List<Itemset> candidates = new ArrayList<Itemset>(); // For each itemset I1 and I2 of level k-1 loop1: for (int i = 0; i < levelK_1.size(); i++) { Itemset itemset1 = levelK_1.get(i); loop2: for (int j = i + 1; j < levelK_1.size(); j++) { Itemset itemset2 = levelK_1.get(j); // we compare items of itemset1 and itemset2. // If they have all the same k-1 items and the last item of // itemset1 is smaller than // the last item of itemset2, we will combine them to generate a // candidate for (int k = 0; k < itemset1.size(); k++) { // if they are the last items if (k == itemset1.size() - 1) { // the one from itemset1 should be smaller (lexical // order) // and different from the one of itemset2 if (itemset1.getItems()[k] >= itemset2.get(k)) { continue loop1; } } // if they are not the last items, and else if (itemset1.getItems()[k] < itemset2.get(k)) { continue loop2; // we continue searching } else if (itemset1.getItems()[k] > itemset2.get(k)) { continue loop1; // we stop searching: because of lexical // order } } // NOW COMBINE ITEMSET 1 AND ITEMSET 2 Integer missing = itemset2.get(itemset2.size() - 1); // create list of common tids BitSet list = (BitSet) itemset1.getTransactionsIds().clone(); list.and(itemset2.getTransactionsIds()); int cardinality = list.cardinality(); if (cardinality >= minSuppRelative) { // Create a new candidate by combining itemset1 and itemset2 int newItemset[] = new int[itemset1.size()+1]; System.arraycopy(itemset1.itemset, 0, newItemset, 0, itemset1.size()); newItemset[itemset1.size()] = itemset2.getItems()[itemset2.size() -1]; Itemset candidate = new Itemset(newItemset); candidate.setTIDs(list, cardinality); candidates.add(candidate); saveItemsetToFile(candidate); } } } return candidates; } /** * Set the maximum itemset size of itemsets to be found * @param maxItemsetSize maximum itemset size. */ public void setMaxItemsetSize(int maxItemsetSize) { this.maxItemsetSize = maxItemsetSize; } /** * Save an itemset to the output file. * @param itemset the itemset to be saved * @throws IOException an exception if error while writing the file. */ void saveItemsetToFile(Itemset itemset) throws IOException { writer.write(itemset.toString() + " #SUP: " + itemset.cardinality); writer.newLine(); itemsetCount++; // increase frequent itemset count } /** * Print statistics about the algorithm execution to System.out. */ public void printStats() { System.out.println("============= APRIORI - STATS ============="); System.out.println(" Transactions count from database : " + tidcount); System.out.println(" Frequent itemsets count : " + itemsetCount); System.out.println(" Maximum memory usage : " + MemoryLogger.getInstance().getMaxMemory() + " mb"); System.out.println(" Total time ~ " + (endTimeStamp - startTimestamp) + " ms"); System.out .println("==================================================="); } }