package ca.pfv.spmf.algorithms.frequentpatterns.pascal;
/* This file is copyright (c) 2008-2013 Philippe Fournier-Viger
*
* This file is part of the SPMF DATA MINING SOFTWARE
* (http://www.philippe-fournier-viger.com/spmf).
*
* SPMF is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
*
* SPMF is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
* You should have received a copy of the GNU General Public License along with
* SPMF. If not, see <http://www.gnu.org/licenses/>.
*/
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import ca.pfv.spmf.algorithms.ArraysAlgos;
import ca.pfv.spmf.algorithms.frequentpatterns.apriori_HT.ItemsetHashTree;
import ca.pfv.spmf.tools.MemoryLogger;
/**
* This is an implementation of the PASCAL algorithm. It is an Apriori-based
* algorithm that use information about generators to skip some database scans. <br/>
* <br/>
*
* The PASCAL algorithm is described in : <br/>
* <br/>
*
* Yves Bastide, Rafik Taouil, Nicolas Pasquier et al. (2002) Pascal : un
* algorithme d'extraction des motifs fr�quents, 65-95. In Techniques et Science
* Informatiques 21 (1). <br/>
* <br/>
*
* The PASCAL algorithm finds all the frequents itemsets and their support in a
* transaction database. It also identify itemsets that are generators. <br/>
* <br/>
*
*
* @see ItemsetPascal
* @see AbstractOrderedItemsetsAdapter
* @see ItemsetHashTree
* @author Philippe Fournier-Viger
*/
public class AlgoPASCAL {
// the maximul level reached by Apriori
protected int k;
// For statistics
protected int totalCandidateCount = 0; // total number of candidates
// generated
protected long startTimestamp; // start time
protected long endTimestamp; // end time
private int itemsetCount; // number of itemsets found
// the relative minimum support used to find itemsets
private int minsupRelative;
// an in-memory representation of the transaction database
private List<int[]> database = null;
// write to file
BufferedWriter writer = null;
/**
* Default constructor
*/
public AlgoPASCAL() {
}
/**
* Run the Apriori-HT algorithm
*
* @param minsup
* the minimum support threshold
* @param input
* path to the input file
* @param output
* path to save the result to an output file
* @throws IOException
* if an error while reading/writing files
*/
public void runAlgorithm(double minsup, String input, String output) throws IOException {
// record start time
startTimestamp = System.currentTimeMillis();
// prepare object for writing the file
writer = new BufferedWriter(new FileWriter(output));
// reset statistics
itemsetCount = 0;
totalCandidateCount = 0;
MemoryLogger.getInstance().reset();
int transactionCount = 0;
// structure to count the support of each item
// Key: item Value: support count
Map<Integer, Integer> mapItemCount = new HashMap<Integer, Integer>();
// the database in memory (intially empty)
database = new ArrayList<int[]>();
// scan the database to load it into memory and count the support of
// each single item at the same time
BufferedReader reader = new BufferedReader(new FileReader(input));
String line;
// for each line (transaction) of the input file until the end of file
while (((line = reader.readLine()) != null)) {
// if the line is a comment, is empty or is a
// kind of metadata
if (line.isEmpty() == true || line.charAt(0) == '#'
|| line.charAt(0) == '%' || line.charAt(0) == '@') {
continue;
}
// split the line into items
String[] lineSplited = line.split(" ");
// create an array to store the items
int transaction[] = new int[lineSplited.length];
// for each item in the current transaction
for (int i = 0; i < lineSplited.length; i++) {
// convert to integer
Integer item = Integer.parseInt(lineSplited[i]);
// add the item to the transaction
transaction[i] = item;
// increase the support count of the item
Integer count = mapItemCount.get(item);
if (count == null) {
mapItemCount.put(item, 1);
} else {
mapItemCount.put(item, ++count);
}
}
// add transaction to the database
database.add(transaction);
// increase the transaction count
transactionCount++;
}
// close the input file
reader.close();
// convert absolute minimum support to a relative minimum support
// by multiplying by the database size
this.minsupRelative = (int) Math.ceil(minsup * transactionCount);
// System.out.println("database size = " +database.size() +
// " minsuprel = " + minsupRelative);
// Set variable k=1 because we start with itemsets of size 1
k = 1;
// Create the list of all frequent items of size 1
List<ItemsetPascal> frequent1 = new ArrayList<ItemsetPascal>();
// For each item
for (Entry<Integer, Integer> entry : mapItemCount.entrySet()) {
// if its support is higher than the support
int itemsetSupport = entry.getValue();
if (itemsetSupport >= minsupRelative) {
// keep the item into memory for generating itemsets of size 2
// ------ CODE SPECIFIC TO PASCAL --------
// an itemset of size 1 is a generator if it has not the support
// equal to the transaction count.
ItemsetPascal itemset = new ItemsetPascal(
new int[] { entry.getKey() });
itemset.isGenerator = (itemsetSupport != transactionCount);
itemset.pred_sup = transactionCount;
// ------ END OF CODE SPECIFIC TO PASCAL --------
frequent1.add(itemset);
// and also save it to the output file
saveItemsetToFile(itemset);
}
}
mapItemCount = null; // we don't need it anymore
// Sort the list of frequent items of size 1 by lexical order because
// Apriori need itemset sorted by a total order.
Collections.sort(frequent1, new Comparator<ItemsetPascal>() {
public int compare(ItemsetPascal o1, ItemsetPascal o2) {
return o1.get(0) - o2.get(0);
}
});
// if no frequent item, we stop there!
if (frequent1.size() == 0) {
return;
}
// increase the number of candidates
totalCandidateCount += frequent1.size();
// Now we will perform a loop to find all frequent itemsets of size > 1
// starting from size k = 2.
// The loop will stop when no candidates can be generated.
List<ItemsetPascal> level = null;
k = 2;
do {
// we check the memory usage
MemoryLogger.getInstance().checkMemory();
// Generate candidates of size K
List<ItemsetPascal> candidatesK;
// if we are at level k=2, we use an optimization to generate
// candidates
if (k == 2) {
candidatesK = generateCandidate2(frequent1);
} else {
// otherwise we use the regular way to generate candidates
candidatesK = generateCandidateSizeK(level);
}
// we add the number of candidates generated to the total
totalCandidateCount += candidatesK.size();
// We scan the database one time to calculate the support
// of each candidates and keep those with higher suport.
for (ItemsetPascal candidate : candidatesK) {
// CODE SPECIFIC TO PASCAL
// PRUNING STRATEGY
// for each itemset that is a generator, if the support
// is the same as the minimum of its subsets, then
// it is not a generator and we want to remember that
if(candidate.isGenerator == false) {
continue;
}
// END CODE SPECIFIC TO PASCAL
// For each transaction:
loop: for (int[] transaction : database) {
// NEW OPTIMIZATION 2013: Skip transactions shorter than k!
if (transaction.length < k) {
// System.out.println("test");
continue;
}
// END OF NEW OPTIMIZATION
// a variable that will be use to check if
// all items of candidate are in this transaction
int pos = 0;
// for each item in this transaction
for (int item : transaction) {
// if the item correspond to the current item of
// candidate
if (item == candidate.itemset[pos]) {
// we will try to find the next item of candidate
// next
pos++;
// if we found all items of candidate in this
// transaction
if (pos == candidate.itemset.length) {
// we increase the support of this candidate
candidate.support++;
continue loop;
}
// Because of lexical order, we don't need to
// continue scanning the transaction if the current
// item
// is larger than the one that we search in
// candidate.
} else if (item > candidate.itemset[pos]) {
continue loop;
}
}
}
}
// We build the level k+1 with all the candidates that have
// a support higher than the minsup threshold.
level = new ArrayList<ItemsetPascal>();
for (ItemsetPascal candidate : candidatesK) {
// if the support is > minsup
if (candidate.getAbsoluteSupport() >= minsupRelative) {
// CODE SPECIFIC TO PASCAL
// for each itemset that is a generator, if the support
// is the same as the minimum of its subsets, then
// it is not a generator and we want to remember that
if(candidate.getAbsoluteSupport() == candidate.pred_sup) {
candidate.isGenerator = false;
}
// END CODE SPECIFIC TO PASCAL
// add the candidate
level.add(candidate);
// the itemset is frequent so save it into results
saveItemsetToFile(candidate);
}
}
// we will generate larger itemsets next.
k++;
} while (level.isEmpty() == false);
// record end time
endTimestamp = System.currentTimeMillis();
// check the memory usage
MemoryLogger.getInstance().checkMemory();
// close the output file if the result was saved to a file.
if (writer != null) {
writer.close();
}
}
/**
* This method generates candidates itemsets of size 2 based on itemsets of
* size 1.
*
* @param frequent1
* the list of frequent itemsets of size 1.
* @return a List of Itemset that are the candidates of size 2.
*/
private List<ItemsetPascal> generateCandidate2(List<ItemsetPascal> frequent1) {
List<ItemsetPascal> candidates = new ArrayList<ItemsetPascal>();
// For each itemset I1 and I2 of level k-1
for (int i = 0; i < frequent1.size(); i++) {
ItemsetPascal itemset1 = frequent1.get(i);
int item1 = itemset1.get(0);
for (int j = i + 1; j < frequent1.size(); j++) {
ItemsetPascal itemset2 = frequent1.get(j);
int item2 = itemset2.get(0);
// CODE SPECIFIC TO PASCAL
ItemsetPascal itemset = new ItemsetPascal(new int[] { item1, item2 });
itemset.isGenerator
= itemset1.isGenerator && itemset2.isGenerator;
itemset.pred_sup = Math.min(itemset1.getAbsoluteSupport(), itemset2.getAbsoluteSupport());
if(itemset.isGenerator == false) {
itemset.support = itemset.pred_sup;
}
// END OF CODE SPECIFIC TO PASCAL
// Create a new candidate by combining itemset1 and itemset2
candidates.add(itemset);
}
}
return candidates;
}
/**
* Method to generate itemsets of size k from frequent itemsets of size K-1.
*
* @param levelK_1
* frequent itemsets of size k-1
* @return itemsets of size k
*/
protected List<ItemsetPascal> generateCandidateSizeK(List<ItemsetPascal> levelK_1) {
// create a variable to store candidates
List<ItemsetPascal> candidates = new ArrayList<ItemsetPascal>();
// For each itemset I1 and I2 of level k-1
loop1: for (int i = 0; i < levelK_1.size(); i++) {
int[] itemset1 = levelK_1.get(i).itemset;
loop2: for (int j = i + 1; j < levelK_1.size(); j++) {
int[] itemset2 = levelK_1.get(j).itemset;
// we compare items of itemset1 and itemset2.
// If they have all the same k-1 items and the last item of
// itemset1 is smaller than
// the last item of itemset2, we will combine them to generate a
// candidate
for (int k = 0; k < itemset1.length; k++) {
// if they are the last items
if (k == itemset1.length - 1) {
// the one from itemset1 should be smaller (lexical
// order)
// and different from the one of itemset2
if (itemset1[k] >= itemset2[k]) {
continue loop1;
}
}
// if they are not the last items, and
else if (itemset1[k] < itemset2[k]) {
continue loop2; // we continue searching
} else if (itemset1[k] > itemset2[k]) {
continue loop1; // we stop searching: because of lexical
// order
}
}
// Create a new candidate by combining itemset1 and itemset2
int newItemset[] = new int[itemset1.length + 1];
System.arraycopy(itemset1, 0, newItemset, 0, itemset1.length);
newItemset[itemset1.length] = itemset2[itemset2.length - 1];
// The candidate is tested to see if its subsets of size k-1 are
// included in
// level k-1 (they are frequent).
ItemsetPascal newItemsetPascal = new ItemsetPascal(newItemset);
if (allSubsetsOfSizeK_1AreFrequent(newItemsetPascal, levelK_1)) {
// ------ CODE SPECIFIC TO PASCAL --------
// if the candidate is not a generator then the support of this
// itemset is the same as the lowest support of its subsets of size k-1
if(newItemsetPascal.isGenerator == false) {
newItemsetPascal.support = newItemsetPascal.pred_sup;
}
// ------ END CODE SPECIFIC TO PASCAL --------
candidates.add(newItemsetPascal);
}
}
}
return candidates; // return the set of candidates
}
/**
* Method to check if all the subsets of size k-1 of a candidate of size k
* are freuqnet
*
* @param candidate
* a candidate itemset of size k
* @param levelK_1
* the frequent itemsets of size k-1
* @return true if all the subsets are frequet
*/
protected boolean allSubsetsOfSizeK_1AreFrequent(ItemsetPascal candidateItemset,
List<ItemsetPascal> levelK_1) {
int[] candidate = candidateItemset.itemset;
// generate all subsets by always each item from the candidate, one by
// one
for (int posRemoved = 0; posRemoved < candidate.length; posRemoved++) {
// perform a binary search to check if the subset appears in level
// k-1.
int first = 0;
int last = levelK_1.size() - 1;
// variable to remember if we found the subset
boolean found = false;
// the binary search
while (first <= last) {
int middle = ( first + last ) >>> 1; // divide by 2
if (ArraysAlgos.sameAs(levelK_1.get(middle).getItems(), candidate, posRemoved) < 0) {
first = middle + 1; // the itemset compared is larger than
// the subset according to the lexical
// order
} else if (ArraysAlgos.sameAs(levelK_1.get(middle).getItems(), candidate, posRemoved) > 0) {
last = middle - 1; // the itemset compared is smaller than
// the subset is smaller according to
// the lexical order
} else {
// WE HAVE FOUND IT
found = true;
// ------ CODE SPECIFIC TO PASCAL --------
// CHECK IF THE SUBSET IS A GENERATOR
int supportMiddle = levelK_1.get(middle).getAbsoluteSupport();
boolean isAGenerator = levelK_1.get(middle).isGenerator;
if(isAGenerator == false) {
// IF NOT THEN THE candidate itemset is also not a generator
candidateItemset.isGenerator = false;
}
// if the support of this subset is smaller than the
// support of all subsets until now, we remember it.
if(supportMiddle < candidateItemset.pred_sup) {
candidateItemset.pred_sup = supportMiddle;
}
// ------ END OF CODE SPECIFIC TO PASCAL --------
break;
}
}
if (found == false) { // if we did not find it, that means that
// candidate is not a frequent itemset
// because
// at least one of its subsets does not appear in level k-1.
return false;
}
}
return true;
}
/**
* Method to save a frequent itemset to file
*
* @param itemset
* @throws IOException
*/
void saveItemsetToFile(ItemsetPascal itemset) throws IOException {
writer.write(itemset.toString() + " #SUP: "
+ itemset.getAbsoluteSupport() + " #IS_GENERATOR " + itemset.isGenerator);
writer.newLine();
System.out.println(itemset.toString());
itemsetCount++;
}
/**
* Method to print statistics about the execution of the algorithm.
*/
public void printStats() {
System.out.println("============= PASCAL - STATS =============");
System.out.println(" Candidates count : " + totalCandidateCount);
System.out.println(" The algorithm stopped at size " + (k - 1)
+ ", because there is no candidate");
System.out.println(" Frequent itemsets count : " + itemsetCount);
System.out.println(" Maximum memory usage : "
+ MemoryLogger.getInstance().getMaxMemory() + " mb");
System.out.println(" Total time ~ " + (endTimestamp - startTimestamp)
+ " ms");
System.out
.println("===================================================");
}
}