package ca.pfv.spmf.algorithms.frequentpatterns.apriori_inverse;
/* This file is copyright (c) 2008-2013 Philippe Fournier-Viger
*
* This file is part of the SPMF DATA MINING SOFTWARE
* (http://www.philippe-fournier-viger.com/spmf).
*
* SPMF is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
*
* SPMF is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
* You should have received a copy of the GNU General Public License along with
* SPMF. If not, see <http://www.gnu.org/licenses/>.
*/
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import ca.pfv.spmf.algorithms.ArraysAlgos;
import ca.pfv.spmf.patterns.itemset_array_integers_with_count.Itemset;
import ca.pfv.spmf.patterns.itemset_array_integers_with_count.Itemsets;
import ca.pfv.spmf.tools.MemoryLogger;
/**
* This is an implementation of the AprioriInverse algorithm as described by :
* <br/><br/>
*
* Yun Sing Koh, Nathan Rountree: Finding Sporadic Rules Using Apriori-Inverse.
* PAKDD 2005: 97-106 <br/><br/>
*
* and the original Apriori article:<br/><br/>
*
* Agrawal R, Srikant R. "Fast Algorithms for Mining Association Rules", VLDB.
* Sep 12-15 1994, Chile, 487-99,<br/><br/>
*
* The AprioriInverse algorithm finds all perfectly rare itemsets. A perfectly
* rare itemset is an itemset such that all its subsets are rare. It is
* very similar to the original Apriori algorithm. The main difference is
* that it uses a threshold "maxsup" and that frequent items of size 1 need
* to have a support not higher than maxsup.<br/><br/>
*
* This is an optimized version that saves the result to a file
* or keep it into memory if no output path is provided
* by the user to the runAlgorithm() method.
*
* @see Itemset
* @see Itemsets
* @author Philippe Fournier-Viger
*/
public class AlgoAprioriInverse {
// the current level k in the breadth-first search
protected int k;
// variables for statistics
protected int totalCandidateCount = 0; // number of candidate generated during last execution
protected long startTimestamp; // start time of last execution
protected long endTimestamp; // end time of last execution
private int itemsetCount; // itemset found during last execution
private int databaseSize;
// the minimum support threshold set by the user
private int minsupRelative;
// the maximum support threshold set by the user
private int maxsupRelative;
// A memory representation of the database.
// Each position in the list represents a transaction
private List<int[]> database = null;
// The patterns that are found
// (if the user want to keep them into memory)
protected Itemsets patterns = null;
// object to write the output file (if the user wants to write to a file)
BufferedWriter writer = null;
/**
* Default constructor
*/
public AlgoAprioriInverse() {
}
/**
* Method to run the algorithm
* @param minsup a minimum support value as a percentage
* @param maxsup a maximum support value as a percentage
* @param input the path of an input file
* @param output the path of an input if the result should be saved to a file. If null,
* the result will be kept into memory and this
* method will return the result.
* @throws IOException exception if error while writing or reading the input/output file
*/
public Itemsets runAlgorithm(double minsup, double maxsup, String input, String output) throws IOException {
// if the user want to keep the result into memory
if(output == null){
writer = null;
patterns = new Itemsets("SPORADIC ITEMSETS");
}else{ // if the user want to save the result to a file
patterns = null;
writer = new BufferedWriter(new FileWriter(output));
}
// record the start time
startTimestamp = System.currentTimeMillis();
// set the number of itemset found to zero
itemsetCount = 0;
// set the number of candidate found to zero
totalCandidateCount = 0;
// reset the utility for checking the memory usage
MemoryLogger.getInstance().reset();
// READ THE INPUT FILE
// variable to count the number of transactions
databaseSize = 0;
// Map to count the support of each item
// Key: item Value : support
Map<Integer, Integer> mapItemCount = new HashMap<Integer, Integer>(); // to count the support of each item
database = new ArrayList<int[]>(); // the database in memory (intially empty)
// scan the database to load it into memory and count the support of each single item at the same time
BufferedReader reader = new BufferedReader(new FileReader(input));
String line;
// for each line (transactions) until the end of the file
while (((line = reader.readLine()) != null)) {
// if the line is a comment, is empty or is a
// kind of metadata
if (line.isEmpty() == true ||
line.charAt(0) == '#' || line.charAt(0) == '%'
|| line.charAt(0) == '@') {
continue;
}
// split the line according to spaces
String[] lineSplited = line.split(" ");
// create an array of int to store the items in this transaction
int transaction[] = new int[lineSplited.length];
// for each item in this line (transaction)
for (int i=0; i< lineSplited.length; i++) {
// transform this item from a string to an integer
Integer item = Integer.parseInt(lineSplited[i]);
// store the item in the memory representation of the database
transaction[i] = item;
// increase the support count
Integer count = mapItemCount.get(item);
if (count == null) {
mapItemCount.put(item, 1);
} else {
mapItemCount.put(item, ++count);
}
}
// add the transaction to the database
database.add(transaction);
// increase the number of transaction
databaseSize++;
}
// close the input file
reader.close();
// convert the minimum support as a percentage to a
// relative minimum support as an integer
this.minsupRelative = (int) Math.ceil(minsup * databaseSize);
this.maxsupRelative = (int) Math.ceil(maxsup * databaseSize);
// we start looking for itemset of size 1
k = 1;
// We add all frequent items to the set of candidate of size 1
List<Integer> frequent1 = new ArrayList<Integer>();
for(Entry<Integer, Integer> entry : mapItemCount.entrySet()){
if(entry.getValue() >= minsupRelative &&
entry.getValue() <= maxsupRelative){ // THE ONLY BIG DIFFERENCE WITH APRIORI IS HERE... WE CHECK MAXSUP FOR ITEMSETS OF SIZE 1
frequent1.add(entry.getKey());
saveItemsetToFile(entry.getKey(), entry.getValue());
}
}
mapItemCount = null;
// We sort the list of candidates by lexical order
// (Apriori need to use a total order otherwise it does not work)
Collections.sort(frequent1, new Comparator<Integer>() {
public int compare(Integer o1, Integer o2) {
return o1 - o2;
}
});
// If no frequent item, the algorithm stops!
if(frequent1.size() == 0){
// close the output file if we used it
if(writer != null){
writer.close();
}
return patterns;
}
// add the frequent items of size 1 to the total number of candidates
totalCandidateCount += frequent1.size();
// Now we will perform a loop to find all frequent itemsets of size > 1
// starting from size k = 2.
// The loop will stop when no candidates can be generated.
List<Itemset> level = null;
k = 2;
do{
// we check the memory usage
MemoryLogger.getInstance().checkMemory();
// Generate candidates of size K
List<Itemset> candidatesK;
// if we are at level k=2, we use an optimization to generate candidates
if(k ==2){
candidatesK = generateCandidate2(frequent1);
}else{
// otherwise we use the regular way to generate candidates
candidatesK = generateCandidateSizeK(level);
}
// we add the number of candidates generated to the total
totalCandidateCount += candidatesK.size();
// We scan the database one time to calculate the support
// of each candidates and keep those with higher suport.
// For each transaction:
for(int[] transaction: database){
// for each candidate:
loopCand: for(Itemset candidate : candidatesK){
// a variable that will be use to check if
// all items of candidate are in this transaction
int pos = 0;
// for each item in this transaction
for(int item: transaction){
// if the item correspond to the current item of candidate
if(item == candidate.itemset[pos]){
// we will try to find the next item of candidate next
pos++;
// if we found all items of candidate in this transaction
if(pos == candidate.itemset.length){
// we increase the support of this candidate
candidate.support++;
continue loopCand;
}
// Because of lexical order, we don't need to
// continue scanning the transaction if the current item
// is larger than the one that we search in candidate.
}else if(item > candidate.itemset[pos]){
continue loopCand;
}
}
}
}
// We build the level k+1 with all the candidates that have
// a support higher than the minsup threshold.
level = new ArrayList<Itemset>();
for (Itemset candidate : candidatesK) {
// if the support is > minsup
if (candidate.getAbsoluteSupport() >= minsupRelative) {
// add the candidate
level.add(candidate);
// the itemset is frequent so save it into results
saveItemset(candidate);
}
}
// we will generate larger itemsets next.
k++;
}while(level.isEmpty() == false);
// record end time
endTimestamp = System.currentTimeMillis();
// check the memory usage
MemoryLogger.getInstance().checkMemory();
// close the output file if the result was saved to a file.
if(writer != null){
writer.close();
}
return patterns;
}
/**
* Return the number of transactions in the last database read by the algorithm.
* @return the number of transactions
*/
public int getDatabaseSize() {
return databaseSize;
}
/**
* This method generates candidates itemsets of size 2 based on
* itemsets of size 1.
* @param frequent1 the list of frequent itemsets of size 1.
* @return a List of Itemset that are the candidates of size 2.
*/
private List<Itemset> generateCandidate2(List<Integer> frequent1) {
List<Itemset> candidates = new ArrayList<Itemset>();
// For each itemset I1 and I2 of level k-1
for (int i = 0; i < frequent1.size(); i++) {
Integer item1 = frequent1.get(i);
for (int j = i + 1; j < frequent1.size(); j++) {
Integer item2 = frequent1.get(j);
// Create a new candidate by combining itemset1 and itemset2
candidates.add(new Itemset(new int []{item1, item2}));
}
}
return candidates;
}
/**
* Method to generate itemsets of size k from frequent itemsets of size K-1.
* @param levelK_1 frequent itemsets of size k-1
* @return itemsets of size k
*/
protected List<Itemset> generateCandidateSizeK(List<Itemset> levelK_1) {
// create a variable to store candidates
List<Itemset> candidates = new ArrayList<Itemset>();
// For each itemset I1 and I2 of level k-1
loop1: for (int i = 0; i < levelK_1.size(); i++) {
int[] itemset1 = levelK_1.get(i).itemset;
loop2: for (int j = i + 1; j < levelK_1.size(); j++) {
int[] itemset2 = levelK_1.get(j).itemset;
// we compare items of itemset1 and itemset2.
// If they have all the same k-1 items and the last item of
// itemset1 is smaller than
// the last item of itemset2, we will combine them to generate a
// candidate
for (int k = 0; k < itemset1.length; k++) {
// if they are the last items
if (k == itemset1.length - 1) {
// the one from itemset1 should be smaller (lexical
// order)
// and different from the one of itemset2
if (itemset1[k] >= itemset2[k]) {
continue loop1;
}
}
// if they are not the last items, and
else if (itemset1[k] < itemset2[k]) {
continue loop2; // we continue searching
} else if (itemset1[k] > itemset2[k]) {
continue loop1; // we stop searching: because of lexical
// order
}
}
// Create a new candidate by combining itemset1 and itemset2
int newItemset[] = new int[itemset1.length+1];
System.arraycopy(itemset1, 0, newItemset, 0, itemset1.length);
newItemset[itemset1.length] = itemset2[itemset2.length -1];
// The candidate is tested to see if its subsets of size k-1 are
// included in
// level k-1 (they are frequent).
if (allSubsetsOfSizeK_1AreFrequent(newItemset, levelK_1)) {
candidates.add(new Itemset(newItemset));
}
}
}
return candidates; // return the set of candidates
}
/**
* Method to check if all the subsets of size k-1 of a candidate of size k are freuqnet
* @param candidate a candidate itemset of size k
* @param levelK_1 the frequent itemsets of size k-1
* @return true if all the subsets are frequet
*/
protected boolean allSubsetsOfSizeK_1AreFrequent(int[] candidate, List<Itemset> levelK_1) {
// generate all subsets by always each item from the candidate, one by one
for(int posRemoved=0; posRemoved< candidate.length; posRemoved++){
// perform a binary search to check if the subset appears in level k-1.
int first = 0;
int last = levelK_1.size() - 1;
// variable to remember if we found the subset
boolean found = false;
// the binary search
while( first <= last )
{
int middle = ( first + last ) >>> 1; // divide by 2
int comparison = ArraysAlgos.sameAs(levelK_1.get(middle).getItems(), candidate, posRemoved);
if(comparison < 0 ){
first = middle + 1; // the itemset compared is larger than the subset according to the lexical order
}
else if(comparison > 0 ){
last = middle - 1; // the itemset compared is smaller than the subset is smaller according to the lexical order
}
else{
found = true; // we have found it so we stop
break;
}
}
if(found == false){ // if we did not find it, that means that candidate is not a frequent itemset because
// at least one of its subsets does not appear in level k-1.
return false;
}
}
return true;
}
/**
* Save an itemset to the file or to memory if no output file path was provided
* @param itemset the itemset to be saved.
* @throws IOException exception if error while writing the file
*/
void saveItemset(Itemset itemset) throws IOException {
// increase frequent itemset count
itemsetCount++;
// if the result should be saved to a file
if(writer != null){
writer.write(itemset.toString() + " #SUP: "
+ itemset.getAbsoluteSupport());
writer.newLine();
}// otherwise the result is kept into memory
else{
patterns.addItemset(itemset, itemset.size());
}
}
/**
* Save an itemset of size 1 to the file or to memory if no output file path was provided
* @param item a single item (an itemset of size 1)
* @param support the support of the item
* @throws IOException exception if error while writing the file
*/
void saveItemsetToFile(Integer item, Integer support) throws IOException {
itemsetCount++; // increase frequent itemset count
// if the result should be saved to a file
if(writer != null){
writer.write(item + " #SUP: " + support);
writer.newLine();
}// otherwise the result is kept into memory
else{
Itemset itemset = new Itemset(item);
itemset.setAbsoluteSupport(support);
patterns.addItemset(itemset, 1);
}
}
/**
* Print statistics about the algorithm execution to System.out.
*/
public void printStats() {
System.out.println("============= APRIORI INVERSE - STATS =============");
System.out.println(" Candidates count : " + totalCandidateCount);
System.out.println(" The algorithm stopped at size " + (k - 1)
+ ", because there is no candidate");
System.out.println(" Sporadic itemsets count : " + itemsetCount);
System.out.println(" Maximum memory usage : " + MemoryLogger.getInstance().getMaxMemory() + " mb");
System.out.println(" Total time ~ " + (endTimestamp - startTimestamp) + " ms");
System.out.println("===================================================");
}
}