package ca.pfv.spmf.algorithms.frequentpatterns.vme;
/* This file is copyright (c) 2008-2013 Philippe Fournier-Viger
*
* This file is part of the SPMF DATA MINING SOFTWARE
* (http://www.philippe-fournier-viger.com/spmf).
*
* SPMF is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
*
* SPMF is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
* You should have received a copy of the GNU General Public License along with
* SPMF. If not, see <http://www.gnu.org/licenses/>.
*/
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import ca.pfv.spmf.patterns.itemset_array_integers_with_tids.Itemset;
/**
* This is an implementation of the VME algorithm (Deng and Xu, 2011) for
* erasable itemset mining.<br/><br/>
*
* The VME algorithm finds all the ereasable itemsets from a product database.<br/><br/>
*
* Actually, this algorithms is a only slight modification of the AprioriTID algorithm.<br/><br/>
*
* I have implemented mostly as described in the paper with some modifications to make
* it more efficient.<br/>
* First, the authors suggested to generate all candidates of a level before
* removing the unereasable ones. This is inefficient. Instead, in my implementation,
* I check the "gain" (loss of profit) directly after generating a candidate so I can eliminate
* them right away. <br/>
* Second, it is unecessary to check the subsets like the authors
* suggest because they use a vertical representation. <br/>
* Third, the authors suggest to store the profit of transactions
* in PID List. This is not memory efficient. For implementation it is better to
* store the profit of each transaction only once in a hashtable.
*
* @see Itemset
* @author Philippe Fournier-Viger
*/
public class AlgoVME {
// variables for counting support of items
// key: item value: tidset of the item as a set of integers
Map<Integer, Set<Integer>> mapItemTIDs = new HashMap<Integer, Set<Integer>>();
// variables for storing the profit of each transaction
// key: transaction id value: transaction profit
Map<Integer, Integer> mapTransactionProfit = new HashMap<Integer, Integer>();
// for statistics
long startTimestamp = 0; //start time of latest execution
long endTimeStamp = 0; //end time of latest execution
// the maximum profit loss
double maxProfitLoss =0;
// the overall profit
double overallProfit = 0;
// the number of erasable itemsets found by the latest execution
private int erasableItemsetCount = 0;
//object to write the output file
BufferedWriter writer = null;
/**
* Default constructor
*/
public AlgoVME() {
}
/**
* Run the VME algorithm.
* @param input path to an input file
* @param output path to be used for writing the output file
* @param threshold the threshold chosen by the user.
* @throws IOException exception if error reading/writing files
*/
public void runAlgorithm(String input, String output, double threshold) throws NumberFormatException, IOException {
// record start time
startTimestamp = System.currentTimeMillis();
// create writer
writer = new BufferedWriter(new FileWriter(output));
// reset number of erasale itemsts o 0
erasableItemsetCount = 0;
// Scan the database one time to get the overall profit
// and at the same time we record the profit of each transaction (product).
overallProfit = 0;
BufferedReader reader = new BufferedReader(new FileReader(input));
String line;
int i=0;
// for each transaction (line) until the end of file
while( ((line = reader.readLine())!= null)){
// if the line is a comment, is empty or is a
// kind of metadata
if (line.isEmpty() == true ||
line.charAt(0) == '#' || line.charAt(0) == '%'
|| line.charAt(0) == '@') {
continue;
}
// split the line
String[] lineSplited = line.split(" ");
// get the profit (in first position of the line)
int profit = Integer.parseInt(lineSplited[0]);
// add the profit to overall profit
overallProfit += profit;
// put the profit of this transaction in the map of transaction profit
mapTransactionProfit.put(i++, profit);
}
// close input file
reader.close();
// Calculate max profit loss
maxProfitLoss = overallProfit * threshold;
// Scan the database second time to find erasable itemset of size 1
// and their tid list.
reader = new BufferedReader(new FileReader(input));
i=0;
// for each transaction (line) until the end of file
while( ((line = reader.readLine())!= null)){
// if the line is a comment, is empty or is a
// kind of metadata
if (line.isEmpty() == true ||
line.charAt(0) == '#' || line.charAt(0) == '%'
|| line.charAt(0) == '@') {
continue;
}
// split the line
String[] lineSplited = line.split(" ");
// for each item in that line
for(int j=1; j< lineSplited.length; j++){
// convert item to integer
int item = Integer.parseInt(lineSplited[j]);
// get the tidset of that item
// and update it with the current tid for this transaction
Set<Integer> tids = mapItemTIDs.get(item);
if(tids == null){
tids = new HashSet<Integer>();
mapItemTIDs.put(item, tids);
}
tids.add(i);
}
i++; // increase the tid for next transaction
}
// close the input file
reader.close();
// Find erasable itemsets of size 1 and delete items that are
// not erasable from memory
List<Itemset> level = new ArrayList<Itemset>();
// for each item
Iterator<Entry<Integer, Set<Integer>>> iterator = mapItemTIDs.entrySet().iterator();
while (iterator.hasNext()) {
// get the tidset of that item
Map.Entry<Integer, Set<Integer>> entry = (Map.Entry<Integer, Set<Integer>>) iterator.next();
// init loss to 0
int loss =0;
// for each tid in the tidset
for(Integer tid : entry.getValue()){
// add the loss resulting from erasing that item
loss += mapTransactionProfit.get(tid);
}
// if the looss is less than the max profit loss
if(loss <= maxProfitLoss){
// it is an erasable itemset
Itemset itemset = new Itemset(entry.getKey());
itemset.setTIDs(mapItemTIDs.get(entry.getKey()));
level.add(itemset);
// save it to the output file
saveItemsetToFile(itemset, loss);
}else{
// otherwise, not erasable so we remove from memory.
iterator.remove();
}
}
// sort items because apriori based algorithm need
// a total order for candidate generation
Collections.sort(level, new Comparator<Itemset>(){
public int compare(Itemset o1, Itemset o2) {
return o1.get(0) - o2.get(0);
}
});
// Recursively generate candidate erasable itemsets of size k>1 by using
// erasable itemsets of size k-1 and stop
// when no candidates can be generated
while (!level.isEmpty()) {
// Generate candidates of size K
level = generateCandidateSizeK(level);
}
// close the file
writer.close();
// record end time
endTimeStamp = System.currentTimeMillis();
}
/**
* Generate candidate itemsets of size K by using HWTUIs of size k-1
* @param levelK_1 itemsets of size k-1
* @return candidates of size K
*/
protected List<Itemset> generateCandidateSizeK(List<Itemset> levelK_1) throws IOException {
// create list to store candidates of size k
List<Itemset> candidates = new ArrayList<Itemset>();
// For each itemset I1 and I2 of level k-1
loop1: for(int i=0; i< levelK_1.size(); i++){
Itemset itemset1 = levelK_1.get(i);
loop2: for(int j=i+1; j< levelK_1.size(); j++){
Itemset itemset2 = levelK_1.get(j);
// we compare items of itemset1 and itemset2.
// If they have all the same k-1 items and the last item of itemset1 is smaller than
// the last item of itemset2, we will combine them to generate a candidate
for(int k=0; k< itemset1.size(); k++){
// if they are the last items
if(k == itemset1.size()-1){
// the one from itemset1 should be smaller (lexical order)
// and different from the one of itemset2
if(itemset1.getItems()[k] >= itemset2.getItems()[k]){
continue loop1;
}
}
// if they are not the last items, and
else if(itemset1.getItems()[k] < itemset2.get(k)){
continue loop2; // we continue searching
}
else if(itemset1.getItems()[k] > itemset2.get(k)){
continue loop1; // we stop searching: because of lexical order
}
}
// NOW COMBINE ITEMSET 1 AND ITEMSET 2
Integer missing = itemset2.get(itemset2.size()-1);
// create the union of tids
Set<Integer> unionTIDS = new HashSet<Integer>(itemset1.getTransactionsIds());
unionTIDS.addAll(itemset2.getTransactionsIds());
// calculate loss
int loss =0;
// for each tid, add the profit ot the transaction to the loss
for(Integer tid : unionTIDS){
loss += mapTransactionProfit.get(tid);
}
// if the loss is higher or equal to the max profit loss
// that we can tolerate
if(loss <= maxProfitLoss){
// Create a new candidate by combining itemset1 and itemset2
int newItemset[] = new int[itemset1.size()+1];
System.arraycopy(itemset1.itemset, 0, newItemset, 0, itemset1.size());
newItemset[itemset1.size()] = itemset2.getItems()[itemset2.size() -1];
Itemset candidate = new Itemset(newItemset);
candidate.setTIDs(unionTIDS);
// add the itemset to the set of candidates
candidates.add(candidate);
// save the itemset to the output file
saveItemsetToFile(candidate, loss);
}
}
}
// return candidates
return candidates;
}
/**
* Save an itemset to the output file.
* @param itemset the itemset
* @param loss the loss
* @throws IOException exception if error while writing to output file
*/
private void saveItemsetToFile(Itemset itemset, int loss) throws IOException{
// write the itemset
writer.write(itemset.toString() + " #LOSS: " + loss);
writer.newLine();
// increase the itemset count
erasableItemsetCount++;
}
/**
* Print statistics about the latest execution to System.out.
*/
public void printStats() {
System.out
.println("============= VME - STATS =============");
long temps = endTimeStamp - startTimestamp;
System.out.println("Overall profit: " + overallProfit);
System.out.println("Maximum profit loss (over. profit x treshold): " + maxProfitLoss);
System.out.println(" Erasable itemset count : " + erasableItemsetCount);
System.out.println(" Total time ~ " + temps + " ms");
System.out
.println("===================================================");
}
}