package ca.pfv.spmf.algorithms.frequentpatterns.two_phase;
/* This file is copyright (c) 2008-2013 Philippe Fournier-Viger
*
* This file is part of the SPMF DATA MINING SOFTWARE
* (http://www.philippe-fournier-viger.com/spmf).
*
* SPMF is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
*
* SPMF is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
* You should have received a copy of the GNU General Public License along with
* SPMF. If not, see <http://www.gnu.org/licenses/>.
*/
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import ca.pfv.spmf.tools.MemoryLogger;
/**
* This is an implementation of the "HUINIV-Mine" for High-Utility Itemsets Mining
* while considering negative profit value.
* HUINIV-Mine is described in this paper: <br/><br/>
*
* Chu, Chun-Jung, Vincent S. Tseng, and Tyne Liang.
* "An efficient algorithm for mining high utility itemsets with negative item
* values in large databases." Applied Mathematics and Computation 215.2 (2009):
* 767-778.<br/><br/>
*
* @see ItemsetsTP
* @see ItemsetTP
* @see TransactionTP
* @see UtilityTransactionDatabase
* @author Philippe Fournier-Viger 2014
*/
public class AlgoHUINIVMine {
// the set of high utility itemsets found by the algorithm
private ItemsetsTP highUtilityItemsets = null;
// the database
protected UtilityTransactionDatabaseTP database;
// the min utility threshold
int minUtility;
// for statistics
long startTimestamp = 0; // start time
long endTimestamp = 0; // end time
private int candidatesCount; // the number of candidates generated
//========= DIFFERENCE FROM TWO-PHASE ======/
/** the set of negative items **/
Set<Integer> negativeItems = new HashSet<Integer>();
//===========================================/
/**
* Default constructor
*/
public AlgoHUINIVMine() {
}
/**
* Run the Two-phase algorithm
* @param database a transaction database containing utility information.
* @param minUtility the min utility threshold
* @return the set of high utility itemsets
*/
public ItemsetsTP runAlgorithm(UtilityTransactionDatabaseTP database, int minUtility) {
// save the parameters
this.database = database;
this.minUtility = minUtility;
// reset the utility to check the memory usage
MemoryLogger.getInstance().reset();
// record start time
startTimestamp = System.currentTimeMillis();
// initialize the set of HUIs (high utility itemsets)
highUtilityItemsets = new ItemsetsTP("HIGH UTILITY ITEMSETS");
// reset HUI count
candidatesCount =0;
// =================== PHASE 1: GENERATE CANDIDATES ===================
// First, we create the level of candidate itemsets of size 1
List<ItemsetTP> candidatesSize1 = new ArrayList<ItemsetTP>();
// Scan database one time to get the tidset of each item and its utility for the whole database
// Map to store the tidset of each item
// key: item value: tidset as a set of integers
Map<Integer, Set<Integer>> mapItemTidsets = new HashMap<Integer, Set<Integer>>();
// Map to store the TWU of each item (key: item , value: TWU)
final Map<Integer, Integer> mapItemTWU = new HashMap<Integer, Integer>();
// variable to remember the maximum item ID
int maxItem = Integer.MIN_VALUE;
// for each line (transaction) in the database
for(int i=0; i< database.size(); i++){
// get the transaction
TransactionTP transaction = database.getTransactions().get(i);
// for each item in the current transactions
for(int j=0; j< transaction.getItems().size(); j++) {
ItemUtility itemUtilityObj = transaction.getItems().get(j);
int item = itemUtilityObj.item;
int itemUtility = itemUtilityObj.utility;
//========= DIFFERENCE FROM TWO-PHASE ======/
if(itemUtility < 0) {
negativeItems.add(item);
// if(item == 12929) {
// System.out.println("12929 is negative");
// }
// if(item == 48) {
// System.out.println("48 is negative");
// }
}
//===========================================
// if this is the largest item until now, remember it
if(item > maxItem){
maxItem = item;
}
// Add the tid of this transaction to the tidset of the item
Set<Integer> tidset = mapItemTidsets.get(item);
if(tidset == null){
tidset = new HashSet<Integer>();
mapItemTidsets.put(item, tidset);
}
tidset.add(i);
// Add transaction utility for this item to its TWU
Integer sumUtility = mapItemTWU.get(item);
if(sumUtility == null){ // if no utility yet
sumUtility = 0;
}
sumUtility += transaction.getTransactionUtility(); // add the utility
mapItemTWU.put(item, sumUtility);
}
}
for(TransactionTP transaction : database.getTransactions()) {
Collections.sort(transaction.getItems(), new Comparator<ItemUtility>() {
public int compare(ItemUtility o1, ItemUtility o2) {
// //====================== FHN =======================
// Boolean item1IsNegative = negativeItems.contains(o1.item);
// Boolean item2IsNegative = negativeItems.contains(o2.item);
// if(!item1IsNegative && item2IsNegative) {
// return 1;
// }else if (item1IsNegative && !item2IsNegative) {
// return -1;
// }
// //=============================================
//
// int compare = mapItemTWU.get(o1.item) - mapItemTWU.get(o2.item);
// // if the same, use the lexical order otherwise use the TWU
// return (compare == 0)? o1.item - o2.item : compare;
return o1.item - o2.item;
}
});
}
// Create a candidate itemset for each item having a TWU >= minUtil
// For each item
for(int item=0; item<= maxItem; item++){
// Get the twu of the item
Integer estimatedUtility = mapItemTWU.get(item);
// if it is a HWTUI itemset (see formal definition in paper)
if(estimatedUtility != null && estimatedUtility >= minUtility){
// Create the itemset with this item and set its tidset
ItemsetTP itemset = new ItemsetTP();
itemset.addItem(item);
itemset.setTIDset(mapItemTidsets.get(item));
// add it to candidates
candidatesSize1.add(itemset);
// add it to the set of HUIs
highUtilityItemsets.addItemset(itemset, itemset.size());
}
}
// From candidate of size 1, we recursively create candidates of greater size
// until no candidates can be generated
List<ItemsetTP> currentLevel = candidatesSize1;
while (true) {
// Generate candidates of size K+1
int candidateCount = highUtilityItemsets.getItemsetsCount();
currentLevel = generateCandidateSizeK(currentLevel, highUtilityItemsets);
// if no new candidates are found, then we stop because no more candidates will be found.
if(candidateCount == highUtilityItemsets.getItemsetsCount()){
break;
}
}
// the Phase 1 of the algorithm is now completed!
// check memory usage
MemoryLogger.getInstance().checkMemory();
// update the number of candidates generated until now
candidatesCount = highUtilityItemsets.getItemsetsCount();
// ======================== PHASE 2: Calculate exact utility of each candidate =============
// for each level of HWTUIs found in phase 1
for(List<ItemsetTP> level : highUtilityItemsets.getLevels()){
// for each HWTUIs in that level
Iterator<ItemsetTP> iterItemset = level.iterator();
while(iterItemset.hasNext()){
// this is the current HWTUI
ItemsetTP candidate = iterItemset.next();
//========= DIFFERENCE FROM TWO-PHASE ======/
// Note : only done for itemsets of length >=2
if(onlyContainsNegativeItems(candidate.getItems())) {
iterItemset.remove(); // delete it
highUtilityItemsets.decreaseCount(); // decrease number of itemsets found
continue;
}
// ==================================================
// Calculate exact utility of that HTWUI by scanning transactions of its tidset
// For each transaction
for(TransactionTP transaction : database.getTransactions()){
// variable to store the transaction utility of "candidate" for the current transaction
int transactionUtility =0;
// the number of items from "candidate" appearing in this transaction
int matchesCount =0;
// for each item of the transaction
for(int i=0; i< transaction.size(); i++){
// if it appears in "candidate"
if(candidate.getItems().contains(transaction.get(i).item)){
// add the transaction utility
transactionUtility += transaction.getItemsUtilities().get(i).utility;
matchesCount++; // increase the number of matches
}
}
// if the numer of matches is the size of "candidate", it means
// that it appears completely in the transaction,
// so we add the transaction utility of "candidate" to its utility.
if(matchesCount == candidate.size()){
candidate.incrementUtility(transactionUtility);
}
}
// finally, after scanning all transactions for "candidate", we have its
// real utility value.
// if lower than min-utility it is not a HUI so:
if(candidate.getUtility() < minUtility){
iterItemset.remove(); // delete it
highUtilityItemsets.decreaseCount(); // decrease number of itemsets found
}
}
}
// check memory usage
MemoryLogger.getInstance().checkMemory();
// record end time
endTimestamp = System.currentTimeMillis();
// Return all frequent itemsets found!
return highUtilityItemsets;
}
//========= DIFFERENCE FROM TWO-PHASE ======/
/**
* Checks if all the items of an itemset are negative profit items
* @param items the items of the itemset
* @return true if all items are negative. Otherwise, false.
*/
private boolean onlyContainsNegativeItems(List<Integer> items) {
for(Integer item : items) {
if(negativeItems.contains(item) == false) {
return false;
}
}
return true;
}
//===================================================
/**
* Generate candidate HWTUI of size K by using HWTUIs of size k-1
* @param levelK_1 HWTUIs of size k-1
* @param candidatesHTWUI structure to store the HWTUIs
* @return candidates of size K
*/
protected List<ItemsetTP> generateCandidateSizeK(List<ItemsetTP> levelK_1, ItemsetsTP candidatesHTWUI) {
// For each itemset I1 and I2 of level k-1
loop1: for(int i=0; i< levelK_1.size(); i++){
ItemsetTP itemset1 = levelK_1.get(i);
loop2: for(int j=i+1; j< levelK_1.size(); j++){
ItemsetTP itemset2 = levelK_1.get(j);
// we compare items of itemset1 and itemset2.
// If they have all the same k-1 items and the last item of itemset1 is smaller than
// the last item of itemset2, we will combine them to generate a candidate
for(int k=0; k< itemset1.size(); k++){
// if they are the last items
if(k == itemset1.size()-1){
// the one from itemset1 should be smaller (lexical order)
// and different from the one of itemset2
if(itemset1.getItems().get(k) >= itemset2.get(k)){
continue loop1;
}
}
// if they are not the last items, and
else if(itemset1.getItems().get(k) < itemset2.get(k)){
continue loop2; // we continue searching
}
else if(itemset1.getItems().get(k) > itemset2.get(k)){
continue loop1; // we stop searching: because of lexical order
}
}
// NOW COMBINE ITEMSET 1 AND ITEMSET 2
Integer missing = itemset2.get(itemset2.size()-1);
// create list of common tids
Set<Integer> tidset = new HashSet<Integer>();
for(Integer val1 : itemset1.getTIDset()){
if(itemset2.getTIDset().contains(val1)){
tidset.add(val1);
}
}
// Calculate TWU of itemset
// it is defined as the sum of the transaction utility (TU) for the
// tidset of the itemset
int twu =0;
for(Integer tid : tidset){
twu += database.getTransactions().get(tid).getTransactionUtility();
}
// if the transaction weighted utility (TWU) is high enough
if(twu >= minUtility){
// Create a new candidate by combining itemset1 and itemset2
ItemsetTP candidate = new ItemsetTP();
for(int k=0; k < itemset1.size(); k++){
candidate.addItem(itemset1.get(k));
}
candidate.addItem(missing);
// set its tidset
candidate.setTIDset(tidset);
// add it to the set of HWTUI of size K
candidatesHTWUI.addItemset(candidate, candidate.size());
}
}
}
// return candidates HTWUIs of size K
return candidatesHTWUI.getLevels().get(candidatesHTWUI.getLevels().size()-1);
}
/**
* Print statistics about the latest algorithm execution to System out.
*/
public void printStats() {
System.out
.println("============= HUINIV-MINE ALGORITHM - STATS =============");
System.out.println(" Transactions count from database : "
+ database.size());
System.out.println(" Candidates count : " + candidatesCount);
System.out.println(" High-utility itemsets count : " + highUtilityItemsets.getItemsetsCount());
System.out.println(" Total time ~ " + (endTimestamp - startTimestamp) + " ms");
System.out
.println("===================================================");
}
}