package ca.pfv.spmf.algorithms.frequentpatterns.itemsettree;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import ca.pfv.spmf.patterns.itemset_array_integers_with_count.Itemset;
/**
* This class contains methods that are shared by the Itemset-Tree
* and Memory-Efficient Itemset Tree Implementations.
*
* Copyright (c) 2008-2012 Philippe Fournier-Viger
*
* This file is part of the SPMF DATA MINING SOFTWARE
* (http://www.philippe-fournier-viger.com/spmf).
*
* SPMF is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
*
* SPMF is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with
* SPMF. If not, see <http://www.gnu.org/licenses/>.
*/
abstract class AbstractItemsetTree {
// root of the itemset tree
ItemsetTreeNode root = null;
// statistics about tree construction
int nodeCount; // number of nodes in the tree (recalculated by printStatistics() )
long totalItemCountInNodes; // total number of items stored in nodes (recalculated by printStatistics()
long startTimestamp; // start time of tree construction (buildTree())
long endTimestamp; // end time of tree contruction (buildTree())
/**
* Method to calculate the largest common ancestor of two given itemsets
* (as defined in the paper).
* @param itemset1 the first itemset
* @param itemset2 the second itemset
* @return a new itemset which is the largest common ancestor or null if it is the empty set
*/
protected int[] getLargestCommonAncestor(int[] itemset1, int[] itemset2) {
// if one of the itemsets is null,
// return null.
if(itemset2 == null || itemset1 == null){
return null;
}
// find the minimum length of the itemsets
int minI = itemset1.length < itemset2.length ? itemset1.length : itemset2.length;
int count = 0; // to count the size of the common ancestor
// for each position in the itemsets from 0 to the maximum length -1
// Note that we use maxI-1 because we don't want that
// the maximum ancestor to be equal to itemset1 or itemset2
for(int i=0; i < minI; i++){
// if the two items are different, we stop because
// of the lexical ordering
if(itemset1[i] != itemset2[i]){
break;
}else{
// otherwise we inscrease the counter indicating the number of common
// items in the prefix
count++;
}
}
// if there is a common ancestor of size >0
// (we don,t want the empty set!)
if(count >0 && count < minI){
// create the itemset by copying the first "count" elements of
// itemset1 and return it
int[] common = new int[count];
System.arraycopy(itemset1, 0, common, 0, count);
return common;
}
else{
// otherwise, return null because the common ancestor is the empty set
return null;
}
}
/**
* Check if a first itemset is the ancestor of the second itemset
* @param itemset1 the first itemset
* @param itemset2 the second itemset
* @return true, if yes, otherwise, false.
*/
protected boolean ancestorOf(int[] itemset1, int[] itemset2) {
// if the second itemset is null (empty set), return false
if(itemset2 == null){
return false;
}
// if the first itemset is null (empty set), return true
if(itemset1 == null){
return true;
}
// if the length of itemset 1 is greater than the one of
// itemset2, it cannot be the ancestor, so return false
if(itemset1.length >= itemset2.length){
return false;
}
// otherwise, loop on items from itemset1
// and check if they are the same as itemset 2
for(int i=0; i< itemset1.length; i++){
// if one item is different, itemset1 is not the ancestor
if(itemset1[i] != itemset2[i]){
return false;
}
}
// otherwise itemset1 is an ancestor of itemset2
return true;
}
/**
* Method to check if two itemsets are equals
* @param itemset1 the first itemset
* @param itemset2 the second itemset
* @param prefix
* @return true if they are the same or false otherwise
*/
protected boolean same(int[] itemset1, int[] itemset2) {
// if one is null, then returns false
if(itemset2 == null || itemset1 == null){
return false;
}
// if they don't have the same size, then they cannot
// be equal
if(itemset1.length != itemset2.length){
return false;
}
// otherwise, loop on items from itemset1
// and check if they are the same as itemset 2
for(int i=0; i< itemset1.length; i++){
if(itemset1[i] != itemset2[i]){
// if one is different then they are not the same
return false;
}
}
// otherwise they are the same
return true;
}
/**
* Get the frequent itemsets subsuming a given itemset for a given minimum support value.
* @param is the itemset
* @param minsup the minimum support threshold (integer)
* @return an hashtable containing the frequent itemsets
*/
public HashTableIT getFrequentItemsetSubsuming(int[] is, int minsup) {
// call the recursive method
HashTableIT hashTable = getFrequentItemsetSubsuming(is);
// after finding the itemsets we do a loop to remove those with a support lower than minsup,
// This does not seems efficient but that is how the authors of the paper do it.
// for each position in the internal array of the hash table
for(List<Itemset> list : hashTable.table){
// if that position is not empty
if(list != null){
// loop over the itemsets stored at that position
Iterator<Itemset> it = list.iterator();
while (it.hasNext()) {
// if the itemset is infrequent, remove it
Itemset itemset = (Itemset) it.next();
if(itemset.support < minsup){
it.remove();
}
}
}
}
// then we return the hash table
return hashTable;
}
/**
* This method pass through the itemset tree to get all itemsets
* that are subsuming a given itemset "s" and their support. Note that
* this method may also return infrequent itemsets that can be filtered by
* additional processing after.
* @param s the itemset
* @return an hashtable countaining itemsets and their support.
*/
abstract protected HashTableIT getFrequentItemsetSubsuming(int[] s);
/**
* Generate all association rules with a given itemset as antecedent.
* @param s the itemset to be used as antecedent
* @param minsup the minsup threshold to be used
* @param minconf the minconf threshold to be used
* @return a list of association rules
*/
public List<AssociationRuleIT> generateRules(int[] s, int minsup, double minconf) {
// create a list of association rules for storing the result
List<AssociationRuleIT> rules = new ArrayList<AssociationRuleIT>();
// put the items from the itemset in a hashset
// for quick item inclusion checking
HashSet<Integer> seti = new HashSet<Integer>();
for(int i=0; i< s.length; i++){
seti.add(s[i]);
}
// calculate the support of the itemset
// (it will be used for calculating the confidence)
int suppS = getSupportOfItemset(s);
// get all frequent itemsets
HashTableIT frequentItemsets = getFrequentItemsetSubsuming(s, minsup);
// for each position in the hash table
for(List<Itemset> list : frequentItemsets.table){
// if the position is not empty
if(list != null){
// iterate over all itemsets in the same bucket in the hash table
for(Itemset c : list){
// if we have found an itemset having the same size as S,
// we continue because we want to find an itemset C to generate
// rules by doing C - S and that would result in the empty set.
if(c.size() == s.length){
continue;
}
//Try to generate a rule by
// creating a new itemset l for the consequent as C - S.
int[] l = new int[c.itemset.length - s.length];
int pos =0;
// we copy to l the items from c that are not in S.
for(Integer item : c.itemset){
if(!seti.contains(item)){
l[pos++] = item;
}
}
// calculate confidence of S --> C - S
int suppC = getSupportOfItemset(c.itemset);
// Note: the formula for calculating the confidence is wrong in the paper.
// It is not g(l) / g(c) but it should be g(c) / g(s).
double conf = (double)suppC / suppS;
// if the confidence is no less than minconf
if(conf >= minconf){
// create a new rule S --> L
AssociationRuleIT rule = new AssociationRuleIT();
rule.itemset1 = s;
rule.itemset2 = l;
rule.support = suppC;
rule.confidence = conf;
// add it to the list of rules found.
rules.add(rule);
}
}
}
}
// return the result
return rules;
}
/**
* Get the support of a given itemset s.
* @param s the itemset
* @return the support as an integer.
*/
public abstract int getSupportOfItemset(int[] s);
}