package ca.pfv.spmf.algorithms.sequential_rules.trulegrowth;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import ca.pfv.spmf.algorithms.ArraysAlgos;
import ca.pfv.spmf.input.sequence_database_list_integers.Sequence;
import ca.pfv.spmf.input.sequence_database_list_integers.SequenceDatabase;
import ca.pfv.spmf.tools.MemoryLogger;
/**
* This is the original implementation of the TRULEGROWTH algorithm for mining sequential rules
* with a window size constraint. The TRuleGrowth algorithm is a variation of the RuleGrowth algorithm
* described in this paper:
* <br/><br/>
*
* Fournier-Viger, P., Wu, C.-W., Tseng, V.S., Nkambou, R. (2012).
* Mining Sequential Rules Common to Several Sequences with the Window Size Constraint.
* Proceedings of the 25th Canadian Conf. on Artificial Intelligence (AI 2012),
* Springer, LNAI 7310, pp.299-304.
* <br/><br/>
*
*@see Sequence
*@see SequenceDatabase
*@author Philippe Fournier-Viger
*/
public class AlgoTRuleGrowth {
// *** for statistics ***
long timeStart = 0; // start time of latest execution
long timeEnd = 0; // end time of latest execution
//*** internal variables ***/
// A map to record the occurences of each item in each sequence
// KEY: an item
// VALUE: a map of key: sequence ID value: occurences of the item in that sequence.
// (note: an occurence is an itemset position)
Map<Integer, Map<Integer, Occurence>> mapItemCount;
// minimum support which will be raised dynamically
int minsuppRelative;
// The number of patterns found
int ruleCount;
// object to write the output file
BufferedWriter writer = null;
// *** parameters ***
SequenceDatabase database; // a sequence database
double minconf; // minimum confidence
int windowSize =0; // window size
// the maximum size of the antecedent of rules (optional)
int maxAntecedentSize = Integer.MAX_VALUE;
// the maximum size of the consequent of rules (optional)
int maxConsequentSize = Integer.MAX_VALUE;
/**
* Default constructor
*/
public AlgoTRuleGrowth() {
}
/**
* Run the algorithm.
* @param minSupport Minsup as a percentage (ex: 0.05 = 5 %)
* @param minConfidence minimum confidence (a value between 0 and 1).
* @param input the input file path
* @param output the output file path
* @param windowSize a window size
* @throws IOException exception if there is an error reading/writing files
*/
public void runAlgorithm(double minSupport, double minConfidence, String input, String output, int windowSize ) throws IOException{
// load the input file into memory
try {
this.database = new SequenceDatabase();
database.loadFile(input);
} catch (Exception e) {
e.printStackTrace();
}
// convert minimum support to an absolute minimum support (integer)
this.minsuppRelative = (int) Math.ceil(minSupport * database.size());
// run the algorithm
runAlgorithm(input, output, minsuppRelative, minConfidence, windowSize);
}
/**
* Run the algorithm.
* @param relativeMinSupport the minsup parameter as a a relative value (integer)
* @param minConfidence minimum confidence (a value between 0 and 1).
* @param input the input file path
* @param output the output file path
* @param windowSize a window size
* @throws IOException exception if there is an error reading/writing files
*/
public void runAlgorithm(String input, String output, int relativeMinSupport, double minConfidence, int windowSize
) throws IOException{
// save the minconf parameter
this.minconf = minConfidence;
// read the database into memory
if(database == null){
try {
this.database = new SequenceDatabase();
database.loadFile(input);
} catch (Exception e) {
e.printStackTrace();
}
}
// We add 1 to the window size to that it follows
// the same definition as in the published article.
this.windowSize = windowSize + 1;
// if minsup is 0, set it to 1 to avoid generating
// rules not in the database
this.minsuppRelative = relativeMinSupport;
if(this.minsuppRelative == 0){ // protection
this.minsuppRelative = 1;
}
// reset the stats for memory usage
MemoryLogger.getInstance().reset();
// prepare the object for writing the output file
writer = new BufferedWriter(new FileWriter(output));
// save the start time
timeStart = System.currentTimeMillis(); // for stats
// Remove infrequent items from the database in one database scan.
// Then perform another database scan to count the
// the support of each item in the same database scan
// and their occurrences.
removeItemsThatAreNotFrequent(database);
// Put frequent items in a list.
List<Integer> listFrequents = new ArrayList<Integer>();
// for each item
for(Entry<Integer,Map<Integer, Occurence>> entry : mapItemCount.entrySet()){
// if it is frequent
if(entry.getValue().size() >= minsuppRelative){
// add it to the list
listFrequents.add(entry.getKey());
}
}
// We will now try to generate rules with one item in the
// antecedent and one item in the consequent using
// the frequent items.
// For each pair of frequent items i and j such that i != j
for(int i=0; i< listFrequents.size(); i++){
// get the item i and its map of occurences
Integer intI = listFrequents.get(i);
Map<Integer,Occurence> occurencesI = mapItemCount.get(intI);
for(int j=i+1; j< listFrequents.size(); j++){
// get the item j and its map of occurences
Integer intJ = listFrequents.get(j);
Map<Integer,Occurence> occurencesJ = mapItemCount.get(intJ);
// (1) We will now calculate the tidsets
// of itemset I, itemset J, the rule I -->J
// and the rule J-->I
Set<Integer> tidsI = new HashSet<Integer>();
Set<Integer> tidsJ = null;
Set<Integer> tidsIJ = new HashSet<Integer>();
Set<Integer> tidsJI= new HashSet<Integer>();
// for each occurence of I
looptid: for(Occurence occI : occurencesI.values()){
// add the sequenceID of that occurence to tidsI
tidsI.add(occI.sequenceID);
// if J does not appear in that sequence continue loop
Occurence occJ = occurencesJ.get(occI.sequenceID);
if(occJ == null){
continue looptid;
}
// make a big loop to compare if I appears before
// J in that sequence and
// if J appears before I
boolean addedIJ= false;
boolean addedJI= false;
// for each occurence of I in that sequence
loopIJ: for(Short posI : occI.occurences){
// for each occurence of J in that sequence
for(Short posJ : occJ.occurences){
if(!posI.equals(posJ) && Math.abs(posI - posJ) <= windowSize){
if(posI <= posJ){
// if I is before J
tidsIJ.add(occI.sequenceID);
addedIJ = true;
}else{
// if J is before I
tidsJI.add(occI.sequenceID);
addedJI = true;
}
// if we have found that I is before J and J is before I
// we don't need to continue.
if(addedIJ && addedJI){
break loopIJ;
}
}
}
}
}
// END
// (2) check if I ==> J has enough common tids
// If yes, we create the rule I ==> J
if(tidsIJ.size() >= minsuppRelative){
// calculate the confidence of I ==> J
double confIJ = ((double)tidsIJ.size()) / occurencesI.size();
// create itemset of the rule I ==> J
int[] itemset1 = new int[]{intI};
int[] itemset2 = new int[]{intJ};
// if the confidence is high enough, save the rule
if(confIJ >= minConfidence){
saveRule(tidsIJ, confIJ, itemset1, itemset2);
}
// Calculate tidsJ.
tidsJ = new HashSet<Integer>();
for(Occurence occJ : occurencesJ.values()){
tidsJ.add(occJ.sequenceID);
}
// recursive call to try to expand the rule on the left and
// right sides
if(itemset1.length < maxAntecedentSize) {
expandLeft(itemset1, itemset2, tidsI, tidsIJ);
}
if(itemset2.length < maxConsequentSize) {
expandRight(itemset1, itemset2, tidsI, tidsJ, tidsIJ);
}
}
// check if J ==> I has enough common tids
// If yes, we create the rule J ==> I
if(tidsJI.size() >= minsuppRelative){
double confJI = ((double)tidsJI.size()) / occurencesJ.size();
//
// create itemsets for that rule
int[] itemset1 = new int[]{intI};
int[] itemset2 = new int[]{intJ};
// if the rule has enough confidence, save it!
if(confJI >= minConfidence){
saveRule(tidsJI, confJI, itemset2, itemset1);
// rules.addRule(ruleJI);
}
// Calculate tidsJ.
if(tidsJ == null){
tidsJ = new HashSet<Integer>();
for(Occurence occJ : occurencesJ.values()){
tidsJ.add(occJ.sequenceID);
}
}
// recursive call to try to expand the rule
if(itemset1.length < maxConsequentSize) {
expandRight(itemset2, itemset1, tidsJ, tidsI, tidsJI /*, occurencesJ, occurencesI*/);
}
if(itemset2.length < maxAntecedentSize) {
expandLeft(itemset2, itemset1, tidsJ, tidsJI /*, occurencesI*/);
}
}
}
}
// save the end time for the execution of the algorithm
timeEnd = System.currentTimeMillis(); // for stats
// close the file
writer.close();
database = null;
}
/**
* This method search for items for expanding left side of a rule I --> J
* with any item c. This results in rules of the form I U�{c} --> J. The method makes sure that:
* - c is not already included in I or J
* - c appear at least minsup time in tidsIJ before last occurence of J
* - c is lexically bigger than all items in I
* @param itemsetI the left side of a rule (see paper)
* @param itemestJ the right side of a rule (see paper)
* @param tidsI the tids set of I
* @param tidsJ the tids set of J
* @throws IOException exception if error while writing output file
*/
private void expandLeft(int[] itemsetI, int[] itemsetJ,
Collection<Integer> tidsI,
Collection<Integer> tidsIJ // ,
// Map<Integer, Occurence> mapOccurencesJ
) throws IOException {
// The following map will be used to count the support of each item
// c that could potentially extend the rule.
// The map associated a set of tids (value) to an item (key).
Map<Integer, Set<Integer>> frequentItemsC = new HashMap<Integer, Set<Integer>>();
// We scan the sequence where I-->J appear to search for items c
// that we could add to generate a larger rule IU{c} --> J
// For each tid of sequence containing I-->J
for(Integer tid : tidsIJ){
Sequence sequence = database.getSequences().get(tid);
// there maps are used when scanning the sequence to determine
// what is currently inside the window and what fall out of the window.
// We use linkedhashMap to as to keep the order of what is inserted into the maps
// There maps are key: item value: position of an itemset
LinkedHashMap<Integer, Integer> mapMostLeftFromI = new LinkedHashMap<Integer, Integer>();
LinkedHashMap<Integer, Integer> mapMostLeftFromJ = new LinkedHashMap<Integer, Integer>();
// key: item value: list of positions of itemsets
LinkedHashMap<Integer, LinkedList<Integer>> mapMostRightFromJ = new LinkedHashMap<Integer, LinkedList<Integer>>();
//
int lastItemsetScannedForC = Integer.MAX_VALUE;
// For each itemset starting from the last in this sequence
int k= sequence.size()-1;
do{
final int firstElementOfWindow = k; // - windowSize +1
final int lastElementOfWindow = k + windowSize -1;
// remove items from J that fall outside the time window
int previousJSize = mapMostLeftFromJ.size();
removeElementOutsideWindow(mapMostLeftFromJ, lastElementOfWindow);
// important: if J was all there, but become smaller we need to clear the
// hashmap for items of I.
int currentJSize = mapMostLeftFromJ.size();
if(previousJSize == itemsetJ.length && previousJSize != currentJSize){
mapMostLeftFromI.clear();
}
// remove items from I that fall outside the time window
removeElementOutsideWindow(mapMostLeftFromI, lastElementOfWindow);
// For each item of the current itemset
for(Integer item : sequence.get(k)){
// record the first position until now of each item in I or J
// if we saw J completely already, and the current item is in I
if(mapMostLeftFromJ.size() == itemsetJ.length && ArraysAlgos.contains(itemsetI, item)){
// then we add its position to the map for items from I
addToLinked(mapMostLeftFromI, item, k);
// otherwise, if it is an item from J
}else if(ArraysAlgos.contains(itemsetJ, item)){
// add its position to the map of positions for J
addToLinked(mapMostLeftFromJ, item, k);
LinkedList<Integer> list = mapMostRightFromJ.get(item);
if(list == null){
list = new LinkedList<Integer>();
addToLinked(mapMostRightFromJ, item, list);
}
list.add(k);
}
}
// if all the items of I ==> J are in the current window
if(mapMostLeftFromI.size() == itemsetI.length && mapMostLeftFromJ.size() == itemsetJ.length){
//remove items from mostRight that fall outside the time window.
// at the same time, calculate the minimum index for items of J.
int minimum = Integer.MAX_VALUE;
// for each position in the map of positions of J
for(LinkedList<Integer> list: mapMostRightFromJ.values()){
while(true){
// get the last position
Integer last = list.getLast();
// if it is oustide the window remove it
if(last > lastElementOfWindow){
list.removeLast();
}else{
// ottherwise update the minimum and break
if(last < minimum){
minimum = last -1;
}
break;
}
}
}
// we need to scan for items C to extend the rule...
// Such item c has to appear in the window before the last occurence of J (before "minimum")
// and if it was scanned before, it should not be scanned again.
int itemsetC = minimum;
if(itemsetC >= lastItemsetScannedForC){
itemsetC = lastItemsetScannedForC -1;
}
// for each item c after the first element of the window, starting from the
// last itemset just before the last occurence of J
for(; itemsetC >= firstElementOfWindow; itemsetC--){
for(Integer itemC : sequence.get(itemsetC)){
// if lexical order is not respected or c is included in the rule already.
if(ArraysAlgos.containsLEXPlus(itemsetI, itemC)
|| ArraysAlgos.containsLEX(itemsetJ, itemC)){
continue; // skip it
}
// otherwise, get the tidset of "c"
Set<Integer> tidsItemC = frequentItemsC.get(itemC);
// if there is no tidset, create one
if(tidsItemC == null){
tidsItemC = new HashSet<Integer>();
frequentItemsC.put(itemC, tidsItemC);
}
// add the tid to the tidset of c
tidsItemC.add(tid);
}
}
// update the last item scanned
lastItemsetScannedForC = firstElementOfWindow;
}
k--; // go to previous itemset in the sequence (we scan the sequence bacward)
}while(k >= 0 && lastItemsetScannedForC >0);
}
// For each item c found, we create a rule IU{c} ==> J
for(Entry<Integer, Set<Integer>> entry : frequentItemsC.entrySet()){
Set<Integer> tidsIC_J = entry.getValue();
// if the support of IU{c} ==> J is enough
if(tidsIC_J.size() >= minsuppRelative){
Integer itemC = entry.getKey();
int [] itemsetIC = new int[itemsetI.length+1];
System.arraycopy(itemsetI, 0, itemsetIC, 0, itemsetI.length);
itemsetIC[itemsetI.length] = itemC;
// Calculate tids containing IU{c} within the time window which is necessary
// to calculate the confidence
Set<Integer> tidsIC = new HashSet<Integer>();
// for each sequence containing I
loop1: for(Integer tid: tidsI){
// get the sequence
Sequence sequence = database.getSequences().get(tid);
// To check if IU{c} is contained in that sequence we will use a map
// such that the key : item value: position of an itemset
LinkedHashMap<Integer, Integer> mapAlreadySeenFromIC = new LinkedHashMap<Integer, Integer>();
// For each itemset
for(int k=0; k< sequence.size(); k++){
// For each item
for(Integer item : sequence.get(k)){
// record the last position of each item in IU{C}
if(ArraysAlgos.contains(itemsetIC, item)){
addToLinked(mapAlreadySeenFromIC, item, k);
}
}
// as we are moving through the sequence,
// remove items that fall outside the time window
Iterator<Entry<Integer, Integer>> iter = mapAlreadySeenFromIC.entrySet().iterator();
while(iter.hasNext()){
// if falling outside of the window
Entry<Integer, Integer> entryMap = iter.next();
if(entryMap.getValue() < k - windowSize +1){
// remove the item
iter.remove();
}else{
// otherwise break
break;
}
}
// if all the items of I are inside the current window, then record the tid
if(mapAlreadySeenFromIC.keySet().size() == itemsetIC.length){
tidsIC.add(tid);
continue loop1;
}
}
}
// ---- ----
//
// if(itemC == 6){
// System.out.println();
// }
// Create rule and calculate its confidence of IU{c} ==> J
// defined as: sup(IU{c} -->J) / sup(IU{c})
double confIC_J = ((double)tidsIC_J.size()) / tidsIC.size();
// if the confidence is high enough, then it is a valid rule
if(confIC_J >= minconf){
// save the rule
saveRule(tidsIC_J, confIC_J, itemsetIC, itemsetJ);
}
// recursive call to expand left side of the rule
if(itemsetIC.length < maxAntecedentSize) {
expandLeft(itemsetIC, itemsetJ, tidsIC, tidsIC_J );
}
}
}
// check the memory usage
MemoryLogger.getInstance().checkMemory();
}
/**
* This method insert a key and a value in a hashmap while making sure that
* the insertion order is preserved. It was necessary to do that because
* when an element is re-inserted in a linked list, the access order remain
* the one of the first insertion.
* @param mapMostLeftFromI the map
* @param key a key
* @param value a value
*/
private void addToLinked(LinkedHashMap mapMostLeftFromI,
Object key, Object value) {
// if the map contain the key already
if(mapMostLeftFromI.containsKey(key)){
// remove it
mapMostLeftFromI.remove(key);
}
// then put it
mapMostLeftFromI.put(key, value);
}
/**
* This method removes elements out of the current window from a hashmap containing
* the position of items at the left of an itemset.
* key: item value : a itemset position
* @param mapMostLeftFromItemset the map
* @param lastElementOfWindow the last itemset of the window in terms of itemset position
* in the sequence.
*/
private void removeElementOutsideWindow(
LinkedHashMap<Integer, Integer> mapMostLeftFromItemset,
final int lastElementOfWindow) {
// iterate over elements of the map
Iterator<Entry<Integer, Integer>> iter = mapMostLeftFromItemset.entrySet().iterator();
while(iter.hasNext()){
// if the position is outside the window, remove it
if(iter.next().getValue() > lastElementOfWindow){
iter.remove();
}else{
// otherwise, we break
break;
}
}
}
/**
* This method removes elements out of the current window from a hashmap containing
* the position of items at the right of an itemset.
* key: item value : a itemset position
* @param mapMostLeftFromItemset the map
* @param lastElementOfWindow the last itemset of the window in terms of itemset position
* in the sequence.
*/
private void removeElementOutsideWindowER(
LinkedHashMap<Integer, Integer> mapMostRightfromI,
final int firstElementOfWindow) {
// iterate over elements of the map
Iterator<Entry<Integer, Integer>> iter = mapMostRightfromI.entrySet().iterator();
while(iter.hasNext()){
Entry<Integer, Integer> entry = iter.next();
// if the position is outside the window, remove it
if(entry.getValue() < firstElementOfWindow){
iter.remove();
}else{
// otherwise, we break
break;
}
}
}
/**
* This method search for items for expanding left side of a rule I --> J
* with any item c. This results in rules of the form I --> J U�{c}. The method makes sure that:
* - c is not already included in I or J
* - c appear at least minsup time in tidsIJ after the first occurence of I
* - c is lexically bigger than all items in J
* @param mapWindowsJI
* @throws IOException
*/
private void expandRight(int[] itemsetI, int[] itemsetJ,
Set<Integer> tidsI,
Collection<Integer> tidsJ,
Collection<Integer> tidsIJ //,
// Map<Integer, Occurence> occurencesI,
// Map<Integer, Occurence> occurencesJ
) throws IOException {
// The following map will be used to count the support of each item
// c that could potentially extend the rule.
// The map associated a set of tids (value) to an item (key).
Map<Integer, Set<Integer>> frequentItemsC = new HashMap<Integer, Set<Integer>>();
// For each tid of sequence containing I-->J
for(Integer tid : tidsIJ){
// get the sequence
Sequence sequence = database.getSequences().get(tid);
// there maps are used when scanning the sequence to determine
// what is currently inside the window and what fall out of the window.
// We use linkedhashMap so as to keep the order of what is inserted into the maps
// There maps are key: item value: position of an itemset
LinkedHashMap<Integer, Integer> mapMostRightFromI = new LinkedHashMap<Integer, Integer>();
LinkedHashMap<Integer, Integer> mapMostRightFromJ = new LinkedHashMap<Integer, Integer>();
// key: item value: list of positions of itemsets
LinkedHashMap<Integer, LinkedList<Integer>> mapMostLeftFromI = new LinkedHashMap<Integer, LinkedList<Integer>>();
int lastItemsetScannedForC = Integer.MIN_VALUE;
// For each itemset starting from the last in this sequence
int k= 0;
do{
final int firstElementOfWindow = k - windowSize +1;
int lastElementOfWindow = k;
// remove items from I that fall outside the time window
int previousISize = mapMostRightFromI.size();
removeElementOutsideWindowER(mapMostRightFromI, firstElementOfWindow);
// important: if I was all there, but become smaller we need to clear the
// hashmap for items of J.
int currentISize = mapMostRightFromI.size();
if(previousISize == itemsetJ.length && previousISize != currentISize){
mapMostRightFromJ.clear();
}
// remove items from J that fall outside the time window
removeElementOutsideWindowER(mapMostRightFromJ, firstElementOfWindow);
// For each item of the current itemset
for(Integer item : sequence.get(k)){
// record the first position until now of each item in I or J
// if we saw I completely already, and the current item is in J
if(mapMostRightFromI.size() == itemsetI.length && ArraysAlgos.contains(itemsetJ, item)){
// then we add its position to the map for items most right from J
addToLinked(mapMostRightFromJ, item, k);
// otherwise, if it is an item from I
}else if(ArraysAlgos.contains(itemsetI, item)){
// add its position to the map of positions most left for I
addToLinked(mapMostRightFromI, item, k);
LinkedList<Integer> list = mapMostLeftFromI.get(item);
if(list == null){
list = new LinkedList<Integer>();
addToLinked(mapMostLeftFromI, item, list);
}
list.add(k);
}
}
// if all the items of IJ are in the current window
if(mapMostRightFromI.size() == itemsetI.length && mapMostRightFromJ.size() == itemsetJ.length){
//remove items from mostLeft that fall outside the time window.
// at the same time, calculate the minimum index for items of I.
int minimum = 1;
// for each position from I in mostLeft
for(LinkedList<Integer> list: mapMostLeftFromI.values()){
while(true){
// get the last position
Integer last = list.getLast();
// if outside the window
if(last < firstElementOfWindow){
// remove the position
list.removeLast();
}else{
// otherwise update the minimum
if(last > minimum){
minimum = last + 1;
}
// then break
break;
}
}
}
// we need to scan for items C to extend the rule...
// Such item c has to appear in the window before the last occurence of J (before "minimum")
// and if it was scanned before, it should not be scanned again.
int itemsetC = minimum;
if(itemsetC < lastItemsetScannedForC){
itemsetC = lastItemsetScannedForC +1;
}
// for each item c before the lst element of the window, starting from the
// first itemset just after the first occurence of I
for(; itemsetC <= lastElementOfWindow; itemsetC++){
for(Integer itemC : sequence.get(itemsetC)){
// We will consider if we could create a rule I --> J U{c}
// If lexical order is not respected or c is included in the rule already,
// then we cannot so the algorithm return.
if(ArraysAlgos.containsLEX(itemsetI, itemC)
|| ArraysAlgos.containsLEXPlus(itemsetJ, itemC)){
continue;
}
// otherwise, get the tidset of "c"
Set<Integer> tidsItemC = frequentItemsC.get(itemC);
// if there is no tidset, create one
if(tidsItemC == null){
//if we did not see "c" yet, create a new tidset for "c"
tidsItemC = new HashSet<Integer>();
frequentItemsC.put(itemC, tidsItemC);
}
// add the current tid to the tidset of "c"
tidsItemC.add(tid);
}
}
// update last itemset scanned
lastItemsetScannedForC = lastElementOfWindow;
}
k++; // to go next itemset
}while(k < sequence.size() && lastItemsetScannedForC < sequence.size()-1);
}
// For each item c found, we create a rule I ==> JU {c}
for(Entry<Integer, Set<Integer>> entry : frequentItemsC.entrySet()){
// get the tidset of I ==> JU {c}
Set<Integer> tidsI_JC = entry.getValue();
// if the support of I ==> JU{c} is enough
if(tidsI_JC.size() >= minsuppRelative){
Integer itemC = entry.getKey();
// create the itemset JU{c}
int[] itemsetJC = new int[itemsetJ.length+1];
System.arraycopy(itemsetJ, 0, itemsetJC, 0, itemsetJ.length);
itemsetJC[itemsetJ.length]= itemC;
// calculate the occurences of JU{c} within the time window
Set<Integer> tidsJC = new HashSet<Integer>();
// for each sequence containing J
loop1: for(Integer tid: tidsJ){
// get the sequence
Sequence sequence = database.getSequences().get(tid);
// To check if JU{c} is contained in that sequence we will use a map
// such that the key : item value: position of an itemset
LinkedHashMap<Integer, Integer> mapAlreadySeenFromJC = new LinkedHashMap<Integer, Integer>();
// For each itemset
for(int k=0; k< sequence.size(); k++){
// For each item
for(Integer item : sequence.get(k)){
// if the item is in JU{C}, then
// record the last position of the item for the map of JU{C}
if(ArraysAlgos.contains(itemsetJC, item)){
addToLinked(mapAlreadySeenFromJC, item, k);
}
}
// remove items that fall outside the time window
// For each position in the map
Iterator<Entry<Integer, Integer>> iter = mapAlreadySeenFromJC.entrySet().iterator();
while(iter.hasNext()){
// if position is outside the map
Entry<Integer, Integer> entryMap = iter.next();
if(entryMap.getValue() < k - windowSize +1){
// remove the position
iter.remove();
}else{
// otherwise break
break;
}
}
// if all the items of I are inside the current window, then record the tid
if(mapAlreadySeenFromJC.keySet().size() == itemsetJC.length){
tidsJC.add(tid);
// then continue the loop!
continue loop1;
}
}
}
// ---- ----
// Create rule and calculate its confidence of I ==> J U{c}
// defined as: sup(I -->J U{c}) / sup(I)
double confI_JC = ((double)tidsI_JC.size()) / tidsI.size();
// Rule ruleI_JC = new Rule(ruleIJ.getItemset1(), itemsetJC, confI_JC, tidsI_JC.size());
// if the confidence is enough
if(confI_JC >= minconf){
// then it is a valid rule so save it
saveRule(tidsI_JC, confI_JC, itemsetI, itemsetJC);
}
// recursively try to expand the left and right side
// of the rule
if(itemsetJC.length < maxConsequentSize) {
expandRight(itemsetI, itemsetJC, tidsI, tidsJC, tidsI_JC);
}
if(itemsetI.length < maxAntecedentSize) {
expandLeft(itemsetI, itemsetJC, tidsI, tidsI_JC); // occurencesJ
}
}
}
// check the memory usage
MemoryLogger.getInstance().checkMemory();
}
/**
* This method calculate the frequency of each item in one database pass.
* Then it remove all items that are not frequent in another database pass.
* @param database : a sequence database
* @return A map such that key = item
* value = a map where a key = tid and a value = Occurence
* This map allows knowing the frequency of each item and their first and last occurence in each sequence.
*/
private Map<Integer, Map<Integer, Occurence>> removeItemsThatAreNotFrequent(SequenceDatabase database) {
// (1) Count the support of each item in the database in one database pass
mapItemCount = new HashMap<Integer, Map<Integer, Occurence>>(); // <item, Map<tid, occurence>>
// for each sequence in the database
for(Sequence sequence : database.getSequences()){
// for each itemset in that sequence
for(short j=0; j< sequence.getItemsets().size(); j++){
List<Integer> itemset = sequence.get(j);
// for each item in that sequence
for(int i=0; i< itemset.size(); i++){
Integer itemI = itemset.get(i);
// get the map of occurences of that item
Map<Integer, Occurence> occurences = mapItemCount.get(itemI);
// if the map of occurences of that item is null, create a new one
if(occurences == null){
occurences = new HashMap<Integer, Occurence>();
mapItemCount.put(itemI, occurences);
}
// add this sequence id to the occurences of that item
Occurence occurence = occurences.get(sequence.getId());
if(occurence == null){
occurence = new Occurence(sequence.getId());
occurences.put(sequence.getId(), occurence);
}
// add the current itemset position to the occurences of that item
occurence.add(j);
}
}
}
// (2) remove all items that are not frequent from the database
// for each sequence
for(Sequence sequence : database.getSequences()){
int i=0;
// for each itemset
while(i < sequence.getItemsets().size()){
List<Integer> itemset = sequence.getItemsets().get(i);
int j=0;
// for each item
while(j < itemset.size()){
double count = mapItemCount.get(itemset.get(j)).size();
// if the item is not frequent remove it
if(count < minsuppRelative){
itemset.remove(j);
}else{
// otherwise go to next item
j++;
}
}
i++; // go to next itemset
}
}
// return the map of occurences of items
return mapItemCount;
}
/**
* Save a rule I ==> J to the output file
* @param tidsIJ the tids containing the rule
* @param confIJ the confidence
* @param itemsetI the left part of the rule
* @param itemsetJ the right part of the rule
* @throws IOException exception if error writing the file
*/
private void saveRule(Set<Integer> tidsIJ, double confIJ, int[] itemsetI, int[] itemsetJ) throws IOException {
// increase the number of rule found
ruleCount++;
// create a string buffer
StringBuilder buffer = new StringBuilder();
// write itemset 1 (antecedent)
for(int i=0; i<itemsetI.length; i++){
buffer.append(itemsetI[i]);
if(i != itemsetI.length -1){
buffer.append(",");
}
}
// write separator
buffer.append(" ==> ");
// write itemset 2 (consequent)
for(int i=0; i<itemsetJ.length; i++){
buffer.append(itemsetJ[i]);
if(i != itemsetJ.length -1){
buffer.append(",");
}
}
// write support
buffer.append(" #SUP: ");
buffer.append(tidsIJ.size());
// write confidence
buffer.append(" #CONF: ");
buffer.append(confIJ);
writer.write(buffer.toString());
writer.newLine();
}
/**
* Set the number of items that a rule antecedent should contain (optional).
* @param maxAntecedentSize the maximum number of items
*/
public void setMaxAntecedentSize(int maxAntecedentSize) {
this.maxAntecedentSize = maxAntecedentSize;
}
/**
* Set the number of items that a rule consequent should contain (optional).
* @param maxConsequentSize the maximum number of items
*/
public void setMaxConsequentSize(int maxConsequentSize) {
this.maxConsequentSize = maxConsequentSize;
}
/**
* Print statistics about the last algorithm execution to System.out.
*/
public void printStats() {
System.out
.println("============= TRULEGROWTH - STATS =============");
// System.out.println("minsup: " + minsuppRelative);
System.out.println("Sequential rules count: " + ruleCount);
System.out.println("Total time : " + (timeEnd - timeStart) + " ms");
System.out.println("Max memory (mb)" + MemoryLogger.getInstance().getMaxMemory());
System.out.println("=====================================");
}
/**
* Get the total runtime of the last execution.
* @return the time as a double.
*/
public double getTotalTime(){
return timeEnd - timeStart;
}
}