package ca.pfv.spmf.algorithms.sequential_rules.rulegen;
/* This file is copyright (c) 2008-2013 Philippe Fournier-Viger
*
* This file is part of the SPMF DATA MINING SOFTWARE
* (http://www.philippe-fournier-viger.com/spmf).
*
* SPMF is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
*
* SPMF is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
* You should have received a copy of the GNU General Public License along with
* SPMF. If not, see <http://www.gnu.org/licenses/>.
*/
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import ca.pfv.spmf.algorithms.sequentialpatterns.BIDE_and_prefixspan.AlgoPrefixSpan;
import ca.pfv.spmf.algorithms.sequentialpatterns.BIDE_and_prefixspan.SequentialPattern;
import ca.pfv.spmf.algorithms.sequentialpatterns.BIDE_and_prefixspan.SequentialPatterns;
import ca.pfv.spmf.input.sequence_database_list_integers.SequenceDatabase;
import ca.pfv.spmf.tools.MemoryLogger;
/**
* This is an implementation of the RuleGen algorithm proposed by Zaki et al to generate sequential rules where
* the antecedent and consequent are sequential patterns. The RuleGen algorithm is described in:
* <br/><br/>
*
* M. J. Zaki, �SPADE: An Efficient Algorithm for Mining Frequent Se-quences,�Machine Learning, vol. 42, no.1-2, pp. 31-60, 2001.
* <br/><br/>
*
* However, note that instead of using the SPADE algorithm, we use the PrefixSpan algorithm because at the time
* that this algorithm was implemented there was no implementation of SPADE in SPMF and PrefixSpan is a fast
* algorithm.
*
* @see AlgoPrefixSpan
* @see SequentialPattern
* @see SequentialPatterns
* @author Philippe Fournier-Viger
*/
public class AlgoRuleGen {
// start time of the latest execution
private long startTime;
// end time of the latest execution
private long endTime;
private int patternCount; // the numer of rules found
// object to write the output file path
BufferedWriter writer = null;
/**
* Default constructor.
*/
public AlgoRuleGen() {
}
/**
* Run the algorithm
* @param minsup the minimum support threshold
* @param minconf the minimum confidence threshold
* @param input the input file path
* @param output the output file path for saving the result
* @throws IOException exception if there is an error reading/writing files
*/
public void runAlgorithm(int minsup, double minconf, String input, String output) throws IOException {
// Prepare object for writing the output file
writer = new BufferedWriter(new FileWriter(output));
// record the start time
startTime = System.currentTimeMillis();
// Load the sequence database taken as input
SequenceDatabase sequenceDatabase = new SequenceDatabase();
sequenceDatabase.loadFile(input);
// STEP 1: Apply the PrefixSpan algorithm to generate frequent sequential patterns
AlgoPrefixSpan algo = new AlgoPrefixSpan();
SequentialPatterns patternsLists = algo.runAlgorithm(sequenceDatabase, null, minsup);
// STEP 2: Generate rules of the form a ==> b,
// where a and b are sequential patterns
// such that a is a subsequence of b.
// For each rule
// for each seq. pattern a (pattern1) of size i
for (int i = 0; i < patternsLists.getLevels().size(); i++) {
for(int j=0; j < patternsLists.getLevel(i).size(); j++){
SequentialPattern pattern1 = patternsLists.getLevel(i).get(j);
//for each seq. pattern b (pattern2) of SIZE k > i
for (int k = i+1; k < patternsLists.getLevels().size(); k++) {
for(int m =0; m < patternsLists.getLevel(k).size(); m++){
SequentialPattern pattern2 = patternsLists.getLevel(k).get(m);
// try to generate a rule a ==> b
tryToGenerateRule(pattern1, pattern2, minconf);
// try to generate a rule b ==> a
tryToGenerateRule(pattern2, pattern1, minconf);
}
}
}
}
// check the memory usage
MemoryLogger.getInstance().checkMemory();
// record the end time
endTime = System.currentTimeMillis();
// close the output file
writer.close();
}
/**
* Try to generate a rule between two sequential patterns. The rule is generated if the
* pattern1 is included in pattern2 and if the confidence is high enough.
* @param pattern1 a sequential pattern
* @param pattern2 another sequential pattern
* @throws IOException
*/
private void tryToGenerateRule(SequentialPattern pattern1, SequentialPattern pattern2, double minconf) throws IOException {
// if pattern1 is not contained in pattern2, we stop
// because we want that pattern1 is strictly included in pattern2
if(strictlyContains(pattern2, pattern1) == false){
return;
}
// calculate the confidence of: pattern1 ==> pattern2 / pattern1
double conf = ((double) pattern2.getAbsoluteSupport()) / pattern1.getAbsoluteSupport();
// if not enough confidence, then the rule is not valid
if(conf < minconf){
return;
}
// otherwise it is a valid rule so
// increase pattern count
patternCount++;
// then save it to file.
// Create a string buffer
StringBuilder buffer = new StringBuilder();
//
// write the rule
buffer.append(pattern1.itemsetsToString());
buffer.append(" ==> ");
buffer.append(pattern2.itemsetsToString());
//
// write support
buffer.append(" #SUP: ");
buffer.append(pattern2.getAbsoluteSupport());
// write confidence
buffer.append(" #CONF: ");
buffer.append(conf);
writer.write(buffer.toString()); // write to file
writer.newLine(); // write a new line
}
/**
* This methods checks if a seq. pattern "pattern2" is strictly contained in a seq. pattern "pattern1".
* @param pattern1 a sequential pattern
* @param pattern2 another sequential pattern
* @return true if the pattern1 contains pattern2.
*/
boolean strictlyContains(SequentialPattern pattern1, SequentialPattern pattern2) {
// if pattern2 is larger or equal in size, then it cannot be contained in pattern1
if(pattern1.size() <= pattern2.size()){
return false;
}
// To see if pattern2 is strictly contained in pattern1,
// we will search for each itemset i of pattern2 in pattern1 by advancing
// in pattern 1 one itemset at a time.
int i =0; // position in pattern2
int j= 0; // position in pattern1
while(true){
//if the itemset at current position in pattern1 contains the itemset
// at current position in pattern2
if(pattern1.getItemsets().get(j).containsAll(pattern2.get(i))){
// go to next itemset in pattern2
i++;
// if we reached the end of pattern2, then return true
if(i == pattern2.size()){
return true;
}
}
// go to next itemset in pattern1
j++;
// if we reached the end of pattern1, then pattern2 is not strictly included
// in it, and return false
if(j >= pattern1.size()){
return false;
}
// lastly, for optimization, we check how many itemsets are left to be matched.
// if there is less itemsets left in pattern1 than in pattern2, then it will
// be impossible to get a total match, and so we return false.
if((pattern1.size() - j)< pattern2.size() - i){
return false;
}
}
}
/**
* Print statistics to System.out about the latest execution of the algorithm.
*/
public void printStats() {
System.out
.println("============= SEQUENTIAL RULES - STATS =============");
System.out.println("Sequential rules count: " + patternCount);
System.out.println("Total time : " + (endTime - startTime) + " ms");
System.out.println("Max memory: " + MemoryLogger.getInstance().getMaxMemory());
System.out
.println("===================================================");
}
}