package ca.pfv.spmf.algorithms.sequentialpatterns.gsp_AGP;
/*
* Copyright Antonio Gomariz Peñalver 2013
*
* This file is part of the SPMF DATA MINING SOFTWARE
* (http://www.philippe-fournier-viger.com/spmf).
*
* SPMF is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
*
* SPMF is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with
* SPMF. If not, see <http://www.gnu.org/licenses/>.
*/
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import ca.pfv.spmf.algorithms.sequentialpatterns.gsp_AGP.items.Item;
import ca.pfv.spmf.algorithms.sequentialpatterns.gsp_AGP.items.SequenceDatabase;
import ca.pfv.spmf.algorithms.sequentialpatterns.gsp_AGP.items.Sequences;
import ca.pfv.spmf.algorithms.sequentialpatterns.gsp_AGP.items.creators.AbstractionCreator;
import ca.pfv.spmf.algorithms.sequentialpatterns.gsp_AGP.items.patterns.Pattern;
import ca.pfv.spmf.tools.MemoryLogger;
/**
* This is an implementation of the GSP algorithm. GSP was proposed by Srikant
* and Agrawal 1996.
*<br/><br/>
*
* NOTE: This implementation saves the patterns to a file as soon as a level of
* patterns is found or can keep the patterns into memory if no output path is provided by the user.
*
* @author Antonio Gomariz Peñalver
*/
public class AlgoGSP {
/**
* minimum support threshold. Range: from 0 up to 1
*/
protected double minSupRelative;
protected double minGap;
protected double maxGap;
protected double windowSize;
/**
* Absolute minimum support threshold. It indicates the minimum number of
* sequences that we need to find.
*/
protected double minSupAbsolute;
/**
* Set of frequent patterns. Whether the user chooses a to save in a file or
* in the memory we use it to keep the different k-levels of frequent
* sequences
*/
protected Sequences patterns;
//Two variables to measure how long the algorithm takes
protected long start, end;
//List with the frequent 1-sequences, i.e. the frequent items.
protected List<Pattern> frequentItems;
private AbstractionCreator abstractionCreator;
//Flag indicanting if the output is sorted or not
private boolean isSorted;
//counter for the frequent patterns already found
private int numberOfFrequentPatterns;
// writer to write output file
BufferedWriter writer = null;
// save sequence identifiers to file
boolean outputSequenceIdentifiers = false;
/**
* Constructor for GSP algorithm. It initializes most of the class'
* attributes.
*/
public AlgoGSP(double minSupRelative, double mingap, double maxgap, double windowSize, AbstractionCreator abstractionCreator) {
this.minSupRelative = minSupRelative;
this.minGap = mingap;
this.maxGap = maxgap;
this.windowSize = windowSize;
this.abstractionCreator = abstractionCreator;
this.isSorted = false;
}
/**
* Method that runs the GSP algorithm in the database given as parameter.
*
* @param database a sequence database
* @param keepPatterns flag activated if we want to keep the resulting
* patterns or not
* @param verbose flat activated for debugging purposes
* @param outputFilePath an output file path
* @param outputSequenceIdentifiers if true output sequence identifiers with each pattern
* @return the frequent sequences found in the original database
* @throws IOException
*/
public Sequences runAlgorithm(SequenceDatabase database, boolean keepPatterns, boolean verbose, String outputFilePath, boolean outputSequenceIdentifiers) throws IOException {
this.outputSequenceIdentifiers = outputSequenceIdentifiers;
patterns = new Sequences("FREQUENT SEQUENTIAL PATTERNS");
// if the user want to keep the result into memory
if (outputFilePath == null) {
writer = null;
} else { // if the user want to save the result to a file
writer = new BufferedWriter(new FileWriter(outputFilePath));
}
/*we calculate in how many sequences a pattern have to appear to be
correctly considered as frequent*/
this.minSupAbsolute = (int) Math.ceil(minSupRelative * database.size());
if (this.minSupAbsolute == 0) { // protection
this.minSupAbsolute = 1;
}
CandidateGeneration candidateGenerator = new CandidateGeneration();
SupportCounting supportCounter = new SupportCounting(database, abstractionCreator);
// reset the stats about memory usage
MemoryLogger.getInstance().reset();
start = System.currentTimeMillis();
runGsp(database, candidateGenerator, supportCounter, keepPatterns, verbose);
end = System.currentTimeMillis();
// close the output file if the result was saved to a file
if (writer != null) {
writer.close();
}
return patterns;
}
/**
* The actual method that executes GSP. It start from the frequent
* 1-sequences level
*
* @param database a sequence database
* @param candidateGenerator a CandidateGenerator
* @param supportCounter a supportCounting element
* @param keepPatterns flag activated if we want to keep the resulting
* patterns or not
* @param verbose flat activated for debugging purposes
* @throws IOException
*/
protected void runGsp(SequenceDatabase database, CandidateGeneration candidateGenerator, SupportCounting supportCounter, boolean keepPatterns, boolean verbose) throws IOException {
//we get the frequent items found in the original database
frequentItems = database.frequentItems();
/* And we add the sequences as the 1-level of patterns. NOTE: we need them
* for generating the candidates
*/
patterns.addSequences(frequentItems, 1);
/*We define a set where we temporaly keep the current frequent k-level.
* It was called Lk in the original algorithm.
*/
Set<Pattern> frequentSet = new LinkedHashSet<Pattern>(frequentItems.size());
//And we add it the frequent 1-sequences
frequentSet.addAll(frequentItems);
//We define a candidate set
List<Pattern> candidateSet;
/*And we put all those frequent 1-sequences indexed by their first item,
* the pattern itself, for this case
*/
Map<Item, Set<Pattern>> indexationMap = new HashMap<Item, Set<Pattern>>();
//Updating the number of frequent candidates adding the number of frequent items
numberOfFrequentPatterns += frequentItems.size();
//From k=1
int k = 1;
//We repeat the same loop. MAIN LOOP
while (frequentSet != null && !frequentSet.isEmpty()) {
//We start with the k+1 level
k++;
if (verbose) {
System.out.println("k=" + k);
System.out.println("generating candidates...");
}
//We get the candidate set
candidateSet = candidateGenerator.generateCandidates(frequentSet, abstractionCreator, indexationMap, k, minSupAbsolute);
frequentSet = null;
//And we break the loop if the set of candidates is empty
if (candidateSet == null) {
break;
}
//Otherwise we continue counting the support of each candidate of the set
if (verbose) {
System.out.println(candidateSet.size() + " Candidates have been created!");
System.out.println("checking frequency...");
}
// check the memory usage for statistics
MemoryLogger.getInstance().checkMemory();
frequentSet = supportCounter.countSupport(candidateSet, k, minSupAbsolute);
if (verbose) {
System.out.println(frequentSet.size() + " frequent patterns\n");
}
// check the memory usage for statistics
MemoryLogger.getInstance().checkMemory();
//We update the number of frequent patterns, adding the number (k+1)-frequent patterns found
numberOfFrequentPatterns += frequentSet.size();
/*And we prepare the next iteration, updating the indexation map and
* the frequent level capable of generating the new candidates
*/
indexationMap = supportCounter.getIndexationMap();
patterns.addSequences(new ArrayList<Pattern>(frequentSet), k);
/*Finally, we remove the previous level if we are not interested in
* keeping the frequent patterns in memory
*/
int level = k - 1;
if (!keepPatterns) {
if (!frequentSet.isEmpty()) {
patterns.delete(level);
}
/*Or even if we are interested in, but we want to keep them in
* a file
*/
}else if (writer != null) {
if (!frequentSet.isEmpty()) {
for (Pattern seq : patterns.getLevel(level)) {
writer.write(seq.toStringToFile(outputSequenceIdentifiers));
writer.newLine();
}
patterns.delete(level);
}
}
}
/*When the loop is over, if we were interested in keeping the output in
* a file, we store the last level found.
*/
if (keepPatterns) {
if (writer != null) {
int level = patterns.getLevelCount();
for (Pattern seq : patterns.getLevel(level)) {
writer.write(seq.toStringToFile(outputSequenceIdentifiers));
writer.newLine();
}
patterns.delete(level);
}
}
// check the memory usage for statistics
MemoryLogger.getInstance().checkMemory();
}
/**
* Method to print some statistics about the execution. It uses the standard
* format.
*
* @return a String with the information in it.
*/
public String printStatistics() {
if (!isSorted) {
patterns.sort();
isSorted = true;
}
StringBuilder sb = new StringBuilder(200);
sb.append("============= Algorithm - STATISTICS =============\n Total time ~ ");
sb.append(runningTime());
sb.append(" ms\n");
sb.append(" Frequent sequences count : ");
sb.append(numberOfFrequentPatterns);
sb.append('\n');
sb.append(" Max memory (mb):");
sb.append(MemoryLogger.getInstance().getMaxMemory());
sb.append('\n');
if (writer == null) {
sb.append(patterns.toString());
}
sb.append("===================================================\n");
return sb.toString();
}
/**
* Method to print some statistics about the execution. It uses the optional
* format.
*
* @return a String with the information in it
*/
public String printedOutputToSaveInFile() {
if (!isSorted) {
patterns.sort();
isSorted = true;
}
StringBuilder output = new StringBuilder();
output.append(patterns.toStringToFile(outputSequenceIdentifiers));
return output.toString();
}
/**
*
* @return The number of frequent sequences found by GSP in the last
* execution
*/
public int getNumberOfFrequentPatterns() {
return numberOfFrequentPatterns;
}
/**
*
* @return the frequent patterns found by the last execution of GSP. It only
* works under a Save_To_Memory option.
*/
public String getPatterns() {
String output = null;
if (writer == null) {
output = patterns.toString();
}
return output;
}
/**
* Time that GSP takes completing the execution
* @return the runtime as a long
*/
public long runningTime() {
return (end - start);
}
/**
* Return the absolute minimum support, i.e. the minimum number of sequences
* where a patter must appear
* @return the minsup value
*/
public double getMinSupAbsolut() {
return minSupAbsolute;
}
/**
* It cleans the most important attributes.
*/
public void clear() {
patterns.clear();
frequentItems.clear();
abstractionCreator = null;
}
}