package ca.pfv.spmf.algorithms.sequentialpatterns.clasp_AGP;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.ObjectOutputStream;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import ca.pfv.spmf.algorithms.sequentialpatterns.clasp_AGP.dataStructures.Itemset;
import ca.pfv.spmf.algorithms.sequentialpatterns.clasp_AGP.dataStructures.Sequence;
import ca.pfv.spmf.algorithms.sequentialpatterns.clasp_AGP.dataStructures.creators.AbstractionCreator;
import ca.pfv.spmf.algorithms.sequentialpatterns.clasp_AGP.dataStructures.database.SequenceDatabase;
import ca.pfv.spmf.algorithms.sequentialpatterns.clasp_AGP.dataStructures.patterns.Pattern;
import ca.pfv.spmf.algorithms.sequentialpatterns.clasp_AGP.savers.Saver;
import ca.pfv.spmf.algorithms.sequentialpatterns.clasp_AGP.savers.SaverIntoFile;
import ca.pfv.spmf.algorithms.sequentialpatterns.clasp_AGP.savers.SaverIntoMemory;
import ca.pfv.spmf.algorithms.sequentialpatterns.clasp_AGP.tries.Trie;
import ca.pfv.spmf.tools.MemoryLogger;
/**
* This is an implementation of the ClaSP algorithm. ClaSP was proposed by A.
* Gomariz et al. in 2013.
*
* NOTE: This implementation saves the pattern to a file as soon as they are
* found or can keep the pattern into memory, depending on what the user choose.
*
* Copyright Antonio Gomariz PeƱalver 2013
*
* This file is part of the SPMF DATA MINING SOFTWARE
* (http://www.philippe-fournier-viger.com/spmf).
*
* SPMF is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
*
* SPMF is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with
* SPMF. If not, see <http://www.gnu.org/licenses/>.
*
* @author agomariz
*/
public class AlgoCM_ClaSP {
/**
* The absolute minimum support threshold, i.e. the minimum number of
* sequences where the patterns have to be
*/
protected double minSupAbsolute;
/**
* Saver variable to decide where the user want to save the results, if it
* the case
*/
Saver saver = null;
/**
* Start and End points in order to calculate the overall time taken by the
* algorithm
*/
public long overallStart, overallEnd;
/**
* Start and End points in order to calculate the time taken by the main
* part of CloSpan algorithm
*/
protected long mainMethodStart, mainMethodEnd;
/**
* Start and End points in order to calculate the time taken by the
* post-processing method of CloSpan algorithm
*/
protected long postProcessingStart, postProcessingEnd;
/**
* Trie root that starts with the empty pattern and from which we will be
* able to access to all the frequent patterns generated by ClaSP
*/
protected Trie FrequentAtomsTrie;
/**
* The abstraction creator
*/
private AbstractionCreator abstractionCreator;
/**
* Number of frequent patterns found by the algorithm
*/
private int numberOfFrequentPatterns;
/**
* flag to indicate if we are interesting in only finding the closed
* sequences
*/
private boolean findClosedPatterns;
/**
* flag to indicate if we are interesting in only finding the closed
* sequence through the postprocessing step
*/
private boolean executePruningMethods;
public long joinCount; // PFV 2013
/**
* Constructor of the class that calls ClaSP algorithm.
*
* @param support Absolute minimum support
* @param abstractionCreator the abstraction creator
* @param findClosedPatterns flag to indicate if we are interesting in only
*/
public AlgoCM_ClaSP(double support, AbstractionCreator abstractionCreator, boolean findClosedPatterns, boolean executePruningMethods) {
this.minSupAbsolute = support;
this.abstractionCreator = abstractionCreator;
this.findClosedPatterns = findClosedPatterns;
this.executePruningMethods = executePruningMethods;
}
/**
* Actual call to ClaSP algorithm. The output can be either kept or ignore.
* Whenever we choose to keep the patterns found, we can keep them in a file
* or in the main memory
*
* @param database Original database in where we want to search for the
* frequent patterns.
* @param keepPatterns Flag indicating if we want to keep the output or not
* @param verbose Flag for debugging purposes
* @param outputFilePath Path of the file in which we want to store the
* frequent patterns. If this value is null, we keep the patterns in the
* main memory. This argument is taken into account just when keepPatterns
* is activated.
* @param outputSequenceIdentifiers indicates if sequence ids should be output with each pattern found.
* @throws IOException
*/
public void runAlgorithm(SequenceDatabase database, boolean keepPatterns, boolean verbose, String outputFilePath, boolean outputSequenceIdentifiers) throws IOException {
//If we do no have any file path
if (outputFilePath == null) {
//The user wants to save the results in memory
saver = new SaverIntoMemory(outputSequenceIdentifiers);
} else {
//Otherwise, the user wants to save them in the given file
saver = new SaverIntoFile(outputFilePath, outputSequenceIdentifiers);
}
// reset the stats about memory usage
MemoryLogger.getInstance().reset();
//keeping the starting time
overallStart = System.currentTimeMillis();
//Starting ClaSP algorithm
claSP(database, (long) minSupAbsolute, keepPatterns, verbose, findClosedPatterns, executePruningMethods);
//keeping the ending time
overallEnd = System.currentTimeMillis();
//Search for frequent patterns: Finished
saver.finish();
}
/**
* The actual method for extracting frequent sequences.
*
* @param database The original database
* @param minSupAbsolute the absolute minimum support
* @param keepPatterns flag indicating if we are interested in keeping the
* output of the algorithm
* @param verbose Flag for debugging purposes
* @param
* @throws IOException
*/
protected void claSP(SequenceDatabase database, long minSupAbsolute, boolean keepPatterns, boolean verbose, boolean findClosedPatterns, boolean executePruningMethods) throws IOException {
//We get the initial trie whose children are the frequent 1-patterns
FrequentAtomsTrie = database.frequentItems();
// NEW-CODE-PFV 2013
// Map: key: item value: another item that followed the first item + support
// (could be replaced with a triangular matrix...)
Map<Integer, Map<Integer, Integer>> coocMapAfter = new HashMap<Integer, Map<Integer, Integer>>(1000);
Map<Integer, Map<Integer, Integer>> coocMapEquals = new HashMap<Integer, Map<Integer, Integer>>(1000);
// update COOC map
for (Sequence seq : database.getSequences()) {
HashSet<Integer> alreadySeenA = new HashSet<Integer>();
Map<Integer, Set<Integer>> alreadySeenB_equals = new HashMap<>();
// for each item
for (int i = 0; i < seq.getItemsets().size(); i++) {
Itemset itemsetA = seq.get(i);
for (int j = 0; j < itemsetA.size(); j++) {
Integer itemA = (Integer) itemsetA.get(j).getId();
boolean alreadyDoneForItemA = false;
Set equalSet = alreadySeenB_equals.get(itemA);
if (equalSet == null) {
equalSet = new HashSet();
alreadySeenB_equals.put(itemA, equalSet);
}
if (alreadySeenA.contains(itemA)) {
alreadyDoneForItemA = true;
}
// create the map if not existing already
Map<Integer, Integer> mapCoocItemEquals = coocMapEquals.get(itemA);
// create the map if not existing already
Map<Integer, Integer> mapCoocItemAfter = null;
if (!alreadyDoneForItemA) {
mapCoocItemAfter = coocMapAfter.get(itemA);
}
//For each item after itemA in the same itemset
for (int k = j + 1; k < itemsetA.size(); k++) {
Integer itemB = (Integer) itemsetA.get(k).getId();
if (!equalSet.contains(itemB)) {
if (mapCoocItemEquals == null) {
mapCoocItemEquals = new HashMap<Integer, Integer>();
coocMapEquals.put(itemA, mapCoocItemEquals);
}
Integer frequency = mapCoocItemEquals.get(itemB);
if (frequency == null) {
mapCoocItemEquals.put(itemB, 1);
} else {
mapCoocItemEquals.put(itemB, frequency + 1);
}
equalSet.add(itemB);
}
}
HashSet<Integer> alreadySeenB_after = new HashSet<Integer>();
// for each item after
if (!alreadyDoneForItemA) {
for (int k = i + 1; k < seq.getItemsets().size(); k++) {
Itemset itemsetB = seq.get(k);
for (int m = 0; m < itemsetB.size(); m++) {
Integer itemB = (Integer) itemsetB.get(m).getId();
if (alreadySeenB_after.contains(itemB)) {
continue;
}
if (mapCoocItemAfter == null) {
mapCoocItemAfter = new HashMap<Integer, Integer>();
coocMapAfter.put(itemA, mapCoocItemAfter);
}
Integer frequency = mapCoocItemAfter.get(itemB);
if (frequency == null) {
mapCoocItemAfter.put(itemB, 1);
} else {
mapCoocItemAfter.put(itemB, frequency + 1);
}
alreadySeenB_after.add(itemB);
}
}
alreadySeenA.add(itemA);
}
}
}
}
database.clear();
database = null;
// DEBUGING PFV
// Calculate the size of CMAP (hashmaps)
//
// int pairCount = 0;
// double maxMemory = getObjectSize(coocMapAfter);
// for(Entry<Integer, Map<Integer, Integer>> entry : coocMapAfter.entrySet()) {
// maxMemory += getObjectSize(entry.getKey());
// for(Entry<Integer, Integer> entry2 :entry.getValue().entrySet()) {
// pairCount++;
// maxMemory += getObjectSize(entry2.getKey()) + getObjectSize(entry2.getValue());
// }
// }
// System.out.println("CMAP size " + maxMemory + " MB");
// System.out.println("PAIR COUNT " + pairCount);
// // Calculate the size of CMAP (matrix) INTEGERS
//2990, 9025, 13905, 267, 20, 497, 10094
// int size = 2990;
// System.exit(0);
// Calculate the size of CMAP (matrix) BITSETS
// BMS, KOSARAK, LEV, SNAKE, SIGN, FIFA
// int [] sizes = new int[] {497, 10094, 9025, 20, 267, 2990};
// for(int size : sizes) {
// int[][] array2 = new int[size][size];
// System.out.println(" INT MATRIX :" + getObjectSize(array2) + "MB");
//
// BitSet array = new BitSet(size);
// System.out.println("BITSET : " + getObjectSize(array)*((double)size) + " MB");
// }
// System.exit(0);
// END-OF-NEW-CODE
//Inizialitation of the class that is in charge of find the frequent patterns
FrequentPatternEnumeration_ClaSP frequentPatternEnumeration = new FrequentPatternEnumeration_ClaSP(abstractionCreator, minSupAbsolute, saver, findClosedPatterns, executePruningMethods);
this.mainMethodStart = System.currentTimeMillis();
//We dfsPruning the search
frequentPatternEnumeration.dfsPruning(new Pattern(), FrequentAtomsTrie, verbose, coocMapAfter, coocMapEquals);
this.mainMethodEnd = System.currentTimeMillis();
//Once we had finished, we keep the number of frequent patterns that we found
numberOfFrequentPatterns = frequentPatternEnumeration.getFrequentPatterns();
// check the memory usage for statistics
MemoryLogger.getInstance().checkMemory();
if (verbose) {
System.out.println("ClaSP: The algorithm takes " + (mainMethodEnd - mainMethodStart) / 1000 + " seconds and finds " + numberOfFrequentPatterns + " patterns");
}
//If the we are interested in closed patterns, we dfsPruning the post-processing step
if (findClosedPatterns) {
List<Entry<Pattern, Trie>> outputPatternsFromMainMethod = FrequentAtomsTrie.preorderTraversal(null);
this.postProcessingStart = System.currentTimeMillis();
frequentPatternEnumeration.removeNonClosedPatterns(outputPatternsFromMainMethod, keepPatterns);
this.postProcessingEnd = System.currentTimeMillis();
numberOfFrequentPatterns = frequentPatternEnumeration.getFrequentPatterns();
if (verbose) {
System.out.println("ClaSP:The post-processing algorithm to remove the non-Closed patterns takes " + (postProcessingEnd - postProcessingStart) / 1000 + " seconds and finds " + numberOfFrequentPatterns + " Closed patterns");
}
} else {
if (keepPatterns) {
List<Entry<Pattern, Trie>> outputPatternsFromMainMethod = FrequentAtomsTrie.preorderTraversal(null);
for (Entry<Pattern, Trie> p : outputPatternsFromMainMethod) {
saver.savePattern(p.getKey());
}
}
}
numberOfFrequentPatterns = frequentPatternEnumeration.getFrequentPatterns();
frequentPatternEnumeration.clear();
// check the memory usage for statistics
MemoryLogger.getInstance().checkMemory();
joinCount = frequentPatternEnumeration.joinCount;
}
private double getObjectSize(
Object object)
throws IOException {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
ObjectOutputStream oos = new ObjectOutputStream(baos);
oos.writeObject(object);
oos.close();
double maxMemory = baos.size() / 1024d / 1024d;
return maxMemory;
}
/**
* Method to show the outlined information about the search for frequent
* sequences by means of ClaSP algorithm
*
* @return
*/
public String printStatistics() {
StringBuilder r = new StringBuilder(200);
r.append("============= Algorithm - STATISTICS =============\n Total time ~ ");
r.append(getRunningTime());
r.append(" ms\n");
r.append(" Frequent sequences count : ");
r.append(numberOfFrequentPatterns);
r.append('\n');
r.append(" Join count : ");
r.append(joinCount);
r.append(" Max memory (mb):");
r.append(MemoryLogger.getInstance().getMaxMemory());
r.append('\n');
r.append(saver.print());
r.append("\n===================================================\n");
return r.toString();
}
public int getNumberOfFrequentPatterns() {
return numberOfFrequentPatterns;
}
/**
* It gets the time spent by the algoritm in its execution.
*
* @return
*/
public long getRunningTime() {
return (overallEnd - overallStart);
}
/**
* It clears all the attributes of AlgoClaSP class
*/
public void clear() {
FrequentAtomsTrie.removeAll();
abstractionCreator = null;
}
/**
* Get the trie (internal structure used by ClaSP).
* @return the trie
*/
public Trie getFrequentAtomsTrie() {
return FrequentAtomsTrie;
}
}