package ca.pfv.spmf.algorithms.sequentialpatterns.fournier2008_seqdim.multidimensionalsequentialpatterns; /* This file is copyright (c) 2008-2013 Philippe Fournier-Viger * * This file is part of the SPMF DATA MINING SOFTWARE * (http://www.philippe-fournier-viger.com/spmf). * * SPMF is free software: you can redistribute it and/or modify it under the * terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * * SPMF is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR * A PARTICULAR PURPOSE. See the GNU General Public License for more details. * You should have received a copy of the GNU General Public License along with * SPMF. If not, see <http://www.gnu.org/licenses/>. */ import java.io.BufferedWriter; import java.io.FileWriter; import java.io.IOException; import java.util.List; import java.util.Set; import ca.pfv.spmf.algorithms.sequentialpatterns.fournier2008_seqdim.AbstractAlgoPrefixSpan; import ca.pfv.spmf.algorithms.sequentialpatterns.fournier2008_seqdim.AlgoBIDEPlus; import ca.pfv.spmf.algorithms.sequentialpatterns.fournier2008_seqdim.AlgoFournierViger08; import ca.pfv.spmf.algorithms.sequentialpatterns.fournier2008_seqdim.AlgoPrefixSpanMDSPM; import ca.pfv.spmf.algorithms.sequentialpatterns.fournier2008_seqdim.Sequence; import ca.pfv.spmf.algorithms.sequentialpatterns.fournier2008_seqdim.Sequences; import ca.pfv.spmf.algorithms.sequentialpatterns.fournier2008_seqdim.multidimensionalpatterns.AlgoDim; import ca.pfv.spmf.algorithms.sequentialpatterns.fournier2008_seqdim.multidimensionalpatterns.MDPattern; import ca.pfv.spmf.algorithms.sequentialpatterns.fournier2008_seqdim.multidimensionalpatterns.MDPatterns; import ca.pfv.spmf.algorithms.sequentialpatterns.fournier2008_seqdim.multidimensionalpatterns.MDPatternsDatabase; import ca.pfv.spmf.tools.MemoryLogger; /** * Implementation of the SeqDim algorithm for multi-dimensional * sequential pattern mining proposed by Pinto et al (2001). * <br/><br/> * SeqDIM is a generic algorithm that can be used in theory with any combination of sequential pattern mining algorithm * and MDPattern mining algorithm.<br/> * In SPMF, the sequential pattern mining algorithm must be chosen from:<br/> * AlgoPrefixspanMDSPM/AlgoBIDEPlus/AlgoFournierViger08 algorithms <br/> * In SPMF, the MD-Pattern mining algorithm is AlgoDim and it offers to choose between Charm and AprioriClose * (see the AlgoDim class for details). * @see AlgoFournierViger08 * @see AlgoDim * @see AlgoBIDEPlus * @see AlgoPrefixSpanMDSPM * @see MDSequence * @see MDSequences * @see MDSequenceDatabase * @author Philippe Fournier-Viger */ public class AlgoSeqDim { // The set of frequent MDSequences found by the algorithm protected MDSequences sequences = new MDSequences("FREQUENT MD-SEQUENCES"); private long startTime; // the start time of the algorithm private long endTime; // the end time of the algorithm private boolean mineClosedPatterns = false; // if true, only closed patterns are found // object to write the output to a file BufferedWriter writer = null; // the number of frequent mdsequences found private int patternCount; // the number of mdsequences in the mdsequence database private int databaseSize = 0; /** * Run the algorithm * @param database and MDSequence database * @param algoPrefixSpan a prefixpsan based algorithm (BIDE, PrefixSpan or Fournier08) * @param algoDim an instance of the DIM algorithm * @param mineClosedPatterns if true, only closed mdsequential pattern will be returned * @param output a path for writting the result to an output file * @return the set of MD-sequential patterns found * @throws IOException exception if error writing to file */ public MDSequences runAlgorithm(MDSequenceDatabase database, AbstractAlgoPrefixSpan algoPrefixSpan, AlgoDim algoDim, boolean mineClosedPatterns, String output) throws IOException { // reset the utility for memory usage logging MemoryLogger.getInstance().reset(); // reset number of pattern found patternCount =0; // save start time startTime = System.currentTimeMillis(); // prepare object to write output file writer = new BufferedWriter(new FileWriter(output)); // save the number of mdsequences in the database databaseSize = database.size(); // save user preference this.mineClosedPatterns = mineClosedPatterns; // (1) First mine sequential patterns by applying // a prefixspan based algorithm Sequences sequencesFound = algoPrefixSpan.runAlgorithm(database .getSequenceDatabase()); // (2) For each frequent sequential pattern found, � // form projected MD-Database // and then find MD-patterns within projected databases // for each level for (int j = 0; j < sequencesFound.getLevelCount(); j++) { List<Sequence> sequencesList = sequencesFound.getLevel(j); // for each sequential pattern for (Sequence sequence : sequencesList) { // try to use this sequential pattern to // generate md-sequential patterns trySequence(sequence, database, algoPrefixSpan.getMinSupp(), algoDim); } } // (3) IF the user wants closed patterns only, we eliminate // non-closed multidimensional sequential patterns if (mineClosedPatterns) { removeRedundancy(); } // record end time endTime = System.currentTimeMillis(); // check memory usage MemoryLogger.getInstance().checkMemory(); // close output file writer.close(); // return the set of MD sequential patterns return sequences; } /** * Try to use a sequential pattern to generate MD sequential patterns * @param sequence a sequential pattern * @param database the MD sequence database * @param minsupp the minsup threshold (double) * @param algoDim an instance of the DIM algorithm * @throws IOException exception if error writing output file */ private void trySequence(Sequence sequence, MDSequenceDatabase database, double minsupp, AlgoDim algoDim) throws IOException { // (a) Create a projected database by using only // the sequence containing the given sequential pattern. MDPatternsDatabase newContexte = createProjectedDatabase( sequence.getSequencesID(), database.getPatternDatabase()); // (b) Run the DIM algorithm on the projected database. // To do that we need to adjust the minimum support based // on the number of sequences on the projected database as follows double newMinSupp = minsupp * database.size() / newContexte.size(); // Run the DIM algorithm MDPatterns patterns = algoDim.runAlgorithm(newContexte, newMinSupp); // (c) Create MD-Sequences by combining the mdpatterns found // with the sequential pattern received as parameter // for each level for (int i = 0; i < patterns.getLevelCount(); i++) { // for each mdpattern for (MDPattern pattern : patterns.getLevel(i)) { // combine the mdpattern with the seq. pattern to // form a md sequential pattern MDSequence mdsequence = new MDSequence(0, pattern, sequence); // check if there is only wild cards in this mdpattern boolean onlyWildcards = true; for(Integer id: pattern.getPatternsID()){ if(id != MDPattern.WILDCARD){ onlyWildcards = false; break; } } // if only wilcard, then the support is the support // of the sequential pattern if(onlyWildcards){ mdsequence.setSupport(sequence.getSequencesID().size()); }else{ // otherwise it is the support of the mdpattern mdsequence.setSupport(pattern.getAbsoluteSupport()); } // finally we save the mdsequential pattern savePattern(sequence, mdsequence); } } } /** * This method saves an md seq. pattern to a file or to memory * @param sequence the sequence in the md seq. pattern * @param mdsequence the md seq. pattern * @throws IOException exception if error writing to file */ private void savePattern(Sequence sequence, MDSequence mdsequence) throws IOException { // if the user wants only closed patterns if(mineClosedPatterns == false){ // write to file writeToFile(mdsequence); }else{ // if the user wants all patterns, then save to memory. sequences.addSequence(mdsequence, sequence.size()); } // increase number of md seq. patterns found patternCount++; } /** * Write a md sequence to the output file * @param mdsequence an md sequence * @throws IOException if error while writing to file */ private void writeToFile(MDSequence mdsequence) throws IOException { // create string buffer StringBuilder buffer = new StringBuilder(); // append mdpattern buffer.append(mdsequence.getMdpattern().toStringShort()); // append mdsequence buffer.append(mdsequence.getSequence().toStringShort()); // append support buffer.append(" #SUP: "); buffer.append(mdsequence.getAbsoluteSupport()); // write to file writer.write(buffer.toString()); writer.newLine(); } /** * Create a projected MD-pattern. database by keeping only the * MDPatterns corresponding to a set of sequence IDs. * @param patternsIds * The set of sequence IDS * @param patternsDatabase * The original md patterns database * @return A new database containing only the MDPatterns to keep. */ private MDPatternsDatabase createProjectedDatabase( Set<Integer> patternsIds, MDPatternsDatabase patternsDatabase) { // create projected database MDPatternsDatabase projectedDatabase = new MDPatternsDatabase(); // for each pattern for (MDPattern pattern : patternsDatabase.getMDPatterns()) { // if the id is in the set of desired ids if (patternsIds.contains(pattern.getId())) { // add to the projected database projectedDatabase.addMDPattern(pattern); } } // return projected database return projectedDatabase; } /** * Print statistics about the algorithm execution * @param databaseSize the number of MDsequences in the original database. */ public void printStatistics(int databaseSize) { StringBuilder r = new StringBuilder(140); r.append("============= SEQ-DIM - STATISTICS =============\n Total time ~ "); r.append(endTime - startTime); r.append(" ms\n"); r.append(" max memory : "); r.append(MemoryLogger.getInstance().getMaxMemory()); r.append("\n Frequent sequences count : "); r.append(patternCount); System.out.println(r.toString()); // sequences.printFrequentSequences(objectsCount); System.out .println("==================================================="); } /** * Eliminate non-closed multidimensional sequential patterns by simply * looping and eliminating redundant patterns. This is necessary if we want * to mine closed multi-dim. seq. patterns, because: closed sequential patt. * mining + closed itemset mining != closed multi-dim seq. patt. mining. * For more details about this, see the paper published by * Panida Songram, Veera Boonjing and Sarun Intakosum (2006) that * explains why we can do that. * @throws IOException exception if error while writing to file */ private void removeRedundancy() throws IOException { // For each level for (int i = sequences.getLevels().size() - 1; i > 0; i--) { // for each md sequential pattern for (MDSequence sequence : sequences.getLevel(i)) { // We check if the md sequential pattern is // strictly included in another // md sequential pattern having the same support. boolean included = false; // for each level for (int j = i; j < sequences.getLevels().size() && !included; j++) { //for each other md sequential pattern for (MDSequence sequence2 : sequences.getLevel(j)) { // if the first md sequential pattern is included // in the second and they have the same support if (sequence != sequence2 && sequence2.getAbsoluteSupport() == sequence .getAbsoluteSupport() && sequence2.contains(sequence)) { // note it included = true; // then break. break; } } } // if the md sequential pattern is NOT included in another sequential pattern // having the same support if (!included) { // save the pattern. writeToFile(sequence); } } } } }