package ca.pfv.spmf.algorithms.sequential_rules.trulegrowth_with_strings; /* This file is copyright (c) 2008-2013 Philippe Fournier-Viger * * This file is part of the SPMF DATA MINING SOFTWARE * (http://www.philippe-fournier-viger.com/spmf). * * SPMF is free software: you can redistribute it and/or modify it under the * terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * * SPMF is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR * A PARTICULAR PURPOSE. See the GNU General Public License for more details. * You should have received a copy of the GNU General Public License along with * SPMF. If not, see <http://www.gnu.org/licenses/>. */ import java.io.BufferedWriter; import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import ca.pfv.spmf.algorithms.sequential_rules.trulegrowth.AlgoTRuleGrowth; import ca.pfv.spmf.algorithms.sequential_rules.trulegrowth.Occurence; import ca.pfv.spmf.input.sequence_database_list_strings.Sequence; import ca.pfv.spmf.input.sequence_database_list_strings.SequenceDatabase; import ca.pfv.spmf.tools.MemoryLogger; /** * This is a modified implementation of the TRULEGROWTH algorithm for mining sequential rules from * sequences containing Strings instead of integers. * <br/><br/> * * Fournier-Viger, P., Wu, C.-W., Tseng, V.S., Nkambou, R. (2012). * Mining Sequential Rules Common to Several Sequences with the Window Size Constraint. * Proceedings of the 25th Canadian Conf. on Artificial Intelligence (AI 2012), * Springer, LNAI 7310, pp.299-304. * <br/><br/> * * In future a version of SPMF, it is planned to remove this class and to provide a more general * mechanism for handling strings that would work for all algorithms that take sequences as input. * * @see Itemset * @see AlgoTRuleGrowth *@see Sequence *@see SequenceDatabase *@author Philippe Fournier-Viger */ public class AlgoTRuleGrowth_withStrings { // statistics long timeStart = 0; // start time of latest execution long timeEnd = 0; // end time of latest execution // A map to record the occurences of each item in each sequence // KEY: an item // VALUE: a map of key: sequence ID value: occurences of the item in that sequence. // (note: an occurence is an itemset position) Map<String, Map<Integer, Occurence>> mapItemCount; // PARAMETERS OF THE ALGORITHM SequenceDatabase database; // a sequence database double minconf; // minimum confidence int minsuppRelative; // minimum support int windowSize =0; // window size // The number of patterns found int ruleCount; // object to write the output file BufferedWriter writer = null; // the maximum size of the antecedent of rules (optional) int maxAntecedentSize = Integer.MAX_VALUE; // the maximum size of the consequent of rules (optional) int maxConsequentSize = Integer.MAX_VALUE; /** * Default constructor */ public AlgoTRuleGrowth_withStrings() { } /** * Run the algorithm. * @param minSupport Minsup as a percentage (ex: 0.05 = 5 %) * @param minConfidence minimum confidence (a value between 0 and 1). * @param input the input file path * @param output the output file path * @param windowSize a window size * @throws IOException exception if there is an error reading/writing files */ public void runAlgorithm(double minSupport, double minConfidence, String input, String output, int windowSize) throws IOException{ // load the input file into memory try { this.database = new SequenceDatabase(); database.loadFile(input); } catch (Exception e) { e.printStackTrace(); } // convert minimum support to a relative minimum support (integer) this.minsuppRelative = (int) Math.ceil(minSupport * database.size()); // run the algorithm runAlgorithm(input, output, minsuppRelative, minConfidence, windowSize); } /** * Run the algorithm. * @param relativeMinSupport minsup as a a relative value (integer) * @param minConfidence minimum confidence (a value between 0 and 1). * @param input the input file path * @param output the output file path * @param windowSize a window size * @throws IOException exception if there is an error reading/writing files */ public void runAlgorithm(String input, String output, int relativeMinSupport, double minConfidence, int windowSize ) throws IOException{ this.minconf = minConfidence; // read the database into memory if(database == null){ try { this.database = new SequenceDatabase(); database.loadFile(input); } catch (Exception e) { e.printStackTrace(); } } // IMPORTANT : THIS IS A FIX SO THAT THE DEFINITION IS THE SAME AS IN THE ARTICLE!! this.windowSize = windowSize + 1; // if minsup is 0, set it to 1 this.minsuppRelative = relativeMinSupport; if(this.minsuppRelative == 0){ // protection this.minsuppRelative = 1; } // reset the stats for memory usage MemoryLogger.getInstance().reset(); // prepare the object for writing the output file writer = new BufferedWriter(new FileWriter(output)); // save the start time timeStart = System.currentTimeMillis(); // for stats // remove infrequent items from the database removeItemsThatAreNotFrequent(database); // note frequent items in a list "listFrequents" List<String> listFrequents = new ArrayList<String>(); // for each item for(Entry<String,Map<Integer, Occurence>> entry : mapItemCount.entrySet()){ // if it is frequent if(entry.getValue().size() >= minsuppRelative){ // add the item to the list listFrequents.add(entry.getKey()); } } // FOR EACH FREQUENT ITEM WE COMPARE WITH EACH OTHER FREQUENT ITEM TO // TRY TO GENERATE A RULE 1-1. for(int i=0; i< listFrequents.size(); i++){ String intI = listFrequents.get(i); Map<Integer,Occurence> occurencesI = mapItemCount.get(intI); for(int j=i+1; j< listFrequents.size(); j++){ String intJ = listFrequents.get(j); Map<Integer,Occurence> occurencesJ = mapItemCount.get(intJ); // (1) Calculate tidsI, tidsJ, tidsJ-->J and tidsI->J Set<Integer> tidsI = new HashSet<Integer>(); Set<Integer> tidsJ = null; Set<Integer> tidsIJ = new HashSet<Integer>(); Set<Integer> tidsJI= new HashSet<Integer>(); // for each occurence of I looptid: for(Occurence occI : occurencesI.values()){ // add the sequenceID to tidsI tidsI.add(occI.sequenceID); // if J does not appear in that sequence continue loop Occurence occJ = occurencesJ.get(occI.sequenceID); if(occJ == null){ continue looptid; } // make a big loop to compare if I appears before // J in that sequence and // if J appears before I boolean addedIJ= false; boolean addedJI= false; // for each occurence of I in that sequence loopIJ: for(Short posI : occI.occurences){ // for each occurence of J in that sequence for(Short posJ : occJ.occurences){ if(!posI.equals(posJ) && Math.abs(posI - posJ) <= windowSize){ if(posI <= posJ){ // if I is before J tidsIJ.add(occI.sequenceID); addedIJ = true; }else{ // if J is before I tidsJI.add(occI.sequenceID); addedJI = true; } // if we have found that I is before J and J is before I // we don't need to continue. if(addedIJ && addedJI){ break loopIJ; } } } } } // END // (2) check if the two itemsets have enough common tids // if not, we don't need to generate a rule for them. // create rule IJ if(tidsIJ.size() >= minsuppRelative){ // calculate the confidence of I ==> J double confIJ = ((double)tidsIJ.size()) / occurencesI.size(); // create itemset of the rule I ==> J String[] itemset1 = new String[]{intI}; String[] itemset2 = new String[]{intJ}; // if the confidence is high enough, save the rule if(confIJ >= minConfidence){ saveRule(tidsIJ, confIJ, itemset1, itemset2); } // Calculate tidsJ. tidsJ = new HashSet<Integer>(); for(Occurence occJ : occurencesJ.values()){ tidsJ.add(occJ.sequenceID); } // recursive call to try to expand the rule if(itemset1.length < maxAntecedentSize) { expandLeft(itemset1, itemset2, tidsI, tidsIJ); } if(itemset2.length < maxConsequentSize) { expandRight(itemset1, itemset2, tidsI, tidsJ, tidsIJ); } } // create rule JI if(tidsJI.size() >= minsuppRelative){ double confJI = ((double)tidsJI.size()) / occurencesJ.size(); // create itemsets for that rule String[] itemset1 = new String[]{intI}; String[] itemset2 = new String[]{intJ}; // if the rule has enough confidence, save it! if(confJI >= minConfidence){ saveRule(tidsJI, confJI, itemset2, itemset1); // rules.addRule(ruleJI); } // Calculate tidsJ. if(tidsJ == null){ tidsJ = new HashSet<Integer>(); for(Occurence occJ : occurencesJ.values()){ tidsJ.add(occJ.sequenceID); } } // recursive call to try to expand the rule if(itemset1.length < maxConsequentSize) { expandRight(itemset2, itemset1, tidsJ, tidsI, tidsJI /*, occurencesJ, occurencesI*/); } if(itemset2.length < maxAntecedentSize) { expandLeft(itemset2, itemset1, tidsJ, tidsJI /*, occurencesI*/); } } } } // save the end time for the execution of the algorithm timeEnd = System.currentTimeMillis(); // for stats // close the file writer.close(); database = null; } /** * This method search for items for expanding left side of a rule I --> J * with any item c. This results in rules of the form I U�{c} --> J. The method makes sure that: * - c is not already included in I or J * - c appear at least minsup time in tidsIJ before last occurence of J * - c is lexically bigger than all items in I * @param itemsetI the left side of a rule (see paper) * @param itemestJ the right side of a rule (see paper) * @param tidsI the tids set of I * @param tidsJ the tids set of J * @throws IOException exception if error while writing output file */ private void expandLeft(String[] itemsetI, String[] itemsetJ, Collection<Integer> tidsI, Collection<Integer> tidsIJ // , // Map<Integer, Occurence> mapOccurencesJ ) throws IOException { if(itemsetI.length ==2 && itemsetI[0].equals("a") && itemsetI[1].equals("b") && itemsetJ[0].equals("d") ){ System.out.println(); } // // map-key: item map-value: set of tids containing the item Map<String, Set<Integer>> frequentItemsC = new HashMap<String, Set<Integer>>(); //////////////////////////////////////////////////////////////////////// // for each sequence containing I-->J for(Integer tid : tidsIJ){ Sequence sequence = database.getSequences().get(tid); LinkedHashMap<String, Integer> mapMostLeftFromI = new LinkedHashMap<String, Integer>(); LinkedHashMap<String, Integer> mapMostLeftFromJ = new LinkedHashMap<String, Integer>(); LinkedHashMap<String, LinkedList<Integer>> mapMostRightFromJ = new LinkedHashMap<String, LinkedList<Integer>>(); int lastItemsetScannedForC = Integer.MAX_VALUE; // For each itemset starting from the last... int k= sequence.size()-1; do{ final int fistElementOfWindow = k; // - windowSize +1 final int lastElementOfWindow = k + windowSize -1; // remove items from J that fall outside the time window int previousJSize = mapMostLeftFromJ.size(); removeElementOutsideWindow(mapMostLeftFromJ, lastElementOfWindow); // important: if J was all there, but become smaller we need to clear the // hashmap for items of I. int currentJSize = mapMostLeftFromJ.size(); if(previousJSize == itemsetJ.length && previousJSize != currentJSize){ mapMostLeftFromI.clear(); } // remove items from I that fall outside the time window removeElementOutsideWindow(mapMostLeftFromI, lastElementOfWindow); // For each item of the current itemset for(String item : sequence.get(k)){ // record the first position until now of each item in I or J if(mapMostLeftFromJ.size() == itemsetJ.length && contains(itemsetI, item)){ addToLinked(mapMostLeftFromI, item, k); }else if(contains(itemsetJ, item)){ addToLinked(mapMostLeftFromJ, item, k); LinkedList<Integer> list = mapMostRightFromJ.get(item); if(list == null){ list = new LinkedList<Integer>(); addToLinked(mapMostRightFromJ, item, list); } list.add(k); } } // if all the items of IJ are in the current window if(mapMostLeftFromI.size() == itemsetI.length && mapMostLeftFromJ.size() == itemsetJ.length){ //remove items from mostRight that fall outside the time window. // at the same time, calculate the minimum index for items of J. int minimum = Integer.MAX_VALUE; for(LinkedList<Integer> list: mapMostRightFromJ.values()){ while(true){ Integer last = list.getLast(); if(last > lastElementOfWindow){ list.removeLast(); }else{ if(last < minimum){ minimum = last -1; } break; } } } // we need to scan for items C to extend the rule... // Such item c has to appear in the window before the last occurence of J (before "minimum") // and if it was scanned before, it should not be scanned again. int itemsetC = minimum; if(itemsetC >= lastItemsetScannedForC){ itemsetC = lastItemsetScannedForC -1; } for(; itemsetC >= fistElementOfWindow; itemsetC--){ for(String itemC : sequence.get(itemsetC)){ // if lexical order is not respected or c is included in the rule already. if(containsLEXPlus(itemsetI, itemC) || containsLEX(itemsetJ, itemC)){ continue; } Set<Integer> tidsItemC = frequentItemsC.get(itemC); if(tidsItemC == null){ tidsItemC = new HashSet<Integer>(); frequentItemsC.put(itemC, tidsItemC); } tidsItemC.add(tid); } } lastItemsetScannedForC = fistElementOfWindow; } k--; }while(k >= 0 && lastItemsetScannedForC >0); } //////////////////////////////////////////////////////////////////////// // for each item c found, we create a rule for(Entry<String, Set<Integer>> entry : frequentItemsC.entrySet()){ Set<Integer> tidsIC_J = entry.getValue(); // if the support is enough Sup(R) = sup(IC -->J) if(tidsIC_J.size() >= minsuppRelative){ String itemC = entry.getKey(); String [] itemsetIC = new String[itemsetI.length+1]; System.arraycopy(itemsetI, 0, itemsetIC, 0, itemsetI.length); itemsetIC[itemsetI.length] = itemC; if(itemC.equals("f") && itemsetIC[0].equals("a")){ System.out.println("6"); } // ---- CALCULATE ALL THE TIDS CONTAINING IC WITHIN A TIME WINDOW --- Set<Integer> tidsIC = new HashSet<Integer>(); loop1: for(Integer tid: tidsI){ Sequence sequence = database.getSequences().get(tid); // MAP: item : itemset index LinkedHashMap<String, Integer> mapAlreadySeenFromIC = new LinkedHashMap<String, Integer>(); // For each itemset for(int k=0; k< sequence.size(); k++){ // For each item for(String item : sequence.get(k)){ if(contains(itemsetIC, item)){ // record the last position of each item in IC addToLinked(mapAlreadySeenFromIC, item, k); } } // remove items that fall outside the time window Iterator<Entry<String, Integer>> iter = mapAlreadySeenFromIC.entrySet().iterator(); while(iter.hasNext()){ Entry<String, Integer> entryMap = iter.next(); if(entryMap.getValue() < k - windowSize +1){ iter.remove(); }else{ break; } } // if all the items of I are inside the current window, then record the tid if(mapAlreadySeenFromIC.keySet().size() == itemsetIC.length){ tidsIC.add(tid); continue loop1; } } } // ---- ---- // Create rule and calculate its confidence: Conf(r) = sup(IUC -->J) / sup(IUC) double confIC_J = ((double)tidsIC_J.size()) / tidsIC.size(); if(confIC_J >= minconf){ saveRule(tidsIC_J, confIC_J, itemsetIC, itemsetJ); } // recursive call to expand left side of the rule if(itemsetIC.length < maxConsequentSize) { expandLeft(itemsetIC, itemsetJ, tidsIC, tidsIC_J ); } } } MemoryLogger.getInstance().checkMemory(); //////////////////////////////////////////////////////////////////////// } // this method is to make sure that the insertion order is preserved. // It was necessary to do that because when an element is re-inserted in a linked list, // the access order remain the one of the first insertion. private void addToLinked(LinkedHashMap<String, LinkedList<Integer>> mapMostLeftFromI, String key, LinkedList<Integer> value) { if(mapMostLeftFromI.containsKey(key)){ mapMostLeftFromI.remove(key); } mapMostLeftFromI.put(key, value); } private void addToLinked(LinkedHashMap<String, Integer> mapMostLeftFromI, String key, Integer value) { if(mapMostLeftFromI.containsKey(key)){ mapMostLeftFromI.remove(key); } mapMostLeftFromI.put(key, value); } /** * This method removes elements out of the current window from a hashmap containing * the position of items at the left of an itemset. * key: item value : a itemset position * @param mapMostLeftFromItemset the map * @param lastElementOfWindow the last itemset of the window in terms of itemset position * in the sequence. */ private void removeElementOutsideWindow( LinkedHashMap<String, Integer> mapMostLeftFromI, final int lastElementOfWindow) { // iterate over elements of the map Iterator<Entry<String, Integer>> iter = mapMostLeftFromI.entrySet().iterator(); while(iter.hasNext()){ // if the position is outside the window, remove it if(iter.next().getValue() > lastElementOfWindow){ iter.remove(); }else{ // otherwise, we break break; } } } /** * This method removes elements out of the current window from a hashmap containing * the position of items at the right of an itemset. * key: item value : a itemset position * @param mapMostLeftFromItemset the map * @param lastElementOfWindow the last itemset of the window in terms of itemset position * in the sequence. */ private void removeElementOutsideWindowER( LinkedHashMap<String, Integer> mapMostRightfromI, final int firstElementOfWindow) { // iterate over elements of the map Iterator<Entry<String, Integer>> iter = mapMostRightfromI.entrySet().iterator(); while(iter.hasNext()){ // if the position is outside the window, remove it Entry<String, Integer> entry = iter.next(); if(entry.getValue() < firstElementOfWindow){ iter.remove(); }else{ // otherwise, we break break; } } } /** * This method search for items for expanding left side of a rule I --> J * with any item c. This results in rules of the form I --> J U�{c}. The method makes sure that: * - c is not already included in I or J * - c appear at least minsup time in tidsIJ after the first occurence of I * - c is lexically bigger than all items in J * @param mapWindowsJI * @throws IOException */ private void expandRight(String[] itemsetI, String[] itemsetJ, Set<Integer> tidsI, Collection<Integer> tidsJ, Collection<Integer> tidsIJ //, // Map<Integer, Occurence> occurencesI, // Map<Integer, Occurence> occurencesJ ) throws IOException { // // map-key: item map-value: set of tids containing the item Map<String, Set<Integer>> frequentItemsC = new HashMap<String, Set<Integer>>(); // for each sequence containing I-->J for(Integer tid : tidsIJ){ Sequence sequence = database.getSequences().get(tid); LinkedHashMap<String, Integer> mapMostRightFromI = new LinkedHashMap<String, Integer>(); LinkedHashMap<String, Integer> mapMostRightFromJ = new LinkedHashMap<String, Integer>(); LinkedHashMap<String, LinkedList<Integer>> mapMostLeftFromI = new LinkedHashMap<String, LinkedList<Integer>>(); int lastItemsetScannedForC = Integer.MIN_VALUE; // For each itemset starting from the first... int k= 0; do{ final int firstElementOfWindow = k - windowSize +1; int lastElementOfWindow = k; // remove items from I that fall outside the time window int previousISize = mapMostRightFromI.size(); removeElementOutsideWindowER(mapMostRightFromI, firstElementOfWindow); // important: if I was all there, but become smaller we need to clear the // hashmap for items of J. int currentISize = mapMostRightFromI.size(); if(previousISize == itemsetJ.length && previousISize != currentISize){ mapMostRightFromJ.clear(); } // remove items from J that fall outside the time window removeElementOutsideWindowER(mapMostRightFromJ, firstElementOfWindow); // For each item of the current itemset for(String item : sequence.get(k)){ // record the first position until now of each item in I or J if(mapMostRightFromI.size() == itemsetI.length && contains(itemsetJ, item)){ addToLinked(mapMostRightFromJ, item, k); }else if(contains(itemsetI, item)){ addToLinked(mapMostRightFromI, item, k); LinkedList<Integer> list = mapMostLeftFromI.get(item); if(list == null){ list = new LinkedList<Integer>(); addToLinked(mapMostLeftFromI, item, list); } list.add(k); } } // if all the items of IJ are in the current window if(mapMostRightFromI.size() == itemsetI.length && mapMostRightFromJ.size() == itemsetJ.length){ //remove items from mostLeft that fall outside the time window. // at the same time, calculate the minimum index for items of I. int minimum = 1; for(LinkedList<Integer> list: mapMostLeftFromI.values()){ while(true){ Integer last = list.getLast(); if(last < firstElementOfWindow){ list.removeLast(); }else{ if(last > minimum){ minimum = last + 1; } break; } } } // we need to scan for items C to extend the rule... // Such item c has to appear in the window before the last occurence of J (before "minimum") // and if it was scanned before, it should not be scanned again. int itemsetC = minimum; if(itemsetC < lastItemsetScannedForC){ itemsetC = lastItemsetScannedForC +1; } for(; itemsetC <= lastElementOfWindow; itemsetC++){ for(String itemC : sequence.get(itemsetC)){ // if lexical order is not respected or c is included in the rule already. if(containsLEX(itemsetI, itemC) || containsLEXPlus(itemsetJ, itemC)){ continue; } Set<Integer> tidsItemC = frequentItemsC.get(itemC); if(tidsItemC == null){ tidsItemC = new HashSet<Integer>(); frequentItemsC.put(itemC, tidsItemC); } tidsItemC.add(tid); } } lastItemsetScannedForC = lastElementOfWindow; } k++; }while(k < sequence.size() && lastItemsetScannedForC < sequence.size()-1); } //////////////////////////////////////////////////////////////////////// // for each item c found, we create a rule for(Entry<String, Set<Integer>> entry : frequentItemsC.entrySet()){ Set<Integer> tidsI_JC = entry.getValue(); // if the support is enough Sup(R) = sup(IC -->J) if(tidsI_JC.size() >= minsuppRelative){ String itemC = entry.getKey(); String[] itemsetJC = new String[itemsetJ.length+1]; System.arraycopy(itemsetJ, 0, itemsetJC, 0, itemsetJ.length); itemsetJC[itemsetJ.length]= itemC; // // Itemset itemsetJC = new Itemset(ruleIJ.getItemset2()); // itemsetJC.addItem(itemC); // ---- CALCULATE ALL THE TIDS CONTAINING JC WITHIN A TIME WINDOW --- Set<Integer> tidsJC = new HashSet<Integer>(); loop1: for(Integer tid: tidsJ){ Sequence sequence = database.getSequences().get(tid); // MAP: item : itemset index LinkedHashMap<String, Integer> mapAlreadySeenFromJC = new LinkedHashMap<String, Integer>(); // For each itemset for(int k=0; k< sequence.size(); k++){ // For each item for(String item : sequence.get(k)){ if(contains(itemsetJC, item)){ // record the last position of each item in JC addToLinked(mapAlreadySeenFromJC, item, k); } } // remove items that fall outside the time window Iterator<Entry<String, Integer>> iter = mapAlreadySeenFromJC.entrySet().iterator(); while(iter.hasNext()){ Entry<String, Integer> entryMap = iter.next(); if(entryMap.getValue() < k - windowSize +1){ iter.remove(); }else{ break; } } // if all the items of I are inside the current window, then record the tid if(mapAlreadySeenFromJC.keySet().size() == itemsetJC.length){ tidsJC.add(tid); continue loop1; } } } // ---- ---- // Create rule and calculate its confidence: Conf(r) = sup(I-->JC) / sup(I) double confI_JC = ((double)tidsI_JC.size()) / tidsI.size(); // Rule ruleI_JC = new Rule(ruleIJ.getItemset1(), itemsetJC, confI_JC, tidsI_JC.size()); // if the confidence is enough if(confI_JC >= minconf){ saveRule(tidsI_JC, confI_JC, itemsetI, itemsetJC); } if(itemsetJC.length < maxConsequentSize) { expandRight(itemsetI, itemsetJC, tidsI, tidsJC, tidsI_JC); // } // recursive call to expand left and right side of the rule if(itemsetI.length < maxAntecedentSize) { expandLeft(itemsetI, itemsetJC, tidsI, tidsI_JC); // occurencesJ } } } MemoryLogger.getInstance().checkMemory(); } /** * This method calculate the frequency of each item in one database pass. * Then it remove all items that are not frequent. * @param database : a sequence database * @return A map such that key = item * value = a map where a key = tid and a value = Occurence * This map allows knowing the frequency of each item and their first and last occurence in each sequence. */ private Map<String, Map<Integer, Occurence>> removeItemsThatAreNotFrequent(SequenceDatabase database) { // (1) Count the support of each item in the database in one database pass mapItemCount = new HashMap<String, Map<Integer, Occurence>>(); // <item, Map<tid, occurence>> // for each sequence for(Sequence sequence : database.getSequences()){ // for each itemset for(short j=0; j< sequence.getItemsets().size(); j++){ List<String> itemset = sequence.get(j); // for each item for(int i=0; i< itemset.size(); i++){ String itemI = itemset.get(i); Map<Integer, Occurence> occurences = mapItemCount.get(itemI); if(occurences == null){ occurences = new HashMap<Integer, Occurence>(); mapItemCount.put(itemI, occurences); } Occurence occurence = occurences.get(sequence.getId()); if(occurence == null){ occurence = new Occurence(sequence.getId()); occurences.put(sequence.getId(), occurence); } occurence.add(j); } } } // System.out.println("NUMBER OF DIFFERENT ITEMS : " + mapItemCount.size()); // (2) remove all items that are not frequent from the database for(Sequence sequence : database.getSequences()){ int i=0; while(i < sequence.getItemsets().size()){ List<String> itemset = sequence.getItemsets().get(i); int j=0; while(j < itemset.size()){ double count = mapItemCount.get(itemset.get(j)).size(); if(count < minsuppRelative){ itemset.remove(j); }else{ j++; } } i++; } } return mapItemCount; } /** * Save a rule I ==> J to the output file * @param tidsIJ the tids containing the rule * @param confIJ the confidence * @param itemsetI the left part of the rule * @param itemsetJ the right part of the rule * @throws IOException exception if error writing the file */ private void saveRule(Set<Integer> tidsIJ, double confIJ, String[] itemsetI, String[] itemsetJ) throws IOException { ruleCount++; StringBuilder buffer = new StringBuilder(); // write itemset 1 for(int i=0; i<itemsetI.length; i++){ buffer.append(itemsetI[i]); if(i != itemsetI.length -1){ buffer.append(","); } } // write separator buffer.append(" ==> "); // write itemset 2 for(int i=0; i<itemsetJ.length; i++){ buffer.append(itemsetJ[i]); if(i != itemsetJ.length -1){ buffer.append(","); } } // write separator buffer.append(" #SUP: "); // write support buffer.append(tidsIJ.size()); // write separator buffer.append(" #CONF: "); // write confidence buffer.append(confIJ); writer.write(buffer.toString()); writer.newLine(); } /** * Check if an itemset contains an item. * @param itemset the itemset * @param item an item * @return true if the item appears in the itemset */ boolean contains(String[] itemset, String item) { // for each item in the itemset for(int i=0; i<itemset.length; i++){ // if the item is found, return true if(itemset[i].equals(item)){ return true; // if the current item is larger than the item that is searched, // then return false because of the lexical order }else if(itemset[i].compareTo(item) > 0){ // not found, return false return false; } } return false; } /** * This method checks if the item "item" is in the itemset. * It assumes that items in the itemset are sorted in lexical order * This version also checks that if the item "item" was added it would be the largest one * according to the lexical order * @param item an item * @param itemset an itemset * @return true if the item is contained in the itemset */ boolean containsLEXPlus(String[] itemset, String item) { // for each item in itemset for(int i=0; i< itemset.length; i++){ // check if the current item is equal to the one that is searched if(itemset[i].equals(item)){ // if yes return true return true; // if the current item is larger than the item that is searched, // then return true because if if the item "item" was added it would be the largest one // according to the lexical order. }else if(itemset[i].compareTo(item) > 0){ return true; // <-- xxxx } } // if the searched item was not found, return false. return false; } /** * This method checks if the item "item" is in the itemset. * It asumes that items in the itemset are sorted in lexical order * @param item an item * @param itemset an itemset * @return true if the item is contained in the itemset */ boolean containsLEX(String[] itemset, String item) { // for each item in itemset for(int i=0; i< itemset.length; i++){ // check if the current item is equal to the one that is searched if(itemset[i].equals(item)){ // if yes return true return true; // if the current item is larger than the item that is searched, // then return false because of the lexical order. }else if(itemset[i].compareTo(item) > 0){ return false; // <-- xxxx } } // if the searched item was not found, return false. return false; } /** * Set the number of items that a rule antecedent should contain (optional). * @param maxAntecedentSize the maximum number of items */ public void setMaxAntecedentSize(int maxAntecedentSize) { this.maxAntecedentSize = maxAntecedentSize; } /** * Set the number of items that a rule consequent should contain (optional). * @param maxConsequentSize the maximum number of items */ public void setMaxConsequentSize(int maxConsequentSize) { this.maxConsequentSize = maxConsequentSize; } /** * Print statistics about the last algorithm execution to System.out. */ public void printStats() { System.out .println("============= TRULEGROWTH - STATS ============="); // System.out.println("minsup: " + minsuppRelative); System.out.println("Sequential rules count: " + ruleCount); System.out.println("Total time : " + (timeEnd - timeStart) + " ms"); System.out.println("Max memory (mb)" + MemoryLogger.getInstance().getMaxMemory()); System.out.println("====================================="); } /** * Get the total runtime of the last execution. * @return the time as a double. */ public double getTotalTime(){ return timeEnd - timeStart; } }