package ca.pfv.spmf.algorithms.sequentialpatterns.spade_spam_AGP.dataStructures.database; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; import ca.pfv.spmf.algorithms.sequentialpatterns.spade_spam_AGP.EquivalenceClass; import ca.pfv.spmf.algorithms.sequentialpatterns.spade_spam_AGP.dataStructures.Item; import ca.pfv.spmf.algorithms.sequentialpatterns.spade_spam_AGP.dataStructures.ItemFactory; import ca.pfv.spmf.algorithms.sequentialpatterns.spade_spam_AGP.dataStructures.Itemset; import ca.pfv.spmf.algorithms.sequentialpatterns.spade_spam_AGP.dataStructures.Sequence; import ca.pfv.spmf.algorithms.sequentialpatterns.spade_spam_AGP.dataStructures.creators.AbstractionCreator; import ca.pfv.spmf.algorithms.sequentialpatterns.spade_spam_AGP.dataStructures.creators.ItemAbstractionPairCreator; import ca.pfv.spmf.algorithms.sequentialpatterns.spade_spam_AGP.dataStructures.patterns.PatternCreator; import ca.pfv.spmf.algorithms.sequentialpatterns.spade_spam_AGP.idLists.IDList; import ca.pfv.spmf.algorithms.sequentialpatterns.spade_spam_AGP.idLists.creators.IdListCreator; /** * Inspired in SPMF. Implementation of a sequence database. Each sequence should * have a unique id. See examples in /test/ directory for the format of input * files. * * Copyright Antonio Gomariz PeƱalver 2013 * * This file is part of the SPMF DATA MINING SOFTWARE * (http://www.philippe-fournier-viger.com/spmf). * * SPMF is free software: you can redistribute it and/or modify it under the * terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * * SPMF is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR * A PARTICULAR PURPOSE. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along with * SPMF. If not, see <http://www.gnu.org/licenses/>. * * @author agomariz */ public class SequenceDatabase { private AbstractionCreator abstractionCreator; private IdListCreator idListCreator; private Map<Item, EquivalenceClass> frequentItems = new HashMap<Item, EquivalenceClass>(); private List<Sequence> sequences = new LinkedList<Sequence>(); private ItemFactory<Integer> itemFactory = new ItemFactory<Integer>(); private PatternCreator patternCreator = PatternCreator.getInstance(); private int nSequences = 1; public SequenceDatabase(AbstractionCreator abstractionCreator, IdListCreator idListCreator) { this.abstractionCreator = abstractionCreator; this.idListCreator = idListCreator; } /** * Method that load a database from a path file given as parameter * * @param path Path file where the database is * @param minSupport Minimum absolute support * @throws IOException */ public void loadFile(String path, double minSupport) throws IOException { String thisLine; BufferedReader myInput = null; try { FileInputStream fin = new FileInputStream(new File(path)); myInput = new BufferedReader(new InputStreamReader(fin)); //For each line while ((thisLine = myInput.readLine()) != null) { // If the line is not a comment line if (thisLine.charAt(0) != '#') { // we add a new sequence to the sequenceDatabase addSequence(thisLine.split(" ")); } } double support = (int) Math.ceil(minSupport * sequences.size()); Set<Item> frequentItemsSet = frequentItems.keySet(); Set<Item> itemsToRemove = new HashSet<Item>(); //We remove those items that are not frequent for (Item frequentItem : frequentItemsSet) { //From the item set of frequent items EquivalenceClass equivalenceClass = frequentItems.get(frequentItem); if (equivalenceClass.getIdList().getSupport() < support) { itemsToRemove.add(frequentItem); } else { equivalenceClass.getIdList().setAppearingSequences(equivalenceClass.getClassIdentifier()); } } for (Item itemToRemove : itemsToRemove) { frequentItems.remove(itemToRemove); } //And from the original database reduceDatabase(frequentItems.keySet()); } catch (Exception e) { e.printStackTrace(); } finally { if (myInput != null) { myInput.close(); } } } /** * Method that adds a sequence from a array of string * * @param integers */ public void addSequence(String[] integers) { ItemAbstractionPairCreator pairCreator = ItemAbstractionPairCreator.getInstance(); long timestamp = -1; Sequence sequence = new Sequence(sequences.size()); Itemset itemset = new Itemset(); sequence.setID(nSequences); nSequences++; int beginning = 0; for (int i = beginning; i < integers.length; i++) { if (integers[i].codePointAt(0) == '<') { // Timestamp String value = integers[i].substring(1, integers[i].length() - 1); timestamp = Long.parseLong(value); itemset.setTimestamp(timestamp); } else if (integers[i].equals("-1")) { // End of an Itemset long time = itemset.getTimestamp() + 1; sequence.addItemset(itemset); itemset = new Itemset(); itemset.setTimestamp(time); timestamp++; } else if (integers[i].equals("-2")) { // End of a sequence sequences.add(sequence); } else { // extract the value for an item Item item = itemFactory.getItem(Integer.parseInt(integers[i])); EquivalenceClass clase = frequentItems.get(item); if (clase == null) { IDList idlist = idListCreator.create(); clase = new EquivalenceClass(patternCreator.createPattern(pairCreator.getItemAbstractionPair(item, abstractionCreator.createDefaultAbstraction())), idlist); frequentItems.put(item, clase); } IDList idlist = clase.getIdList(); if (timestamp < 0) { timestamp = 1; itemset.setTimestamp(timestamp); } idListCreator.addAppearance(idlist, sequence.getId(), (int) timestamp); itemset.addItem(item); } } } /** * Get the string representation of this SequenceDatabase * @return the string representation */ @Override public String toString() { StringBuilder r = new StringBuilder(); for (Sequence sequence : sequences) { r.append(sequence.getId()); r.append(": "); r.append(sequence.toString()); r.append('\n'); } return r.toString(); } /** * It returns the final number of sequences * @return the number of sequences */ public int size() { return sequences.size(); } /** * It returns the sequences of the database in a list * @return the list of sequences. */ public List<Sequence> getSequences() { return sequences; } /** * It returns the equivalence classes associated with the frequent items * that we have found * @return the list of equivalence classes */ public List<EquivalenceClass> frequentItems() { List<EquivalenceClass> celdasDeItemsFrecuentes = new ArrayList<EquivalenceClass>(frequentItems.values()); Collections.sort(celdasDeItemsFrecuentes); return celdasDeItemsFrecuentes; } /** * It returns the map that makes the matching between items and * equivalence classes. * @return the map */ public Map<Item, EquivalenceClass> getFrequentItems() { return frequentItems; } /** * It return the equivalence classes associated with the frequent * 2-Patterns that we have found * @param minSupport Minimum absolute support * @return the list of equivalence classes */ public List<EquivalenceClass> getSize2FrecuentSequences(double minSupport) { List<EquivalenceClass> patronesSize2 = abstractionCreator.getFrequentSize2Sequences(sequences, idListCreator); removeInfrequentItems(patronesSize2, minSupport); for (EquivalenceClass clase : patronesSize2) { clase.getIdList().setAppearingSequences(clase.getClassIdentifier()); } return patronesSize2; } private void removeInfrequentItems(List<EquivalenceClass> size2Patterns, double minSupport) { if (size2Patterns.isEmpty()) { return; } Item currentItem = size2Patterns.get(0).getClassIdentifier().getElements().get(0).getItem(); EquivalenceClass value = frequentItems.get(currentItem); List<Integer> infrequentItems = new ArrayList<Integer>(); for (int i = 0; i < size2Patterns.size(); i++) { Item nuevoItem = size2Patterns.get(i).getClassIdentifier().getElements().get(0).getItem(); if (!nuevoItem.equals(currentItem)) { currentItem = nuevoItem; value = frequentItems.get(currentItem); } if (size2Patterns.get(i).getIdList().getSupport() < minSupport) { infrequentItems.add(i); } else { value.addClassMember(size2Patterns.get(i)); } } for (int i = infrequentItems.size() - 1; i >= 0; i--) { EquivalenceClass removedClass = size2Patterns.remove(infrequentItems.get(i).intValue()); } Collections.sort(infrequentItems); } /** * It reduces the original database to just frequent items. * @param keySet the set of frequent items that should be kept. */ private void reduceDatabase(Set<Item> keySet) { for (Sequence sequence : sequences) { for (int i = 0; i < sequence.size(); i++) { Itemset itemset = sequence.get(i); for (int j = 0; j < itemset.size(); j++) { Item item = itemset.get(j); if (!keySet.contains(item)) { sequence.remove(i, j); j--; } } if (itemset.size() == 0) { sequence.remove(i); i--; } } } } public void clear() { if (sequences != null) { sequences.clear(); } sequences = null; if (frequentItems != null) { frequentItems.clear(); } frequentItems = null; itemFactory = null; } }