package ca.pfv.spmf.algorithms.sequentialpatterns.clospan_AGP.items; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.util.BitSet; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; /** * Inspired in SPMF. Implementation of a sequence database. Each sequence should * have a unique id. See examples in /test/ directory for the format of input * files. * * Copyright Antonio Gomariz PeƱalver 2013 * * This file is part of the SPMF DATA MINING SOFTWARE * (http://www.philippe-fournier-viger.com/spmf). * * SPMF is free software: you can redistribute it and/or modify it under the * terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * * SPMF is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR * A PARTICULAR PURPOSE. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along with * SPMF. If not, see <http://www.gnu.org/licenses/>. * * @author agomariz */ public class SequenceDatabase { private Map<Item, BitSet> frequentItems = new HashMap<Item, BitSet>(); private List<Sequence> sequences = new LinkedList<Sequence>(); private ItemFactory<Integer> itemFactory = new ItemFactory<Integer>(); /** * From a file located at the given string path, we create a database * composed of a list of sequences * * @param path File path where we have the database * @param minSupRelative relative Minimum support * @throws IOException */ public void loadFile(String path, double minSupRelative) throws IOException { String thisLine; BufferedReader myInput = null; try { FileInputStream fin = new FileInputStream(new File(path)); myInput = new BufferedReader(new InputStreamReader(fin)); int sequenceID=1; //For each line while ((thisLine = myInput.readLine()) != null) { // If the line is not a comment line if (thisLine.charAt(0) != '#') { // we read it and add it as a sequence addSequence(thisLine.split(" "),sequenceID); sequenceID++; } } double minSupAbsolute = (int) Math.ceil(minSupRelative * sequences.size()); //We get the set of items Set<Item> frequent = frequentItems.keySet(); //And prepare a list to keep the non-frequent ones Set<Item> toRemove = new HashSet<Item>(); for (Item frecuente : frequent) { if ((frequentItems.get(frecuente)).cardinality() < minSupAbsolute) { toRemove.add(frecuente); } } //We remove from the original set those non frequent items for(Item removedItem:toRemove){ frequentItems.remove(removedItem); } } catch (Exception e) { } finally { if (myInput != null) { myInput.close(); } } } /** * It adds a sequence from an array of string that we have to interpret * @param integers * @param sequenceID */ public void addSequence(String[] integers,int sequenceID) { long timestamp = -1; Sequence sequence = new Sequence(sequences.size()); sequence.setID(sequenceID); Itemset itemset = new Itemset(); int inicio = 0; Map<Item, Boolean> counted = new HashMap<Item, Boolean>(); for (int i = inicio; i < integers.length; i++) { if (integers[i].codePointAt(0) == '<') { // Timestamp String value = integers[i].substring(1, integers[i].length() - 1); timestamp = Long.parseLong(value); itemset.setTimestamp(timestamp); } else if (integers[i].equals("-1")) { // end of an itemset long time = itemset.getTimestamp() + 1; sequence.addItemset(itemset); itemset = new Itemset(); itemset.setTimestamp(time); } else if (integers[i].equals("-2")) { // end of a sequence sequences.add(sequence); } else { // extract the value for an item Item item = itemFactory.getItem(Integer.parseInt(integers[i])); if (counted.get(item) == null) { counted.put(item, Boolean.TRUE); BitSet appearances = frequentItems.get(item); if (appearances == null) { appearances = new BitSet(); frequentItems.put(item, appearances); } appearances.set(sequence.getId()); } itemset.addItem(item); } } } /** * Get a string representation of this sequence database * @return the string representation */ @Override public String toString() { StringBuilder r = new StringBuilder(); for (Sequence sequence : sequences) { r.append(sequence.getId()); r.append(": "); r.append(sequence.toString()); r.append('\n'); } return r.toString(); } /** * It returns the number of sequences of the sequence database * @return the number of sequenes */ public int size() { return sequences.size(); } /** * It return the list of sequences of this sequence database * @return the list of sequences */ public List<Sequence> getSequences() { return sequences; } /** * Get the list of ids of sequences in this sequence database * @return the list of sequence ids. */ public Set<Integer> getSequenceIDs() { Set<Integer> set = new HashSet<Integer>(); for (Sequence sequence : getSequences()) { set.add(sequence.getId()); } return set; } /** * It return the frequent items * @return the list of frequent items and the bitsets representing their sequence IDs. */ public Map<Item, BitSet> getFrequentItems() { return frequentItems; } public void clear() { if (frequentItems != null) { frequentItems.clear(); } frequentItems = null; if (sequences != null) { sequences.clear(); } sequences = null; itemFactory = null; } }