package ca.pfv.spmf.algorithms.sequentialpatterns.fournier2008_seqdim; /* This file is copyright (c) 2008-2013 Philippe Fournier-Viger * * This file is part of the SPMF DATA MINING SOFTWARE * (http://www.philippe-fournier-viger.com/spmf). * * SPMF is free software: you can redistribute it and/or modify it under the * terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * * SPMF is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR * A PARTICULAR PURPOSE. See the GNU General Public License for more details. * You should have received a copy of the GNU General Public License along with * SPMF. If not, see <http://www.gnu.org/licenses/>. */ import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; import ca.pfv.spmf.algorithms.sequentialpatterns.fournier2008_seqdim.kmeans_for_fournier08.Cluster; import ca.pfv.spmf.algorithms.sequentialpatterns.fournier2008_seqdim.multidimensionalsequentialpatterns.AlgoSeqDim; /** * Implementation of a sequence database as used by the SeqDim and Fournier-Vier (2008) algorithms. * Each sequence need to hvae a unique ID. * See examples in the /test/ directory for the format of input files. that can be read by this class. * * @see AlgoFournierViger08 * @see AlgoSeqDim * @see Sequence * @author Philippe Fournier-Viger */ public class SequenceDatabase{ /** List of sequences */ private final List<Sequence> sequences = new ArrayList<Sequence>(); /** for the Fournier08 algorithm, the cluster that was used to do the projection that results in this database. */ private Cluster cluster = null; /** * Load a sequence database from an input file * @param path the input file path * @throws IOException exception if error reading file */ public void loadFile(String path) throws IOException { // we will read line by line String thisLine; BufferedReader myInput = null; try { FileInputStream fin = new FileInputStream(new File(path)); myInput = new BufferedReader(new InputStreamReader(fin)); // For each line (sequence) until end of file while ((thisLine = myInput.readLine()) != null) { // if the line is a comment, is empty or is a // kind of metadata if (thisLine.isEmpty() == true || thisLine.charAt(0) == '#' || thisLine.charAt(0) == '%' || thisLine.charAt(0) == '@') { continue; } // process this line (sequence) splitted into tokens processSequence(thisLine.split(" ")); } } catch (Exception e) { e.printStackTrace(); }finally { // Close the input file if(myInput != null){ myInput.close(); } } } /** * Process a line from the input file, splitted into tokens. * @param tokens a list of tokens (String). */ void processSequence(String[] tokens) { // // create a new Sequence Sequence sequence = new Sequence(sequences.size()); // Create an itemset that will be used to store items // from the first itemset and eventually the next itemsets. Itemset itemset = new Itemset(); // for each tokens for(String integer: tokens){ // if this token is a timestamp if(integer.codePointAt(0) == '<'){ // we extract the timestamp and set it as the timestamp // of the current itemset String value = integer.substring(1, integer.length()-1); itemset.setTimestamp(Long.parseLong(value)); }else if(integer.equals("-1")){ // If -1, this indicate the end of the current itemset, // so we add the itemset to the sequence, and // create a new itemset. sequence.addItemset(itemset); itemset = new Itemset(); }else if(integer.equals("-2")){ // If -2, it indicates the end of a sequence // If the last itemset is not empty, it means // that a -1 was missing. if(itemset.size() >0){ // in this case we add the current itemset to the sequence // because it is not empty sequence.addItemset(itemset); itemset = new Itemset(); } // finally, we add the sequence to the sequence database sequences.add(sequence); }else{ // otherwise, check if it is // an item with the format : id(value) where // id is the item ID and value is a value associated with the item // we find the position of the left parenthesis int indexLeftParenthesis = integer.indexOf("("); // if there is a left parenthesis if(indexLeftParenthesis != -1){ // we find the position of the left parenthesis int indexRightParenthesis = integer.indexOf(")"); // we extract the value int value = Integer.parseInt(integer.substring(indexLeftParenthesis+1, indexRightParenthesis)); // we extract the item ID integer = integer.substring(0, indexLeftParenthesis); // We create a new item with the item ID and the value ItemValued item = new ItemValued(Integer.parseInt(integer), value); // The item is then added to the current itemset itemset.addItem(item); }else{ // Otherwise, it is just a regular item without value. //The item ID is extracted ItemSimple item = new ItemSimple(Integer.parseInt(integer)); // If the item is not already in this itemset if(!itemset.getItems().contains(item)){ // we add it to the itemset. itemset.addItem(item); } } } } } /** * Add a sequence to the sequence database * @param sequence the sequence */ public void addSequence(Sequence sequence){ sequences.add(sequence); } /** * Print this sequence database to System.out */ public void print(){ System.out.println("============ Context =========="); // for each sequence for(Sequence sequence : sequences){ // print the sequence System.out.print(sequence.getId() + ": "); sequence.print(); System.out.println(""); } } /** * Get a string representation of this sequence */ public String toString(){ // create a string buffer StringBuilder r = new StringBuilder(); // for each sequence for(Sequence sequence : sequences){ // append the sequence id r.append(sequence.getId()); r.append(": "); // append the itemsets of the sequence r.append(sequence.toString()); r.append('\n'); } // return the string return r.toString(); } /** * Get the number of sequences. * @return an integer */ public int size(){ return sequences.size(); } /** * Get the list of sequences. * @return a List of sequences. */ public List<Sequence> getSequences() { return sequences; } /** * Get the set of sequence IDs. * @return a Set of Integer. */ public Set<Integer> getSequenceIDs() { Set<Integer> set = new HashSet<Integer>(); for(Sequence sequence : getSequences()){ set.add(sequence.getId()); } return set; } /** * Get the cluster that was used to create this projected sequence database. * @return the Cluster or null if none */ Cluster getCluster() { return cluster; } /** * Set the cluster that was used to create this projected sequence database. * @param cluster the Cluster */ void setCluster(Cluster cluster) { this.cluster = cluster; } }