package ca.pfv.spmf.input.sequence_database_list_strings; /* Copyright (c) 2008-2013 Philippe Fournier-Viger * * This file is part of the SPMF DATA MINING SOFTWARE * (http://www.philippe-fournier-viger.com/spmf). * * SPMF is free software: you can redistribute it and/or modify it under the * terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * * SPMF is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR * A PARTICULAR PURPOSE. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along with * SPMF. If not, see <http://www.gnu.org/licenses/>. */ import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.HashSet; import java.util.List; import java.util.Set; /** * Implementation of a sequence database as list of strings. * Each sequence should have a unique id. * See examples in /test/ directory for the format of input files. * @see Sequence * @author Philipe-Fournier-Viger */ public class SequenceDatabase { // variable that contains the sequences of this database private final List<Sequence> sequences = new ArrayList<Sequence>(); /** * Method to load a sequence database from a text file in SPMF format. * @param path the input file path. * @throws IOException exception if error while reading the file. */ public void loadFile(String path) throws IOException { String thisLine; // variable to read each line. BufferedReader myInput = null; try { FileInputStream fin = new FileInputStream(new File(path)); myInput = new BufferedReader(new InputStreamReader(fin)); // for each line until the end of the file while ((thisLine = myInput.readLine()) != null) { // if the line is not a comment, is not empty or is not other // kind of metadata if (thisLine.isEmpty() == false && thisLine.charAt(0) != '#' && thisLine.charAt(0) != '%' && thisLine.charAt(0) != '@') { // split this line according to spaces and process the line addSequence(thisLine.split(" ")); } } } catch (Exception e) { e.printStackTrace(); } finally { if (myInput != null) { myInput.close(); } } } /** * Method to process a line from the input file * @param tokens A list of tokens from the line (which were separated by spaces in the original file). */ void addSequence(String[] tokens) { // create a new Sequence to store the sequence Sequence sequence = new Sequence(sequences.size()); // create a list of strings for the first itemset. List<String> itemset = new ArrayList<String>(); // for each token in this line for (String item : tokens) { // if the token start with "<", it indicates a timestamp. // We just ignore it because algorithms that use this class // don't need it. if (item.codePointAt(0) == '<') { //nothing to do } // if the token is -1, it means that we reached the end of an itemset. else if (item.equals("-1")) { // We sort the itemset to make sure that it is sorted // (it is important for several algorithms) Collections.sort(itemset, new Comparator<String>() { public int compare(String arg0, String arg1) { return arg0.hashCode() - arg1.hashCode(); } }); // we add the current itemset to the sequence sequence.addItemset(itemset); // we create a new itemset for the next items that we will read (if any). itemset = new ArrayList<String>(); } // if the token is -2, it means that we reached the end of // the sequence. else if (item.equals("-2")) { // we thus add the sequence to the sequence database sequences.add(sequence); } // else, it is an item else { // we add the item to the current itemset. itemset.add(item); } } } /** * Add a sequence to this sequence database. * @param sequence A sequence. */ public void addSequence(Sequence sequence) { sequences.add(sequence); } /** * Print this sequence database to System.out. */ public void printDatabase() { System.out.println("============ Database =========="); for (Sequence sequence : sequences) { // pour chaque objet System.out.print(sequence.getId() + ": "); sequence.print(); System.out.println(""); } } /** * Print some statistics about this sequence database to * System.out. */ public void printDatabaseStats() { System.out.println("============ STATS =========="); System.out.println("Number of sequences : " + sequences.size()); // Calculate the average size of sequences long size = 0; for(Sequence sequence : sequences){ size += sequence.size(); } double meansize = ((float)size) / ((float)sequences.size()); System.out.println("Average sequence size : " + meansize); } /** * Return a string representation of this sequence database. */ public String toString() { StringBuilder r = new StringBuilder(); for (Sequence sequence : sequences) { // pour chaque objet r.append(sequence.getId()); r.append(": "); r.append(sequence.toString()); r.append('\n'); } return r.toString(); } /** * Get the sequence count in this database. * @return the sequence count. */ public int size() { return sequences.size(); } /** * Get the sequences from this sequence database. * @return A list of sequences (Sequence). */ public List<Sequence> getSequences() { return sequences; } /** * Get the list of sequence IDs for this database. * @return A set containing the sequence IDs of sequence in this * database. */ public Set<Integer> getSequenceIDs() { // create a set Set<Integer> set = new HashSet<Integer>(); // for each sequence for (Sequence sequence : getSequences()) { set.add(sequence.getId()); // add the id to the set. } return set; // return the set. } }