package ca.pfv.spmf.algorithms.sequentialpatterns.fournier2008_seqdim.multidimensionalsequentialpatterns;
/* This file is copyright (c) 2008-2013 Philippe Fournier-Viger
*
* This file is part of the SPMF DATA MINING SOFTWARE
* (http://www.philippe-fournier-viger.com/spmf).
*
* SPMF is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
*
* SPMF is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
* You should have received a copy of the GNU General Public License along with
* SPMF. If not, see <http://www.gnu.org/licenses/>.
*/
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import ca.pfv.spmf.algorithms.sequentialpatterns.fournier2008_seqdim.ItemSimple;
import ca.pfv.spmf.algorithms.sequentialpatterns.fournier2008_seqdim.ItemValued;
import ca.pfv.spmf.algorithms.sequentialpatterns.fournier2008_seqdim.Itemset;
import ca.pfv.spmf.algorithms.sequentialpatterns.fournier2008_seqdim.Sequence;
import ca.pfv.spmf.algorithms.sequentialpatterns.fournier2008_seqdim.SequenceDatabase;
import ca.pfv.spmf.algorithms.sequentialpatterns.fournier2008_seqdim.multidimensionalpatterns.MDPattern;
import ca.pfv.spmf.algorithms.sequentialpatterns.fournier2008_seqdim.multidimensionalpatterns.MDPatternsDatabase;
/**
* Implementation of a "Multi-Dimensional Sequence Database"
* as used by the SeqDim algorithm (Pinto et al., 2001).
* <br/><br/>
*
* A MD-Sequences database contains a list of md-Sequences, where
* each MD-Sequence is composed of an MD-Pattern and a Sequence.
*
* @see MDSequence
* @see MDSequences
* @see AlgoSeqDim
* @author Philippe Fournier-Viger
*/
public class MDSequenceDatabase {
/** List of md-sequences*/
private final List<MDSequence> sequences = new ArrayList<MDSequence>();
/** We also keep the sequences and patterns in some separate databases.
// the sequence database*/
private final SequenceDatabase sequenceDatabase = new SequenceDatabase();
/** the mdpattern database*/
private final MDPatternsDatabase patternDatabase = new MDPatternsDatabase();
/** the set of item IDs in this database*/
private final Set<ItemSimple> itemIDs = new HashSet<ItemSimple>();
/** the largest sequence ID in this database*/
private int sequenceNumber =0;
/**
* Get the number of distinct items in this database.
* @return an integer
*/
public int getItemCount(){
return itemIDs.size();
}
/**
* Load a MD-Sequence database from a file
* @param path the path of the file
* @throws IOException exception if error reading the file
*/
public void loadFile(String path) throws IOException {
// It read the file line by line
// Each line is a md-sequence except lines starting with #.
String thisLine;
BufferedReader myInput = null;
try {
FileInputStream fin = new FileInputStream(new File(path));
myInput = new BufferedReader(new InputStreamReader(fin));
// for each line until the end of file
while ((thisLine = myInput.readLine()) != null) {
// if the line is a comment, is empty or is a
// kind of metadata
if (thisLine.isEmpty() == true ||
thisLine.charAt(0) == '#' || thisLine.charAt(0) == '%'
|| thisLine.charAt(0) == '@') {
continue;
}
// split the MDsequence into tokens
// and process this MDsequence
processMDSequence(thisLine.split(" "));
}
} catch (Exception e) {
e.printStackTrace();
}finally {
// close the file
if(myInput != null){
myInput.close();
}
}
}
/**
* Process a line of the input file representing a MDSequence
* @param tokens a list of tokens from the line as String[]
*/
private void processMDSequence(String[] tokens) {
// (1) First, read the MDpattern part of the MDSequence
// create the mdpattern
MDPattern mdpattern = new MDPattern(sequenceNumber);
int i= 0;
// for each token until the end of the mdpattern (-3):
for(; i< tokens.length; i++){
// if -3, then it is the end of the mdpattern
if(tokens[i].equals("-3")){
break;
// if "*" , it is a wildcard
}else if(tokens[i].equals("*")){
mdpattern.addInteger(MDPattern.WILDCARD);
}else{ // otherwise, it is a dimension value
mdpattern.addInteger(Integer.valueOf(tokens[i]));
}
}
// (2) Now that the mdpattern has been read, the next step
// is to read the sequence part of this mdsequence.
// Create the sequence
Sequence sequence = new Sequence(sequenceNumber);
// create an itemset for the current itemset that will be read
Itemset itemset = new Itemset();
// for each token until the last one
for(i++ ;i< tokens.length; i++){
// if the token is a timestamp of an itemset
if(tokens[i].codePointAt(0) == '<'){
// set this value as the timestamp of the current itemset
String value = tokens[i].substring(1, tokens[i].length()-1);
itemset.setTimestamp(Long.parseLong(value));
}else if(tokens[i].equals("-1")){
// if -1, it means the end of the current itemset,
// so we add it to the sequence and create
// a new itemset
sequence.addItemset(itemset);
itemset = new Itemset();
}else if(tokens[i].equals("-2")){
// if -2, that means the end of the MDSequence
// so we create the object with the
// mdpattern and sequence objects.
MDSequence mdsequence = new MDSequence(sequenceNumber, mdpattern, sequence);
sequences.add(mdsequence);
sequenceDatabase.addSequence(sequence);
patternDatabase.addMDPattern(mdpattern);
sequenceNumber++;
}else{
// Otherwise, it is an item.
// An item can have a value between parenthesis.
// We check if it has a parenthesis
int indexLeftParenthesis = tokens[i].indexOf("(");
int value =0;
// if there is a left parenthesis
if(indexLeftParenthesis != -1){
// find the index of the right parenthesis
int indexRightParenthesis = tokens[i].indexOf(")");
// extract the value
value = Integer.parseInt(tokens[i].substring(indexLeftParenthesis+1, indexRightParenthesis));
// extract the item ID
tokens[i] = tokens[i].substring(0, indexLeftParenthesis);
// create the item with the value
ItemValued item = new ItemValued(Integer.parseInt(tokens[i]), value);
// add the item to the current itemset
itemset.addItem(item);
}else{
// otherwise, it is just a simple item so
// we extract the item ID and create a new item
ItemSimple item = new ItemSimple(Integer.parseInt(tokens[i]));
// we add the item to the current itemset
itemset.addItem(item);
}
}
}
}
/**
* Add an MDSequence to this MDSequence database.
* @param sequence an MDSequence
*/
public void addSequence(MDSequence sequence){
// add it to the list of sequences
sequences.add(sequence);
// add the sequence and mdpattern parts to the respective
// databases.
sequenceDatabase.addSequence(sequence.getSequence());
patternDatabase.addMDPattern(sequence.getMdpattern());
}
/**
* Print this database to system.out.
*/
public void printDatabase(){
System.out.println(toString());
}
/**
* Get a String representation of this database.
* @return a string
*/
public String toString(){
StringBuilder out = new StringBuilder("============ MD Sequence Database ==========\n");
// for each mdsequence
for(MDSequence sequence : sequences){
// append the mdsequence
out.append(sequence.toString() + "\n");
}
//return the string
return out.toString();
}
/**
* Get the number of MDsequences.
* @return a integer value
*/
public int size(){
return sequences.size();
}
/**
* Get the list of MDSequences stored in this database
* @return a List of MDSequence objects.
*/
public List<MDSequence> getSequences() {
return sequences;
}
/**
* Get the i-th MDSequence.
* @param index the position i.
* @return the MDSequence
*/
public MDSequence get(int index) {
return sequences.get(index);
}
/**
* Get the list of item IDs in this database.
* @return a set of Item objects.
*/
public Set<ItemSimple> getItemIDs() {
return itemIDs;
}
/**
* Get the list of sequences contained in the MDSequences.
* @return a SequencDatabase
*/
public SequenceDatabase getSequenceDatabase() {
return sequenceDatabase;
}
/**
* Get the list of MDPatterns contained in the MDSequences.
* @return a MDPatternsDatabase
*/
public MDPatternsDatabase getPatternDatabase() {
return patternDatabase;
}
}