package ca.pfv.spmf.algorithms.sequentialpatterns.clasp_AGP.dataStructures.database;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import ca.pfv.spmf.algorithms.sequentialpatterns.clasp_AGP.dataStructures.Item;
import ca.pfv.spmf.algorithms.sequentialpatterns.clasp_AGP.dataStructures.Itemset;
import ca.pfv.spmf.algorithms.sequentialpatterns.clasp_AGP.dataStructures.Sequence;
import ca.pfv.spmf.algorithms.sequentialpatterns.clasp_AGP.dataStructures.creators.AbstractionCreator;
import ca.pfv.spmf.algorithms.sequentialpatterns.clasp_AGP.dataStructures.creators.ItemAbstractionPairCreator;
import ca.pfv.spmf.algorithms.sequentialpatterns.clasp_AGP.idlists.IDList;
import ca.pfv.spmf.algorithms.sequentialpatterns.clasp_AGP.idlists.creators.IdListCreator;
import ca.pfv.spmf.algorithms.sequentialpatterns.clasp_AGP.tries.Trie;
import ca.pfv.spmf.algorithms.sequentialpatterns.clasp_AGP.tries.TrieNode;
/**
* Inspired in SPMF. Implementation of a sequence database. Each sequence should
* have a unique id. See examples in /test/ directory for the format of input
* files.
*
* Copyright Antonio Gomariz PeƱalver 2013
*
* This file is part of the SPMF DATA MINING SOFTWARE
* (http://www.philippe-fournier-viger.com/spmf).
*
* SPMF is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
*
* SPMF is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with
* SPMF. If not, see <http://www.gnu.org/licenses/>.
*
* @author agomariz
*/
public class SequenceDatabase{
private AbstractionCreator abstractionCreator;
private IdListCreator idListCreator;
private Map<Item, TrieNode> frequentItems = new HashMap<Item, TrieNode>();
private List<Sequence> sequences = new ArrayList<Sequence>();
private ItemFactory<Integer> itemFactory = new ItemFactory<Integer>();
private int nSequences = 1;
/**
* Map where we keep the original length for all the sequences
*/
private Map<Integer, Integer> sequencesLengths = new HashMap<Integer, Integer>();
/**
* Map where, for each sequence, we have a list of integers corresponding
* to all the sizes of all the itemsets that the sequence has
*/
private Map<Integer, List<Integer>> sequenceItemsetSize = new HashMap<Integer, List<Integer>>();
/**
* For each item, we match it with a map of entries <sequence id, number of elements after item>.
* We will use this map in order to maintain the values necessaries for making the pruning methods.
*/
private Map<Item, Map<Integer, List<Integer>>> projectingDistance = new HashMap<Item, Map<Integer, List<Integer>>>();
/**
* Standard constructor
* @param abstractionCreator
* @param IdListCreator
*/
public SequenceDatabase(AbstractionCreator abstractionCreator, IdListCreator IdListCreator) {
this.abstractionCreator = abstractionCreator;
this.idListCreator = IdListCreator;
}
/**
* Method that load a database from a path file given as parameter
*
* @param path Path file where the database is
* @param minSupport Minimum absolute support
* @throws IOException
*/
public double loadFile(String path, double minSupport) throws IOException {
String thisLine;
BufferedReader myInput = null;
try {
FileInputStream fin = new FileInputStream(new File(path));
myInput = new BufferedReader(new InputStreamReader(fin));
//For each line
while ((thisLine = myInput.readLine()) != null) {
// If the line is not a comment line
if (thisLine.charAt(0) != '#') {
// we add a new sequence to the sequenceDatabase
addSequence(thisLine.split(" "));
}
}
double support = (int) Math.ceil(minSupport * sequences.size());
Set<Item> frequentItemsSet = frequentItems.keySet();
Set<Item> itemsToRemove = new HashSet<Item>();
//We remove those items that are not frequent
for (Item frequentItem : frequentItemsSet) {
//From the item set of frequent items
TrieNode nodo = frequentItems.get(frequentItem);
if (nodo.getChild().getIdList().getSupport() < support) {
itemsToRemove.add(frequentItem);
} else {
nodo.getChild().getIdList().setAppearingIn(nodo.getChild());
}
}
for(Item item: itemsToRemove){
frequentItems.remove(item);
}
//And from the original database
reduceDatabase(frequentItems.keySet());
/*
* We initialize all the maps
*/
idListCreator.initializeMaps(frequentItems, projectingDistance, sequencesLengths, sequenceItemsetSize/*, itemsetTimestampMatching*/);
return support;
} catch (Exception e) {
} finally {
if (myInput != null) {
myInput.close();
}
}
return -1;
}
/**
* Method that adds a sequence from a array of string
*
* @param integers
*/
public void addSequence(String[] integers) {
ItemAbstractionPairCreator pairCreator = ItemAbstractionPairCreator.getInstance();
long timestamp = -1;
Sequence sequence = new Sequence(sequences.size());
Itemset itemset = new Itemset();
sequence.setID(nSequences);
int beginning = 0;
List<Integer> sizeItemsetsList = new ArrayList<Integer>();
for (int i = beginning; i < integers.length; i++) {
if (integers[i].codePointAt(0) == '<') { // Timestamp
String value = integers[i].substring(1, integers[i].length() - 1);
timestamp = Long.parseLong(value);
itemset.setTimestamp(timestamp);
} else if (integers[i].equals("-1")) { // End of an Itemset
//insertMatchItemsetTimestamp(nSequences, sequence.size(), timestamp);
timestamp = itemset.getTimestamp() + 1;
sequence.addItemset(itemset);
itemset = new Itemset();
itemset.setTimestamp(timestamp);
sizeItemsetsList.add(sequence.length());
} else if (integers[i].equals("-2")) { // End of a sequence
sequences.add(sequence);
nSequences++;
sequencesLengths.put(sequence.getId(), sequence.length());
sequenceItemsetSize.put(sequence.getId(), sizeItemsetsList);
} else { // an item with the format : id(value) ou: id
int indexParentheseGauche = integers[i].indexOf("(");
if (indexParentheseGauche != -1) {
} else {
// extract the value for an item
Item item = itemFactory.getItem(Integer.parseInt(integers[i]));
TrieNode node = frequentItems.get(item);
if (node == null) {
IDList idlist = idListCreator.create();
node = new TrieNode(pairCreator.getItemAbstractionPair(item, abstractionCreator.createDefaultAbstraction()), new Trie(null, idlist));
frequentItems.put(item, node);
}
IDList idlist = node.getChild().getIdList();
if (timestamp < 0) {
timestamp = 1;
itemset.setTimestamp(timestamp);
}
itemset.addItem(item);
idListCreator.addAppearance(idlist, sequence.getId(), (int) timestamp, sequence.length() + itemset.size());
idListCreator.updateProjectionDistance(projectingDistance, item, sequence.getId(), sequence.size(), sequence.length() + itemset.size());
}
}
}
}
public void addSequence(Sequence sequence) {
sequences.add(sequence);
}
/**
* Get the string representation of this SequenceDatabase
* @return the string representation
*/
@Override
public String toString() {
StringBuilder r = new StringBuilder();
for (Sequence sequence : sequences) {
r.append(sequence.getId());
r.append(": ");
r.append(sequence.toString());
r.append('\n');
}
return r.toString();
}
public int size() {
return sequences.size();
}
public List<Sequence> getSequences() {
return sequences;
}
/**
* Get the equivalence classes associated with the frequent items
* that we have found.
* @return the trie
*/
public Trie frequentItems() {
Trie result = new Trie();
List<TrieNode> frequentItemsNodes = new ArrayList<TrieNode>(frequentItems.values());
result.setNodes(frequentItemsNodes);
result.sort();
return result;
}
/**
* Get the map that makes the matching between items and
* equivalence classes
* @return the map
*/
public Map<Item, TrieNode> getFrequentItems() {
return frequentItems;
}
/**
* It reduces the original database to just frequent items
* @param keySet
*/
private void reduceDatabase(Set<Item> keySet) {
for (int k = 0; k < sequences.size(); k++) {
Sequence sequence = sequences.get(k);
for (int i = 0; i < sequence.size(); i++) {
Itemset itemset = sequence.get(i);
for (int j = 0; j < itemset.size(); j++) {
Item item = itemset.get(j);
if (!keySet.contains(item)) {
sequence.remove(i, j);
j--;
}
}
if (itemset.size() == 0) {
sequence.remove(i);
i--;
}
}
if (sequence.size() == 0) {
sequences.remove(k);
k--;
}
}
}
public void clear() {
if (sequences != null) {
sequences.clear();
}
sequences = null;
if (frequentItems != null) {
frequentItems.clear();
}
frequentItems = null;
itemFactory = null;
projectingDistance = null;
sequenceItemsetSize = null;
sequencesLengths = null;
}
}