package ca.pfv.spmf.algorithms.sequentialpatterns.gsp_AGP.items; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import ca.pfv.spmf.algorithms.sequentialpatterns.gsp_AGP.items.creators.AbstractionCreator; import ca.pfv.spmf.algorithms.sequentialpatterns.gsp_AGP.items.creators.ItemAbstractionPairCreator; import ca.pfv.spmf.algorithms.sequentialpatterns.gsp_AGP.items.patterns.Pattern; import ca.pfv.spmf.algorithms.sequentialpatterns.gsp_AGP.items.patterns.PatternCreator; /** * Inspired in SPMF * Implementation of a sequence database, where each sequence is implemented * as an array of integers and should have a unique id. * See examples in /test/ directory for the format of input files. * * Copyright (c) 2013 Antonio Gomariz Peñalver * * This file is part of the SPMF DATA MINING SOFTWARE * (http://www.philippe-fournier-viger.com/spmf). * * SPMF is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * SPMF is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with SPMF. If not, see <http://www.gnu.org/licenses/>. * * @author agomariz */ public class SequenceDatabase { private AbstractionCreator abstractionCreator; /** * Map to associate the frequent items with the 1-patterns composed by themselves */ private Map<Item, Pattern> frequentItems = new HashMap<Item, Pattern>(); /** * List of sequences that compose the database */ private List<Sequence> sequences = new ArrayList<Sequence>(); /** * Instance of ItemFactory */ private ItemFactory<Integer> itemFactory = new ItemFactory<Integer>(); /** * Instance of PatternCreator */ private PatternCreator patternCreator = PatternCreator.getInstance(); public SequenceDatabase(AbstractionCreator abstractionCreator) { this.abstractionCreator = abstractionCreator; } /** * It loads the database contained in the file path given as parameter. * Besides, all the frequent 1-patterns are identified and the original database * is updated by removing the non-frequent items * @param path File path of the original database * @param minSupportAbsolute Minimum absolute support * @throws IOException */ public void loadFile(String path, double minSupportAbsolute) throws IOException { String thisLine; BufferedReader myInput = null; try { FileInputStream fis = new FileInputStream(new File(path)); myInput = new BufferedReader(new InputStreamReader(fis)); while ((thisLine = myInput.readLine()) != null) { // si la linea no es un comentario if (thisLine.charAt(0) != '#') { // añade una secuencia addSequence(thisLine.split(" ")); } } double minSupRelative = (int) Math.ceil(minSupportAbsolute * sequences.size()); // double support = (int) (minSupport * sequences.size()); Set<Item> items = frequentItems.keySet(); Set<Item> itemsToRemove = new HashSet<Item>(); for (Item item : items) { Pattern pattern = frequentItems.get(item); if (pattern.getSupport() < minSupRelative) { itemsToRemove.add(item); } } for (Item nonFrequent : itemsToRemove) { frequentItems.remove(nonFrequent); } shrinkDatabase(frequentItems.keySet()); } catch (Exception e) { } finally { if (myInput != null) { myInput.close(); } } } /** * It creates and addes the sequence found in the array of Strings * @param integers */ public void addSequence(String[] integers) { ItemAbstractionPairCreator creadorPares = ItemAbstractionPairCreator.getInstance(); long timestamp; Sequence sequence = new Sequence(sequences.size()); Itemset itemset = new Itemset(); int start = 0; for (int i = start; i < integers.length; i++) { if (integers[i].codePointAt(0) == '<') { // Timestamp String value = integers[i].substring(1, integers[i].length() - 1); timestamp = Long.parseLong(value); itemset.setTimestamp(timestamp); } else if (integers[i].equals("-1")) { // indica el final de un itemset long time = itemset.getTimestamp() + 1; sequence.addItemset(itemset); itemset = new Itemset(); itemset.setTimestamp(time); } else if (integers[i].equals("-2")) { // indica el final de la secuencia sequences.add(sequence); } else { // extract the value for an item Item item = itemFactory.getItem(Integer.parseInt(integers[i])); Pattern pattern = frequentItems.get(item); if (pattern == null) { pattern = patternCreator.createPattern(creadorPares.getItemAbstractionPair(item, abstractionCreator.CreateDefaultAbstraction())); frequentItems.put(item, pattern); } pattern.addAppearance(sequence.getId()); itemset.addItem(item); } } } public void addSequence(Sequence sequence) { sequences.add(sequence); } @Override public String toString() { StringBuilder r = new StringBuilder(); for (Sequence sequence : sequences) { r.append(sequence.getId()); r.append(": "); r.append(sequence.toString()); r.append('\n'); } return r.toString(); } public int size() { return sequences.size(); } public List<Sequence> getSequences() { return sequences; } /** * It returns the frequent 1-patterns * @return the list of frequent items. */ public List<Pattern> frequentItems() { List<Pattern> celdasDeItemsFrecuentes = new ArrayList<Pattern>(frequentItems.values()); Collections.sort(celdasDeItemsFrecuentes); return celdasDeItemsFrecuentes; } /** * It return a map where are associated each frequent item with the * 1-pattern composed by itself * @return the map */ public Map<Item, Pattern> getFrequentItems() { return frequentItems; } public void clear() { if (sequences != null) { sequences.clear(); } sequences = null; if (frequentItems != null) { frequentItems.clear(); } frequentItems = null; itemFactory = null; } /** * Reduce the original database, removing the items given in the * parameter set * @param keySet */ private void shrinkDatabase(Set<Item> keySet) { for (Sequence sequence : sequences) { for (int i = 0; i < sequence.size(); i++) { Itemset itemset = sequence.get(i); for (int j = 0; j < itemset.size(); j++) { Item item = itemset.get(j); if (!keySet.contains(item)) { sequence.remove(i, j); j--; } } if (itemset.size() == 0) { sequence.remove(i); i--; } } } } }