package ca.pfv.spmf.input.sequence_database_array_integers; /* Copyright (c) 2008-2013 Philippe Fournier-Viger * * This file is part of the SPMF DATA MINING SOFTWARE * (http://www.philippe-fournier-viger.com/spmf). * * SPMF is free software: you can redistribute it and/or modify it under the * terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * * SPMF is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR * A PARTICULAR PURPOSE. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along with * SPMF. If not, see <http://www.gnu.org/licenses/>. */ import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.List; /** * Implementation of a sequence database, where each sequence is implemented * as an array of integers and should have a unique id. * * @see Sequence * @author Philipe-Fournier-Viger */ public class SequenceDatabase { /** smallest item in this database*/ public int minItem = Integer.MAX_VALUE; /** largest item in this database */ public int maxItem = 0; /** the number of sequences in this database */ public int tidsCount =0; /** variable that contains the sequences of this database */ private final List<Sequence> sequences = new ArrayList<Sequence>(); /** * Method to load a sequence database from a text file in SPMF format. * @param path the input file path. * @throws IOException exception if error while reading the file. */ public void loadFile(String path) throws IOException { String thisLine; // variable to read each line. BufferedReader myInput = null; try { FileInputStream fin = new FileInputStream(new File(path)); myInput = new BufferedReader(new InputStreamReader(fin)); // for each line until the end of the file while ((thisLine = myInput.readLine()) != null) { // if the line is not a comment, is not empty or is not other // kind of metadata if (thisLine.isEmpty() == false && thisLine.charAt(0) != '#' && thisLine.charAt(0) != '%' && thisLine.charAt(0) != '@') { // split this line according to spaces and process the line addSequence(thisLine.split(" ")); } } } catch (Exception e) { e.printStackTrace(); } finally { if (myInput != null) { myInput.close(); } } } // /** * Method to process a line from the input file * @param tokens A list of tokens from the line (which were separated by spaces in the original file). */ public void addSequence(String[] tokens) { // create a new Sequence to store the sequence Sequence sequence = new Sequence(); // create a list of strings for the first itemset. List<Integer> itemset = new ArrayList<Integer>(); // for each token in this line for (String token : tokens) { // if the token start with "<", it indicates a timestamp. // We just ignore it because algorithms that use this class // don't need it. if (token.codePointAt(0) == '<') { // we just ignore } // if the token is -1, it means that we reached the end of an itemset. else if (token.equals("-1")) { // add the current itemset to the sequence sequence.addItemset(itemset.toArray()); // create a new itemset itemset = new ArrayList<Integer>(); } // if the token is -2, it means that we reached the end of // the sequence. else if (token.equals("-2")) { // we add it to the list of sequences sequences.add(sequence); } else { // Otherwise it is an item. // We parse it as an integer. Integer item = Integer.parseInt(token); // we update the maximum item for statistics if(item >= maxItem){ maxItem = item; } // we update the minimum item for statistics if(item < minItem){ minItem = item; } // we add the item to the current itemset itemset.add(item); } } // tidsCount++; } /** * Method to add a sequence to this sequence database * @param sequence A sequence of type "Sequence". */ public void addSequence(Sequence sequence) { sequences.add(sequence); } /** * Print this sequence database to System.out. */ public void print() { System.out.println("============ CONTEXTE =========="); for (int i=0 ; i < sequences.size(); i++) { // pour chaque objet System.out.print(i + ": "); sequences.get(i).print(); System.out.println(""); } } /** * Print statistics about this database. */ public void printDatabaseStats() { System.out.println("============ STATS =========="); System.out.println("Number of sequences : " + sequences.size()); System.out.println("Min item:" + minItem); System.out.println("Max item:" + maxItem); // Calculate the average size of sequences in this database long size = 0; for(Sequence sequence : sequences){ size += sequence.size(); } double meansize = ((float)size) / ((float)sequences.size()); System.out.println("mean size" + meansize); } /** * Return a string representation of this sequence database. */ public String toString() { StringBuilder r = new StringBuilder(); // for each sequence for (int i=0 ; i < sequences.size(); i++) { r.append(i); r.append(": "); r.append(sequences.get(i).toString()); r.append('\n'); } return r.toString(); } /** * Get the sequence count in this database. * @return the sequence count. */ public int size() { return sequences.size(); } /** * Get the sequences from this sequence database. * @return A list of sequences (Sequence). */ public List<Sequence> getSequences() { return sequences; } // // public void printDatabaseStats() { // System.out.println("============ STATS =========="); // System.out.println("Number of sequences : " + sequences.size()); // System.out.println("Min item:" + minItem); // System.out.println("Max item:" + maxItem); // // average size of sequence // long size = 0; // long sizeItems = 0; // ArrayList<Integer> listItems = new ArrayList<Integer>(); // for(Sequence sequence : sequences){ // int itemCount = 0; // for(Integer[] array : sequence.getItemsets()){ // itemCount += array.length; // } // sizeItems += itemCount; // listItems.add(itemCount); // size += sequence.size(); // } // double meansizeItems = ((float)sizeItems) / ((float)sequences.size()); // // standard deviation // double std =0; // for(Integer elementList : listItems){ // std += Math.abs((meansizeItems - elementList) / ((float)sequences.size())); // } // std = Math.sqrt(std); // // System.out.println("mean item count" + meansizeItems + " std : " + std); //// System.out.println("mean itemset count" + meansize); // } // public void loadFile(String path, int maxlineCount) throws IOException { // String thisLine; // BufferedReader myInput = null; // try { // FileInputStream fin = new FileInputStream(new File(path)); // myInput = new BufferedReader(new InputStreamReader(fin)); // int i=0; // while ((thisLine = myInput.readLine()) != null) { // // si la ligne n'est pas un commentaire // if (thisLine.charAt(0) != '#') { // // ajoute une s�quence // addSequence(thisLine.split(" ")); // i++; // if(i == maxlineCount){ // break; // } // } // } // } catch (Exception e) { // e.printStackTrace(); // } finally { // if (myInput != null) { // myInput.close(); // } // } // } // // ---------------------- Pour le fomat g�n�r� par seq_data_generator // public void loadFileBinaryFormat(String path, int maxcount) { // // TODO Auto-generated method stub // DataInputStream myInput = null; // try { // FileInputStream fin = new FileInputStream(new File(path)); // myInput = new DataInputStream(fin); // // Sequence sequence = new Sequence(); // List<Integer> itemset = new ArrayList<Integer>(); // while (true) { // int value = INT_little_endian_TO_big_endian(myInput.readInt()); // // // System.out.println(value); // if (value == -1) { // sequence.addItemset(itemset.toArray()); // itemset = new ArrayList<Integer>(); // } else if (value == -2) { // sequences.add(sequence); // if (sequences.size() == maxcount) { // break; // } // sequence = new Sequence(); // } else { // itemset.add(value); // } // // } // } catch (Exception e) { // e.printStackTrace(); // } // } // // // 4-byte number this function was taken from the internet (by Anghel // // Leonard) // int INT_little_endian_TO_big_endian(int i) { // return ((i & 0xff) << 24) + ((i & 0xff00) << 8) + ((i & 0xff0000) >> 8) // + ((i >> 24) & 0xff); // } // // public void loadFileKosarakFormat(String filepath, int nblinetoread) // throws IOException { //String thisLine; //BufferedReader myInput = null; //try { // FileInputStream fin = new FileInputStream(new File(filepath)); // myInput = new BufferedReader(new InputStreamReader(fin)); // int i = 0; // while ((thisLine = myInput.readLine()) != null) { // // ajoute une s�quence // String[] split = thisLine.split(" "); // i++; // if (nblinetoread == i) { // break; // } // Sequence sequence = new Sequence(); // for (String value : split) { // List<Integer> itemset = new ArrayList<Integer>(); // Integer item = Integer.parseInt(value); // if(item >= maxItem){ // maxItem = item; // } // if(item < minItem){ // minItem = item; // } // itemset.add(item); // sequence.addItemset(itemset.toArray()); // } // sequences.add(sequence); // // } //} catch (Exception e) { // e.printStackTrace(); //} finally { // if (myInput != null) { // myInput.close(); // } //} //} // // // public void loadFileKosarakFormatV2(String filepath, int nblinetoread) // throws IOException { // String thisLine; // BufferedReader myInput = null; // try { // FileInputStream fin = new FileInputStream(new File(filepath)); // myInput = new BufferedReader(new InputStreamReader(fin)); // int i = 0; // while ((thisLine = myInput.readLine()) != null) { // // ajoute une s�quence // String[] split = thisLine.split(" "); // i++; //// if (nblinetoread == i) { //// break; //// } // if(split.length < 25){ // continue; // } // Sequence sequence = new Sequence(); // int count=0; // List<Integer> itemset = new ArrayList<Integer>(); // for (String value : split) { // Integer item = Integer.parseInt(value); // itemset.add(item); // if(item >= maxItem){ // maxItem = item; // } // if(item < minItem){ // minItem = item; // } // count++; // if(count == 3){ // sequence.addItemset(itemset.toArray()); // itemset = new ArrayList<Integer>(); // count =0; // } // } // if(sequence.size() >10 ){ // sequences.add(sequence); // } // // } // } catch (Exception e) { // e.printStackTrace(); // } finally { // if (myInput != null) { // myInput.close(); // } // } // } // // public void loadFileWebViewFOrmat(String filepath, int nbLine) { // String thisLine; // BufferedReader myInput = null; // try { // FileInputStream fin = new FileInputStream(new File(filepath)); // myInput = new BufferedReader(new InputStreamReader(fin)); // int realID = 0; // int lastId = 0; // Sequence sequence = null; // while ((thisLine = myInput.readLine()) != null) { // // ajoute une s�quence // String[] split = thisLine.split(" "); // int id = Integer.parseInt(split[0]); // int val = Integer.parseInt(split[1]); // // if(lastId != id){ // if(lastId!=0 ){ //&& sequence.size() >=2 // sequences.add(sequence); // realID++; // } // sequence = new Sequence(); // lastId = id; // } // List<Integer> itemset = new ArrayList<Integer>(); // itemset.add(val); // if(val >= maxItem){ // maxItem = val; // } // if(val < minItem){ // minItem = val; // } // sequence.addItemset(itemset.toArray()); // } // } catch (Exception e) { // e.printStackTrace(); // } // } // // public void loadFileWebViewFOrmatV2(String filepath, int nbLine) { // String thisLine; // BufferedReader myInput = null; // try { // FileInputStream fin = new FileInputStream(new File(filepath)); // myInput = new BufferedReader(new InputStreamReader(fin)); // int realID = 0; // int lastId = 0; // Sequence sequence = null; // while ((thisLine = myInput.readLine()) != null) { // // ajoute une s�quence // String[] split = thisLine.split(" "); // int id = Integer.parseInt(split[0]); // int val = Integer.parseInt(split[1]); // // if(lastId != id){ // if(lastId!=0 && sequence.size() >=5){ // sequences.add(sequence); // realID++; // } // sequence = new Sequence(); // lastId = id; // } // List<Integer> itemset = new ArrayList<Integer>(); // itemset.add(val); // if(val >= maxItem){ // maxItem = val; // } // if(val < minItem){ // minItem = val; // } // sequence.addItemset(itemset.toArray()); // } // } catch (Exception e) { // e.printStackTrace(); // } // } // // public void loadSnakeDataset(String filepath, int nbLine) { // String thisLine; // BufferedReader myInput = null; // try { // FileInputStream fin = new FileInputStream(new File(filepath)); // myInput = new BufferedReader(new InputStreamReader(fin)); // while ((thisLine = myInput.readLine()) != null) { // if(thisLine.length() >= 50){ // Sequence sequence = new Sequence(); // for(int i=0; i< thisLine.length(); i++){ // List<Integer> itemset = new ArrayList<Integer>(); // int character = thisLine.toCharArray()[i] - 65; // System.out.println(thisLine.toCharArray()[i] + " " + character); // itemset.add(character); // if(character >= maxItem){ // maxItem = character; // } // if(character < minItem){ // minItem = character; // } // sequence.addItemset(itemset.toArray()); // } // sequences.add(sequence); // } // } // } catch (Exception e) { // e.printStackTrace(); // } // } // // public void loadFileSignLanguage(String fileToPath, int i) { // String thisLine; // BufferedReader myInput = null; // try { // FileInputStream fin = new FileInputStream(new File(fileToPath)); // myInput = new BufferedReader(new InputStreamReader(fin)); // String oldUtterance = "-1"; // Sequence sequence = null; // while ((thisLine = myInput.readLine()) != null) { // if(thisLine.length() >= 1 && thisLine.charAt(0) != '#'){ // String []tokens = thisLine.split(" "); // String currentUtterance = tokens[0]; // if(!currentUtterance.equals(oldUtterance)){ // if(sequence != null){ // sequences.add(sequence); // } // sequence = new Sequence(); // oldUtterance = currentUtterance; // } // for(int j=1; j< tokens.length; j++){ // int character = Integer.parseInt(tokens[j]); // if(character == -11 || character == -12){ // continue; // } // if(character >= maxItem){ // maxItem = character; // } // if(character < minItem){ // minItem = character; // } // sequence.addItemset(new Object[]{character}); // } // } // } // sequences.add(sequence); // System.out.println(sequence.toString()); // } catch (Exception e) { // e.printStackTrace(); // } // } // Sequence sequence = new Sequence(); // for(int i=0; i< thisLine.length(); i++){ // List<Integer> itemset = new ArrayList<Integer>(); // int character = thisLine.toCharArray()[i] - 65; // System.out.println(thisLine.toCharArray()[i] + " " + character); // itemset.add(character); // if(character >= maxItem){ // maxItem = character; // } // if(character < minItem){ // minItem = character; // } // sequence.addItemset(itemset.toArray()); // } // sequences.add(sequence); }