package ca.pfv.spmf.tools.dataset_stats; /* This file is copyright (c) 2008-2012 Philippe Fournier-Viger * * This file is part of the SPMF DATA MINING SOFTWARE * (http://www.philippe-fournier-viger.com/spmf). * * SPMF is free software: you can redistribute it and/or modify it under the * terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * SPMF is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR * A PARTICULAR PURPOSE. See the GNU General Public License for more details. * You should have received a copy of the GNU General Public License along with * SPMF. If not, see <http://www.gnu.org/licenses/>. */ import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map.Entry; import ca.pfv.spmf.input.sequence_database_list_integers.Sequence; /** * This class read a sequence database and calculates statistics * about this sequence database, then it prints the statistics. * <br/><br/> * In this version this class reads the database into memory before calculating the * statistics. It could be optimized to calculate statistics without * reading the database in memory because a single pass is required. It * was done like that because the code is simpler and easier to understand. * @author Philippe Fournier-Viger */ public class SequenceStatsGenerator { /** * This method generates statistics for a sequence database (a file) * @param path the path to the file * @throws IOException exception if there is a problem while reading the file. */ public void getStats(String path) throws IOException { ///////////////////////////////////// // (1) First we will read the sequence database into memory. // (actually, we don't really need to read it into memory because it // just require a single pass, but the code is more simple like that // - it could be optimized, if necessary). /////////////////////////////////// List<Sequence> sequences = new ArrayList<Sequence>(); // A sequence database is stored as a list of sequences int maxItem = 0; // the largest id for items in the database String thisLine; // a temporary variable to read each line from the file BufferedReader myInput = null; try { // we read the file line by line FileInputStream fin = new FileInputStream(new File(path)); myInput = new BufferedReader(new InputStreamReader(fin)); int i=0; // used to count the lines. // for each line until the end of the file while ((thisLine = myInput.readLine()) != null) { // we split the line according to spaces into tokens String tokens[] = thisLine.split(" "); // we create a new sequence object to store the sequence that correspond to this line. Sequence sequence = new Sequence(i++); // we create a list of integer to store the current itemset from the sequence // that correspond to this line. List<Integer> itemset = new ArrayList<Integer>(); // For each token for (String token : tokens) { //if the token starts with "<" it means that it is a timestamp if (token.codePointAt(0) == '<') { // we just ignore it for statistics.. } // if the token is "-1" it means that it is the end of an itemset else if (token.equals("-1")) { // we add the itemset to the sequence sequence.addItemset(itemset); // we reset the variable itemset to read the next itemset itemset = new ArrayList<Integer>(); } // if the token is "-2", it indicates the end of this sequence and the // end of the line else if (token.equals("-2")) { // we add the sequence to the list of sequences sequences.add(sequence); } // otherwise, it means that the token is an item else { // we convert to an integer Integer item = Integer.parseInt(token); // we check if it has the largest value because we // want to keep this information if (item >= maxItem) { maxItem = item; } // we add the item to the current itemset. itemset.add(item); } } } } catch (Exception e) { e.printStackTrace(); } finally { if (myInput != null) { myInput.close(); } } ///////////////////////////////////// // We finished reading the database into memory. // We will calculate statistics on this sequence database. /////////////////////////////////// System.out.println("============ SEQUENCE DATABASE STATS =========="); System.out.println("Number of sequences : " + sequences.size()); // we initialize some variables that we will use to generate the statistics java.util.Set<Integer> items = new java.util.HashSet<Integer>(); // the set of all items List<Integer> sizes = new ArrayList<Integer>(); // the lengths of each sequence List<Integer> itemsetsizes = new ArrayList<Integer>(); // the lengths of each itemset List<Integer> differentitems = new ArrayList<Integer>(); // the number of different item for each sequence List<Integer> appearXtimesbySequence = new ArrayList<Integer>(); // the average number of times that items appearing in a sequence, appears in this sequence. // Loop on sequences from the database for (Sequence sequence : sequences) { // we add the size of this sequence to the list of sizes sizes.add(sequence.size()); // this map is used to calculate the number of times that each item // appear in this sequence. // the key is an item // the value is the number of occurences of the item until now for this sequence HashMap<Integer, Integer> mapIntegers = new HashMap<Integer, Integer>(); // Loop on itemsets from this sequence for (List<Integer> itemset : sequence.getItemsets()) { // we add the size of this itemset to the list of itemset sizes itemsetsizes.add(itemset.size()); // Loop on items from this itemset for (Integer item : itemset) { // If the item is not in the map already, we set count to 0 Integer count = mapIntegers.get(item); if (count == null) { count = 0; } // otherwise we set the count to count +1 count = count + 1; mapIntegers.put(item, count); // finally, we add the item to the set of items items.add(item); } } // we add all items found in this sequence to the global list // of different items for the database differentitems.add(mapIntegers.entrySet().size()); // for each item appearing in this sequence, // we put the number of times in a global list "appearXtimesbySequence" // previously described. for (Entry<Integer, Integer> entry : mapIntegers.entrySet()) { appearXtimesbySequence.add(entry.getValue()); } } // we print the statistics System.out.println("File " + path); System.out.println("Number of distinct items: " + items.size()); System.out.println("Largest item id: " + maxItem); System.out.println("Average number of itemsets per sequence : " + calculateMean(sizes) + " standard deviation: " + calculateStdDeviation(sizes) + " variance: " + calculateVariance(sizes)); System.out.println("Average number of distinct item per sequence : " + calculateMean(differentitems) + " standard deviation: " + calculateStdDeviation(differentitems) + " variance: " + calculateVariance(differentitems)); System.out .println("Average number of occurences in a sequence for each item appearing in a sequence : " + calculateMean(appearXtimesbySequence) + " standard deviation: " + calculateStdDeviation(appearXtimesbySequence) + " variance: " + calculateVariance(appearXtimesbySequence)); System.out.println("Average number of items per itemset : " + calculateMean(itemsetsizes) + " standard deviation: " + calculateStdDeviation(itemsetsizes) + " variance: " + calculateVariance(itemsetsizes)); System.out.println(items.size()); } /** * This method calculate the mean of a list of integers * @param list the list of integers * @return the mean */ private static double calculateMean(List<Integer> list) { double sum = 0; for (Integer val : list) { sum += val; } return sum / list.size(); } /** * This method calculate the standard deviation of a list of integers * @param list the list of integers * @return the standard deviation */ private static double calculateStdDeviation(List<Integer> list) { double deviation = 0; double mean = calculateMean(list); for (Integer val : list) { deviation += Math.pow(mean - val, 2); } return Math.sqrt(deviation / list.size()); } /** * This method calculate the mean of a list of doubles * @param list the list of doubles * @return the mean */ private static double calculateMeanD(List<Double> list) { double sum = 0; for (Double val : list) { sum += val; } return sum / list.size(); } /** * This method calculate the standard deviation of a list of doubles * @param list the list of doubles * @return the standard deviation */ private static double calculateStdDeviationD(List<Double> list) { double deviation = 0; double mean = calculateMeanD(list); for (Double val : list) { deviation += Math.pow(mean - val, 2); } return Math.sqrt(deviation / list.size()); } /** * This method calculate the variance of a list of integers * @param list the list of integers * @return the variance */ private static double calculateVariance(List<Integer> list) { double deviation = 0; double mean = calculateMean(list); for (Integer val : list) { deviation += Math.pow(mean - val, 2); } return Math.pow(Math.sqrt(deviation / list.size()), 2); } /** * This method return the smallest integer from a list of integers * @param list the list of integers * @return the smallest integer */ private static int calculateMinValue(List<Integer> list) { int min = Integer.MIN_VALUE; for (Integer val : list) { if (val <= min) { min = val; } } return min; } /** * This method return the largest integer from a list of integers * @param list the list of integers * @return the largest integer */ private static int calculateMaxValue(List<Integer> list) { int max = 0; for (Integer val : list) { if (val >= max) { max = val; } } return max; } }