package ca.pfv.spmf.tools.dataset_stats;
/* This file is copyright (c) 2008-2014 Philippe Fournier-Viger
*
* This file is part of the SPMF DATA MINING SOFTWARE
* (http://www.philippe-fournier-viger.com/spmf).
*
* SPMF is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
* SPMF is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
* You should have received a copy of the GNU General Public License along with
* SPMF. If not, see <http://www.gnu.org/licenses/>.
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Set;
import ca.pfv.spmf.input.transaction_database_list_integers.TransactionDatabase;
/**
* This class read a transaction database and calculates statistics
* about this transactions database, then it prints the statistics to the console.
* <br/><br/>
* In this version this class reads the database into memory before calculating the
* statistics. It could be optimized to calculate statistics without
* reading the database in memory because a single pass is required. It
* was done like that because the code is simpler and easier to understand and performance
* is not an issue for this kind of tasks.
* @author Philippe Fournier-Viger
*/
public class TransactionStatsGenerator {
/**
* This method generates statistics for a transaction database (a file)
* @param path the path to the file
* @throws IOException exception if there is a problem while reading the file.
*/
public void getStats(String path) throws IOException {
/////////////////////////////////////
// (1) First we will read the transaction database into memory.
// (actually, we don't really need to read it into memory because it
// just require a single pass, but the code is more simple like that
// - it could be optimized, if necessary).
///////////////////////////////////
TransactionDatabase database = new TransactionDatabase();
database.loadFile(path);
/////////////////////////////////////
// We finished reading the database into memory.
// We will calculate statistics on this transaction database.
///////////////////////////////////
System.out.println("============ TRANSACTION DATABASE STATS ==========");
System.out.println("Number of transactions : " + database.size());
// we initialize some variables that we will use to generate the statistics
int minItem = Integer.MAX_VALUE; // the largest id for items in the database
int maxItem = 0; // the largest id for items in the database
Set<Integer> items = new java.util.HashSet<Integer>(); // the set of all items
List<Integer> sizes = new ArrayList<Integer>(); // the lengths of each transactions
// this map is used to store the number of times that each item
// appear in the database.
// the key is an item
// the value is the number of items that the item appears
HashMap<Integer, Integer> mapItemSupport = new HashMap<Integer, Integer>();
// Loop on transactions from the database
for (List<Integer> transaction : database.getTransactions()) {
// we add the size of this transaction to the list of sizes
sizes.add(transaction.size());
// Loop on items from this transaction
for (int item : transaction) {
if(item > maxItem) {
maxItem = item;
}
if(item < minItem) {
minItem = item;
}
// If the item is not in the map already, we set count to 0
Integer count = mapItemSupport.get(item);
if (count == null) {
count = 0;
}
mapItemSupport.put(item, count+1);
// finally, we add the item to the set of items
items.add(item);
}
}
// put support of items into a list
List<Integer> listSupportOfItems = new ArrayList(mapItemSupport.values());
// we print the statistics
System.out.println("File " + path);
System.out.println("Number of distinct items: " + items.size());
System.out.println("Smallest item id: " + minItem);
System.out.println("Largest item id: " + maxItem);
System.out.println("Average number of items per transaction: "
+ calculateMean(sizes) + " standard deviation: "
+ calculateStdDeviation(sizes) + " variance: "
+ calculateVariance(sizes));
System.out.println("Average item support in the database: "
+ calculateMean(listSupportOfItems) + " standard deviation: "
+ calculateStdDeviation(listSupportOfItems) + " variance: "
+ calculateVariance(listSupportOfItems)
+ " min value: " + calculateMinValue(listSupportOfItems)
+ " max value: " + calculateMaxValue(listSupportOfItems)
);
}
/**
* This method calculate the mean of a list of integers
* @param list the list of integers
* @return the mean
*/
private static double calculateMean(List<Integer> list) {
double sum = 0;
for (Integer val : list) {
sum += val;
}
return sum / list.size();
}
/**
* This method calculate the standard deviation of a list of integers
* @param list the list of integers
* @return the standard deviation
*/
private static double calculateStdDeviation(List<Integer> list) {
double deviation = 0;
double mean = calculateMean(list);
for (Integer val : list) {
deviation += Math.pow(mean - val, 2);
}
return Math.sqrt(deviation / list.size());
}
/**
* This method calculate the mean of a list of doubles
* @param list the list of doubles
* @return the mean
*/
private static double calculateMeanD(List<Double> list) {
double sum = 0;
for (Double val : list) {
sum += val;
}
return sum / list.size();
}
/**
* This method calculate the standard deviation of a list of doubles
* @param list the list of doubles
* @return the standard deviation
*/
private static double calculateStdDeviationD(List<Double> list) {
double deviation = 0;
double mean = calculateMeanD(list);
for (Double val : list) {
deviation += Math.pow(mean - val, 2);
}
return Math.sqrt(deviation / list.size());
}
/**
* This method calculate the variance of a list of integers
* @param list the list of integers
* @return the variance
*/
private static double calculateVariance(List<Integer> list) {
double deviation = 0;
double mean = calculateMean(list);
for (Integer val : list) {
deviation += Math.pow(mean - val, 2);
}
return Math.pow(Math.sqrt(deviation / list.size()), 2);
}
/**
* This method return the smallest integer from a list of integers
* @param list the list of integers
* @return the smallest integer
*/
private static int calculateMinValue(List<Integer> list) {
int min = Integer.MAX_VALUE;
for (Integer val : list) {
if (val < min) {
min = val;
}
}
return min;
}
/**
* This method return the largest integer from a list of integers
* @param list the list of integers
* @return the largest integer
*/
private static int calculateMaxValue(List<Integer> list) {
int max = 0;
for (Integer val : list) {
if (val >= max) {
max = val;
}
}
return max;
}
}