package ca.pfv.spmf.tools.dataset_generator;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Set;
/**
* Convert a transaction database to a transaction database with utility values
* from the source code.
* @author Philippe Fournier-Viger, 2010
*/
public class TransactionDatasetUtilityGenerator {
/**
* Convert a transaction database to a transaction database with utility values
* from the source code.
* @param input the input file path (a transaction database in SPMF format)
* @param output the output file path
* @param maxQuantity the maximum quantity of each item in a transaction
* @param externalUtilityFactor the external utility of items generated by Random.nextGaussian() will be multiplied by this value
* @throws IOException if an error while reading/writting files.
* @throws NumberFormatException
*/
public void convert(String input, String output, int maxQuantity, double externalUtilityFactor) throws NumberFormatException, IOException {
// for stats
Set<Integer> items = new HashSet<Integer>();
long avglength =0;
long tidcount = 0;
Random randomGenerator = new Random(System.currentTimeMillis());
Map<Integer, Integer> externalUtilities = new HashMap<Integer, Integer>();
BufferedWriter writer = new BufferedWriter(new FileWriter(output));
BufferedReader myInput = new BufferedReader(new InputStreamReader( new FileInputStream(new File(input))));
// for each line (transaction) until the end of file
String thisLine;
while ((thisLine = myInput.readLine()) != null) {
// if the line is a comment, is empty or is a
// kind of metadata
if (thisLine.isEmpty() == true ||
thisLine.charAt(0) == '#' || thisLine.charAt(0) == '%'
|| thisLine.charAt(0) == '@') {
continue;
}
// split the transaction according to the : separator
String split[] = thisLine.split(" ");
tidcount++;
avglength += split.length;
for(int i=0; i <split.length; i++){
// convert item to integer
Integer item = Integer.parseInt(split[i]);
items.add(item);
if(externalUtilities.containsKey(item) == false) {
double rand = Math.abs(randomGenerator.nextGaussian() * externalUtilityFactor);
// System.out.println("rand " + rand);
int extUtility = (int) (rand) + 1;
externalUtilities.put(item, extUtility);
// System.out.println(extUtility);
}
}
}
myInput.close();
myInput = new BufferedReader(new InputStreamReader( new FileInputStream(new File(input))));
// for each line (transaction) until the end of file
while ((thisLine = myInput.readLine()) != null) {
// if the line is a comment, is empty or is a
// kind of metadata
if (thisLine.isEmpty() == true ||
thisLine.charAt(0) == '#' || thisLine.charAt(0) == '%'
|| thisLine.charAt(0) == '@') {
continue;
}
// split the transaction according to the : separator
String split[] = thisLine.split(" ");
List<Integer> quantities = new ArrayList<Integer>();
int TU = 0;
// split the transaction according to the : separator
for(int i=0; i <split.length; i++){
// convert item to integer
Integer item = Integer.parseInt(split[i]);
int quantity = randomGenerator.nextInt(maxQuantity) +1;
quantities.add(quantity);
int extutility = externalUtilities.get(item);
TU += extutility * quantity;
}
for(int i=0; i <split.length; i++){
// convert item to integer
Integer item = Integer.parseInt(split[i]);
writer.write(""+ item);
if(i != split.length -1) {
writer.write(" ");
}
}
writer.write(":");
writer.write(""+ TU);
writer.write(":");
for(int i=0; i <split.length; i++){
// convert item to integer
Integer item = Integer.parseInt(split[i]);
Integer q = quantities.get(i);
int extutility = externalUtilities.get(item);
writer.write(""+ q * extutility);
if(i != split.length -1) {
writer.write(" ");
}
}
writer.newLine();
}
writer.close();
System.out.println("item count " + items.size());
System.out.println("transaction count " + tidcount);
System.out.println("transaction avg length " + (avglength / (double) tidcount));
}
}