package ca.pfv.spmf.tools.other_dataset_tools;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
/**
* This tool allows to fix some common problems in a transaction database file in SPMF format.
* In particular: (1) the tool removes items that appears more than once in a transaction.
* (2) it sort transactions according to the lexicographical ordering.
* The reason for performing this is that many itemset and association rule mining
* algorithms assumes that items cannot appear more than once in a transaction
* and that transactions are sorted.
*
* @author Philippe Fournier-Viger, 2014
*/
public class FixTransactionDatabaseTool {
/**
* Fix the transaction database
* @param input the input file path (a transaction database in SPMF format)
* @param output the output file path (the fixed trnasactoin database in SPMF format)
* @throws IOException if an error while reading/writting files.
* @throws NumberFormatException
*/
public void convert(String input, String output) throws NumberFormatException, IOException {
// for stats
BufferedWriter writer = new BufferedWriter(new FileWriter(output));
BufferedReader myInput = new BufferedReader(new InputStreamReader( new FileInputStream(new File(input))));
// for each line (transaction) until the end of file
String thisLine;
while ((thisLine = myInput.readLine()) != null) {
// if the line is empty we skip it
if (thisLine.isEmpty() == true) {
continue;
// if the line is some kind of metadata we just write the line as it is
}else if(thisLine.charAt(0) == '#' || thisLine.charAt(0) == '%'
|| thisLine.charAt(0) == '@') {
writer.write(thisLine + " ");
writer.newLine();
continue;
}
// Otherwise
// split the transaction according to the white space separator
String [] split = thisLine.split(" ");
// This will store the current transaction in memory
// so that we can sort it
List<Integer> transaction = new ArrayList<Integer>();
// This is to remember items that we have already seen in the current transaction.
Set<Integer> alreadySeen = new HashSet<Integer>();
for(int i=0; i <split.length; i++){
// convert item to integer
Integer item = Integer.parseInt(split[i]);
// if the item is appearing for the first time in the transaction
// we add the item to the transaction
if(alreadySeen.contains(item) == false) {
// we add the item
transaction.add(item);
// we remember that we have seen this item
alreadySeen.add(item);
}
}
// Sort the transaction
Collections.sort(transaction);
// Then write the transaction to the file
for(int i = 0; i < transaction.size(); i++) {
Integer item = transaction.get(i);
writer.write(String.valueOf(item));
if(i != transaction.size()-1) {
writer.write(" ");
}
}
// write a new line
writer.newLine();
}
myInput.close();
writer.close();
}
}