package ca.pfv.spmf.tools.dataset_converter; /* This file is copyright (c) 2008-2012 Philippe Fournier-Viger * * This file is part of the SPMF DATA MINING SOFTWARE * (http://www.philippe-fournier-viger.com/spmf). * * SPMF is free software: you can redistribute it and/or modify it under the * terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * SPMF is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR * A PARTICULAR PURPOSE. See the GNU General Public License for more details. * You should have received a copy of the GNU General Public License along with * SPMF. If not, see <http://www.gnu.org/licenses/>. */ import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.DataInputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileWriter; import java.io.IOException; import java.io.InputStreamReader; /** * This class is for converting sequence databases from various formats * to the SPMF format. * * @see Formats * @author Philippe Fournier-Viger */ public class SequenceDatabaseConverter { String input; // the path of the input file String output; // the path of the file to be written to disk in SPMF format int lineCount =0; // the number of sequences in the input file BufferedWriter writer; // to write the output file /** * This method converts a sequence database from a given format to the SPMF format. * @param input the path of the input file * @param output the path of the file to be written to disk in SPMF format * @param inputFileformat the format of the input file * @param lineCount the number of lines from the input file that should be converted * @throws IOException an exception is thrown if there is an error reading/writing files */ public void convert(String input, String output, Formats inputFileformat, int lineCount) throws IOException { // we save the parameter in the class fields this.input = input; this.output = output; this.lineCount = lineCount; // we create an object fro writing the output file writer = new BufferedWriter(new FileWriter(output)); // we call the appropriate method for converting a database // according to the format of the input file if(inputFileformat.equals(Formats.IBMGenerator)){ convertIBMGenerator(); } else if(inputFileformat.equals(Formats.Kosarak)){ convertKosarak(); }else if(inputFileformat.equals(Formats.CSV_INTEGER)){ convertCSV(); }else if(inputFileformat.equals(Formats.BMS)){ convertBMS(); }else if(inputFileformat.equals(Formats.Snake)){ convertSnake(); }else if(inputFileformat.equals(Formats.SPMF_TRANSACTION_DB)){ convertTransactionDB(); } // we close the output file writer.close(); } /** * This method convert a transaction database in SPMF format to a sequence database in SPMF format */ private void convertTransactionDB() { String thisLine; // variable to read a line BufferedReader myInput = null; try { // Objects to read the file FileInputStream fin = new FileInputStream(new File(input)); myInput = new BufferedReader(new InputStreamReader(fin)); int count =0; // to count the number of line // we read the file line by line until the end of the file while ((thisLine = myInput.readLine()) != null) { // we split the line according to spaces String[] split = thisLine.split(" "); // for each item on this line for(String itemString : split) { int item = Integer.parseInt(itemString); // we write the item with an itemset separator writer.write(item + " -1 "); } // we write the end of the line writer.write("-2"); count++; // we increase the number of line that was read until now // if we have read enough lines, we stop. if(count == lineCount){ break; } // start a new line writer.newLine(); } myInput.close(); } catch (Exception e) { e.printStackTrace(); } } /** * This method convert a file from the SNAKE format to SPMF format */ private void convertSnake() { String thisLine; // variable to read a line BufferedReader myInput = null; try { // Objects to read the file FileInputStream fin = new FileInputStream(new File(input)); myInput = new BufferedReader(new InputStreamReader(fin)); int count =0; // to count the number of line // we read the file line by line until the end of the file while ((thisLine = myInput.readLine()) != null) { // if the line contains more than 11 elements // (we use this to filter smaller lines) if(thisLine.length() >= 11){ // for each integer on this line, we consider that it is an item for(int i=0; i< thisLine.length(); i++){ // we subtract 65 to get the item number and // write the item to the file int character = thisLine.toCharArray()[i] - 65; // we write an itemset separator writer.write(character + " -1 "); } // we write the end of the line writer.write("-2"); } count++; // we increase the number of line that was read until now // if we have read enough lines, we stop. if(count == lineCount){ break; } // start a new line writer.newLine(); } myInput.close(); } catch (Exception e) { e.printStackTrace(); } } /** * This method convert a file from the BMS format to SPMF format */ private void convertBMS() { String thisLine; // variable to read a line BufferedReader myInput = null; try { // Objects to read the file FileInputStream fin = new FileInputStream(new File(input)); myInput = new BufferedReader(new InputStreamReader(fin)); // In the BMS format, the sequencs of webpage of a user // is separated on several lines. // We use this variable to remember the id of the current user // that we are reading. int lastId = 0; int count = 0; // to count the number of line // we read the file line by line until the end of the file while ((thisLine = myInput.readLine()) != null) { // we split the line according to spaces String[] split = thisLine.split(" "); // each line is a user id with a webpage int id = Integer.parseInt(split[0]); // id of the user on this line int val = Integer.parseInt(split[1]); // webpage viewed by this user // if the id of the current user is not the same as the previous line if(lastId != id){ // and it is not the first line if(lastId!=0 ){ count++; // increase sequence count // write the end of line writer.write("-2"); writer.newLine(); } lastId = id; // remember the current user id for this line so that we know it for next line } // if we have read enough sequences, we stop. if(count == lineCount){ break; } // after each line we write an itemset separator "-1" writer.write(val + " -1 "); // WRITE } myInput.close(); } catch (Exception e) { e.printStackTrace(); } } /** * This method convert a file from the CSV format to SPMF format */ private void convertCSV() throws IOException { String thisLine; // variable to read a line BufferedReader myInput = null; try { // Objects to read the file FileInputStream fin = new FileInputStream(new File(input)); myInput = new BufferedReader(new InputStreamReader(fin)); int count = 0; // to count the number of line // we read the file line by line until the end of the file while ((thisLine = myInput.readLine()) != null) { // we split the line according to spaces String[] split = thisLine.split(","); // for each value for (String value : split) { // we convert to integer and write the item Integer item = Integer.parseInt(value); writer.write(item + " -1 "); // write an itemset separator } writer.write("-2"); // write end of line count++; // increase the number of sequences // if we have read enough sequences, we stop. if(count == lineCount){ break; } writer.newLine(); // create new line } } catch (Exception e) { e.printStackTrace(); } finally { if (myInput != null) { myInput.close(); } } } /** * This method convert a file from the KOSARAK format to SPMF format */ private void convertKosarak() throws IOException { String thisLine; // variable to read a line BufferedReader myInput = null; try { // Objects to read the file FileInputStream fin = new FileInputStream(new File(input)); myInput = new BufferedReader(new InputStreamReader(fin)); int count = 0; // to count the number of line // we read the file line by line until the end of the file while ((thisLine = myInput.readLine()) != null) { // we split the line according to spaces String[] split = thisLine.split(" "); // for each string on this line for (String value : split) { // we convert to integer and write it to file (it is an item) Integer item = Integer.parseInt(value); writer.write(item + " -1 "); // write an itemset separator } writer.write("-2"); // write end of line count++;// increase the number of sequences // if we have read enough sequences, we stop. if(count == lineCount){ break; } writer.newLine(); // create new line } } catch (Exception e) { e.printStackTrace(); } finally { if (myInput != null) { myInput.close(); } } } /** * This method convert a file from the IBM GENERATOR format to SPMF format */ private void convertIBMGenerator() { DataInputStream myInput = null; try { // Objects to read the input file in binary format FileInputStream fin = new FileInputStream(new File(input)); myInput = new DataInputStream(fin); // Variable to remember if we have written -1 after a group of items or not // (because in the binary format, at the end of a line there is no -1 before the -2 // but in spmf format there is one). boolean lastMinus1 = false; int count = 0; // to count the number of line // we read the file integer by integer until the end of the file while (myInput.available() != 0) { // we read the first 32 bits and convert to big indian int value = INT_little_endian_TO_big_endian(myInput.readInt()); // if it is "-1", the end of an itemset if (value == -1) { // we write the same thing as output writer.write("-1 "); lastMinus1 = true; // to remember that we have written -1 } // if it is "-2", the end of a sequence else if (value == -2) { // check if the last "-1" was not written if (lastMinus1 == false) { writer.write("-1 "); // write "-1" } writer.write("-2 "); // write end of line count++;// increase the number of sequences // if we have read enough sequences, we stop. if(count == lineCount){ break; } writer.newLine(); // create new line } // else it is an item else { // we write the item writer.write(value + " "); lastMinus1 = false; // to remember that we need to write a -1 } } myInput.close(); } catch (Exception e) { e.printStackTrace(); } } /** * This method converts integer values from little indian to big endian * @param i an integer in little indian * @return the integer converted to big indian */ int INT_little_endian_TO_big_endian(int i) { return ((i & 0xff) << 24) + ((i & 0xff00) << 8) + ((i & 0xff0000) >> 8) + ((i >> 24) & 0xff); } }