package ca.pfv.spmf.tools.dataset_converter; /* This file is copyright (c) 2008-2012 Philippe Fournier-Viger * * This file is part of the SPMF DATA MINING SOFTWARE * (http://www.philippe-fournier-viger.com/spmf). * * SPMF is free software: you can redistribute it and/or modify it under the * terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * SPMF is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR * A PARTICULAR PURPOSE. See the GNU General Public License for more details. * You should have received a copy of the GNU General Public License along with * SPMF. If not, see <http://www.gnu.org/licenses/>. */ import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileInputStream; import java.io.FileWriter; import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import ca.pfv.spmf.input.sequence_database_array_integers.Sequence; import ca.pfv.spmf.input.sequence_database_array_integers.SequenceDatabase; /** * This class is for converting transaction databases from various formats * to the SPMF format. @see Formats * @author Philippe Fournier-Viger */ public class TransactionDatabaseConverter { String input; // the path of the input file String output; // the path of the file to be written to disk in SPMF format int lineCount =0; // the number of sequences in the input file /** * This method converts a transaction database from a given format to the SPMF format. * @param input the path of the input file * @param output the path of the file to be written to disk in SPMF format * @param inputFileformat the format of the input file * @param lineCount the number of lines from the input file that should be converted * @throws IOException an exception is thrown if there is an error reading/writing files Otherwise, null. */ public void convert(String input, String output, Formats inputFileformat, int lineCount) throws IOException { // we save the parameter in the class fields this.input = input; this.output = output; this.lineCount = lineCount; // we call the appropriate method for converting a database // according to the format of the input file if(inputFileformat.equals(Formats.CSV_INTEGER)){ convertCSV(); }else if(inputFileformat.equals(Formats.ARFF)){ convertARFF(true, false); }else if(inputFileformat.equals(Formats.ARFF_WITH_MISSING_VALUES)){ convertARFF(false, false); }else if(inputFileformat.equals(Formats.SPMF_SEQUENCE_DB)){ convertSequenceDB(); } } /** * This method convert a transaction database in ARFF format to SPMF format and * return a map of key = item id value = corresponding attribute value. This * method is to be used by the GUI version of SPMF that need to keep the mapping * between item IDs and attribute value in memory to avoid an extra database scan. * @param inputFile the path of the file to be converted * @param outputFile the path for saving the converted file * @param lineCount the number of lines of the input file to be converted * @return a map of entry (key : itemID, value: attribute-value) if the input format is ARFF. * @throws IOException if an error while reading/writing files */ public Map<Integer, String> convertARFFandReturnMap(String inputFile, String outputFile, int lineCount) throws IOException { // we save the parameter in the class fields this.input = inputFile; this.output = outputFile; this.lineCount = lineCount; return convertARFF(true, true); } /** * This method convert a file from the ARFF format to the SPMF format. * * @param returnMapItemIDValue * @throws IOException exception if error while reading/writing files. * @return a map where an entry indicates for an item (key), the corresponding attribute value (value). */ private Map<Integer, String> convertARFF(boolean ignoreMissingValues, boolean returnMapItemIDValue) throws IOException { // This map will be used to store mapping from item id (key) to attribute value (value). // It is used only if returnMapItemIDValue is set to true. This is used by the GUI of SPMF // which need to keep this information in memory to avoid an extra database scan after an algorithm // is applied. Map<Integer, String> mapItemsIDsToAttributeValues = null; if(returnMapItemIDValue){ mapItemsIDsToAttributeValues = new HashMap<Integer, String>(); } // object for writing the output file BufferedWriter writer = new BufferedWriter(new FileWriter(output)); BufferedReader myInput = null; try { // Objects to read the file FileInputStream fin = new FileInputStream(new File(input)); myInput = new BufferedReader(new InputStreamReader(fin)); int count = 0; // to count the number of data instance lines int attributeCount =0; // to count the number fo attributes // the last item ID used in the output file int lastItemAdded =0; // A list that stores a map for each attribute. // An entry in the map is : // key = String (attribute value) // value = Integer (item id) List<Map<String, Integer>> mapAttributeValuesItemsID = null; List<String> listAttributeNames = new ArrayList<String>(); String thisLine; // variable to read a line // we read the file line by line until the end of the file while ((thisLine = myInput.readLine()) != null) { // if the line is too short (e.g emptylines), skip it if(thisLine.length() <2){ continue; } // if the line starts with a comment if(thisLine.startsWith("%")){ continue; } // check if the line contains a comment later in the line int indexComment = thisLine.indexOf('%'); // if yes, then remove it if(indexComment >=0){ thisLine = thisLine.substring(0, indexComment); } // if the line is the relation name // (e.g. " @RELATION 'sunburn' ") if(thisLine.startsWith("@RELATION") || thisLine.startsWith("@relation")){ String relationName = thisLine.split(" ")[1]; // if the name is between quotes, we remove them if(relationName.contains("'")){ relationName = relationName.split("'")[1]; } if(returnMapItemIDValue == false){ writer.write("@CONVERTED_FROM_ARFF"); writer.newLine(); writer.write("@RELATION_NAME="); writer.write(relationName + "="); writer.newLine(); } continue; } // if the line is an attribute definition // For example: // @ATTRIBUTE 'hair' {blonde, brown, red} // @attribute class {positive,negative} // @attribute col_17 INTEGER // @attribute col_18 {0,1,2,3,4,5} // @ATTRIBUTE petalwidth NUMERIC // @data // if(thisLine.startsWith("@ATTRIBUTE") || thisLine.startsWith("@attribute") ){ // increase the number of attributes attributeCount++; if(returnMapItemIDValue == false){ writer.write("@ATTRIBUTE="); } // get the first position of the attribute name after the space before it int firstPositionOfAttributeName = thisLine.indexOf(' ') +1; // if the first character is a quote boolean useQuotes = false; if(thisLine.charAt(firstPositionOfAttributeName) == '\''){ useQuotes = true; firstPositionOfAttributeName++; } // remove the part of the string before the attribute name thisLine = thisLine.substring(firstPositionOfAttributeName); // if there is extra spaces, we remove them just in case thisLine = thisLine.trim(); // If quotes are use if(useQuotes){ // get the position of the character just before the second quote int quotePosition = thisLine.indexOf('\''); // write attribute name String attributeName = thisLine.substring(0, quotePosition); if(returnMapItemIDValue == false){ writer.write(attributeName + "="); } listAttributeNames.add(attributeName); // cut the string to remove the attribute name thisLine = thisLine.substring(quotePosition+1); }else{ // get the position of the character just before the space after the attribute name int spacePosition = thisLine.indexOf(' '); // write attribute name String attributeName = thisLine.substring(0, spacePosition); if(returnMapItemIDValue == false){ writer.write(attributeName + "="); } listAttributeNames.add(attributeName); // cut the string to remove the attribute name thisLine = thisLine.substring(spacePosition+1); } // remove spaces before or after what is remaining in this // line thisLine = thisLine.trim(); // System.out.println(thisLine); // WRITE TYPE String type = thisLine; if(type.startsWith("{")){ if(returnMapItemIDValue == false){ writer.write("ENUMERATION="); } // Remove the brackets {} thisLine = thisLine.substring(1,thisLine.length()-1); // NEED TO READ THE ENUMERATION VALUES for (String token : thisLine.split(",")) { // remove spaces i they are some token = token.trim(); // write the enumeration value if(returnMapItemIDValue == false){ writer.write(token + "="); } } }else{ // this is not an enumeration so we don't need // to write enumeration values. if(returnMapItemIDValue == false){ writer.write(type + "="); } } if(returnMapItemIDValue == false){ writer.newLine(); } continue; } // if the line is the data separator if(thisLine.startsWith("@data") || thisLine.startsWith("@DATA")){ // System.out.println("DATA"); // initialize the map for storing attribute values // by creating an empty hashmap for each attribute. mapAttributeValuesItemsID = new ArrayList<Map<String, Integer>>(attributeCount); for(int i=0; i< attributeCount; i++){ mapAttributeValuesItemsID.add(new HashMap<String, Integer>()); } continue; } // ===== NOW WE WILL PROCESS THE DATA INSTANCES IN THE FILE ==== //Create a list to store the items of this transaction List<Integer> transaction = new ArrayList<Integer>(); // Create a temporary StringBuilder for storing attributes // definition of attribute values that have not been seen before StringBuilder unseenAttributeValues = new StringBuilder(); // IF SPARSE DATA // For example: // {2 W, 4 "class B"} // where each instance is a pair indicating the attribute number and the value. // Ommitted values means the value 0 // Unknown values are represented by ? if(thisLine.startsWith("{")){ // System.out.println(thisLine); // remove the brackets thisLine = thisLine.substring(1).trim(); thisLine = thisLine.substring(0, thisLine.length()-1).trim(); // System.out.println(thisLine); // we will use a HashSet<Integer> to remember which attribute // position are included and which one are not. // This is important because if an attribute is ommited, // the value 0 should be used according to the ARFF specification. Set<Integer> positionProcessed = new HashSet<Integer>(); // System.out.println(thisLine); // for each entry for(String entry : thisLine.split(",")){ entry = entry.trim(); // separate the entry into position + value int indexOfFirstSpace = entry.indexOf(' '); // extract the attribute number int i = Integer.parseInt(entry.substring(0, indexOfFirstSpace)); // extract the attribute value // System.out.println(entry.substring(indexOfFirstSpace+1)); String val = entry.substring(indexOfFirstSpace+1); positionProcessed.add(i); // if the user want to ignore missing values, // we skip the value if("?".equals(val) && ignoreMissingValues){ continue; } // get the corresponding item id Map<String, Integer> mapValueToItemID = mapAttributeValuesItemsID.get(i); Integer itemID = mapValueToItemID.get(val); if(itemID == null){ // if it is the first time that we see this attribute, // increase item ID. itemID = ++lastItemAdded; // record the itemID that is given for this value mapValueToItemID.put(val, itemID); if(mapItemsIDsToAttributeValues != null){ mapItemsIDsToAttributeValues.put(itemID, listAttributeNames.get(i) + "=" + val); } // add the unseen attribute value to the string for // unseen attribute values. unseenAttributeValues.append("@ITEM=" + itemID +"=" + listAttributeNames.get(i) + "=" + val + "\n"); } // USE THE ITEM ID transaction.add(itemID); } // We will put the value 0 for all position that have not been // seen. for(int i=0; i< attributeCount; i++){ // if the attriute i has not been processed yet if(positionProcessed.contains(i) == false){ String val = "0"; // if the user want to ignore missing values, // we skip the value if("?".equals(val) && ignoreMissingValues){ continue; } // get the corresponding item id Map<String, Integer> mapValueToItemID = mapAttributeValuesItemsID.get(i); Integer itemID = mapValueToItemID.get(val); if(itemID == null){ // if it is the first time that we see this attribute, // increase item ID. itemID = ++lastItemAdded; // record the itemID that is given for this value mapValueToItemID.put(val, itemID); if(mapItemsIDsToAttributeValues != null){ mapItemsIDsToAttributeValues.put(itemID, listAttributeNames.get(i) + "=" + val); } // add the unseen attribute value to the string for // unseen attribute values. unseenAttributeValues.append("@ITEM=" + itemID +"=" + listAttributeNames.get(i) + "=" + val + "\n"); } // USE THE ITEM ID transaction.add(itemID); } } }else{ // IF NOT SPARSE DATA // For example : // 0, X, 0, Y, "class A" // Values are separated by "," and spaces // we split the line according to comma String[] split = thisLine.split(","); for(int i=0; i< attributeCount; i++){ String val = split[i].trim(); // if the user want to ignore missing values, // we skip the value if("?".equals(val) && ignoreMissingValues){ continue; } // get the corresponding item id Map<String, Integer> mapValueToItemID = mapAttributeValuesItemsID.get(i); Integer itemID = mapValueToItemID.get(val); if(itemID == null){ // if it is the first time that we see this attribute, // increase item ID. itemID = ++lastItemAdded; // record the itemID that is given for this value mapValueToItemID.put(val, itemID); if(mapItemsIDsToAttributeValues != null){ mapItemsIDsToAttributeValues.put(itemID, listAttributeNames.get(i) + "=" + val); } // add the unseen attribute value to the string for // unseen attribute values. unseenAttributeValues.append("@ITEM=" + itemID +"=" + listAttributeNames.get(i) + "=" + val + "\n"); } // USE THE ITEM ID transaction.add(itemID); } } // // sort the transaction in lexical order Collections.sort(transaction); // if(returnMapItemIDValue == false){ writer.write(unseenAttributeValues.toString()); // for each item, we will output them for (int i=0; i<transaction.size(); i++) { if(i != transaction.size() -1){ // if not the last item // write the item with an itemset separator writer.write(transaction.get(i) + " "); }else{ // if the last item // write the item writer.write(transaction.get(i) + ""); } } writer.newLine(); // } count++; // increase the number of sequences // if we have read enough sequences, we stop. if(count == lineCount){ break; } } // close output file writer.close(); } catch (Exception e) { e.printStackTrace(); } finally { if (myInput != null) { myInput.close(); } } return mapItemsIDsToAttributeValues; } /** * This method convert a file from the CSV format to the SPMF format */ private void convertCSV() throws IOException { BufferedReader myInput = null; try { // we create an object for writing the output file BufferedWriter writer = new BufferedWriter(new FileWriter(output)); // Objects to read the file FileInputStream fin = new FileInputStream(new File(input)); myInput = new BufferedReader(new InputStreamReader(fin)); int count = 0; // to count the number of line String thisLine; // variable to read a line // we read the file line by line until the end of the file while ((thisLine = myInput.readLine()) != null) { // if not the first line, we create a new line if(count !=0){ writer.newLine(); // create new line } // we split the line according to spaces String[] split = thisLine.split(","); // we use a set to store the values to avoid duplicates // because they are not allowed in a transaction Set<Integer> values = new HashSet<Integer>(); for(int i=0; i< split.length; i++){ values.add(Integer.parseInt(split[i])); } // sort the transaction in lexical order List<Integer> listValues = new ArrayList<Integer>(values); Collections.sort(listValues); // for each item, we will output them for (int i=0; i<listValues.size(); i++) { if(i != listValues.size() -1){ // if not the last item // write the item with an itemset separator writer.write(listValues.get(i) + " "); }else{ // if the last item // write the item writer.write(listValues.get(i) + ""); } } count++; // increase the number of sequences // if we have read enough sequences, we stop. if(count == lineCount){ break; } } // close the output file writer.close(); } catch (Exception e) { e.printStackTrace(); } finally { if (myInput != null) { myInput.close(); } } } /** * This method convert a file from the SPMF sequence database format * to the SPMF transaction database format. * Note that this code could be further optimized if performance is really an issue. */ private void convertSequenceDB() throws IOException { SequenceDatabase database = new SequenceDatabase(); database.loadFile(input); BufferedReader myInput = null; try { // we create an object for writing the output file BufferedWriter writer = new BufferedWriter(new FileWriter(output)); for(int i=0; i < database.getSequences().size(); i++) { Sequence sequence = database.getSequences().get(i); // ==== Read the sequence and keep all distinct items ====== // Create a set to remember with items have been seen already Set<Integer> itemsInSequence = new HashSet<Integer>(); // Create a list of integers to store the transaction corresponding to this sequence List<Integer> transaction = new ArrayList<Integer>(); // for each itemset in this sequence for(Integer[] itemset : sequence.getItemsets()) { for(Integer item : itemset) { // if we have not seen this item yet, add it to the transaction if(itemsInSequence.contains(item) == false) { transaction.add(item); itemsInSequence.add(item); } } } // === Sort the set of items in lexical order Collections.sort(transaction); // ==== write the transaction for(int j=0; j < transaction.size(); j++) { writer.write(transaction.get(j) + " "); } // if we have read enough sequences, we stop. if(i+1 == lineCount){ break; } // if not the last sequence, we move to next line if(i != database.size() -1) { writer.newLine(); } } // close the output file writer.close(); } catch (Exception e) { e.printStackTrace(); } finally { if (myInput != null) { myInput.close(); } } } }