package ca.pfv.spmf.tools.dataset_converter; /* This file is copyright (c) 2008-2012 Philippe Fournier-Viger * * This file is part of the SPMF DATA MINING SOFTWARE * (http://www.philippe-fournier-viger.com/spmf). * * SPMF is free software: you can redistribute it and/or modify it under the * terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * SPMF is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR * A PARTICULAR PURPOSE. See the GNU General Public License for more details. * You should have received a copy of the GNU General Public License along with * SPMF. If not, see <http://www.gnu.org/licenses/>. */ import java.io.IOException; import java.io.UnsupportedEncodingException; import java.net.URL; /** * Example of how to convert a transaction database from the ARFF format to the * SPMF format. * <br/><br/> * * The ARFF format is a format used by other data mining software (rapid-miner, * etc.). It allows representing a relational table as a text file. In SPMF, a * good effort has been made to support the conversion from the ARFF format to a * transaction database in SPMF format. All features of the ARFF format * specification are supported (see * http://weka.wikispaces.com/ARFF+%28stable+version%29 for the full ARFF * specification), except that the character "=" is forbidden and that escape * characters are not supported. * <br/><br/> * * For example, the following input file (example.arff) is in ARFF format and * can be converted to SPMF format by the tool. This file defines a table named * "sunburn" having 6 attributes. The first five attributes have a limited set * of possible values. For example, the attribute "weight" is an enumeration * that can only take "blonde", "brown" or "red" as value, or "?" if the value * is unknown. The instances in this table are represented by each line after * the @data instruction. In this example, there is 9 such instances. The first * 8 instances are in regular ARFF format: each attribute value is separated by * a comma. The last instance is in sparse ARFF format. This means that some * attributes having a value of "0" are ommitted. Finally, note that lines * starting with "%" are comments". For additional details about the ARFF * format, please refer to the official specification. * * % % SUNBURN DATA % This file is a modified version of * http://www.hakank.org/weka/sunburn.arf % additional * * @RELATION 'sunburn'% THIS IS A COMMENT * @ATTRIBUTE 'weight' {blonde, brown, red} * @ATTRIBUTE 'height' {short, average, tall} * @ATTRIBUTE weight {light, average, heavy} * @ATTRIBUTE 'lotion' {yes,no} * @ATTRIBUTE 'burned' {burned, none} % THIS IS A COMMENT * @attribute col_17 INTEGER * @DATA% THIS IS A TEST ?, average,light, no, burned, 1 % THIS IS A COMMENT * blonde, tall, average,yes, none, 2% THIS IS A COMMENT brown, short, * average,yes, none, 3 blonde, short, average,no, burned, 4 red, * average,heavy, no, burned, 5 brown, tall, heavy, no, none, 6 brown, * average,heavy, no, none, 7 blonde, short, light, yes, none, 4 {1 * blonde, 2 average, 3, heavy, 4 none} % THIS IS A SPARSE DATA INSTANCE * SPECIFICATION * * Note that according to the ARFF format, an unknown value for an * attribute is represented by the character "?". In SPMF, if you choose * the format "ARFF", the unknown values will be ommitted during the * conversion. If you want to keep the unknown values, then choose the * alternative format "ARFF_WITH_MISSING_VALUES" for conversion, which * will keep the unknown values. * * The result of the conversion of the previous ARFF file is a file in * SPMF format: * @CONVERTED_FROM_ARFF * @RELATION_NAME=sunburn= * @ATTRIBUTE=weight=ENUMERATION=blonde=brown=red= * @ATTRIBUTE=height=ENUMERATION=short=average=tall= * @ATTRIBUTE=weight=ENUMERATION=light=average=heavy= * @ATTRIBUTE=lotion=ENUMERATION=yes=no= * @ATTRIBUTE=burned=ENUMERATION=burned=none= * @ATTRIBUTE=col_17=INTEGER= * @ITEM=1=height=average * @ITEM=2=weight=light * @ITEM=3=lotion=no * @ITEM=4=burned=burned * @ITEM=5=col_17=1 1 2 3 4 5 * @ITEM=6=weight=blonde * @ITEM=7=height=tall * @ITEM=8=weight=average * @ITEM=9=lotion=yes * @ITEM=10=burned=none * @ITEM=11=col_17=2 6 7 8 9 10 11 * @ITEM=12=weight=brown * @ITEM=13=height=short * @ITEM=14=col_17=3 8 9 10 12 13 14 * @ITEM=15=col_17=4 3 4 6 8 13 15 * @ITEM=16=weight=red * @ITEM=17=weight=heavy * @ITEM=18=col_17=5 1 3 4 16 17 18 * @ITEM=19=col_17=6 3 7 10 12 17 19 * @ITEM=20=col_17=7 1 3 10 12 17 20 2 6 9 10 13 15 * * The first line indicates that this file was obtained by a * conversion from ARFF to SPMF format. The second line * indicates the name of the original relational table * specified in the ARFF file. Each line starting with * "@attribute" defines an attribute in the table, its type * and the possible values that the attribute can take if the * type is "ENUMERATION". For example, the attribute "weight" * is an enumeration that can take a value from "blonde", * "brown" or "red". The lines starting with * "@ITEM= indicates a mapping between a unique ID an an attribute value. For example, the line " * @ITEM=1=height=average * " means that the ID 1 represents the value " * average" for the attribute "height * ". Finally, the lines that are a list of integers separated by spaces each represents a data instances. For example, the line " * 1 2 3 4 * 5" represents the data instance with the value "average * " for the attribute " * height", the value "light" for the attribute " * weight", the value " * no" for the attribute "lotion", the value " * burned" for the attribute "burned" and the value 1" for the * attribute "col_17". * * Note that all lines starting with "@" are metadata that are * not used by the algorithms in SPMF". But this data is kept * so that the results found by the algorithms can be * interpreted. */ class MainTestConvertTransactionDatabaseARFFtoSPMF { public static void main(String[] arg) throws IOException { String inputFile = fileToPath("example.arff"); // the file to be converted in ARFF format String outputFile = ".//output.txt"; // the resulting converted file in SPMF format Formats inputFileformat = Formats.ARFF; // the format of the input file (ARFF) int transactionCount = Integer.MAX_VALUE; // the number of transaction from the input file to be converted // Create a converter TransactionDatabaseConverter converter = new TransactionDatabaseConverter(); // Call the method to convert the input file from ARFF to the SPMF format converter.convert(inputFile, outputFile, inputFileformat, transactionCount); } public static String fileToPath(String filename) throws UnsupportedEncodingException { URL url = MainTestConvertTransactionDatabaseARFFtoSPMF.class .getResource(filename); return java.net.URLDecoder.decode(url.getPath(), "UTF-8"); } }