/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ /* * ConverterCLUS.java * Copyright (C) 2009-2010 Aristotle University of Thessaloniki, Thessaloniki, Greece */ package mulan.data; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; import weka.core.Attribute; import weka.core.DenseInstance; import weka.core.Instance; import weka.core.Instances; /** * <p>Class that converts a dataset that is originally in the format of the * <a href="http://www.cs.kuleuven.be/~dtai/clus/">Clus system</a> to a format * that is suitable for Mulan. An arff and an xml file are created.</p> * <p>The arff file contains the original dataset with all labels converted to * separate attributes and properly converted instances. The xml file contains * the hierarchy of the labels.</p> * * @author George Saridis * @author Grigorios Tsoumakas */ public class ConverterCLUS { /** * Converts the original dataset to mulan compatible dataset. * * @param sourceFilename the source file name * @param arffFilename the converted arff name * @param xmlFilename the xml name * @throws java.lang.Exception */ public static void convert(String sourceFilename, String arffFilename, String xmlFilename) throws Exception { String line; try { BufferedReader brInput = new BufferedReader(new FileReader(sourceFilename)); String relationName = null; ArrayList<Attribute> attInfo = new ArrayList<Attribute>(); Instances data = null; int numAttributes = 0; String[] labelNames = null; while ((line = brInput.readLine()) != null) { if (line.startsWith("@RELATION")) { relationName = line.replace("@RELATION ", "").replaceAll("'", "").trim(); continue; } if (line.startsWith("@ATTRIBUTE ")) { String tokens[] = line.split("\\s+"); Attribute att; if (line.startsWith("@ATTRIBUTE class")) { labelNames = tokens[3].split(","); for (int i = 0; i < labelNames.length; i++) { ArrayList<String> labelValues = new ArrayList<String>(); labelValues.add("0"); labelValues.add("1"); att = new Attribute(labelNames[i], labelValues); attInfo.add(att); } } else { numAttributes++; if (tokens[2].equals("numeric")) { att = new Attribute(tokens[1]); } else { ArrayList<String> nominalValues = new ArrayList<String>(); tokens[2].substring(1, tokens[2].length() - 1); String[] nominalTokens = tokens[2].substring(1, tokens[2].length() - 1).split(","); for (int i = 0; i < nominalTokens.length; i++) { nominalValues.add(nominalTokens[i]); } att = new Attribute(tokens[1], nominalValues); } attInfo.add(att); } continue; } if (line.toLowerCase().startsWith("@data")) { data = new Instances(relationName, attInfo, 0); while ((line = brInput.readLine()) != null) { // fill data String[] tokens = line.split(","); double[] values = new double[attInfo.size()]; for (int i = 0; i < numAttributes; i++) { Attribute att = (Attribute) attInfo.get(i); if (att.isNumeric()) { values[i] = Double.parseDouble(tokens[i]); } else { values[i] = att.indexOfValue(tokens[i]); } } String[] labels = tokens[numAttributes].split("@"); // fill class values for (int j = 0; j < labels.length; j++) { String[] splitedLabels = labels[j].split("/"); String attrName = splitedLabels[0]; Attribute att = data.attribute(attrName); values[attInfo.indexOf(att)] = 1; for (int k = 1; k < splitedLabels.length; k++) { attrName = attrName + "/" + splitedLabels[k]; att = data.attribute(attrName); values[attInfo.indexOf(att)] = 1; } } Instance instance = new DenseInstance(1, values); data.add(instance); } } } BufferedWriter writer; writer = new BufferedWriter(new FileWriter(arffFilename)); writer.write(data.toString()); writer.close(); // write xml file writer = new BufferedWriter(new FileWriter(xmlFilename)); writer.write("<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n"); writer.write("<labels xmlns=\"http://mulan.sourceforge.net/labels\">\n"); writer.write("<label name=\"" + labelNames[0] + "\">"); int depth = 0; for (int i = 1; i < labelNames.length; i++) { int difSlashes = countSlashes(labelNames[i]) - countSlashes(labelNames[i - 1]); // child if (difSlashes == 1) { depth++; writer.write("\n"); for (int j = 0; j < depth; j++) { writer.write("\t"); } writer.write("<label name=\"" + labelNames[i] + "\">"); } // sibling if (difSlashes == 0) { writer.write("</label>\n"); for (int j = 0; j < depth; j++) { writer.write("\t"); } writer.write("<label name=\"" + labelNames[i] + "\">"); } // ancestor if (difSlashes < 0) { writer.write("</label>\n"); for (int j = 0; j < Math.abs(difSlashes); j++) { depth--; for (int k = 0; k < depth; k++) { writer.write("\t"); } writer.write("</label>\n"); } for (int j = 0; j < depth; j++) { writer.write("\t"); } writer.write("<label name=\"" + labelNames[i] + "\">"); } } writer.write("</label>\n"); while (depth > 0) { for (int k = 0; k < depth; k++) { writer.write("\t"); } writer.write("</label>\n"); depth--; } writer.write("</labels>"); writer.close(); } catch (IOException ioEx) { ioEx.printStackTrace(); } } private static int countSlashes(String label) { int counter = 0; for (int i = 0; i < label.length(); i++) { if (label.charAt(i) == '/') { counter++; } } return counter; } }