/* * Copyright 2004-2010 Information & Software Engineering Group (188/1) * Institute of Software Technology and Interactive Systems * Vienna University of Technology, Austria * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.ifs.tuwien.ac.at/dm/somtoolbox/license.html * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package at.tuwien.ifs.somtoolbox.data; import java.io.IOException; import java.io.PrintWriter; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.logging.Logger; import cern.colt.matrix.DoubleMatrix1D; import at.tuwien.ifs.somtoolbox.SOMToolboxException; import at.tuwien.ifs.somtoolbox.input.SOMLibFileFormatException; import at.tuwien.ifs.somtoolbox.util.FileUtils; import at.tuwien.ifs.somtoolbox.util.StdErrProgressWriter; import at.tuwien.ifs.somtoolbox.util.StringUtils; /** * This class gathers methods to write certain {@link InputData}, {@link TemplateVector} and * {@link SOMLibClassInformation} in a certain number of file formats, such as SOMLib, WEKA ARFF, SOMPak and ESOM. * * @author Rudolf Mayer * @version $Id: InputDataWriter.java 3848 2010-10-12 12:29:26Z mayer $ */ public class InputDataWriter { /** Supported Output File Format Types */ public static final String[] OUTPUT_FILE_FORMAT_TYPES = { SOMLibSparseInputData.getFormatName(), ARFFFormatInputData.getFormatName(), RandomAccessFileSOMLibInputData.getFormatName(), ESOMInputData.getFormatName(), SOMPAKInputData.getFormatName(), "Orange", "CSV" }; /** * Writes the data to <a href="http://databionic-esom.sourceforge.net/user.html#File_formats">ESOM lrn/cls * format</a>. */ public static void writeAsESOM(InputData data, String fileName) throws IOException, SOMLibFileFormatException { String fileNameLrn = StringUtils.appendExtension(fileName, ".lrn"); // write the header, see http://databionic-esom.sourceforge.net/user.html#File_formats Logger.getLogger("at.tuwien.ifs.somtoolbox").info("Writing input data as ESOM file to '" + fileNameLrn + "'."); PrintWriter writer = FileUtils.openFileForWriting("ESOM lrn", fileNameLrn, false); if (org.apache.commons.lang.StringUtils.isNotBlank(data.getDataSource())) { writer.println("# Converted from " + data.getDataSource() + "."); } writer.println("% " + data.numVectors()); writer.println("% " + (data.dim() + 1)); writer.println("% 9" + StringUtils.repeatString(data.dim(), "\t 1")); writer.println("% Key\t" + StringUtils.interleave(data.templateVector().getLabels(), "\t")); for (int i = 0; i < data.numVectors(); i++) { writer.print(String.valueOf(i + 1)); // index in the lrn file will start with 1, make sure this is in synch // with ESOMMapOutputter for (int j = 0; j < data.dim(); j++) { writer.print("\t" + data.getValue(i, j)); } writer.println(); } writer.close(); // write the names file String fileNameNames = StringUtils.appendOrReplaceExtension(fileName, ".lrn", ".names"); Logger.getLogger("at.tuwien.ifs.somtoolbox").info("Writing names as ESOM file to '" + fileNameNames + "'."); writer = FileUtils.openFileForWriting("ESOM names", fileNameNames, false); if (org.apache.commons.lang.StringUtils.isNotBlank(data.getDataSource())) { writer.println("# Converted from " + data.getDataSource() + "."); } writer.println("% " + data.numVectors()); for (int i = 0; i < data.numVectors(); i++) { // index in the names file starts at 1, make sure this is in synch with lrn file and ESOMMapOutputter writer.println(String.valueOf(i + 1) + "\t" + data.getLabel(i)); } writer.close(); if (data.classInformation() != null) { // guess a good filename String fileNameCls = StringUtils.appendOrReplaceExtension(fileName, ".lrn", ".cls"); Logger.getLogger("at.tuwien.ifs.somtoolbox").info( "Writing class info as ESOM file to '" + fileNameCls + "'."); writeAsESOM(data.classInformation(), fileNameCls); } } /** * Writes the class information as <a href="http://databionic-esom.sourceforge.net/user.html#File_formats">ESOM * cls</a> file. */ public static void writeAsESOM(SOMLibClassInformation classInfo, String fileName) throws IOException, SOMLibFileFormatException { PrintWriter writer = FileUtils.openFileForWriting("ESOM class info", fileName); writer.println("% " + classInfo.numData); // write class index => class name mapping in header for (int i = 0; i < classInfo.numClasses(); i++) { writer.println("% " + i + " " + classInfo.getClassName(i)); } for (String element : classInfo.getDataNames()) { writer.println(element + "\t" + classInfo.getClassIndexForInput(element)); } writer.flush(); writer.close(); } public static void writeAsSOMLib(InputData data, String fileName) throws IOException { PrintWriter writer = FileUtils.openFileForWriting("Input vector file", fileName, true); InputDataWriter.writeHeaderToFile(writer, data.numVectors(), data.dim()); for (int i = 0; i < data.numVectors(); i++) { InputDatum inputDatum = data.getInputDatum(i); InputDataWriter.writeInputDatumToFile(writer, inputDatum); } writer.flush(); writer.close(); } /** Writes the class information to a file. */ public static void writeAsSOMLib(InputData data, TemplateVector templateVector, SOMLibClassInformation classInformation, boolean tabSeparatedClassFile, String basicFileName) throws IOException, SOMLibFileFormatException { writeAsSOMLib(data, basicFileName + ".vec"); if (templateVector != null) { InputDataWriter.writeAsSOMLib(templateVector, basicFileName + ".tv"); } if (classInformation != null) { if (tabSeparatedClassFile) { InputDataWriter.writeToFileTabSeparated(classInformation, basicFileName + ".cls"); } else { InputDataWriter.writeAsSOMLib(classInformation, basicFileName + ".cls"); } } } /** Writes the class information to a file. */ public static void writeAsSOMLib(InputData data, TemplateVector templateVector, SOMLibClassInformation classInformation, String basicFileName) throws IOException, SOMLibFileFormatException { writeAsSOMLib(data, templateVector, classInformation, false, basicFileName); } /** Writes the class information to a file in SOMLib format. */ public static void writeAsSOMLib(SOMLibClassInformation classInfo, String fileName) throws IOException, SOMLibFileFormatException { PrintWriter writer = FileUtils.openFileForWriting("SOMLib class info", fileName); writer.println("$TYPE class_information"); writer.println("$NUM_CLASSES " + classInfo.numClasses()); writer.write("$CLASS_NAMES "); for (int i = 0; i < classInfo.numClasses(); i++) { writer.write(classInfo.getClassName(i)); if (i + 1 < classInfo.numClasses()) { writer.write(" "); } } writer.println(); writer.println("$XDIM 2"); writer.println("$YDIM " + classInfo.numData); for (String element : classInfo.getDataNames()) { writer.println(element + " " + classInfo.getClassName(element)); } writer.flush(); writer.close(); } /** Writes the class information to a file in SOMLib format. */ public static void writeAsSOMLib(HashMap<String, String> classInfo, HashSet<String> classNames, String fileName) throws IOException, SOMLibFileFormatException { ArrayList<String> classNamesList = new ArrayList<String>(classNames); Collections.sort(classNamesList); PrintWriter writer = FileUtils.openFileForWriting("SOMLib class info", fileName); writer.println("$TYPE class_information"); writer.println("$NUM_CLASSES " + classNames.size()); writer.println("$CLASS_NAMES " + StringUtils.toString(classNamesList, "", "", " ")); writer.println("$XDIM 2"); writer.println("$YDIM " + classInfo.size()); for (String key : classInfo.keySet()) { writer.println(key + " " + classNamesList.indexOf(classInfo.get(key))); } writer.flush(); writer.close(); } public static void writeAsSOMLib(TemplateVector tv, String fileName) throws IOException { Logger.getLogger("at.tuwien.ifs.somtoolbox").info("Start writing new template vector to '" + fileName + "'."); PrintWriter writer = FileUtils.openFileForWriting("Template Vector", fileName, fileName.endsWith(".gz")); writeTempplateHeaderToFile(writer, fileName, tv.numVectors(), tv.dim(), tv.numinfo()); for (int i = 0; i < tv.dim(); i++) { writeElementToFile(writer, i, tv.getElement(i)); } writer.flush(); writer.close(); Logger.getLogger("at.tuwien.ifs.somtoolbox").info("Finished."); } public static void writeElementToFile(PrintWriter writer, int i, TemplateVectorElement e) { StringBuffer b = new StringBuffer(); int numinfo = e.getTemplateVector().numinfo(); if (numinfo > 2) { b.append(" ").append(e.getDocumentFrequency()); } if (numinfo > 3) { b.append(" ").append(e.getCollectionTermFrequency()); } if (numinfo > 4) { b.append(" ").append(e.getMinimumTermFrequency()); } if (numinfo > 5) { b.append(" ").append(e.getMaximumTermFrequency()); } if (numinfo > 6) { b.append(" ").append(e.getMeanTermFrequency()); } if (e.getComment() != null) { b.append(" ").append(e.getComment()); } writer.println(i + " " + e.getLabel() + b.toString()); } /** * Writes input data in the SOMPAK format (see * http://www.cis.hut.fi/projects/somtoolbox/package/docs2/som_read_data.html) */ public static void writeAsSOMPAK(InputData data, String fileName) throws IOException { Logger.getLogger("at.tuwien.ifs.somtoolbox").info("Writing input data as SOMPAK file to '" + fileName + "'."); PrintWriter writer = FileUtils.openFileForWriting("SOMPAK data", fileName, false); // number of dimensions first, simply the number writer.println(data.dim()); // now all component names TemplateVector tv = data.templateVector(); if (tv == null) { Logger.getLogger("at.tuwien.ifs.somtoolbox").info("Template vector not loaded - creating a generic one."); tv = new SOMLibTemplateVector(data.numVectors(), data.dim()); } writer.println(SOMPAKInputData.INDICATOR_COMPONENTS + " " + StringUtils.toString(tv.getLabels(), "", "", " ")); // now all data, appended by the class name for (int i = 0; i < data.numVectors(); i++) { for (int j = 0; j < data.dim(); j++) { writer.print(data.getValue(i, j)); if (j + 1 < data.dim()) { writer.print(" "); } } if (data.classInformation() != null) { writer.print(" " + data.classInformation().getClassName(i)); } writer.println(); } writer.close(); } /** Writes the data to <a href="http://www.cs.waikato.ac.nz/~ml/weka/arff.html">Weka ARFF format</a>. */ public static void writeAsWekaARFF(InputData data, String fileName, boolean writeInstanceNames, boolean skipInputsWithoutClass) throws IOException, SOMToolboxException { if (data.classInformation() == null) { throw new SOMToolboxException("Class Information File needed for WEKA ARFF writing"); } fileName = StringUtils.ensureExtension(fileName, ".arff"); Logger.getLogger("at.tuwien.ifs.somtoolbox").info("Writing input data as ARFF file to '" + fileName + "'."); PrintWriter writer = FileUtils.openFileForWriting("Weka ARFF", fileName, false); TemplateVector tv = data.templateVector(); if (tv == null) { Logger.getLogger("at.tuwien.ifs.somtoolbox").info("Template vector not loaded - creating a generic one."); tv = new SOMLibTemplateVector(data.numVectors(), data.dim()); } String relation = fileName.substring(0, fileName.length() - 4); writer.println("@RELATION " + relation + "\n"); for (int i = 0; i < tv.dim(); i++) { writer.println("@ATTRIBUTE " + tv.getLabel(i) + " NUMERIC"); } if (writeInstanceNames) { writer.println("@ATTRIBUTE instanceName STRING"); } writer.println(getWekaClassHeader(data.classInformation().classNames())); writer.println("@DATA"); int skipCounter = 0; StdErrProgressWriter progress = new StdErrProgressWriter(data.numVectors(), "Writing vector ", data.numVectors() / 10); for (int i = 0; i < data.numVectors(); i++) { InputDatum inputDatum = data.getInputDatum(i); if (skipInputsWithoutClass && !data.classInformation().hasClassAssignmentForName(inputDatum.getLabel())) { skipCounter++; Logger.getLogger("at.tuwien.ifs.somtoolbox").info( "Skipping datum '" + inputDatum.getLabel() + "', as it has no class assigned; skipped " + skipCounter + " so far."); continue; } DoubleMatrix1D vector = inputDatum.getVector(); for (int j = 0; j < data.dim(); j++) { writer.print(vector.get(j) + ","); } if (writeInstanceNames) { writer.print("'" + StringUtils.escapeForWeka(inputDatum.getLabel()) + "',"); } writer.println("'" + data.classInformation().getClassName(inputDatum.getLabel()) + "'"); progress.progress(); } writer.flush(); writer.close(); } public static String getWekaClassHeader(String[] classNames) { String classNamesString = ""; for (String className : classNames) { if (classNamesString.length() > 0) { classNamesString += ","; } classNamesString += "'" + StringUtils.escapeClassNameForWeka(className) + "'"; } String x = "@ATTRIBUTE class {" + classNamesString + "}"; return x; } /** * Writes input data in the tab-separated format used by the Orange data mining toolkit (see * http://www.ailab.si/orange/) */ public static void writeAsOrange(InputData data, String fileName) throws IOException { Logger.getLogger("at.tuwien.ifs.somtoolbox").info("Writing input data as Orange file to '" + fileName + "'."); PrintWriter writer = FileUtils.openFileForWriting("Orange data", fileName, false); TemplateVector tv = data.templateVector(); if (tv == null) { Logger.getLogger("at.tuwien.ifs.somtoolbox").info("Template vector not loaded - creating a generic one."); tv = new SOMLibTemplateVector(data.numVectors(), data.dim()); } SOMLibClassInformation classInformation = data.classInformation(); boolean haveClassInfo = classInformation != null; /* - first the tab-separated names of the features - then the types of the features - and then the indicator whether a feature is the class assignment e.g. for IRIS: sepallength sepalwidth petallength petalwidth class continuous continuous continuous continuous discrete class */ // row 1: tab-separated label names writer.print(StringUtils.toString(tv.getLabels(), "", "", "\t")); if (haveClassInfo) { // and optionally the class writer.print("\tclass"); } writer.println(); // row 2: data types for (int i = 0; i < tv.dim(); i++) { writer.print("continuous"); // all are continuous if (i + 1 < tv.dim()) { writer.print("\t"); } } if (haveClassInfo) { writer.print("\tdiscrete"); // just the class is discrete } writer.println(); // row 3: indicating options to the attributes writer.print(StringUtils.repeatString(tv.dim() - 1, "\t")); if (haveClassInfo) { writer.print("\tclass"); // the class attribute } writer.println(); // now the data, tab separated, and optionally with the class assignment // now all data, appended by the class name for (int i = 0; i < data.numVectors(); i++) { for (int j = 0; j < data.dim(); j++) { writer.print(data.getValue(i, j)); if (j + 1 < data.dim()) { writer.print("\t"); } } if (haveClassInfo) { writer.print("\t" + classInformation.getClassName(i)); } writer.println(); } writer.close(); } public static void writeAsCSV(InputData data, String fileName) throws IOException { Logger.getLogger("at.tuwien.ifs.somtoolbox").info("Writing input data as CVS to '" + fileName + "'."); PrintWriter writer = FileUtils.openFileForWriting("CVS", fileName, false); TemplateVector tv = data.templateVector(); if (tv == null) { Logger.getLogger("at.tuwien.ifs.somtoolbox").info("Template vector not loaded - creating a generic one."); tv = new SOMLibTemplateVector(data.numVectors(), data.dim()); } SOMLibClassInformation classInformation = data.classInformation(); boolean haveClassInfo = classInformation != null; String separator = "\t"; // header: tab-separated label names writer.print(StringUtils.toString(tv.getLabels(), "", "", separator, "\"")); if (haveClassInfo) { // and optionally the class writer.print(separator + "class"); } writer.println(); // data: tab separated, optionally with the class assignment for (int i = 0; i < data.numVectors(); i++) { for (int j = 0; j < data.dim(); j++) { writer.print(data.getValue(i, j)); if (j + 1 < data.dim()) { writer.print(separator); } } if (haveClassInfo) { writer.print(separator + classInformation.getClassName(i)); } writer.println(); } writer.close(); } public static void writeHeaderToFile(PrintWriter writer, int numVectors, int dim) throws IOException { writer.println("$TYPE vec"); writer.println("$XDIM " + numVectors); writer.println("$YDIM 1"); writer.println("$VEC_DIM " + dim); } public static void writeInputDatumToFile(PrintWriter writer, InputDatum inputDatum) throws IOException { InputDataWriter.writeInputDatumToFile(writer, inputDatum.getLabel(), inputDatum.getVector()); } public static void writeInputDatumToFile(PrintWriter writer, String label, DoubleMatrix1D vector) throws IOException { for (int i = 0; i < vector.size(); i++) { if (!Double.isNaN(vector.get(i))) { writer.write(vector.get(i) + " "); } else { writer.write("? "); } } writer.println(label); } public static void writeTempplateHeaderToFile(PrintWriter writer, String fileName, final int numVectors, final int dim, final int numInfo) throws IOException { writer.println("$TYPE template"); writer.println("$XDIM " + numInfo); writer.println("$YDIM " + numVectors); writer.println("$VEC_DIM " + dim); } /** Writes the class information to a tab-separated file. */ public static void writeToFileTabSeparated(SOMLibClassInformation classInfo, String fileName) throws IOException, SOMLibFileFormatException { PrintWriter writer = FileUtils.openFileForWriting("Tab-separated class info", fileName); for (String element : classInfo.getDataNames()) { writer.println(element + "\t" + classInfo.getClassName(element)); } writer.flush(); writer.close(); } public static void write(String fName, InputData data, String outputFormat, boolean tabSeparatedClassFile, boolean skipInstanceNames, boolean skipInputsWithoutClass) throws IOException, SOMToolboxException { Logger logger = Logger.getLogger("at.tuwien.ifs.somtoolbox"); if (outputFormat.equals(AbstractSOMLibSparseInputData.getFormatName())) { logger.info("Writing SOMLib Data Format."); InputDataWriter.writeAsSOMLib(data, data.templateVector(), data.classInformation(), tabSeparatedClassFile, fName + ".vec"); } else if (outputFormat.equals(RandomAccessFileSOMLibInputData.getFormatName())) { logger.info("Writing Random Access Binary Data Format."); RandomAccessFileSOMLibInputData.write(data, fName); } else if (outputFormat.equals(ARFFFormatInputData.getFormatName())) { logger.info("Writing ARFF Data Format, skipping instance names: " + skipInstanceNames); InputDataWriter.writeAsWekaARFF(data, fName, !skipInstanceNames, skipInputsWithoutClass); } else if (outputFormat.equals(ESOMInputData.getFormatName())) { logger.info("Writing ESOM Data Format"); InputDataWriter.writeAsESOM(data, fName); } else if (outputFormat.equals(SOMPAKInputData.getFormatName())) { logger.info("Writing SOMPAK Data Format"); InputDataWriter.writeAsSOMPAK(data, fName); } else if (outputFormat.equals("Orange")) { logger.info("Writing Orange Data Format"); InputDataWriter.writeAsOrange(data, fName); } else if (outputFormat.equals("CSV")) { logger.info("Writing CSV Format"); InputDataWriter.writeAsCSV(data, fName); } else { // check for logical programming mistakes, basically throw new SOMToolboxException("Didn't write format of type '" + outputFormat + "', most likely a programming error."); } } }