/* * Copyright 2004-2010 Information & Software Engineering Group (188/1) * Institute of Software Technology and Interactive Systems * Vienna University of Technology, Austria * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.ifs.tuwien.ac.at/dm/somtoolbox/license.html * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package at.tuwien.ifs.somtoolbox.data.distance; import java.io.IOException; import java.io.PrintWriter; import java.io.RandomAccessFile; import java.util.logging.Logger; import com.martiansoftware.jsap.FlaggedOption; import com.martiansoftware.jsap.JSAPResult; import com.martiansoftware.jsap.Parameter; import at.tuwien.ifs.somtoolbox.SOMToolboxException; import at.tuwien.ifs.somtoolbox.apps.SOMToolboxApp; import at.tuwien.ifs.somtoolbox.apps.config.OptionFactory; import at.tuwien.ifs.somtoolbox.data.InputData; import at.tuwien.ifs.somtoolbox.data.InputDatum; import at.tuwien.ifs.somtoolbox.data.SOMLibSparseInputData; import at.tuwien.ifs.somtoolbox.input.SOMLibMapDescription; import at.tuwien.ifs.somtoolbox.layers.metrics.AbstractMetric; import at.tuwien.ifs.somtoolbox.layers.metrics.DistanceMetric; import at.tuwien.ifs.somtoolbox.layers.metrics.MetricException; import at.tuwien.ifs.somtoolbox.util.FileUtils; import at.tuwien.ifs.somtoolbox.util.StdErrProgressWriter; import at.tuwien.ifs.somtoolbox.util.StringUtils; /** * Writes the input matrix for a given data set to an ASCII or binary format. A distance matrix is of size * <code>n*n</code>, where <code>n</code> is the number of elements. The matrix is symmetric, i.e. the upper-right and * lower-left halves contain the same values. The diagonal contains the distances of one element to itself, and is thus * always 0. <br/> * Supported formats are: * <ul> * <li>Plain ASCII, containing all values in the matrix</li> * <li>SOMlib ASCII, containing just the upper-right half of the symmetric matrix, skipping the diagonal (can be read by * {@link AbstractMemoryInputVectorDistanceMatrix})</li> * <li>Orange (http://www.ailab.si/orange/) ASCII, containing the diagonal and the lower-left half</li> * <li>Binary, storing the same information as SOMLib ASCII, but in a binary format for random access (can be read by * {@link RandomAccessFileInputVectorDistanceMatrix})</li> * </ul> * * @author Rudolf Mayer * @version $Id: DistanceMatrixWriter.java 3869 2010-10-21 15:56:09Z mayer $ */ public class DistanceMatrixWriter implements SOMToolboxApp { private static final FlaggedOption OPT_CLASS_INFORMATION_FILE = OptionFactory.getOptClassInformationFile(false); /** Format for the Orange data mining tool (http://www.ailab.si/orange/) */ private static final String ORANGE = "Orange"; /** Binary format (for {@link RandomAccessFile} */ private static final String BINARY = "Binary"; /** SOMLib format (half-matrix) */ private static final String SOM_LIB = "SOMLib"; private static final String PLAIN = "plain"; public static String[] OUTPUT_FORMATS = { SOM_LIB, PLAIN, BINARY, ORANGE }; public static final Type APPLICATION_TYPE = Type.Helper; public static final String DESCRIPTION = "Writes a distance matrix for the given data"; public static final String LONG_DESCRIPTION = "Writes a distance matrix for the given data, in ASCII or binary format";; public static final Parameter[] OPTIONS = new Parameter[] { OptionFactory.getOptInputVectorFile(true), OPT_CLASS_INFORMATION_FILE, OptionFactory.getOptMetric(false), OptionFactory.getOptMetricParams(false), OptionFactory.getOptOutputFileName(true), OptionFactory.getOptOutputFormat(false, SOM_LIB, DistanceMatrixWriter.OUTPUT_FORMATS) }; public static void main(String[] args) throws SOMToolboxException, IOException { JSAPResult config = OptionFactory.parseResults(args, OPTIONS); String inputVectorFileName = OptionFactory.getFilePath(config, "inputVectorFile"); String classInfoFile = OptionFactory.getFilePath(config, "classInformationFile"); String outputFileName = OptionFactory.getFilePath(config, "output"); String metricName = config.getString("metric"); String metricParams = config.getString("metricParams"); DistanceMetric metric = AbstractMetric.instantiateNice(metricName); metric.setMetricParams(metricParams); String outputFormat = config.getString("outputFormat"); InputData data = new SOMLibSparseInputData(inputVectorFileName, null, classInfoFile); if (outputFormat.equals(SOM_LIB)) { writeSOMLibFileInputVectorDistanceMatrix(data, outputFileName, metric); } else if (outputFormat.equals(ORANGE)) { if (classInfoFile == null) { Logger.getLogger("at.tuwien.ifs.somtoolbox").info( "No class information present - writing Orange Distance Matrix files with input labels"); } writeOrangeFileInputVectorDistanceMatrix(data, outputFileName, metric); } else if (outputFormat.equals(PLAIN)) {// full format, no headers writePlainFileInputVectorDistanceMatrix(data, outputFileName, metric); } else { // binary writeRandomAccessFileInputVectorDistanceMatrix(data, outputFileName, metric); } } /** Write input distance matrix to ASCII file, computing distances on the fly. */ public static void writeSOMLibFileInputVectorDistanceMatrix(InputData data, String fileName, DistanceMetric metric) throws IOException, MetricException { writeSOMLibFileInputVectorDistanceMatrix(data, fileName, metric, true); } /** Write input distance matrix to ASCII file, computing distances on the fly. */ public static void writeSOMLibFileInputVectorDistanceMatrix(InputData data, String fileName, DistanceMetric metric, boolean gzip) throws IOException, MetricException { int numVec = data.numVectors(); PrintWriter out = printSOMLibHeader(numVec, fileName, metric, gzip); Logger.getLogger("at.tuwien.ifs.somtoolbox").info( "Storing input distance matrix with metric " + metric + " to ASCII file " + fileName); StdErrProgressWriter progress = new StdErrProgressWriter(numVec, "Calculating distances for vector ", 1); for (int i = 0; i < numVec; i++) { for (int j = i + 1; j < numVec; j++) { out.print(metric.distance(data.getInputDatum(i), data.getInputDatum(j))); if (j + 1 < numVec) { out.print(" "); } } if (i + 1 < numVec) { // only print newline if lines are not empty out.println(); } progress.progress(); } out.flush(); out.close(); } /** Write input distance matrix to an ASCII file in plain format, computing distances on the fly. */ public static void writePlainFileInputVectorDistanceMatrix(InputData data, String fileName, DistanceMetric metric) throws IOException, MetricException { int numVec = data.numVectors(); PrintWriter out = FileUtils.openFileForWriting(AbstractMemoryInputVectorDistanceMatrix.FILE_TYPE, fileName); Logger.getLogger("at.tuwien.ifs.somtoolbox").info( "Storing input distance matrix with metric " + metric + " to plain distance file " + fileName); StdErrProgressWriter progress = new StdErrProgressWriter(numVec, "Calculating distances for vector ", 1); for (int i = 0; i < numVec; i++) { InputDatum datum = data.getInputDatum(i); for (int j = 0; j < numVec; j++) { if (i == j) { out.print("0"); } else { out.print(StringUtils.format(metric.distance(datum, data.getInputDatum(j)), 8)); } if (j + 1 < numVec) { out.print(" "); } } out.println(); progress.progress(); } out.flush(); out.close(); } /** * Write input distance matrix to an ASCII file for the Orange data mining toolkit ((http://www.ailab.si/orange/), * computing distances on the fly. */ public static void writeOrangeFileInputVectorDistanceMatrix(InputData data, String fileName, DistanceMetric metric) throws IOException, MetricException { int numVec = data.numVectors(); PrintWriter out = FileUtils.openFileForWriting(AbstractMemoryInputVectorDistanceMatrix.FILE_TYPE, fileName); Logger.getLogger("at.tuwien.ifs.somtoolbox").info( "Storing input distance matrix with metric " + metric + " to Orange distance file " + fileName); StdErrProgressWriter progress = new StdErrProgressWriter(numVec, "Calculating distances for vector ", 1); out.println(data.numVectors() + " labeled"); for (int i = 0; i < numVec; i++) { InputDatum datum = data.getInputDatum(i); if (data.classInformation() != null) { out.print(data.classInformation().getClassName(i)); } else { out.print(data.getLabel(i)); } out.print("\t"); for (int j = 0; j < i; j++) { out.print(StringUtils.format(metric.distance(datum, data.getInputDatum(j)), 8)); if (j + 1 < numVec) { out.print("\t"); } } out.print("0.000000\n"); progress.progress(); } out.flush(); out.close(); } /** Write pre-calculated input distance matrix to an ASCII file in SOMLib format. */ public static void writeSOMLibFileInputVectorDistanceMatrix(double[][] distances, String fileName, DistanceMetric metric, boolean gzip) throws IOException, MetricException { int numVec = distances[0].length; PrintWriter out = printSOMLibHeader(distances.length, fileName, metric, gzip); Logger.getLogger("at.tuwien.ifs.somtoolbox").info( "Storing pre-calculated input distance matrix with metric " + metric + " to ASCII file " + fileName); StdErrProgressWriter progress = new StdErrProgressWriter(numVec, "Calculating distances for vector ", 1); for (int i = 0; i < numVec; i++) { for (int j = i + 1; j < distances[i].length; j++) { out.print(distances[i][j]); if (j + 1 < distances[i].length) { out.print(" "); } } if (i + 1 < distances[i].length) { // only print newline if lines are not empty out.println(); } progress.progress(); } out.flush(); out.close(); } private static PrintWriter printSOMLibHeader(int numVectors, String fileName, DistanceMetric metric, boolean gzip) throws IOException { final PrintWriter out = FileUtils.openFileForWriting(AbstractMemoryInputVectorDistanceMatrix.FILE_TYPE, fileName, gzip); out.println("$NUM_VECTORS " + numVectors); out.println(SOMLibMapDescription.METRIC + " " + metric.getClass().getCanonicalName()); return out; } /** Write input distance matrix to a binary file, computing distances on the fly. */ public static void writeRandomAccessFileInputVectorDistanceMatrix(InputData data, String fileName, DistanceMetric metric) throws IOException, MetricException { int numVec = data.numVectors(); RandomAccessFile file = new RandomAccessFile(fileName, "rw"); file.writeInt(numVec); Logger.getLogger("at.tuwien.ifs.somtoolbox").info( "Storing input distance matrix with metric " + metric + " to BINARY file " + fileName); StdErrProgressWriter progress = new StdErrProgressWriter(numVec, "Calculating distances for vector ", 1); for (int i = 0; i < numVec; i++) { for (int j = i + 1; j < numVec; j++) { file.writeDouble(metric.distance(data.getInputDatum(i), data.getInputDatum(j))); } progress.progress(); } file.writeChars(metric.getClass().getCanonicalName().trim()); file.close(); } /** Write pre-calculated input distance matrix to a binary file. */ public static void writeRandomAccessFileInputVectorDistanceMatrix(double[][] distances, String fileName, DistanceMetric metric) throws IOException, MetricException { int numVec = distances[0].length; RandomAccessFile file = new RandomAccessFile(fileName, "rw"); Logger.getLogger("at.tuwien.ifs.somtoolbox").info( "Storing pre-calculated input distance matrix with metric " + metric + " to file " + fileName); StdErrProgressWriter progress = new StdErrProgressWriter(numVec, "Calculating distances for vector ", 1); file.writeInt(numVec); for (int i = 0; i < numVec; i++) { for (int j = i + 1; j < distances[j].length; j++) { file.writeDouble(distances[i][j]); } progress.progress(); } file.writeChars(metric.getClass().getCanonicalName().trim()); file.close(); } }