/* * Copyright 2004-2010 Information & Software Engineering Group (188/1) * Institute of Software Technology and Interactive Systems * Vienna University of Technology, Austria * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.ifs.tuwien.ac.at/dm/somtoolbox/license.html * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package at.tuwien.ifs.somtoolbox.apps.helper; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.net.URLDecoder; import java.net.URLEncoder; import java.util.logging.Logger; import java.util.zip.GZIPInputStream; import java.util.zip.GZIPOutputStream; import com.martiansoftware.jsap.JSAPResult; import at.tuwien.ifs.somtoolbox.apps.config.OptionFactory; import at.tuwien.ifs.somtoolbox.util.StringUtils; /** * Handles the normalization of vector files in SOMLib format. This class can be run in standalone mode taking two * arguments, i.e. input and output file. If the input file is gzip-compressed, the output will also be written * gzip-compressed. The .gz suffix has to be specified manually in order not to alter filenames to something other than * intended by the user. * <p> * <i>Created on Mar 16, 2004</i> * </p> * * @author Michael Dittenbach * @version $Id: SOMLibDataInfoGenerator.java 3589 2010-05-21 10:42:01Z mayer $ */ public class SOMLibDataInfoGenerator { /** * Static method for standalone invocation. * * @param args Usage: method-type input-filename output-filename */ public static void main(String[] args) { // -b base directory, mand. // -r common part to be removed, opt. // input vector file // data info file // register and parse all options JSAPResult config = OptionFactory.parseResults(args, OptionFactory.OPTIONS_DATA_INFO_GENERATOR); String baseDir = config.getString("baseDir"); String removeDir = config.getString("removeDir"); String inputVectorFileName = config.getString("inputVectorFile"); String dataInfoFileName = config.getString("dataInfoFile"); try { Logger.getLogger("at.tuwien.ifs.somtoolbox").info("starting data info file generation"); generateDataInfo(inputVectorFileName, dataInfoFileName, baseDir, removeDir); } catch (Exception e) { Logger.getLogger("at.tuwien.ifs.somtoolbox").severe(e.getMessage()); e.printStackTrace(); System.exit(-1); } Logger.getLogger("at.tuwien.ifs.somtoolbox").info("finished data info file generation"); } /** * Static method taking input and output filename as argument. File handling is done in this method and throws a * FileNotFoundException if the input file can not be found and an IOException if some other file handling error * occurs. If the input file is gzip-compressed, the output will also be written gzip-compressed. The .gz suffix has * to be specified manually in order not to alter filenames to something other than intended by the user. * * @param inFileName Name of input file. * @param outFileName Name of output file. * @throws FileNotFoundException if the file does not exist, is a directory rather than a regular file, or for some * other reason cannot be opened for reading. */ public static void generateDataInfo(String inFileName, String outFileName, String baseDir, String removeDir) throws FileNotFoundException, IOException { BufferedReader inFile = null; boolean gzipped = false; try { inFile = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(inFileName)))); Logger.getLogger("at.tuwien.ifs.somtoolbox").info( inFileName + " is gzip compressed. Trying compressed read. Creating compressed output."); gzipped = true; } catch (FileNotFoundException e) { throw new FileNotFoundException("Input vector file " + inFileName + " not found. Aborting."); } catch (IOException e) { Logger.getLogger("at.tuwien.ifs.somtoolbox").info( inFileName + " is not gzip compressed. Trying uncompressed read. Creating uncompressed output."); try { inFile = new BufferedReader(new FileReader(inFileName)); gzipped = false; } catch (FileNotFoundException e2) { throw new FileNotFoundException("Input vector file " + inFileName + " not found. Aborting."); } } BufferedWriter outFile = null; try { if (gzipped == false) { outFile = new BufferedWriter(new FileWriter(outFileName)); } else { outFile = new BufferedWriter(new OutputStreamWriter(new GZIPOutputStream(new FileOutputStream( outFileName)))); } } catch (IOException e) { throw new IOException("Can not open data info file " + outFileName + ". Aborting."); } generateDataInfo(inFile, outFile, baseDir, removeDir); } /** * Static method taking a Buffered Reader and BufferedWriter as argument. The method exits the program if the file * format is corrupt. TODO: This should be weakened in future by throwing a SOMLibVectorFileFormatException. * * @param inReader BufferedReader reading the file continaing unnormalized vectors. * @param outWriter BuffererWriter writing the normalized vector file. */ public static void generateDataInfo(BufferedReader inReader, BufferedWriter outWriter, String baseDir, String removeDir) { String line = null; // int vectorDim = 0; int numVectors = 0; // String fileSeparator = System.getProperty("file.separator"); char sep1 = getSeparatorFromString(baseDir); if (!baseDir.endsWith(String.valueOf(sep1))) { baseDir = baseDir + String.valueOf(sep1); } sep1 = getSeparatorFromString(removeDir); if (!removeDir.endsWith(String.valueOf(sep1))) { removeDir = removeDir + String.valueOf(sep1); } try { int index = 1; outWriter.write("$TYPE data_info"); outWriter.newLine(); outWriter.write("$BASE_DIR " + URLEncoder.encode(baseDir, "UTF-8")); outWriter.newLine(); while ((line = inReader.readLine()) != null) { if (line.startsWith("$")) { // write fields XDIM and YDIM through without change if (line.startsWith("$TYPE")) { // ignore } else if (line.startsWith("$XDIM")) { outWriter.write(line); outWriter.newLine(); String[] lineElements = line.split(StringUtils.REGEX_SPACE_OR_TAB); if (lineElements.length > 1) { numVectors = Integer.parseInt(lineElements[1]); } else { Logger.getLogger("at.tuwien.ifs.somtoolbox").severe( "Input vector file format corrupt. Aborting."); System.exit(-1); } } else if (line.startsWith("$YDIM")) { outWriter.write(line); outWriter.newLine(); String[] lineElements = line.split(StringUtils.REGEX_SPACE_OR_TAB); if (lineElements.length > 1) { numVectors *= Integer.parseInt(lineElements[1]); } else { Logger.getLogger("at.tuwien.ifs.somtoolbox").severe( "Input vector file format corrupt. Aborting."); System.exit(-1); } } else if (line.startsWith("$VEC_DIM") || line.startsWith("$VECDIM")) { String[] lineElements = line.split(StringUtils.REGEX_SPACE_OR_TAB); if (lineElements.length > 1) { // vectorDim = Integer.parseInt(lineElements[1]); } else { Logger.getLogger("at.tuwien.ifs.somtoolbox").severe( "Input vector file format corrupt. Aborting."); System.exit(-1); } } } else { String label = line.substring(line.lastIndexOf(' ') + 1); label = URLDecoder.decode(label, "UTF-8"); char sep = getSeparatorFromString(label); String displayName = label.substring(label.lastIndexOf(sep) + 1); String fileName = null; if (label.indexOf(removeDir) != -1) { fileName = label.substring(removeDir.length()); } // System.out.println("removeDir: "+removeDir); // System.out.println(label + " " + displayName + " " + fileName); // System.out.println(displayName + " " + fileName); outWriter.write(URLEncoder.encode(label, "UTF-8") + " " + URLEncoder.encode(displayName, "UTF-8") + " " + URLEncoder.encode(fileName, "UTF-8")); outWriter.newLine(); index++; } } if (index - 1 != numVectors) { Logger.getLogger("at.tuwien.ifs.somtoolbox").severe( "Input vector file format corrupt. Incorrect number of vectors - header: " + numVectors + ", read " + index + " . Aborting."); System.exit(-1); } } catch (IOException e) { Logger.getLogger("at.tuwien.ifs.somtoolbox").severe("Problem writing data info file. Aborting."); System.exit(-1); } try { inReader.close(); outWriter.close(); } catch (IOException e) { Logger.getLogger("at.tuwien.ifs.somtoolbox").severe("Problem closing vector files. Aborting."); System.exit(-1); } } /** * @param path the path to calculate the separator of. * @return the separator string, either '/' or '\\' */ private static char getSeparatorFromString(String path) { char sep = '/'; if (path.indexOf("/") != -1) { sep = '/'; } else if (path.indexOf("\\") != -1) { sep = '\\'; } return sep; } }