/*
* Copyright 2004-2010 Information & Software Engineering Group (188/1)
* Institute of Software Technology and Interactive Systems
* Vienna University of Technology, Austria
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.ifs.tuwien.ac.at/dm/somtoolbox/license.html
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package at.tuwien.ifs.somtoolbox.data;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.logging.Logger;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import at.tuwien.ifs.somtoolbox.apps.SOMToolboxApp;
import at.tuwien.ifs.somtoolbox.apps.config.OptionFactory;
import at.tuwien.ifs.somtoolbox.util.StringUtils;
/**
* Handles the removal of zero vectors in vector files in SOMLib format. This class can be run in standalone mode taking
* two arguments, i.e. input and output file. If the input file is gzip-compressed, the output will also be written
* gzip-compressed. The .gz suffix has to be specified manually in order not to alter filenames to something other than
* intended by the user.
* <p>
* <i>Created on Mar 16, 2004</i>
* </p>
*
* @author Michael Dittenbach
* @version $Id: SOMLibZeroVectorRemover.java 3682 2010-07-15 09:12:22Z frank $
*/
public class SOMLibZeroVectorRemover implements SOMToolboxApp {
public static final Parameter[] OPTIONS = new Parameter[] { OptionFactory.getOptInputFileName(),
OptionFactory.getOptOutputVector() };
private static final String ERROR_FILE_FORMAT_CORRUPT = "Input vector file format corrupt. Aborting.";
public static final String DESCRIPTION = "Removes zero vectors (i.e. with 0 in all their components) from files in SOMLib format";
public static final String LONG_DESCRIPTION = DESCRIPTION;
public static final Type APPLICATION_TYPE = Type.Helper;
/**
* Static method for standalone invocation.
*
* @param args Usage: input-filename output-filename
*/
public static void main(String[] args) {
// input file
// output file
// register and parse all options
JSAPResult config = OptionFactory.parseResults(args, OPTIONS);
String inputFileName = config.getString("input");
String outputFileName = config.getString("output");
try {
Logger.getLogger("at.tuwien.ifs.somtoolbox").info("starting removal of vectors with zeros only");
removeZeroVectors(inputFileName, outputFileName);
} catch (Exception e) {
Logger.getLogger("at.tuwien.ifs.somtoolbox").severe(e.getMessage());
throw new IllegalArgumentException(e.getMessage());
}
Logger.getLogger("at.tuwien.ifs.somtoolbox").info("finished removal of vectors with zeros only");
}
/**
* Static method taking input and output filename as argument. File handling is done in this method and throws a
* FileNotFoundException if the input file can not be found and an IOException if some other file handling error
* occurs. If the input file is gzip-compressed, the output will also be written gzip-compressed. The .gz suffix has
* to be specified manually in order not to alter filenames to something other than intended by the user.
*
* @param inFileName Name of input file.
* @param outFileName Name of output file.
*/
public static void removeZeroVectors(String inFileName, String outFileName) throws FileNotFoundException,
IOException {
BufferedReader inFile = null;
boolean gzipped = false;
try {
inFile = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(inFileName))));
Logger.getLogger("at.tuwien.ifs.somtoolbox").info(
inFileName + " is gzip compressed. Trying compressed read. Creating compressed output.");
gzipped = true;
} catch (FileNotFoundException e) {
throw new FileNotFoundException("Input vector file " + inFileName + " not found. Aborting.");
} catch (IOException e) {
Logger.getLogger("at.tuwien.ifs.somtoolbox").info(
inFileName + " is not gzip compressed. Trying uncompressed read. Creating uncompressed output.");
try {
inFile = new BufferedReader(new FileReader(inFileName));
gzipped = false;
} catch (FileNotFoundException e2) {
throw new FileNotFoundException("Input vector file " + inFileName + " not found. Aborting.");
}
}
BufferedWriter outFile = null;
try {
if (gzipped == false) {
outFile = new BufferedWriter(new FileWriter(outFileName));
} else {
outFile = new BufferedWriter(new OutputStreamWriter(new GZIPOutputStream(new FileOutputStream(
outFileName))));
}
} catch (IOException e) {
throw new IOException("Can not open output vector file " + outFileName + ". Aborting.");
}
removeZeroVectors(inFile, outFile);
try {
inFile.close();
outFile.close();
} catch (IOException e) {
String errorMessage = "Problem closing vector files. Aborting.";
Logger.getLogger("at.tuwien.ifs.somtoolbox").severe(errorMessage);
throw new IllegalArgumentException(errorMessage);
}
}
/**
* Static method taking a Buffered Reader and BufferedWriter as argument. The method exits the program if the file
* format is corrupt. TODO: This should be weakened in future by throwing a SOMLibVectorFileFormatException.
*
* @param inReader BufferedReader reading the file continaing unnormalized vectors.
* @param outWriter BuffererWriter writing the normalized vector file.
*/
public static void removeZeroVectors(BufferedReader inReader, BufferedWriter outWriter) {
String line = null;
int xDim = 0;
int yDim = 0;
int vectorDim = 0;
int numVectors = 0;
String type = "";
String tmpFileName = System.getProperty("java.io.tmpdir") + System.getProperty("file.separator")
+ "somtoolboxzerotmp." + System.currentTimeMillis();
BufferedWriter tmpOutWriter = null;
try {
tmpOutWriter = new BufferedWriter(new FileWriter(tmpFileName));
} catch (IOException ioe) {
String errorMessage = "Could not write temporary file " + tmpFileName + ". Aborting";
Logger.getLogger("at.tuwien.ifs.somtoolbox").severe(errorMessage);
throw new IllegalArgumentException(errorMessage);
}
try {
int index = 1;
int numZeroVecs = 0;
while ((line = inReader.readLine()) != null) {
if (line.startsWith("$")) { // write fields through without change
// outWriter.write(line);
// outWriter.newLine();
if (line.startsWith("$TYPE")) {
type = line;
} else if (line.startsWith("$XDIM")) {
String[] lineElements = line.split(StringUtils.REGEX_SPACE_OR_TAB);
if (lineElements.length > 1) {
xDim = Integer.parseInt(lineElements[1]);
numVectors = Integer.parseInt(lineElements[1]);
} else {
Logger.getLogger("at.tuwien.ifs.somtoolbox").severe(ERROR_FILE_FORMAT_CORRUPT);
throw new IllegalArgumentException(ERROR_FILE_FORMAT_CORRUPT);
}
} else if (line.startsWith("$YDIM")) {
String[] lineElements = line.split(StringUtils.REGEX_SPACE_OR_TAB);
if (lineElements.length > 1) {
yDim = Integer.parseInt(lineElements[1]);
numVectors *= Integer.parseInt(lineElements[1]);
} else {
Logger.getLogger("at.tuwien.ifs.somtoolbox").severe(ERROR_FILE_FORMAT_CORRUPT);
throw new IllegalArgumentException(ERROR_FILE_FORMAT_CORRUPT);
}
} else if (line.startsWith("$VEC_DIM") || line.startsWith("$VECDIM")) {
String[] lineElements = line.split(StringUtils.REGEX_SPACE_OR_TAB);
if (lineElements.length > 1) {
vectorDim = Integer.parseInt(lineElements[1]);
} else {
Logger.getLogger("at.tuwien.ifs.somtoolbox").severe(ERROR_FILE_FORMAT_CORRUPT);
throw new IllegalArgumentException(ERROR_FILE_FORMAT_CORRUPT);
}
}
} else {
String[] lineElements = line.split(StringUtils.REGEX_SPACE_OR_TAB);
if (lineElements.length != vectorDim + 1) {
String errorMessage = "Input vector file format corrupt in vector number " + index
+ ". Aborting.";
Logger.getLogger("at.tuwien.ifs.somtoolbox").severe(errorMessage);
throw new IllegalArgumentException(errorMessage);
}
double[] vector = new double[vectorDim];
boolean nonZero = false;
for (int ve = 0; ve < vectorDim; ve++) {
try {
vector[ve] = Double.parseDouble(lineElements[ve]);
if (vector[ve] > 0) {
nonZero = true;
}
} catch (NumberFormatException e) {
String errorMessage = "Input vector number format corrupt in vector number " + index
+ ". Aborting.";
Logger.getLogger("at.tuwien.ifs.somtoolbox").severe(errorMessage);
throw new IllegalArgumentException(errorMessage);
}
}
String label = lineElements[vectorDim];
// vector = Normalization.normalizeVectorToUnitLength(vector);
if (nonZero == true) {
for (int ve = 0; ve < vectorDim; ve++) {
if (vector[ve] == 0) {
tmpOutWriter.write("0 ");
} else {
tmpOutWriter.write(vector[ve] + " ");
}
}
tmpOutWriter.write(label);
tmpOutWriter.newLine();
} else {
numZeroVecs++;
}
index++;
}
}
if (index - 1 != numVectors) {
String errorMessage = "Input vector file format corrupt. Incorrect number of vectors. Aborting.";
Logger.getLogger("at.tuwien.ifs.somtoolbox").severe(errorMessage);
throw new IllegalArgumentException(errorMessage);
}
Logger.getLogger("at.tuwien.ifs.somtoolbox").info("Number of vectors removed: " + numZeroVecs);
tmpOutWriter.close();
outWriter.write(type);
outWriter.newLine();
outWriter.write("$XDIM " + (xDim - numZeroVecs));
outWriter.newLine();
outWriter.write("$YDIM " + yDim);
outWriter.newLine();
outWriter.write("$VEC_DIM " + vectorDim);
outWriter.newLine();
char cbuf[] = new char[1024];
BufferedReader tmpInReader = new BufferedReader(new FileReader(tmpFileName));
int i = 0;
while ((i = tmpInReader.read(cbuf)) != -1) {
outWriter.write(cbuf, 0, i);
}
tmpInReader.close();
File tmpFile = new File(tmpFileName);
tmpFile.delete();
} catch (IOException e) {
String errorMessage = "Problem writing output vector file. Aborting.";
Logger.getLogger("at.tuwien.ifs.somtoolbox").severe(errorMessage);
throw new IllegalArgumentException(errorMessage);
}
}
}