/* * Copyright 2004-2010 Information & Software Engineering Group (188/1) * Institute of Software Technology and Interactive Systems * Vienna University of Technology, Austria * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.ifs.tuwien.ac.at/dm/somtoolbox/license.html * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package at.tuwien.ifs.somtoolbox.apps.helper; import java.io.File; import java.io.IOException; import java.io.RandomAccessFile; import org.apache.commons.lang.StringUtils; import com.martiansoftware.jsap.JSAPResult; import at.tuwien.ifs.somtoolbox.apps.config.OptionFactory; import at.tuwien.ifs.somtoolbox.data.InputDataFileFormatConverter; import at.tuwien.ifs.somtoolbox.data.RandomAccessFileSOMLibInputData; import at.tuwien.ifs.somtoolbox.data.SOMLibSparseInputData; /** * Converts an input file to a binary/random access input file. his class customises the handling of data read from the * file by storing it in an Random Access File. * <p> * This is a specific, memory saving implementation, that could otherwise be handled with * {@link InputDataFileFormatConverter} * </p> * . * * @author Rudolf Mayer * @version $Id: VectorFileToRandomAccessFileConverter.java 3589 2010-05-21 10:42:01Z mayer $ */ public class VectorFileToRandomAccessFileConverter extends SOMLibSparseInputData { private RandomAccessFile file; boolean headerWritten = false; public VectorFileToRandomAccessFileConverter(String inputVectorFile) throws IOException { String outputFile = StringUtils.chomp(inputVectorFile, ".gz") + ".bin"; if (new File(outputFile).exists()) { new File(outputFile).delete(); } file = new RandomAccessFile(outputFile, "rw"); // write all the vectors readVectorFile(inputVectorFile, false); // write the vector labels in the end of the file RandomAccessFileSOMLibInputData.writeVectorLabels(file, dataNames); file.close(); } /** * Stores the information read in the random access file. */ @Override protected void processLine(int documentIndex, String[] lineElements) throws Exception { if (!headerWritten) { headerWritten = RandomAccessFileSOMLibInputData.writeHeader(file, numVectors(), dim()); } String label = lineElements[dim].trim(); dataNames[documentIndex] = label; for (int termIndex = 0; termIndex < dim; termIndex++) { file.writeDouble(Double.parseDouble(lineElements[termIndex])); } } @Override protected void initMatrix(boolean sparse) { // do nothing, we don't need to store anything } /** * Starts the conversion * * @param args Needed program arguments: * <ul> * <li>-v inputVectorFile, mandatory</li> * </ul> * @throws IOException If the input vector file can't be read, or the output binary file can't be written. */ public static void main(String[] args) throws IOException { JSAPResult config = OptionFactory.parseResults(args, OptionFactory.getOptInputVectorFile(true)); String inputVectorFile = config.getString("inputVectorFile"); new VectorFileToRandomAccessFileConverter(inputVectorFile); } }