/* * Copyright 2004-2010 Information & Software Engineering Group (188/1) * Institute of Software Technology and Interactive Systems * Vienna University of Technology, Austria * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.ifs.tuwien.ac.at/dm/somtoolbox/license.html * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package at.tuwien.ifs.somtoolbox.data; import java.io.BufferedReader; import java.io.IOException; import java.util.logging.Logger; import at.tuwien.ifs.somtoolbox.util.FileUtils; import at.tuwien.ifs.somtoolbox.util.StdErrProgressWriter; import at.tuwien.ifs.somtoolbox.util.StringUtils; /** * Implements a {@link TemplateVector} based on a SOMLib <a * href="http://olymp.ifs.tuwien.ac.at/somtoolbox/doc/somlibFileFormat.html#template_vector">Template Vector File</a>. * See also a <a href="../../../../../ssd.tv">sample TemplateVector file</a>. * <p> * <i>Created on May 14, 2004</i> * </p> * * @author Michael Dittenbach * @author Rudolf Mayer * @version $Id: SOMLibTemplateVector.java 3583 2010-05-21 10:07:41Z mayer $ */ public class SOMLibTemplateVector extends AbstractSOMLibTemplateVector { private int lineNumber = 0; /** * Creates an empty instance. */ protected SOMLibTemplateVector() { } /** Constructor intended to be used when generating data. All attributes will be called "componen_x". */ public SOMLibTemplateVector(int numVectors, int dim) { this.dim = dim; this.numInfo = 2; this.numVectors = numVectors; Logger.getLogger("at.tuwien.ifs.somtoolbox").info("Initializing template vector with " + dim + " dimensions."); elements = new TemplateVectorElement[dim]; for (int i = 0; i < dim; i++) { elements[i] = new TemplateVectorElement(this, "component_" + i, i); } } /** Creates an instance with the given component names, and the specified detail level numInfo */ public SOMLibTemplateVector(int numVectors, String[] componentNames, int numInfo) throws IOException { this.dim = componentNames.length; this.numInfo = numInfo; this.numVectors = numVectors; elements = new TemplateVectorElement[dim]; for (int i = 0; i < dim; i++) { elements[i] = new TemplateVectorElement(this, componentNames[i], i); elementMap.put(elements[i].getLabel(), elements[i]); } } /** Creates an instance with the given component names. */ public SOMLibTemplateVector(int numVectors, String[] componentNames) throws IOException { this(numVectors, componentNames, 7); } /** * Creates a new {@link TemplateVector} by reading from the given file. * * @param templateFileName the file to read from * @throws IOException in case of problems reading the file */ public SOMLibTemplateVector(String templateFileName) throws IOException { this.templateFileName = templateFileName; readTemplateVectorFile(templateFileName); } /** * Reads the {@link TemplateVector} information from the given filename. The file format has to follow the * specification given in the <a * href="http://olymp.ifs.tuwien.ac.at/somtoolbox/doc/somlibFileFormat.html#template_vector">SOMLib Data Files * specification</a>. See also a <a href="../../../../../doc/examples/ssd.tv">sample TemplateVector file</a>. */ public void readTemplateVectorFile(String templateFileName) throws IOException { Logger.getLogger("at.tuwien.ifs.somtoolbox").info("Start reading template vector."); BufferedReader br = FileUtils.openFile("Template vector file", templateFileName); String line = readTemplateVectorFileHeader(br); elements = new TemplateVectorElement[dim]; int index = 0; StdErrProgressWriter progressWriter = new StdErrProgressWriter(dim, "Reading feature ", 10); while (line != null) { // skip comment lines and empty lines if (line.trim().length() == 0 || line.trim().startsWith("#")) { line = br.readLine(); continue; } progressWriter.progress(index + 1); String[] lineElements = line.split(StringUtils.REGEX_SPACE_OR_TAB); if (lineElements.length != numInfo) { throw new IOException("Template vector file corrupt in element number " + index + ": expected " + numInfo + " line elements, found " + lineElements.length + " " + getErrorDetails(line, lineNumber)); } else { // vector format ok. checking number format and creating Vector of element. // check index if (index >= dim) { throw new IOException("Template vector file corrupt, attempting to read element #" + (index + 1) + ", specified dimensionality is " + dim + ". Aborting."); } try { processLine(index, lineElements); } catch (NumberFormatException e) { // does not happen at the moment NumberFormatException ex = new NumberFormatException( "Template vector number format corrupt in vector number " + index + ": " + e.getMessage()); ex.setStackTrace(e.getStackTrace()); throw ex; } } index++; lineNumber++; line = br.readLine(); } if (index != dim) { throw new IOException("Template vector file corrupt. Incorrect number of dimensions(index=" + index + ", dim=" + dim + ")."); } Logger.getLogger("at.tuwien.ifs.somtoolbox").info("Template vector file correctly loaded."); } protected String readTemplateVectorFileHeader(BufferedReader br) throws IOException { String line = null; lineNumber = 0; // PROCESS HEADER with arbitrary number of comment lines & lines starting with $ while ((line = br.readLine()) != null) { lineNumber++; if (line.startsWith("#")) { // ignore comments continue; } if (!line.startsWith("$")) { break; } if (line.startsWith("$TYPE")) { // ignore } else if (line.startsWith("$XDIM")) { String[] lineElements = line.split(StringUtils.REGEX_SPACE_OR_TAB); if (lineElements.length > 1) { numInfo = Integer.parseInt(lineElements[1]); if (numInfo < 2) { throw new IOException( "Template vector file format corrupt. At least 2 columns (number, label) required."); } } else { throw new IOException(getErrorMessage("$XDIM requires a numeric parameter.", line, lineNumber)); } } else if (line.startsWith("$YDIM")) { String[] lineElements = line.split(StringUtils.REGEX_SPACE_OR_TAB); if (lineElements.length > 1) { try { numVectors = Integer.parseInt(lineElements[1]); } catch (NumberFormatException e) { throw new IOException(getErrorMessage("$YDIM requires a valid numeric parameter.", line, lineNumber)); } } else { throw new IOException(getErrorMessage("$YDIM requires a numeric parameter.", line, lineNumber)); } } else if (line.startsWith("$VEC_DIM") || line.startsWith("$VECDIM")) { String[] lineElements = line.split(StringUtils.REGEX_SPACE_OR_TAB); if (lineElements.length > 1) { try { dim = Integer.parseInt(lineElements[1]); } catch (NumberFormatException e) { throw new IOException(getErrorMessage("$VEC_DIM requires a numeric parameter.", line, lineNumber)); } } else { throw new IOException(getErrorMessage("$VEC_DIM requires a numeric parameter.", line, lineNumber)); } } } return line; } private String getErrorMessage(String messageDetail, String line, int lineNumber) { return "Template vector file corrupt: " + messageDetail + " " + getErrorDetails(line, lineNumber); } private String getErrorDetails(String line, int lineNumber) { return "(in line " + lineNumber + ": '" + line + "')"; } /** * Parse information from one single line in the {@link TemplateVector} file, representing one attribute. * * @param index the index (number) of this attribute * @param lineElements the elements of this line, split by the delimiter */ protected void processLine(int index, String[] lineElements) { elements[index] = new TemplateVectorElement(this, lineElements[1].trim(), index); elementMap.put(elements[index].getLabel(), elements[index]); if (numInfo > 2) { elements[index].setDocumentFrequency(Integer.parseInt(lineElements[2])); } if (numInfo > 3) { elements[index].setCollectionTermFrequency(Integer.parseInt(lineElements[3])); } if (numInfo > 4) { elements[index].setMinimumTermFrequency(Integer.parseInt(lineElements[4])); } if (numInfo > 5) { elements[index].setMaximumTermFrequency(Integer.parseInt(lineElements[5])); } if (numInfo > 6) { elements[index].setMeanTermFrequency(Double.parseDouble(lineElements[6])); } if (numInfo > 7) { elements[index].setComment(lineElements[7]); } } /** Sets the names of the vector elements. */ public void setComponentNames(String[] componentNames) { for (int i = 0; i < componentNames.length; i++) { elements[i] = new TemplateVectorElement(this, componentNames[i], i); } } public void setLabel(int index, String label) { elements[index].setLabel(label); } }