/*
* Copyright 2004-2010 Information & Software Engineering Group (188/1)
* Institute of Software Technology and Interactive Systems
* Vienna University of Technology, Austria
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.ifs.tuwien.ac.at/dm/somtoolbox/license.html
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package at.tuwien.ifs.somtoolbox.data;
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Hashtable;
import java.util.LinkedHashMap;
import java.util.Random;
import java.util.logging.Logger;
import cern.colt.matrix.DoubleMatrix1D;
import cern.colt.matrix.DoubleMatrix2D;
import cern.colt.matrix.impl.DenseDoubleMatrix1D;
import cern.colt.matrix.impl.DenseDoubleMatrix2D;
import cern.colt.matrix.impl.SparseDoubleMatrix2D;
import cern.jet.math.Functions;
import com.martiansoftware.jsap.JSAPResult;
import at.tuwien.ifs.somtoolbox.SOMToolboxException;
import at.tuwien.ifs.somtoolbox.apps.config.OptionFactory;
import at.tuwien.ifs.somtoolbox.layers.metrics.DistanceMetric;
import at.tuwien.ifs.somtoolbox.layers.metrics.MetricException;
import at.tuwien.ifs.somtoolbox.util.FileUtils;
import at.tuwien.ifs.somtoolbox.util.StdErrProgressWriter;
import at.tuwien.ifs.somtoolbox.util.StringUtils;
/**
* Implements {@link InputData} based on a SOMLib <a
* href="http://olymp.ifs.tuwien.ac.at/somtoolbox/doc/somlibFileFormat.html#input_vectors">Input Vector File</a>.
*
* @author Michael Dittenbach
* @version $Id: SOMLibSparseInputData.java 3971 2010-12-15 13:18:39Z mayer $
*/
public class SOMLibSparseInputData extends AbstractSOMLibSparseInputData {
public static final String INPUT_VECTOR_FILE_FORMAT_CORRUPT_MESSAGE = "Input vector file corrupt in vector number ";
public static final boolean DEFAULT_NORMALISED = true;
public static final int DEFAULT_NUM_CACHE_BLOCKS = 1;
public static final int DEFAULT_RANDOM_SEED = 7;
public static final boolean DEFAULT_SPARSE = true;
private boolean containsMissingValues = false;
/** Counts how many of the feature values are not zero; stores an int value for each vector in the input data. */
protected int[] nonZeros;
protected boolean sparse;
/**
* The actual data. Each row in the matrix represents one vector.
*/
protected DoubleMatrix2D data = null;
private int ydim = 1;
/** Constructor intended for generated synthetic data. */
public SOMLibSparseInputData(InputDatum[] inputData, SOMLibClassInformation classInfo) {
String[] dataNames = new String[inputData.length];
DenseDoubleMatrix2D data = new DenseDoubleMatrix2D(inputData.length, inputData[0].getDim());
for (int i = 0; i < dataNames.length; i++) {
// System.out.println(i + " + " + inputData[i]);
dataNames[i] = inputData[i].getLabel();
DoubleMatrix1D vector = inputData[i].getVector();
for (int j = 0; j < vector.size(); j++) {
data.setQuick(i, j, vector.getQuick(j));
}
}
initFromExistingData(data, dataNames, false, new Random(), null, classInfo);
nonZeros = new int[inputData.length];
}
/**
* Constructor intended for subset generation.
*/
protected SOMLibSparseInputData(DoubleMatrix2D data, String[] dataNames, boolean norm, Random rand,
TemplateVector tv, SOMLibClassInformation clsInfo) {
initFromExistingData(data, dataNames, norm, rand, tv, clsInfo);
}
private void initFromExistingData(DoubleMatrix2D data, String[] dataNames, boolean norm, Random rand,
TemplateVector tv, SOMLibClassInformation clsInfo) {
this.data = data;
this.dataNames = dataNames;
this.dim = data.columns();
this.numVectors = dataNames.length;
this.isNormalized = norm;
nameCache = new LinkedHashMap<String, Integer>();
meanVector = new DenseDoubleMatrix1D(dim);
for (int i = 0; i < dataNames.length; i++) {
meanVector.assign(data.viewRow(i), Functions.plus); // add to mean vector
nameCache.put(dataNames[i], new Integer(i));
}
meanVector.assign(Functions.div(numVectors)); // calculating mean vector
this.rand = rand;
this.templateVector = tv;
this.classInfo = clsInfo;
}
/**
* Uses default values for sparsity (<code>true</code>), normalisation (<code>true</code>), chacheblocks (
* <code>1</code>) and seed (<code>7</code> ).
*/
public SOMLibSparseInputData(String vectorFileName) {
this(vectorFileName, DEFAULT_SPARSE, DEFAULT_NORMALISED, DEFAULT_NUM_CACHE_BLOCKS, DEFAULT_RANDOM_SEED);
}
public SOMLibSparseInputData(String vectorFileName, boolean sparse, boolean norm, int numCacheBlocks, long seed) {
source = vectorFileName;
init(sparse, norm, seed);
readVectorFile(vectorFileName, sparse);
// TODO: use SVD infos for map size determination, write standalone class
// System.out.print("Singular Values: ");
// SingularValueDecomposition svd = new SingularValueDecomposition(data.viewDice());
// double[] sv = svd.getSingularValues();
// for (int i = 0; i < sv.length; i++) {
// System.out.print(sv[i] + ", ");
// }
// System.out.println();
}
public void init(boolean sparse, boolean norm, long seed) {
isNormalized = norm;
nameCache = new LinkedHashMap<String, Integer>();
rand = new Random(seed);
this.sparse = sparse;
}
public SOMLibSparseInputData(String vectorFileName, String templateFileName) {
this(vectorFileName, templateFileName, DEFAULT_SPARSE, DEFAULT_NORMALISED, DEFAULT_NUM_CACHE_BLOCKS,
DEFAULT_RANDOM_SEED);
}
public SOMLibSparseInputData(String vectorFileName, String templateFileName, boolean sparse, boolean norm,
int numCacheBlocks, long seed) {
this(vectorFileName, sparse, norm, numCacheBlocks, seed);
try {
if (templateFileName == null) {
templateVector = new SOMLibTemplateVector(numVectors, dim); // initialize new default Template Vector
} else {
templateVector = new SOMLibTemplateVector(templateFileName);
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
if (dim != templateVector.dim()) {
String errorMessage = "Dimensionalities in input vector file and template vector file differ (" + dim
+ " != " + templateVector.dim() + ". Aborting.";
Logger.getLogger("at.tuwien.ifs.somtoolbox").severe(errorMessage);
throw new IllegalArgumentException(errorMessage);
}
}
public SOMLibSparseInputData(String vectorFileName, String templateFileName, String classInfoFileName)
throws SOMToolboxException {
this(vectorFileName, templateFileName, classInfoFileName, DEFAULT_SPARSE, DEFAULT_NORMALISED,
DEFAULT_NUM_CACHE_BLOCKS, DEFAULT_RANDOM_SEED);
}
public SOMLibSparseInputData(String vectorFileName, String templateFileName, String classInfoFileName,
boolean sparse, boolean norm, int numCacheBlocks, long seed) throws SOMToolboxException {
this(vectorFileName, templateFileName, sparse, norm, numCacheBlocks, seed);
if (classInfoFileName != null) {
classInfo = new SOMLibClassInformation(classInfoFileName);
}
}
protected SOMLibSparseInputData() {
super();
}
@Override
public InputDatum getInputDatum(int index) {
return new InputDatum(dataNames[index], data.viewRow(index), nonZeros[index]);
}
@Override
public double[] getInputVector(int d) {
return data.viewRow(d).toArray();
}
@Override
public double getValue(int x, int y) {
return data.get(x, y);
}
@Override
public double mqe0(DistanceMetric metric) {
if (mqe0 == -1) { // mqe0 for data was not yet calculated
mqe0 = 0;
try {
for (int i = 0; i < numVectors; i++) {
mqe0 += metric.distance(meanVector, data.viewRow(i));
}
} catch (MetricException e) {
Logger.getLogger("at.tuwien.ifs.somtoolbox").severe(e.getMessage());
throw new IllegalArgumentException(e.getMessage());
}
}
return mqe0;
}
/**
* Reads the input data from the given file, which has to follow the <a
* href="http://olymp.ifs.tuwien.ac.at/somtoolbox/doc/somlibFileFormat.html#input_vectors">Input Vector File</a>
* specification. Additionally calculates the {@link AbstractSOMLibSparseInputData#meanVector} and creates the
* {@link AbstractSOMLibSparseInputData#nameCache} for faster index search.
*
* @param vectorFileName the name of the input vector file.
*/
protected void readVectorFile(String vectorFileName, boolean sparse) {
BufferedReader br = openFile(vectorFileName);
String line = null;
int lineNumber = 0;
try {
// PROCESS HEADER with arbitrary number of comment lines & lines starting with $
while ((line = br.readLine()) != null) {
lineNumber++;
line = line.trim();
if (line.startsWith("#") || line.equals("")) { // ignore comments and empty lines
continue;
}
if (!line.startsWith("$")) {
break;
}
String[] lineElements = line.split(StringUtils.REGEX_SPACE_OR_TAB);
if (lineElements.length < 2) {
String msg = "Header in input vector file corrupt in line #" + lineNumber
+ ": less than two elements!";
Logger.getLogger("at.tuwien.ifs.somtoolbox").severe(msg);
throw new IOException(msg);
}
String header = lineElements[0];
String content = lineElements[1];
if (header.equals("$TYPE")) {
// do nothing
} else if (header.equals("$DATA_TYPE")) {
// determine type of vector data, if data is audio set sparsity to false
String[] subtypes = content.split("-", 2);
content_type = subtypes[0];
if (subtypes.length > 1) {
content_subtype = subtypes[1];
}
if (content_type.equals("audio")) {
Logger.getLogger("at.tuwien.ifs.somtoolbox").info(
"Content type = audio. Setting sparsity to false.");
sparse = false;
}
} else if (header.equals("$DATA_DIM")) {
String[] strDataDim = content.split("x", 2);
featureMatrixRows = Integer.parseInt(strDataDim[0]);
featureMatrixCols = Integer.parseInt(strDataDim[1]);
} else if (header.equals("$XDIM")) {
numVectors = Integer.parseInt(content);
} else if (header.equals("$YDIM")) {
ydim = Integer.parseInt(content);
} else if (header.equals("$VEC_DIM") || header.equals("$VECDIM")) {
dim = Integer.parseInt(content);
} else {
Logger.getLogger("at.tuwien.ifs.somtoolbox").warning("Unkown Header line '" + line + "', ingoring.");
}
}
numVectors *= ydim;
// PROCESS REMAINDER OF FILE
initDataStructures(sparse);
int index = 0;
StdErrProgressWriter progressWriter = new StdErrProgressWriter(numVectors, "Reading input datum ", 10);
while (line != null) {
if (!line.equals("")) {
// sanity check for numVectors
if (index >= numVectors) {
String errorMessage = "Input vector file corrupt. Incorrect number of vectors: header says "
+ numVectors + ", but already reading vector " + (index + 1) + ". Aborting.";
Logger.getLogger("at.tuwien.ifs.somtoolbox").severe(errorMessage);
throw new IOException(errorMessage);
}
String[] lineElements = line.split(StringUtils.REGEX_SPACE_OR_TAB, dim + 1);
if (lineElements.length != dim + 1) {
String msg = INPUT_VECTOR_FILE_FORMAT_CORRUPT_MESSAGE + (index + 1)
+ ": dimension specified is " + dim + ", found " + (lineElements.length - 1)
+ ". Aborting.";
Logger.getLogger("at.tuwien.ifs.somtoolbox").severe(msg);
throw new IOException(msg);
} else { // vector syntax ok. checking number format and calculating meanVector.
try {
processLine(index, lineElements);
} catch (NumberFormatException e) {
String msg = INPUT_VECTOR_FILE_FORMAT_CORRUPT_MESSAGE + (index + 1) + " (line #"
+ lineNumber + "): " + e.getMessage() + ". Aborting.";
Logger.getLogger("at.tuwien.ifs.somtoolbox").severe(msg);
throw new IOException(msg);
}
}
progressWriter.progress();
index++;
}
line = br.readLine();
lineNumber++;
}
if (containsMissingValues) {
System.out.println("\n\n");
Logger.getLogger("at.tuwien.ifs.somtoolbox").warning(
"Input data file contained missing values - be sure to handle them correctly in sub-sequent steps!\n\n");
}
if (index != numVectors) {
String errorMessage = "Input vector file corrupt. Incorrect number of vectors: header says "
+ numVectors + ", but read " + index + ". Aborting.";
Logger.getLogger("at.tuwien.ifs.somtoolbox").severe(errorMessage);
throw new IOException(errorMessage);
} else { // file is sane
meanVector.assign(Functions.div(numVectors)); // calculating mean vector
}
} catch (Exception e) {
Logger.getLogger("at.tuwien.ifs.somtoolbox").severe(ERROR_MESSAGE_FILE_FORMAT_CORRUPT);
throw new IllegalArgumentException(e.getMessage());
}
Logger.getLogger("at.tuwien.ifs.somtoolbox").info("Input vector file format seems to be correct. Riding on ...");
}
protected void initDataStructures(boolean sparse) {
initMatrix(sparse);
dataNames = new String[numVectors];
nonZeros = new int[numVectors];
// initialize mean vector
meanVector = new DenseDoubleMatrix1D(dim);
}
protected void initMatrix(boolean sparse) {
if (sparse == true) {
data = new SparseDoubleMatrix2D(numVectors, dim);
} else {
data = new DenseDoubleMatrix2D(numVectors, dim);
}
}
protected static BufferedReader openFile(String vectorFileName) {
try {
return FileUtils.openFile("Input vector file", vectorFileName);
} catch (FileNotFoundException e) {
throw new IllegalArgumentException(e.getMessage());
}
}
/**
* Process a single line of the input vector file.
*
* @param index the line index
* @param lineElements the line elements, split by the delimeters
*/
protected void processLine(int index, String[] lineElements) throws Exception {
for (int ve = 0; ve < dim; ve++) {
setMatrixValue(index, ve, parseDouble(lineElements[ve]));
}
addInstance(index, lineElements[dim]);
}
protected double parseDouble(String s) {
if (s.trim().equals("?")) {
containsMissingValues = true;
return MISSING_VALUE;
}
return Double.parseDouble(s);
}
protected void setMatrixValue(int row, int column, double value) {
data.setQuick(row, column, value);
if (value != 0.0d) {
nonZeros[row]++;
}
}
protected void addInstance(int index, String label) {
// avoid heading or trailing spaces --> can create problems with DB driven vectors that do not store those
// spaces
dataNames[index] = label.trim();
// insert into nameCache
nameCache.put(label, new Integer(index));
/*
* if (isNormalized==Normalization.UNIT_LEN) { // create normalized vector before adding to meanVec
* Normalization.normalizeRowToUnitLength(data, index-1); }
*/
meanVector.assign(data.viewRow(index), Functions.plus); // add to mean vector
}
@Override
public InputData subset(String[] names) {
SparseDoubleMatrix2D newData = new SparseDoubleMatrix2D(names.length, dim);
int[] nonZerosNew = new int[names.length];
for (int i = 0; i < names.length; i++) {
try {
int index = nameCache.get(names[i]).intValue();
newData.viewRow(i).assign(data.viewRow(index));
nonZerosNew[i] = this.nonZeros[index];
} catch (NullPointerException e) {
return null;
}
}
SOMLibSparseInputData res = new SOMLibSparseInputData(newData, names, isNormalized, rand, templateVector,
classInfo);
res.nonZeros = nonZerosNew;
return res;
}
/** Method for stand-alone execution, prints useful information about the input data. */
public static void main(String[] args) throws Exception {
// register and parse all options for the AttendeeMapper
JSAPResult config = OptionFactory.parseResults(args, OptionFactory.getOptInputVectorFile(true));
String inputFileName = config.getString("inputVectorFile");
SOMLibSparseInputData libSparseInputData = new SOMLibSparseInputData(inputFileName);
Hashtable<Integer, Integer> featureDensities = libSparseInputData.getFeatureDensities();
ArrayList<Integer> arrayList = new ArrayList<Integer>(featureDensities.keySet());
Collections.sort(arrayList);
for (int i = 0; i < arrayList.size(); i++) {
System.out.println(arrayList.get(i) + ": " + featureDensities.get(arrayList.get(i)));
}
}
public static long getDimensionality(String vectorFileName) {
BufferedReader br = openFile(vectorFileName);
String line = null;
int numVectors = 0;
int ydim = 1;
int dim = 0;
try {
while ((line = br.readLine()) != null) {
line = line.trim();
if (line.startsWith("#") || line.equals("")) { // ignore comments and empty lines
continue;
}
if (!line.startsWith("$")) {
break;
}
String[] lineElements = line.split(StringUtils.REGEX_SPACE_OR_TAB);
if (lineElements.length < 2) {
Logger.getLogger("at.tuwien.ifs.somtoolbox").severe("Header in input vector file corrupt!");
throw new IOException("Header in input vector file corrupt!");
}
String content = lineElements[1];
if (lineElements[0].equals("$XDIM")) {
numVectors = Integer.parseInt(content);
} else if (lineElements[0].equals("$YDIM")) {
ydim = Integer.parseInt(content);
} else if (lineElements[0].startsWith("$VEC_DIM") || lineElements[0].startsWith("$VECDIM")) {
dim = Integer.parseInt(content);
}
}
} catch (Exception e) {
Logger.getLogger("at.tuwien.ifs.somtoolbox").severe(ERROR_MESSAGE_FILE_FORMAT_CORRUPT);
throw new IllegalArgumentException(e.getMessage());
}
return numVectors * ydim * dim;
}
public void setLabel(int index, String name) {
dataNames[index] = name;
}
}