// Copyright 2013 Thomas Müller
// This file is part of MarMoT, which is licensed under GPLv3.
package marmot.morph;
import java.io.Serializable;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.logging.Logger;
import marmot.core.ArrayFloatFeatureVector;
import marmot.core.DenseArrayFloatFeatureVector;
import marmot.core.FloatFeatureVector;
import marmot.util.Converter;
import marmot.util.LineIterator;
import marmot.util.SymbolTable;
public class FloatHashDictionary implements Serializable {
private static final long serialVersionUID = 1L;
private Map<String, FloatFeatureVector> index_map_;
private SymbolTable<String> column_table_;
private MorphDictionaryOptions options_;
protected void readSparseVector(LineIterator iterator) {
while (iterator.hasNext()) {
List<String> line = iterator.next();
if (!line.isEmpty()) {
String form = line.get(0);
List<Integer> indexes = new LinkedList<Integer>();
List<Double> values = new LinkedList<Double>();
for (int i = 1; i < line.size(); i++) {
String pair_string = line.get(i);
String[] key_value = pair_string.split(":");
Double value = null;
if (key_value.length == 2) {
try {
value = Double.parseDouble(key_value[1]);
} catch (NumberFormatException e) {
}
}
if (value == null) {
System.err.println("Skipping pair string: "
+ pair_string);
} else {
String key = key_value[0];
int index = column_table_.toIndex(key, true);
assert (index >= 0);
indexes.add(index);
values.add(value);
}
}
FloatFeatureVector vector = new ArrayFloatFeatureVector(
Converter.toIntArray(indexes),
Converter.toDoubleArray(values), 0);
index_map_.put(form, vector);
}
}
}
public void readDenseVector(LineIterator iterator) {
int dim = -1;
while (iterator.hasNext()) {
List<String> line = iterator.next();
if (dim == -1 && line.size() == 2) {
// Data is in word2vec text format.
Logger logger = Logger.getLogger(getClass().getName());
logger.info(String.format("Skipping possible file header: %s", line));
continue;
}
if (!line.isEmpty()) {
if (dim < 0) {
dim = line.size() - 1;
for (int i=0; i< dim; i++) {
column_table_.toIndex(Integer.toString(i), true);
}
}
String form = line.get(0);
if (dim != line.size() - 1) {
System.err
.format("Dimension was expected to be %d, but is %d\n Line : %s\n",
dim, line.size() - 1, line.toString());
System.exit(1);
}
double[] weights = new double[dim];
for (int i = 1; i < dim + 1; i++) {
Double value = Double.parseDouble(line.get(i));
weights[i - 1] = value;
}
DenseArrayFloatFeatureVector vector = new DenseArrayFloatFeatureVector(
weights);
index_map_.put(form, vector);
}
}
}
public void init(MorphDictionaryOptions options) {
options_ = options;
LineIterator iterator = new LineIterator(options_.getFilename());
column_table_ = new SymbolTable<String>();
index_map_ = new HashMap<String, FloatFeatureVector>();
if (options_.getDense()) {
readDenseVector(iterator);
} else {
readSparseVector(iterator);
for (Map.Entry<String, FloatFeatureVector> entry : index_map_
.entrySet()) {
if (entry.getValue() instanceof ArrayFloatFeatureVector) {
ArrayFloatFeatureVector vec = (ArrayFloatFeatureVector) entry
.getValue();
if (column_table_.size() == vec.getWeights().length) {
entry.setValue(new DenseArrayFloatFeatureVector(vec
.getWeights()));
} else {
double[] weights = new double[column_table_.size()];
for (int index = 0; index < vec.getWeights().length; index++) {
weights[vec.getFeatures()[index]] = vec
.getWeights()[index];
}
}
}
}
}
}
public FloatFeatureVector getVector(String form) {
FloatFeatureVector v = index_map_.get(form);
return v;
}
public int getDimension() {
return column_table_.size();
}
public int numEntries() {
return index_map_.size();
}
}