package edu.stanford.nlp.semparse.open.ling;
import java.io.BufferedReader;
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.*;
import fig.basic.LogInfo;
import fig.basic.Option;
public class WordVectorTable {
public static class Options {
@Option public String wordVectorFilename = null;
@Option(gloss = "vector to use for UNKNOWN words (-1 = don't use any vector)")
public int wordVectorUNKindex = 0;
}
public static Options opts = new Options();
public static Map<String, Integer> wordToIndex;
public static double[][] wordVectors;
public static int numWords, numDimensions;
public static void initModels() {
if (wordVectors != null || opts.wordVectorFilename == null || opts.wordVectorFilename.isEmpty()) return;
Path dataPath = Paths.get(opts.wordVectorFilename);
LogInfo.logs("Reading word vectors from %s", dataPath);
try (BufferedReader in = Files.newBufferedReader(dataPath, Charset.forName("UTF-8"))) {
String[] headerTokens = in.readLine().split(" ");
numWords = Integer.parseInt(headerTokens[0]);
numDimensions = Integer.parseInt(headerTokens[1]);
wordToIndex = new HashMap<>();
wordVectors = new double[numWords][numDimensions];
for (int i = 0; i < numWords; i++) {
String[] tokens = in.readLine().split(" ");
wordToIndex.put(tokens[0], i);
for (int j = 0; j < numDimensions; j++) {
wordVectors[i][j] = Double.parseDouble(tokens[j+1]);
}
}
LogInfo.logs("Neural network vectors: %s words; %s dimensions per word", numWords, numDimensions);
} catch (IOException e) {
LogInfo.fails("Cannot load neural network vectors from %s", dataPath);
}
}
public static double[] getVector(String word) {
initModels();
Integer index = wordToIndex.get(word);
if (index == null) {
index = opts.wordVectorUNKindex;
if (index < 0) return null;
}
return wordVectors[index];
}
}