package edu.stanford.nlp.semparse.open.ling;
import java.io.BufferedReader;
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.*;
import fig.basic.LogInfo;
import fig.basic.Option;
public class FrequencyTable {
public static class Options {
@Option public String frequencyFilename = null;
@Option public List<Integer> frequencyAmounts = Arrays.asList(30, 300, 3000);
}
public static Options opts = new Options();
public static Map<Integer, Set<String>> topWordsLists;
public static void initModels() {
if (topWordsLists != null || opts.frequencyFilename == null || opts.frequencyFilename.isEmpty()) return;
Path dataPath = Paths.get(opts.frequencyFilename);
LogInfo.logs("Reading word frequency from %s", dataPath);
List<String> words = new ArrayList<>();
try (BufferedReader in = Files.newBufferedReader(dataPath, Charset.forName("UTF-8"))) {
String line = null;
while ((line = in.readLine()) != null) {
String[] tokens = line.split("\t");
words.add(tokens[0]);
}
} catch (IOException e) {
LogInfo.fails("Cannot load word frequency from %s", dataPath);
}
topWordsLists = new HashMap<>();
for (int amount : opts.frequencyAmounts) {
topWordsLists.put(amount, new HashSet<>(words.subList(0, amount)));
}
}
}