package edu.stanford.nlp.semparse.open.ling; import java.io.BufferedReader; import java.io.IOException; import java.nio.charset.Charset; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.*; import fig.basic.LogInfo; import fig.basic.Option; public class BrownClusterTable { public static class Options { @Option public String brownClusterFilename = null; } public static Options opts = new Options(); public static Map<String, String> wordClusterMap; public static Map<String, Integer> wordFrequencyMap; public static void initModels() { if (wordClusterMap != null || opts.brownClusterFilename == null || opts.brownClusterFilename.isEmpty()) return; Path dataPath = Paths.get(opts.brownClusterFilename); LogInfo.logs("Reading Brown clusters from %s", dataPath); try (BufferedReader in = Files.newBufferedReader(dataPath, Charset.forName("UTF-8"))) { wordClusterMap = new HashMap<>(); wordFrequencyMap = new HashMap<>(); String line = null; while ((line = in.readLine()) != null) { String[] tokens = line.split("\t"); wordClusterMap.put(tokens[1], tokens[0].intern()); wordFrequencyMap.put(tokens[1], Integer.parseInt(tokens[2])); } } catch (IOException e) { LogInfo.fails("Cannot load Brown cluster from %s", dataPath); } } public static String getCluster(String word) { initModels(); return wordClusterMap.get(word); } public static String getClusterPrefix(String word, int length) { initModels(); String answer = wordClusterMap.get(word); if (answer == null) return null; return answer.substring(0, Math.min(length, answer.length())); } public static final int[] DEFAULT_PREFIXES = {4, 6, 10, 20}; public static List<String> getDefaultClusterPrefixes(String cluster) { List<String> answer = new ArrayList<>(); if (cluster != null) for (int length : DEFAULT_PREFIXES) answer.add("[" + length + "]" + cluster.substring(0, Math.min(length, cluster.length()))); return answer; } public static List<String> getDefaultClusterPrefixesFromWord(String word) { return getDefaultClusterPrefixes(getCluster(word)); } public static List<String> getDefaultClusterPrefixes(String cluster1, String cluster2) { List<String> answer = new ArrayList<>(); for (int length : DEFAULT_PREFIXES) { answer.add(cluster1.substring(0, Math.min(length, cluster1.length())) + "|" + cluster2.substring(0, Math.min(length, cluster2.length()))); } return answer; } public static int getSmoothedFrequency(String word) { initModels(); Integer frequency = wordFrequencyMap.get(word); if (frequency == null) return 1; return frequency + 1; } }