/** * Title: StanfordMaxEnt<p> * Description: A Maximum Entropy Toolkit<p> * Copyright: Copyright (c) Kristina Toutanova<p> * Company: Stanford University<p> */ package edu.stanford.nlp.tagger.maxent; import edu.stanford.nlp.util.logging.Redwood; import edu.stanford.nlp.io.IOUtils; import edu.stanford.nlp.stats.IntCounter; import edu.stanford.nlp.util.Generics; import java.io.IOException; import java.io.DataInputStream; import java.io.DataOutputStream; import java.util.Map; /** Maintains a map from words to tags and their counts. * * @author Kristina Toutanova * @version 1.0 */ public class Dictionary { /** A logger for this class */ private static Redwood.RedwoodChannels log = Redwood.channels(Dictionary.class); private final Map<String,TagCount> dict = Generics.newHashMap(); private final Map<Integer,CountWrapper> partTakingVerbs = Generics.newHashMap(); private static final String naWord = "NA"; private static final boolean VERBOSE = false; public Dictionary() { } void fillWordTagCounts(Map<String, IntCounter<String>> wordTagCounts) { for (String word : wordTagCounts.keySet()) { TagCount count = new TagCount(wordTagCounts.get(word)); dict.put(word, count); } } /* public void release() { dict.clear(); } public void addVPTaking(String verb, String tag, String partWord) { int h = verb.hashCode(); Integer i = Integer.valueOf(h); if (tag.startsWith("RP")) { if (this.partTakingVerbs.containsKey(i)) { this.partTakingVerbs.get(i).incPart(partWord); } else { this.partTakingVerbs.put(i, new CountWrapper(verb, 0, 0, 0, 0)); this.partTakingVerbs.get(i).incPart(partWord); } } else if (tag.startsWith("RB")) { if (this.partTakingVerbs.containsKey(i)) { this.partTakingVerbs.get(i).incRB(partWord); } else { this.partTakingVerbs.put(i, new CountWrapper(verb, 0, 0, 0, 0)); this.partTakingVerbs.get(i).incRB(partWord); } } else if (tag.startsWith("IN")) { if (this.partTakingVerbs.containsKey(i)) { this.partTakingVerbs.get(i).incIn(partWord); } else { this.partTakingVerbs.put(i, new CountWrapper(verb, 0, 0, 0, 0)); this.partTakingVerbs.get(i).incIn(partWord); } } } */ protected void addVThatTaking(String verb) { int i = verb.hashCode(); if (this.partTakingVerbs.containsKey(i)) { this.partTakingVerbs.get(i).incThat(); } else { this.partTakingVerbs.put(i, new CountWrapper(verb, 0, 1, 0, 0)); } } protected int getCountPart(String verb) { int i = verb.hashCode(); if (this.partTakingVerbs.containsKey(i)) { return this.partTakingVerbs.get(i).getCountPart(); } return 0; } protected int getCountThat(String verb) { int i = verb.hashCode(); if (this.partTakingVerbs.containsKey(i)) { return this.partTakingVerbs.get(i).getCountThat(); } return 0; } protected int getCountIn(String verb) { int i = verb.hashCode(); if (this.partTakingVerbs.containsKey(i)) { return this.partTakingVerbs.get(i).getCountIn(); } return 0; } protected int getCountRB(String verb) { int i = verb.hashCode(); if (this.partTakingVerbs.containsKey(i)) { return this.partTakingVerbs.get(i).getCountRB(); } return 0; } protected int getCount(String word, String tag) { TagCount count = dict.get(word); if (count == null) { return 0; } else { return count.get(tag); } } protected String[] getTags(String word) { TagCount count = get(word); if (count == null) { return null; } return count.getTags(); } protected TagCount get(String word) { return dict.get(word); } String getFirstTag(String word) { TagCount count = dict.get(word); if (count != null) { return count.getFirstTag(); } return null; } protected int sum(String word) { TagCount count = dict.get(word); if (count != null) { return count.sum(); } return 0; } boolean isUnknown(String word) { return ! dict.containsKey(word); } /* public void save(String filename) { try { DataOutputStream rf = IOUtils.getDataOutputStream(filename); save(rf); rf.close(); } catch (Exception e) { e.printStackTrace(); } } */ void save(DataOutputStream file) { String[] arr = dict.keySet().toArray(new String[dict.keySet().size()]); try { file.writeInt(arr.length); log.info("Saving dictionary of " + arr.length + " words ..."); for (String word : arr) { TagCount count = get(word); file.writeUTF(word); count.save(file); } Integer[] arrverbs = this.partTakingVerbs.keySet().toArray(new Integer[partTakingVerbs.keySet().size()]); file.writeInt(arrverbs.length); for (Integer iO : arrverbs) { CountWrapper tC = this.partTakingVerbs.get(iO); file.writeInt(iO.intValue()); tC.save(file); } } catch (Exception e) { e.printStackTrace(); } } private void read(DataInputStream rf, String filename) throws IOException { // Object[] arr=dict.keySet().toArray(); int maxNumTags = 0; int len = rf.readInt(); if (VERBOSE) { log.info("Reading Dictionary of " + len + " words from " + filename + '.'); } for (int i = 0; i < len; i++) { String word = rf.readUTF(); TagCount count = TagCount.readTagCount(rf); int numTags = count.numTags(); if (numTags > maxNumTags) { maxNumTags = numTags; } this.dict.put(word, count); if (VERBOSE) { log.info(" " + word + " [idx=" + i + "]: " + count); } } if (VERBOSE) { log.info("Read dictionary of " + len + " words; max tags for word was " + maxNumTags + '.'); } } private void readTags(DataInputStream rf) throws IOException { // Object[] arr=dict.keySet().toArray(); int maxNumTags = 0; int len = rf.readInt(); if (VERBOSE) { log.info("Reading Dictionary of " + len + " words."); } for (int i = 0; i < len; i++) { String word = rf.readUTF(); TagCount count = TagCount.readTagCount(rf); int numTags = count.numTags(); if (numTags > maxNumTags) { maxNumTags = numTags; } this.dict.put(word, count); if (VERBOSE) { log.info(" " + word + " [idx=" + i + "]: " + count); } } if (VERBOSE) { log.info("Read dictionary of " + len + " words; max tags for word was " + maxNumTags + '.'); } } protected void read(String filename) { try { DataInputStream rf = IOUtils.getDataInputStream(filename); read(rf, filename); int len1 = rf.readInt(); for (int i = 0; i < len1; i++) { int iO = rf.readInt(); CountWrapper tC = new CountWrapper(); tC.read(rf); this.partTakingVerbs.put(iO, tC); } rf.close(); } catch (IOException e) { e.printStackTrace(); } } protected void read(DataInputStream file) { try { readTags(file); int len1 = file.readInt(); for (int i = 0; i < len1; i++) { int iO = file.readInt(); CountWrapper tC = new CountWrapper(); tC.read(file); this.partTakingVerbs.put(iO, tC); } } catch (IOException e) { e.printStackTrace(); } } /* public void printAmbiguous() { String[] arr = dict.keySet().toArray(new String[dict.keySet().size()]); try { int countAmbiguous = 0; int countUnAmbiguous = 0; int countAmbDisamb = 0; for (String word : arr) { if (word.indexOf('|') == -1) { continue; } TagCount count = get(word); if (count.numTags() > 1) { System.out.print(word); countAmbiguous++; tC.print(); System.out.println(); } else { String wordA = word.substring(0, word.indexOf('|')); if (get(wordA).numTags() > 1) { System.out.print(word); countAmbDisamb++; countUnAmbiguous++; tC.print(); System.out.println(); } else { countUnAmbiguous++; } }// else } System.out.println(" ambg " + countAmbiguous + " unambg " + countUnAmbiguous + " disamb " + countAmbDisamb); } catch (Exception e) { e.printStackTrace(); } } */ /** * This makes ambiguity classes from all words in the dictionary and remembers * their classes in the TagCounts */ protected void setAmbClasses(AmbiguityClasses ambClasses, int veryCommonWordThresh, TTags ttags) { for (Map.Entry<String,TagCount> entry : dict.entrySet()) { String w = entry.getKey(); TagCount count = entry.getValue(); int ambClassId = ambClasses.getClass(w, this, veryCommonWordThresh, ttags); count.setAmbClassId(ambClassId); } } protected int getAmbClass(String word) { if (word.equals(naWord)) { return -2; } if (get(word) == null) { return -1; } return get(word).getAmbClassId(); } public static void main(String[] args) { String s = "word"; String tag = "tag"; Dictionary d = new Dictionary(); System.out.println(d.getCount(s, tag)); System.out.println(d.getFirstTag(s)); } }