package syntaxLearner.corpus; import java.util.Comparator; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import java.util.Set; import java.util.SortedMap; import java.util.SortedSet; import java.util.TreeMap; import java.util.TreeSet; import java.util.Map.Entry; import syntaxLearner.UI.Console; /** * * @author Omer Shapira * This class is meant to keep track of vocabulary operations, * including a data structure for words, hash functions, word * affinity in a language, and other helper functions. */ public class Vocabulary { /* Debug*/ private boolean _DEBUG=false; /* Class Variables */ private final Corpus corpus; private boolean indexedOnce = false; private boolean updated = false; private int numOfWords = 0; //number of individual words private int wordIDCounter = 0; /* Data Structures */ private Map<String,Integer> wordIndices; private Map<Integer,Word> words; private SortedSet<Integer> wordHierarchy; private Map<Integer, Integer> ranks; private Set<Integer> wordsUpdated; /* Class Constants */ public final Word START_SYMBOL, END_SYMBOL; public Vocabulary(Corpus c){ this.corpus = c; wordIndices = new TreeMap<String,Integer>(); words = new HashMap<Integer, Word>(); wordHierarchy = new TreeSet<Integer>( new Comparator<Integer>() { @Override public int compare(Integer o1, Integer o2) { return -words.get(o1).compareTo(words.get(o2)); } } ); wordsUpdated = new TreeSet<Integer>(); this.add("$START"); this.add("$END"); START_SYMBOL = words.get(wordIndices.get("$START")); END_SYMBOL = words.get(wordIndices.get("$END")); } public void add(String s){ if (wordIndices.containsKey(s)){ words.get(wordIndices.get(s)).increase(1); if (_DEBUG) Console.line("Vocab #"+wordIndices.get(s)+" : "+ s+" = "+words.get(wordIndices.get(s)).frequency); } else { Word w = new Word(s, this); wordIndices.put(s, w.ID); words.put(w.ID, w); if (_DEBUG) Console.line("Vocab : NEW "+s); numOfWords++; } updated = false; } /** * Updates the word */ private void update(){ wordHierarchy.addAll(words.keySet()); indexedOnce = true; updated = true; generateRanks(); } public SortedSet<Integer> getWordHierarchy(){ if (!updated) { update(); return wordHierarchy; }else { return wordHierarchy; } } /** * INNEFICIENT - Generates each time * @return */ public void generateRanks(){ if (!updated){ update(); } ranks = new HashMap<Integer,Integer>(); Iterator<Integer> iter = wordHierarchy.iterator(); int i = 1; while (iter.hasNext()){ ranks.put(iter.next(), i++); } } public int getRank(int index){ if (!updated){ generateRanks(); } return ranks.get(index); } public int newID(){ return wordIDCounter++; } public int getIndex(String s){ return wordIndices.get(s); } public Word getWord(int i){ return words.get(i); } /** * * @return The number of individual words in the vocabulary */ public int getNumOfWords(){ return numOfWords; } /** * * @param threshold - Typically the "Rare word threshold" * @return The number of words below a certain threshold. */ public int countWordsBelowThreshold(int threshold){ int i=0; for (Word w: words.values()){ if (w.frequency < threshold){ i++; } } return i; } /** * * @return */ public Corpus getCorpus(){ return corpus; } /** * * @param i */ protected void registerWordUpdate(int i){ wordsUpdated.add(i); } /** * @param i The word index * @return True if the word is registered as updated */ protected boolean isWordUpdated(int i ){ return wordsUpdated.contains(i); } /** * Resets the state of updated words */ public void purgeUpdatedWords(){ wordsUpdated.clear(); } public Set<Entry<String,Integer>> getWordIndicesEntrySet(){ return wordIndices.entrySet(); } public Set<Entry<Integer,Word>> getWordEntrySet(){ SortedMap<Integer,Word> m = new TreeMap<Integer,Word>(words); return m.entrySet(); } }