Vocabulary.java example

Explorer

Syntactic-master
- src
  - main
    - java
      - syntaxLearner
        Cluster.java
        ClusterContext.java
        Learner.java
        LearnerMain.java
        Recorder.java
        UI
        Console.java
        Report.java
        corpus
        Context.java
        Corpus.java
        Vocabulary.java
        VocabularyContext.java
        Word.java
        source
        CorpusSource.java
        PlainTextFile.java
        WikiDump.java

package syntaxLearner.corpus;

import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.SortedMap;
import java.util.SortedSet;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.Map.Entry;

import syntaxLearner.UI.Console;

/**
 * 
 * @author Omer Shapira
 * This class is meant to keep track of vocabulary operations,
 * including a data structure for words, hash functions, word
 * affinity in a language, and other helper functions.
 */
public class Vocabulary {

	/* Debug*/
	private boolean _DEBUG=false;

	/* Class Variables */
	private final Corpus corpus;
	private boolean indexedOnce = false;
	private boolean updated = false;
	private int numOfWords = 0; //number of individual words
	private int wordIDCounter = 0;

	/* Data Structures */
	private Map<String,Integer> 	wordIndices;
	private Map<Integer,Word> 		words;
	private SortedSet<Integer> 	wordHierarchy;
	private Map<Integer, Integer> ranks;
	private Set<Integer>			wordsUpdated;

	/* Class Constants */
	public final Word START_SYMBOL, END_SYMBOL;


	public Vocabulary(Corpus c){
		this.corpus = c;
		wordIndices = new TreeMap<String,Integer>();
		words = new HashMap<Integer, Word>();
		wordHierarchy = new TreeSet<Integer>( 
				new Comparator<Integer>() {
					@Override
					public int compare(Integer o1, Integer o2) {
						return -words.get(o1).compareTo(words.get(o2));
					}
				}
		);
		wordsUpdated = new TreeSet<Integer>();
		this.add("$START");
		this.add("$END");
		START_SYMBOL = words.get(wordIndices.get("$START"));
		END_SYMBOL = words.get(wordIndices.get("$END"));
	}




	public void add(String s){
		if (wordIndices.containsKey(s)){
			words.get(wordIndices.get(s)).increase(1);
			if (_DEBUG) Console.line("Vocab #"+wordIndices.get(s)+" : "+
					s+" = "+words.get(wordIndices.get(s)).frequency);
		} else {
			Word w = new Word(s, this);
			wordIndices.put(s, w.ID);
			words.put(w.ID, w);
			if (_DEBUG) Console.line("Vocab : NEW "+s);
			numOfWords++;
		}
		updated = false;
	}

	/**
	 * Updates the word 
	 */
	private void update(){
		wordHierarchy.addAll(words.keySet());
		indexedOnce = true;
		updated  = true;
		generateRanks();
	}

	public SortedSet<Integer> getWordHierarchy(){
		if (!updated) {
			update();
			return wordHierarchy;
		}else {
			return wordHierarchy;
		}
	}

	/**
	 * INNEFICIENT - Generates each time
	 * @return
	 */
	public void generateRanks(){
		if (!updated){
			update();
		}
		ranks = new HashMap<Integer,Integer>();
		Iterator<Integer> iter = wordHierarchy.iterator();
		int i = 1;
		while (iter.hasNext()){
			ranks.put(iter.next(), i++);	
		}
	}

	public int getRank(int index){
		if (!updated){
			generateRanks();
		}
		return ranks.get(index);
	}

	public int newID(){
		return wordIDCounter++;
	}

	public int getIndex(String s){
		return wordIndices.get(s);
	}

	public Word getWord(int i){
		return words.get(i);
	}

	/**
	 * 
	 * @return The number of individual words in the vocabulary
	 */
	public int getNumOfWords(){
		return numOfWords;
	}

	/**
	 * 
	 * @param threshold - Typically the "Rare word threshold"
	 * @return The number of words below a certain threshold. 
	 */
	public int countWordsBelowThreshold(int threshold){
		int i=0;
		for (Word w: words.values()){
			if (w.frequency < threshold){
				i++;
			}
		}
		return i;
	}

	/**
	 * 
	 * @return
	 */
	public Corpus getCorpus(){
		return corpus;
	}

	/**
	 * 
	 * @param i
	 */
	protected void registerWordUpdate(int i){
		wordsUpdated.add(i);
	}
	/**
	 * @param i The word index
	 * @return True if the word is registered as updated
	 */
	protected boolean isWordUpdated(int i ){
		return wordsUpdated.contains(i);
	}
	/**
	 * Resets the state of updated words
	 */
	public void purgeUpdatedWords(){
		wordsUpdated.clear();
	}
	public Set<Entry<String,Integer>> getWordIndicesEntrySet(){
		return wordIndices.entrySet();
	}

	public Set<Entry<Integer,Word>> getWordEntrySet(){
		SortedMap<Integer,Word> m = new TreeMap<Integer,Word>(words);
		return m.entrySet();
	}
}