package edu.berkeley.nlp.lm;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
/**
* Enumerates words in the vocabulary of a language model. Stores a two-way
* mapping between integers and words.
*
* @author adampauls
*
* @param <W>
* A type representing words in the language. Can be a
* <code>String</code>, or something more complex if needed
*/
public interface WordIndexer<W> extends Serializable
{
/**
* Gets the index for a word, adding if necessary.
*
* @param word
* @return
*/
public int getOrAddIndex(W word);
public int getOrAddIndexFromString(String word);
/**
* Should never add to vocabulary, and should return getUnkSymbol() if the
* word is not in the vocabulary.
*
* @param word
* @return
*/
public int getIndexPossiblyUnk(W word);
/**
* Gets the word object for an index.
*
* @param index
* @return
*/
public W getWord(int index);
/**
* Number of words that have been added so far
*
* @return
*/
public int numWords();
/**
* Returns the start symbol (usually something like {@literal <s>}
*
* @return
*/
public W getStartSymbol();
public void setStartSymbol(W sym);
/**
* Returns the start symbol (usually something like {@literal </s>}
*
* @return
*/
public W getEndSymbol();
public void setEndSymbol(W sym);
/**
* Returns the unk symbol (usually something like {@literal <unk>}
*
* @return
*/
public W getUnkSymbol();
public void setUnkSymbol(W sym);
/**
* Informs the implementation that no more words can be added to the
* vocabulary. Implementations may perform some space optimization, and
* should trigger an error if an attempt is made to add a word after this
* point.
*/
public void trimAndLock();
public static class StaticMethods
{
/**
* Converts an object representation to an int array. Does not add to
* the indexer.
*
* @param <W>
* @param wordIndexer
* @param list
* @return
*/
public static <W> int[] toArray(final WordIndexer<W> wordIndexer, final List<W> list) {
final int[] ret = new int[list.size()];
for (int i = 0; i < list.size(); ++i) {
ret[i] = wordIndexer.getIndexPossiblyUnk(list.get(i));
}
return ret;
}
/**
* Converts an string representation to an int array, adding to the
* indexer.
*
* @param <W>
* @param wordIndexer
* @param list
* @return
*/
public static <W> int[] toArrayFromStrings(final WordIndexer<W> wordIndexer, final List<String> list) {
final int[] ret = new int[list.size()];
for (int i = 0; i < list.size(); ++i) {
ret[i] = wordIndexer.getOrAddIndexFromString(list.get(i));
}
return ret;
}
/**
* Converts an int representation of an n-gram to a list. Converts only
* the range of the array specified by [startPos,endPos)
*
* @param <W>
* @param wordIndexer
* @param intNgram
* @param startPos
* @param endPos
* @return
*/
public static <W> List<W> toList(final WordIndexer<W> wordIndexer, final int[] intNgram, final int startPos, final int endPos) {
final List<W> l = new ArrayList<W>(endPos - startPos);
for (int i = startPos; i < endPos; ++i) {
l.add(wordIndexer.getWord(intNgram[i]));
}
return l;
}
public static <W> List<W> toList(final WordIndexer<W> wordIndexer, final int[] intNgram) {
return toList(wordIndexer, intNgram, 0, intNgram.length);
}
}
}