/* This file is part of the Joshua Machine Translation System. * * Joshua is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 * of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, * MA 02111-1307 USA */ package joshua.corpus.vocab; import java.util.Collection; /** * Represents a symbol table capable of mapping between strings and * symbols. * * @author Lane Schwartz * @author Zhifei Li * @version $LastChangedDate: 2009-11-24 23:07:43 -0600 (Tue, 24 Nov 2009) $ */ public interface SymbolTable { //TODO Remove all hard-coded references to nonterminals /** * The unknown word's ID will be the size of the vocabulary, * ensuring that it is outside of the vocabulary. Note that * for vocabularies which have not been fixed yet, this * means the actual value is volatile and therefore a word * ID can only be compared against UNKNOWN_WORD at the time * the word ID is generated (otherwise unknown words can * become "known" if new words are added to the vocabulary * before testing). * <p> * Negative IDs are reserved for non-terminals. * * Zero is reserved as the UNKNOWN_WORD. */ int UNKNOWN_WORD = 1; /** String representation for out-of-vocabulary words. */ String UNKNOWN_WORD_STRING = "<unk>"; /** * Integer representation of the bare (non-indexed) nonterminal X, * which represents a wild-card gap in a phrase. * <p> * All nonterminals are guaranteed to be represented by negative integers. */ int X = -1; /** * String representation of the bare (non-indexed) nonterminal X, * which represents a wild-card gap in a phrase. */ String X_STRING = "[X]"; /** * String representation of the nonterminal X with index 1, * which represents a wild-card gap in a phrase. */ String X1_STRING = "[X,1]"; /** * String representation of the nonterminal X with index 2, * which represents a wild-card gap in a phrase. */ String X2_STRING = "[X,2]"; /** * Integer representation of the nonterminal S. * <p> * All nonterminals are guaranteed to be represented by negative integers. */ int S = -4; /** * String representation of the nonterminal S.. */ String S_STRING = "[S]"; /** * Integer representation of the nonterminal X with index 1, * which represents a wild-card gap in a phrase. * <p> * All nonterminals are guaranteed to be represented by negative integers. */ int S1 = -5; /** * String representation of the nonterminal X with index 2, * which represents a wild-card gap in a phrase. */ String S1_STRING = "[S,1]"; /** * Gets a unique integer identifier for the nonterminal. * <p> * The integer returned is guaranteed to be a negative number. * * If the nonterminal is {@link #X_STRING}, * then the value returned must be {@link #X}. * * Otherwise, the value returned must be a negative number * whose value is less than {@link X}. * * @param nonterminal Nonterminal symbol * @return a unique integer identifier for the nonterminal */ int addNonterminal(String nonterminal); /** * Gets a unique integer identifier for the terminal. * * @param terminal Terminal symbol * @return a unique integer identifier for the terminal */ int addTerminal(String terminal); /** * Gets the unique integer identifiers for the words. * * @param words Array of symbols * @return the unique integer identifiers for the words */ int[] addTerminals(String[] words); /** * Gets the unique integer identifiers for the words * in the sentence. * * @param sentence Space-delimited string of symbols * @return the unique integer identifiers for the words * in the sentence */ int[] addTerminals(String sentence); /** * Gets an integer identifier for the word. * <p> * If the word is in the vocabulary, the integer returned * will uniquely identify that word. * <p> * If the word is not in the vocabulary, the integer returned * by <code>getUnknownWordID</code> may be returned. * * Alternatively, implementations may, if they choose, add * unknown words and assign them a symbol ID instead of * returning <code>getUnknownWordID</code>. * * @see #getUnknownWordID * @return the unique integer identifier for wordString, * or the result of <code>getUnknownWordID<code> * if wordString is not in the vocabulary */ int getID(String wordString); /** * Gets the integer identifiers for all words in the provided * sentence. * <p> * The sentence will be split (on spaces) into words, then * the integer identifier for each word will be retrieved * using <code>getID</code>. * * @see #getID(String) * @param sentence String of words, separated by spaces. * @return Array of integer identifiers for each word in * the sentence */ int[] getIDs(String sentence); /** * Gets the String that corresponds to the specified integer * identifier. * <p> * If the identifier is in the symbol vocabulary, the String * returned will correspond to that identifier. * * Otherwise, the String returned by <code>getUnknownWord<code> * will be returned. * * @return the String that corresponds to the specified * integer identifier, or the result of * <code>getUnknownWord</code> if the identifier * does not correspond to a word in the vocabulary */ String getTerminal(int wordID); /** * Gets the String that corresponds to the specified integer * identifier. * <p> * This method can be called for terminals or nonterminals. * * @param tokenID Integer identifier * @return the String that corresponds to the specified * integer identifier */ String getWord(int tokenID); /** * Gets the String that corresponds to the sequence of * specified integer identifiers. * * @param ids Sequence of integer identifiers * @return the String that corresponds to the sequence of * specified integer identifiers */ String getWords(int[] ids); /** * * @param wordIDs * @return */ String getTerminals(int[] wordIDs); /** * Gets a collection over all symbol identifiers for the * vocabulary. * * @return a collection over all symbol identifiers for the * vocabulary */ Collection<Integer> getAllIDs(); /** * Gets the list of all words represented by this vocabulary. * * @return the list of all words represented by this * vocabulary */ Collection<String> getWords(); /** * Gets the number of unique words in the vocabulary. * * @return the number of unique words in the vocabulary. */ int size(); /** * Gets the integer symbol representation of the unknown * word. * * @return the integer symbol representation of the unknown * word. */ int getUnknownWordID(); /** * Gets the string representation of the unknown word. * * @return the string representation of the unknown word. */ String getUnknownWord(); /** * Returns <code>true</code> if the symbol id represents a * nonterminal, <code>false</code> otherwise. * * @param id * @return <code>true</code> if the symbol id represents a * nonterminal, <code>false</code> otherwise. */ boolean isNonterminal(int id); /** * Gets the lowest-valued allowable terminal symbol id in * this table. * * @return the lowest-valued allowable terminal symbol id * in this table. */ int getLowestID(); /** * Gets the highest-valued allowable terminal symbol id in * this table. * <p> * NOTE: This may or may not return the same value as * <code>size</code>. * * @return the highest-valued allowable terminal symbol id * in this table. */ int getHighestID(); /** * * * @param id * @return */ int getTargetNonterminalIndex(int id);//first convert id to its String mapping, then call the function below /** * * * @param word * @return */ int getTargetNonterminalIndex(String word); /** * * * @param wordIDs * @param ntIndexIncrements * @return */ String getWords(int[] wordIDs, boolean ntIndexIncrements); }