/* This file is part of the Joshua Machine Translation System. * * Joshua is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 * of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, * MA 02111-1307 USA */ package joshua.corpus.suffix_array; import java.util.List; import joshua.corpus.Corpus; import joshua.corpus.MatchedHierarchicalPhrases; import joshua.corpus.Phrase; import joshua.corpus.vocab.SymbolTable; import joshua.decoder.ff.tm.Rule; import joshua.util.Cache; /** * A representation of the suffixes in a corpus. * * @author Lane Schwartz * @author Chris Callison-Burch */ public interface Suffixes { //=============================================================== // Constants //=============================================================== /** * The maximum length suffix to consider during sorting. */ int MAX_COMPARISON_LENGTH = 20; /** * Maximum number of items that can be stored in the cache * of patterns and hierarchical phrases. */ int DEFAULT_CACHE_CAPACITY = 100000; /** * Gets the symbol table for this object. * * @return the symbol table for this object. */ SymbolTable getVocabulary(); /** * Gets the corpus for this object. * * @return the corpus for this object */ Corpus getCorpus(); /** * This method creates a list of trivially HierarchicalPhrases * (i.e.\ they're really just contiguous phrases, but we * will want to perform some of the HierarchialPhrase * operations on them). Sorts the positions. Adds the results * to the cache. * <p> * The construction of more complex hierarchical phrases * is handled within the prefix tree. * * @param startPositions an unsorted list of the positions * in the corpus where the matched phrases begin * @param pattern a contiguous phrase * @return a list of trivially hierarchical phrases */ MatchedHierarchicalPhrases createTriviallyHierarchicalPhrases(int[] startPositions, Pattern pattern, SymbolTable vocab); /** * Gets all locations in the corpus * of the specified hierarchical pattern, * subject to the specified span constraints. * <p> * This method exists to provide an easy mechanism * for getting all instances of arbitrary hierarchical phrases. * * @param pattern Pattern of terminals and (optionally) nonterminals. * @param minNonterminalSpan Minimum number of terminals * that a nonterminal is allowed to represent * @param maxPhraseSpan Maximum length in the corpus * that an extracted phrase may represent * @return */ MatchedHierarchicalPhrases createHierarchicalPhrases(Pattern pattern, int minNonterminalSpan, int maxPhraseSpan); /** * Returns the number of suffixes in the suffix array, which * is identical to the length of the corpus. * * @return the number of suffixes in the suffix array */ int size(); /** * Gets the position in the corpus corresponding to the * specified index in the suffix array. * * @return the position in the corpus corresponding to the * specified index in the suffix array. */ int getCorpusIndex(int suffixIndex); /** * Gets the sentence number of the word at the specified * position in the corpus. * * @param corpusIndex Position of a word in the corpus * @return the sentence number of the word at the specified * position in the corpus */ int getSentenceIndex(int corpusIndex); /** * Gets the position in the corpus of the first word of * the specified sentence. If the sentenceID is * outside of the bounds of the sentences, then it * returns the last position in the corpus + 1. * * @return the position in the corpus of the first word of * the specified sentence. If the sentenceID is * outside of the bounds of the sentences, then it * returns the last position in the corpus + 1. */ int getSentencePosition(int sentenceIndex); /** * Finds a phrase in the suffix array. * * @param phrase the search phrase * @return a tuple containing the (inclusive) start and the * (inclusive) end bounds in the suffix array for * the phrase */ int[] findPhrase(Phrase phrase); /** * Finds a phrase in the suffix array. The phrase is extracted * from the sentence given the start and end points. This * version of the method allows bounds to be specified in * the suffix array, which is useful when searching for * increasingly longer sub-phrases in a sentences. * * @param sentence the sentence/super-phrase to draw the * search phrase from * @param phraseStart the start of the phrase in the sentence * (inclusive) * @param phraseEnd the end of the phrase in the sentence * (exclusive) * @param lowerBound the first index in the suffix array * that will bound the search * @param upperBound the last index in the suffix array * that will bound the search * @return a tuple containing the (inclusive) start and the * (inclusive) end bounds in the suffix array for * the phrase, or null if the phrase is not found. */ int[] findPhrase(Phrase sentence, int phraseStart, int phraseEnd, int lowerBound, int upperBound); /** * Gets a list of hierarchical phrases that match the pattern * if they are already cached or null if the pattern is not * in the cache. * * @return a list of hierarchical phrases that match the * pattern if they are already cached or null if * the pattern is not in the cache. */ MatchedHierarchicalPhrases getMatchingPhrases(Pattern pattern); /** * Caches the matching hierarchical phrases for the pattern. * * @param matchings Hierarchical phrases located in the corpus * that match a common pattern. */ void cacheMatchingPhrases(MatchedHierarchicalPhrases matchings); /** * Gets all of the positions in the corpus for the bounds * in the suffix array, sorting the corpus position. * * @param bounds Inclusive bounds in the suffix array * @return all positions in the corpus for the specified * bounds */ int[] getAllPositions(int[] bounds); /** * Gets the hierarchical phrase objects cached by this * suffix array. * * @return the hierarchical phrase objects cached by this * suffix array */ Cache<Pattern,MatchedHierarchicalPhrases> getCachedHierarchicalPhrases(); /** * Gets the list of rule objects cached by this * suffix array. * * @return the list of rule objects cached by this * suffix array */ Cache<Pattern,List<Rule>> getCachedRules(); }