/* This file is part of the Joshua Machine Translation System.
*
* Joshua is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1
* of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free
* Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
* MA 02111-1307 USA
*/
package joshua.corpus;
import joshua.corpus.vocab.SymbolTable;
/**
* Corpus is an interface that contains methods for accessing the
* information within a monolingual corpus.
*
* @author Chris Callison-Burch
* @since 7 February 2005
* @version $LastChangedDate:2008-07-30 17:15:52 -0400 (Wed, 30 Jul 2008) $
*/
public interface Corpus { //extends Externalizable {
//===============================================================
// Attribute definitions
//===============================================================
/**
* @return the integer representation of the Word at the
* specified position in the corpus.
*/
int getWordID(int position);
/**
* Gets the sentence index associated with the specified
* position in the corpus.
*
* @param position Index into the corpus
* @return the sentence index associated with the specified
* position in the corpus.
*/
int getSentenceIndex(int position);
/**
* Gets the sentence index of each specified position.
*
* @param position Index into the corpus
* @return array of the sentence indices associated
* with the specified positions in the corpus.
*/
int[] getSentenceIndices(int[] positions);
/**
* Gets the position in the corpus of the first word of
* the specified sentence. If the sentenceID is
* outside of the bounds of the sentences, then it
* returns the last position in the corpus + 1.
*
* @return the position in the corpus of the first word of
* the specified sentence. If the sentenceID is
* outside of the bounds of the sentences, then it
* returns the last position in the corpus + 1.
*/
int getSentencePosition(int sentenceID);
/**
* Gets the exclusive end position of a sentence in the
* corpus.
*
* @return the position in the corpus one past the last
* word of the specified sentence. If the sentenceID
* is outside of the bounds of the sentences, then
* it returns one past the last position in the
* corpus.
*/
int getSentenceEndPosition(int sentenceID);
/**
* Gets the specified sentence as a phrase.
*
* @param sentenceIndex Zero-based sentence index
* @return the sentence, or null if the specified sentence
* number doesn't exist
*/
Phrase getSentence(int sentenceIndex);
/**
* Gets the number of words in the corpus.
*
* @return the number of words in the corpus.
*/
int size();
/**
* Gets the number of sentences in the corpus.
*
* @return the number of sentences in the corpus.
*/
int getNumSentences();
//===========================================================
// Methods
//===========================================================
/**
* Compares the phrase that starts at position start with
* the subphrase indicated by the start and end points of
* the phrase.
*
* @param corpusStart the point in the corpus where the
* comparison begins
* @param phrase the superphrase that the comparsion
* phrase is drawn from
* @param phraseStart the point in the phrase where the
* comparison begins (inclusive)
* @param phraseEnd the point in the phrase where the
* comparison ends (exclusive)
* @return an int that follows the conventions of
* java.util.Comparator.compareTo()
*/
int comparePhrase(int corpusStart, Phrase phrase, int phraseStart, int phraseEnd);
/**
* Compares the phrase that starts at position start with
* the phrase passed in. Compares the entire phrase.
*
* @param corpusStart
* @param phrase
* @return
*/
int comparePhrase(int corpusStart, Phrase phrase);
/**
* Gets the symbol table associated with this corpus.
*
* @return the symbol table associated with this corpus
*/
SymbolTable getVocabulary();
/**
* Compares the suffixes starting a positions index1 and
* index2.
*
* @param position1 the position in the corpus where the
* first suffix begins
* @param position2 the position in the corpus where the
* second suffix begins
* @param maxComparisonLength a cutoff point to stop the
* comparison
* @return an int that follows the conventions of
* java.util.Comparator.compareTo()
*/
int compareSuffixes(int position1, int position2, int maxComparisonLength);
/**
*
* @param startPosition
* @param endPosition
* @return
*/
ContiguousPhrase getPhrase(int startPosition, int endPosition);
/**
* Gets an object capable of iterating
* over all positions in the corpus, in order.
*
* @return An object capable of iterating
* over all positions in the corpus, in order.
*/
Iterable<Integer> corpusPositions();
// void write(String corpusFilename, String vocabFilename, String charset) throws IOException;
}