Corpus.java example

Explorer
relax-decode-master
- third-party
/* This file is part of the Joshua Machine Translation System.
 * 
 * Joshua is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation; either version 2.1
 * of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free
 * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
 * MA 02111-1307 USA
 */
package joshua.corpus;

import joshua.corpus.vocab.SymbolTable;


/**
 * Corpus is an interface that contains methods for accessing the
 * information within a monolingual corpus.
 *
 * @author Chris Callison-Burch
 * @since  7 February 2005
 * @version $LastChangedDate:2008-07-30 17:15:52 -0400 (Wed, 30 Jul 2008) $
 */

public interface Corpus { //extends Externalizable {

//===============================================================
// Attribute definitions
//===============================================================
	
	/**
	 * @return the integer representation of the Word at the
	 *         specified position in the corpus.
	 */
	int getWordID(int position);
	
	
	/**
	 * Gets the sentence index associated with the specified
	 *         position in the corpus.
	 * 
	 * @param position Index into the corpus
	 * @return the sentence index associated with the specified
	 *         position in the corpus.
	 */
	int getSentenceIndex(int position);
	
	
	/**
	 * Gets the sentence index of each specified position.
	 * 
	 * @param position Index into the corpus
	 * @return array of the sentence indices associated
	 *         with the specified positions in the corpus.
	 */
	int[] getSentenceIndices(int[] positions);
	
	/**
	 * Gets the position in the corpus of the first word of
	 *         the specified sentence.  If the sentenceID is
	 *         outside of the bounds of the sentences, then it
	 *         returns the last position in the corpus + 1.
	 * 
	 * @return the position in the corpus of the first word of
	 *         the specified sentence.  If the sentenceID is
	 *         outside of the bounds of the sentences, then it
	 *         returns the last position in the corpus + 1.
	 */
	int getSentencePosition(int sentenceID);
	
	/**
	 * Gets the exclusive end position of a sentence in the
	 * corpus.
	 *
	 * @return the position in the corpus one past the last
	 *         word of the specified sentence. If the sentenceID
	 *         is outside of the bounds of the sentences, then
	 *         it returns one past the last position in the
	 *         corpus.
	 */
	int getSentenceEndPosition(int sentenceID);
	
	/** 
	 * Gets the specified sentence as a phrase.
	 * 
	 * @param sentenceIndex Zero-based sentence index
	 * @return the sentence, or null if the specified sentence
	 *         number doesn't exist
	 */
	Phrase getSentence(int sentenceIndex);
	
	
	/**
	 * Gets the number of words in the corpus.
	 * 
	 * @return the number of words in the corpus.
	 */
	int size();
	
	
	/**
	 * Gets the number of sentences in the corpus.
	 * 
	 * @return the number of sentences in the corpus.
	 */
	int getNumSentences();
	
	
	//===========================================================
	// Methods
	//===========================================================
	
	
	/**
	 * Compares the phrase that starts at position start with
	 * the subphrase indicated by the start and end points of
	 * the phrase.
	 *
	 * @param corpusStart the point in the corpus where the
	 *                    comparison begins
	 * @param phrase      the superphrase that the comparsion
	 *                    phrase is drawn from
	 * @param phraseStart the point in the phrase where the
	 *                    comparison begins (inclusive)
	 * @param phraseEnd   the point in the phrase where the
	 *                    comparison ends (exclusive)
	 * @return an int that follows the conventions of
	 *         java.util.Comparator.compareTo()
	 */
	int comparePhrase(int corpusStart, Phrase phrase, int phraseStart, int phraseEnd);
	
	
	/**
	 * Compares the phrase that starts at position start with
	 * the phrase passed in. Compares the entire phrase.
	 * 
	 * @param corpusStart
	 * @param phrase
	 * @return
	 */
	int comparePhrase(int corpusStart, Phrase phrase);
	
	/**
	 * Gets the symbol table associated with this corpus.
	 * 
	 * @return the symbol table associated with this corpus
	 */
	SymbolTable getVocabulary();
	
	
	/** 
	 * Compares the suffixes starting a positions index1 and
	 * index2.
	 *
	 * @param position1 the position in the corpus where the
	 *                  first suffix begins
	 * @param position2 the position in the corpus where the
	 *                  second suffix begins
	 * @param maxComparisonLength a cutoff point to stop the
	 *                            comparison
	 * @return an int that follows the conventions of
	 *         java.util.Comparator.compareTo()
	 */
    int compareSuffixes(int position1, int position2, int maxComparisonLength);
	
	/**
	 * 
	 * @param startPosition
	 * @param endPosition
	 * @return
	 */
	ContiguousPhrase getPhrase(int startPosition, int endPosition);
	
	/**
	 * Gets an object capable of iterating 
	 * over all positions in the corpus, in order.
	 * 
	 * @return An object capable of iterating 
	 *         over all positions in the corpus, in order.
	 */
	Iterable<Integer> corpusPositions();
	
//	void write(String corpusFilename, String vocabFilename, String charset) throws IOException;
}