/* This file is part of the Joshua Machine Translation System.
*
* Joshua is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1
* of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free
* Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
* MA 02111-1307 USA
*/
package joshua.corpus;
import java.util.Iterator;
import joshua.corpus.vocab.SymbolTable;
/**
* This class provides a mostly-complete implementation of the
* <code>Corpus</code> interface, designed to minimize the effort
* required to build a concrete implementation of a corpus array
* data structure.
*
* @author Lane Schwartz
* @author Chris Callison-Burch
*/
public abstract class AbstractCorpus<Vocab extends SymbolTable> implements Corpus {
/**
* Symbol table for the corpus, responsible for mapping
* between tokens in the corpus and the integer representations
* of those tokens.
*/
protected Vocab symbolTable;
/**
* Constructs an abstract corpus with the specified symbol
* table.
*
* @param symbolTable Symbol table for the corpus, responsible
* for mapping between tokens in the corpus and
* the integer representations of those tokens
*/
public AbstractCorpus(Vocab symbolTable) {
this.symbolTable = symbolTable;
}
/* See Javadoc for Corpus interface. */
public int comparePhrase(int corpusStart, Phrase phrase, int phraseStart, int phraseEnd) {
int diff = -1;
int size = size();
for (int i = 0; i < phraseEnd-phraseStart; i++) {
if (i + corpusStart >= size) {
return -1;
}
diff = getWordID(i+corpusStart) - phrase.getWordID(i+phraseStart);
if (diff != 0) {
return diff;
}
}
return 0;
}
/* See Javadoc for Corpus interface. */
public int comparePhrase(int corpusStart, Phrase phrase) {
return comparePhrase(corpusStart, phrase, 0, phrase.size());
}
/* See Javadoc for Corpus interface. */
public int compareSuffixes(int position1, int position2,
int maxComparisonLength) {
int size = size();
for (int i = 0; i < maxComparisonLength; i++) {
if (position1 + i < (size - 1)
&& position2 + i > (size - 1)) {
return 1;
}
if (position2 + i < (size - 1)
&& position1 + i > (size - 1)) {
return -1;
}
int diff = getWordID(position1 + i) - getWordID(position2 + i);
if (diff != 0) {
return diff;
}
}
return 0;
}
/* See Javadoc for Corpus interface. */
public abstract int getNumSentences();
/* See Javadoc for Corpus interface. */
public ContiguousPhrase getPhrase(int startPosition, int endPosition) {
return new ContiguousPhrase(startPosition, endPosition, this);
}
/* See Javadoc for Corpus interface. */
public Phrase getSentence(int sentenceIndex) {
int numSentences = getNumSentences();
int numWords = size();
if (sentenceIndex >= numSentences) {
return null;
} else if (sentenceIndex == numSentences - 1) {
return getPhrase(getSentencePosition(sentenceIndex), numWords);
} else {
return getPhrase(getSentencePosition(sentenceIndex), getSentencePosition(sentenceIndex+1));
}
}
/* See Javadoc for Corpus interface. */
public int getSentenceEndPosition(int sentenceId) {
return getSentencePosition(sentenceId+1);
}
/* See Javadoc for Corpus interface. */
public abstract int getSentenceIndex(int position);
/* See Javadoc for Corpus interface. */
public abstract int getSentencePosition(int sentenceId);
/* See Javadoc for Corpus interface. */
public int[] getSentenceIndices(int[] positions) {
int size = positions.length;
int[] sentenceNumber = new int[size];
for (int i=0; i<size; i++) {
sentenceNumber[i] = getSentenceIndex(positions[i]);
}
return sentenceNumber;
}
/* See Javadoc for Corpus interface. */
public SymbolTable getVocabulary() {
return symbolTable;
}
/* See Javadoc for Corpus interface. */
public abstract int getWordID(int position);
/* See Javadoc for Corpus interface. */
public abstract int size();
/* See Javadoc for Corpus interface. */
public Iterable<Integer> corpusPositions() {
final int size = size();
return new Iterable<Integer>() {
public Iterator<Integer> iterator() {
return new Iterator<Integer>() {
int position = 0;
public boolean hasNext() {
return position < size;
}
public Integer next() {
int result = position;
position += 1;
return result;
}
public void remove() {
throw new UnsupportedOperationException();
}
};
}
};
}
}