/* This file is part of the Joshua Machine Translation System. * * Joshua is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 * of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, * MA 02111-1307 USA */ package joshua.corpus; import joshua.corpus.suffix_array.SuffixArray; import joshua.corpus.suffix_array.SuffixArrayFactory; import joshua.corpus.vocab.ExternalizableSymbolTable; import joshua.corpus.vocab.SymbolTable; import joshua.corpus.vocab.Vocabulary; import joshua.util.io.BinaryOut; import java.io.Externalizable; import java.io.FileOutputStream; import java.io.IOException; import java.io.ObjectInput; import java.io.ObjectOutput; import java.util.Arrays; /** * A compact int[] based representation of a corpus. The class keeps * all of the words in their int form in a single array. It also * maintains a separate int[] array that lists the start index for * each sentence in the corpus. This second array allows us to * quickly determine the source sentence of any given position in * the corpus using a binary search. * * @author Josh Schroeder * @since 29 Dec 2004 * @version $LastChangedDate:2008-07-30 17:15:52 -0400 (Wed, 30 Jul 2008) $ */ public class CorpusArray extends AbstractCorpus<ExternalizableSymbolTable> implements Corpus, Externalizable { //=============================================================== // Constants //=============================================================== //=============================================================== // Member variables //=============================================================== /** * Stores an integer based representation of each word in * the corpus. */ protected int[] corpus; /** * Keeps the starting position in the corpus array for each * of the sentences. The length of the sentences array is * equal to the number of sentences in the corpus. */ protected int[] sentences; /** * The alphabetized vocabulary which maps between the String * and int representation of words in the corpus. */ // protected SymbolTable symbolTable; //=============================================================== // Constructor(s) //=============================================================== /** * Constructs an empty corpus. * <p> * NOTE: Primarily needed for Externalizable interface. */ public CorpusArray() { super(new Vocabulary()); // this.symbolTable = new Vocabulary(); this.sentences = new int[]{}; this.corpus = new int[]{}; } /** * Protected constructor takes in the already prepared * member variables. * * @see SuffixArrayFactory#createCorpusArray */ public CorpusArray (int[] corpus, int[] sentences, ExternalizableSymbolTable vocab) { super(vocab); this.corpus = corpus; this.sentences = sentences; // this.symbolTable = vocab; } //=============================================================== // Public //=============================================================== //=========================================================== // Accessor methods (set/get) //=========================================================== /** * @return the integer representation of the Word at the * specified position in the corpus. */ public int getWordID(int position) { return corpus[position]; } /** * @return the sentence index associated with the specified * position in the corpus. */ public int getSentenceIndex(int position) { int index = Arrays.binarySearch(sentences, position); // if index is positive, then the position searched // for is the first word of a sentence. we return // the exact value. if (index >= 0) { return index; } else { // otherwise, we are given an negative version of // the first number higher than our position. that // is the position of where this would be inserted // if it was its own sentence, so we make the number // positive and subtract 2 (one since since it is // by ith element instead of position, one to get // the previous index) return (index*(-1))-2; } } /** * @return the position in the corpus of the first word of * the specified sentence. If the sentenceID is * outside of the bounds of the sentences, then it * returns the last position in the corpus + 1. */ public int getSentencePosition(int sentenceID) { if (sentenceID >= sentences.length) { return corpus.length; } return sentences[sentenceID]; } /** * Gets the exclusive end position of a sentence in the * corpus. * * @return the position in the corpus one past the last * word of the specified sentence. If the sentenceID * is outside of the bounds of the sentences, then * it returns one past the last position in the * corpus. */ public int getSentenceEndPosition(int sentenceID) { if (sentenceID >= sentences.length-1) { return corpus.length; } return sentences[sentenceID+1]; } /** * Gets the sentence at the specified index (starting from * zero). * * @return the sentence, or null if the specified sentence * number doesn't exist */ public Phrase getSentence(int sentenceIndex) { if (sentenceIndex >= sentences.length) { return null; } else if (sentenceIndex == sentences.length - 1) { return getPhrase(sentences[sentenceIndex], corpus.length); } else { return getPhrase(sentences[sentenceIndex], sentences[sentenceIndex+1]); } } /** * @return the number of words in the corpus. */ public int size() { return corpus.length; } /** * @return the number of sentences in the corpus. */ public int getNumSentences() { return sentences.length; } /** * Sets the symbol table to the provided object, and changes * migrates all internal data to use the new mappings * provided by that object. */ public void setSymbolTable(ExternalizableSymbolTable vocab) { SymbolTable oldVocab = this.symbolTable; for (int i=0; i<corpus.length; i++) { int oldID = corpus[i]; String word = oldVocab.getWord(oldID); int newID = vocab.getID(word); corpus[i] = newID; } this.symbolTable = vocab; oldVocab = null; } //=========================================================== // Methods //=========================================================== /** * Compares the phrase that starts at position start with * the subphrase indicated by the start and end points of * the phrase. * * @param corpusStart the point in the corpus where the * comparison begins * @param phrase the superphrase that the comparsion * phrase is drawn from * @param phraseStart the point in the phrase where the * comparison begins (inclusive) * @param phraseEnd the point in the phrase where the * comparison ends (exclusive) * @return an int that follows the conventions of * java.util.Comparator.compareTo() */ public int comparePhrase(int corpusStart, Phrase phrase, int phraseStart, int phraseEnd) { int diff = -1; for (int i = 0; i < phraseEnd-phraseStart; i++) { if (i + corpusStart >= corpus.length) { return -1; } diff = corpus[i+corpusStart] - phrase.getWordID(i+phraseStart); if (diff != 0) { return diff; } } return 0; } /** * compares the phrase that starts at position start with * the phrase passed in. Compares the entire phrase. */ public int comparePhrase(int corpusStart, Phrase phrase) { return comparePhrase(corpusStart, phrase, 0, phrase.size()); } public SymbolTable getVocabulary() { return symbolTable; } /** * Compares the suffixes starting a positions index1 and * index2. * * @param position1 the position in the corpus where the * first suffix begins * @param position2 the position in the corpus where the * second suffix begins * @param maxComparisonLength a cutoff point to stop the * comparison * @return an int that follows the conventions of * java.util.Comparator.compareTo() */ public int compareSuffixes(int position1, int position2, int maxComparisonLength){ for (int i = 0; i < maxComparisonLength; i++) { if (position1 + i < (corpus.length) && position2 + i >= (corpus.length)) { return 1; } if (position2 + i < (corpus.length) && position1 + i >= (corpus.length)) { return -1; } int diff; try { diff = corpus[position1 + i] - corpus[position2 + i]; } catch (ArrayIndexOutOfBoundsException e) { throw new Error("Bug in CorpusArray method compareSuffixes: " + e.getMessage()); } if (diff != 0) { return diff; } } return 0; } public void write(String corpusFilename, String vocabFilename, String charset) throws IOException { ObjectOutput vocabOut = new BinaryOut(new FileOutputStream(vocabFilename), true); // new ObjectOutputStream(new FileOutputStream(vocabFilename)); symbolTable.setExternalizableEncoding(charset); symbolTable.writeExternal(vocabOut); vocabOut.flush(); BinaryOut corpusOut = new BinaryOut(new FileOutputStream(corpusFilename), false); this.writeExternal(corpusOut); corpusOut.flush(); } public ContiguousPhrase getPhrase(int startPosition, int endPosition) { return new ContiguousPhrase(startPosition, endPosition, this); } //=============================================================== // Private //=============================================================== //=============================================================== // Methods //=============================================================== //=============================================================== // Static //=============================================================== //=============================================================== // Main //=============================================================== public static void main(String[] args) throws Exception { if (args.length < 4) { System.err.println("Usage: java " + SuffixArray.class.getName() + " corpus vocab.jbin corpus.bin"); System.exit(0); } String corpusFileName = args[0]; String binaryVocabFilename = args[1]; String binaryCorpusFilename = args[2]; String charset = (args.length > 3) ? args[3] : "UTF-8"; Vocabulary symbolTable = new Vocabulary(); int[] lengths = Vocabulary.initializeVocabulary(corpusFileName, symbolTable, true); CorpusArray corpusArray = SuffixArrayFactory.createCorpusArray(corpusFileName, symbolTable, lengths[0], lengths[1]); corpusArray.write(binaryCorpusFilename, binaryVocabFilename, charset); } public void readExternal(ObjectInput in) throws IOException, ClassNotFoundException { // Read the vocabulary symbolTable.readExternal(in); int numSentences = in.readInt(); this.sentences = new int[numSentences]; for (int i=0; i<numSentences; i++) { this.sentences[i] = in.readInt(); } int numWords = in.readInt(); this.corpus = new int[numWords]; for (int i=0; i<numWords; i++) { this.corpus[i] = in.readInt(); } } public void writeExternal(ObjectOutput out) throws IOException { // Write the vocabulary out.writeObject(symbolTable); out.writeInt(sentences.length); for (int sentencePosition : sentences) { out.writeInt(sentencePosition); } out.writeInt(corpus.length); for (int word : corpus) { out.writeInt(word); } } // static final long serialVersionUID = 1L; }