/* This file is part of the Joshua Machine Translation System. * * Joshua is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 * of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, * MA 02111-1307 USA */ package joshua.corpus.mm; import joshua.corpus.AbstractCorpus; import joshua.corpus.vocab.SymbolTable; import joshua.corpus.vocab.Vocabulary; import java.io.IOException; import java.io.RandomAccessFile; import java.nio.IntBuffer; import java.nio.channels.FileChannel; /** * * * @author Lane Schwartz */ public class MemoryMappedCorpusArray extends AbstractCorpus<SymbolTable> { private final IntBuffer binaryCorpusBuffer; private final IntBuffer binarySentenceBuffer; private final int numberOfWords; private final int numberOfSentences; /** * Constructs a corpus array from a binary file. * <p> * The binary file contains both the vocabulary, which is * read in first, and the corpus array data. * * @param binaryFileName * @throws IOException * @throws ClassNotFoundException */ public MemoryMappedCorpusArray(String binaryFileName, String vocabFileName) throws IOException, ClassNotFoundException { this(Vocabulary.readExternal(vocabFileName), binaryFileName); } /** * Constructs a corpus array from a binary file. * <p> * The binary file may or may not contain a vocabulary. The * first integer in the file specifies a header length. If * no vocabulary is contained, this value should be zero. * <p> * Even if the binary file contains a vocabulary, it is * ignored, and the symbol table provided to the constructor * is used instead. * * @param symbolTable * @param binaryFileName * @throws IOException */ public MemoryMappedCorpusArray( SymbolTable symbolTable, String binaryFileName ) throws IOException { super(symbolTable); IntBuffer tmp; RandomAccessFile binaryFile = new RandomAccessFile( binaryFileName, "r" ); FileChannel binaryChannel = binaryFile.getChannel(); int headerSize = 0; tmp = binaryChannel.map( FileChannel.MapMode.READ_ONLY, headerSize, 4).asIntBuffer().asReadOnlyBuffer(); this.numberOfSentences = tmp.get(); this.binarySentenceBuffer = binaryChannel.map( FileChannel.MapMode.READ_ONLY, (headerSize+4), 4*numberOfSentences ).asIntBuffer().asReadOnlyBuffer(); tmp = binaryChannel.map( FileChannel.MapMode.READ_ONLY, (headerSize + 4 + 4*numberOfSentences), 4).asIntBuffer().asReadOnlyBuffer(); this.numberOfWords = tmp.get(); this.binaryCorpusBuffer = binaryChannel.map( FileChannel.MapMode.READ_ONLY, (headerSize + 4 + 4*numberOfSentences + 4), 4*numberOfWords ).asIntBuffer().asReadOnlyBuffer(); } @Override public int getNumSentences() { return numberOfSentences; } @Override public int getSentenceIndex(int position) { int index = binarySearch(position); // if index is positive, then the position searched // for is the first word of a sentence. we return // the exact value. if (index >= 0) { return index; } else { // otherwise, we are given an negative version of // the first number higher than our position. that // is the position of where this would be inserted // if it was its own sentence, so we make the number // positive and subtract 2 (one since since it is // by ith element instead of position, one to get // the previous index) return (index*(-1))-2; } } private int binarySearch(int value) { int low = 0; int high = numberOfSentences - 1; while (low <= high) { int mid = (low + high) >>> 1; int midValue = binarySentenceBuffer.get(mid); if (midValue < value) { low = mid + 1; } else if (midValue > value) { high = mid -1; } else { return mid; } } return -(low+1); } @Override public int getSentencePosition(int sentenceID) { if (sentenceID >= numberOfSentences) { return numberOfWords; } return binarySentenceBuffer.get(sentenceID); } @Override public int getWordID(int position) { return binaryCorpusBuffer.get(position); } @Override public int size() { return numberOfWords; } }