/* This file is part of the Joshua Machine Translation System. * * Joshua is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 * of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, * MA 02111-1307 USA */ package joshua.corpus.alignment.mm; import java.io.IOException; import java.io.RandomAccessFile; import java.nio.IntBuffer; import java.nio.ShortBuffer; import java.nio.channels.FileChannel; import joshua.corpus.Corpus; import joshua.corpus.alignment.AbstractAlignmentGrids; import joshua.corpus.alignment.AlignmentGrid; /** * Memory-mapped list of alignment grids representing all alignment * data for an aligned parallel corpus. * <p> * Instances of this class are created from binary alignment files, * which are typically created using * the {@link joshua.corpus.alignment.AlignmentGrids#writeExternal} method. * * @author Lane Schwartz */ public class MemoryMappedAlignmentGrids extends AbstractAlignmentGrids { /** Number of alignment grids in the aligned parallel corpus. */ private final int size; /** * Memory-mapped buffer containing the width of each alignment * grid in the aligned parallel corpus. * <p> * The number of integers in this buffer should be equal * to the number of sentences in the aligned parallel corpus. */ private final IntBuffer widths; /** * Memory-mapped buffer containing the height of each * alignment grid in the aligned parallel corpus. * <p> * The number of integers in this buffer should be equal * to the number of sentences in the aligned parallel corpus. */ private final IntBuffer heights; /** * Memory-mapped buffer containing the number of alignment * points in each alignment grid in the aligned parallel * corpus. * <p> * The number of integers in this buffer should be equal * to the number of sentences in the aligned parallel corpus. */ private final IntBuffer pointCounts; /** * Memory-mapped buffer containing all alignment points in * the aligned parallel corpus. Each alignment point is * encoded as a short. * <p> * The number of shorts in this buffer should be equal to * the number of alignment points in the aligned parallel * corpus. * * @see joshua.corpus.alignment.AlignmentGrid#getKey * @see joshua.corpus.alignment.AlignmentGrid#getLocation */ private final ShortBuffer alignmentPoints; /** * Memory-mapped buffer containing all reverse alignment * points in the aligned parallel corpus. Each reverse * alignment point is encoded as a short. * <p> * The number of shorts in this buffer should be equal to * the number of alignment points in the aligned parallel * corpus. * * @see joshua.corpus.alignment.AlignmentGrid#getKey * @see oshua.corpus.alignment.AlignmentGrid#getLocation */ private final ShortBuffer reverseAlignmentPoints; /** * Constructs memory-mapped alignment grids from the specified * binary file and the specified source and target corpora. * <p> * The object returned by this constructor will require * tight spans. * * @param binaryAlignmentsFilename Name of binary file * containing encoded alignment points * @param sourceCorpus Source language corpus * @param targetCorpus Target language corpus * @throws IOException Any I/O exception that was encountered * @see joshua.corpus.alignment.AlignmentGrids#writeExternal */ public MemoryMappedAlignmentGrids(String binaryAlignmentsFilename, Corpus sourceCorpus, Corpus targetCorpus) throws IOException { this(binaryAlignmentsFilename, sourceCorpus, targetCorpus, true); } /** * Constructs memory-mapped alignment grids from the specified * binary file and the specified source and target corpora. * * @param binaryAlignmentsFilename Name of binary file * containing encoded alignment points * @param sourceCorpus Source language corpus * @param targetCorpus Target language corpus * @param requireTightSpans Indicates whether tight alignment * spans are required * @throws IOException Any I/O exception that was encountered * @see joshua.corpus.alignment.AlignmentGrids#writeExternal */ public MemoryMappedAlignmentGrids(String binaryAlignmentsFilename, Corpus sourceCorpus, Corpus targetCorpus, boolean requireTightSpans) throws IOException { super(sourceCorpus, targetCorpus, requireTightSpans); RandomAccessFile binaryFile = new RandomAccessFile( binaryAlignmentsFilename, "r" ); FileChannel binaryChannel = binaryFile.getChannel(); IntBuffer tmp; // Read the number of alignment grids int start = 0; int length = 4; tmp = binaryChannel.map( FileChannel.MapMode.READ_ONLY, start, length).asIntBuffer().asReadOnlyBuffer(); this.size = tmp.get(); // Memory map the widths of all grids start += length; length = 4*size; this.widths = binaryChannel.map( FileChannel.MapMode.READ_ONLY, start, length).asIntBuffer().asReadOnlyBuffer(); // Memory map the heights of all grids start += length; length = 4*size; this.heights = binaryChannel.map( FileChannel.MapMode.READ_ONLY, start, length ).asIntBuffer().asReadOnlyBuffer(); // Memory map the cumulative counts for alignment points start += length; length = 4*(size+1); this.pointCounts = binaryChannel.map( FileChannel.MapMode.READ_ONLY, start, length ).asIntBuffer().asReadOnlyBuffer(); int totalPoints = pointCounts.get(size); start += length; length = 2*totalPoints; this.alignmentPoints = binaryChannel.map( FileChannel.MapMode.READ_ONLY, start, length ).asShortBuffer().asReadOnlyBuffer(); start += length; length = 2*totalPoints; this.reverseAlignmentPoints = binaryChannel.map( FileChannel.MapMode.READ_ONLY, start, length ).asShortBuffer().asReadOnlyBuffer(); } /* See Javadoc for AbstractAlignmentGrids. */ @Override protected int[] getSourcePoints(int sentenceId, int targetSpanStart, int targetSpanEnd) { int start = pointCounts.get(sentenceId); int end = pointCounts.get(sentenceId+1); int numPoints = end - start; short[] reversePoints = new short[numPoints]; reverseAlignmentPoints.position(start); reverseAlignmentPoints.get(reversePoints); return AlignmentGrid.getPoints(targetSpanStart, targetSpanEnd, widths.get(sentenceId), reversePoints); } /* See Javadoc for AbstractAlignmentGrids. */ @Override protected int[] getTargetPoints(int sentenceId, int sourceSpanStart, int sourceSpanEnd) { int start = pointCounts.get(sentenceId); int end = pointCounts.get(sentenceId+1); int numPoints = end - start; short[] points = new short[numPoints]; alignmentPoints.position(start); alignmentPoints.get(points); return AlignmentGrid.getPoints(sourceSpanStart, sourceSpanEnd, heights.get(sentenceId), points); } /* See Javadoc for Alignments. */ public int size() { return this.size; } }