/* This file is part of the Joshua Machine Translation System. * * Joshua is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 * of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, * MA 02111-1307 USA */ package joshua.corpus.alignment; import joshua.corpus.Span; import joshua.corpus.suffix_array.HierarchicalPhrases; import java.util.logging.Level; import java.util.logging.Logger; /** * AlignmentArray is an auxiliary class which stores alignment * information for a parallel corpus. For each source word it stores * the minimum and maximum index of aligned words in the target * corpus, and for each target word it stores the min and max indexed * of aligned words in the source corpus. The intent is to increase * the speed of the phrase extraction. * * This class was inspired by a conversation with Adam Lopez. * * @author Chris Callison-Burch * @since 13 May 2008 * @author Lane Schwartz * @version $LastChangedDate:2008-07-30 17:15:52 -0400 (Wed, 30 Jul 2008) $ */ public class AlignmentArray extends AbstractAlignments { //=============================================================== // Member variables //=============================================================== /** * Stores the indices of all aligned target words for each * word in the source corpus. */ protected final int[][] alignedTargetIndices; /** * Stores the indices of all aligned source words for each * word in the target corpus. */ protected final int[][] alignedSourceIndices; /** Logger for this class. */ private static final Logger logger = Logger.getLogger(AlignmentArray.class.getName()); protected final int numSentences; //=============================================================== // Constructor(s) //=============================================================== /** * This protected constructor is used by the * SuffixArrayFactory.loadAlignmentArray and * SuffixArrayFactory.createAlignmentArray methods. * @param numSentences TODO */ public AlignmentArray(int[][] alignedTargetIndices, int[][] alignedSourceIndices, int numSentences) { this.alignedTargetIndices = alignedTargetIndices; this.alignedSourceIndices = alignedSourceIndices; this.numSentences = numSentences; } //=============================================================== // Public //=============================================================== //=========================================================== // Accessor methods (set/get) //=========================================================== /** * This method looks up target span for the given source * span. * * @param startSourceIndex the staring position in the * source corpus (inclusive) * @param endSourceIndex the end position in the source * corpus (exclusive) * @return a tuple containing the min and max indices in * the target corpus, if the span is unaligned the * value will be <UNALIGNED, undefined> */ public Span getAlignedTargetSpan(int startSourceIndex, int endSourceIndex) { return getAlignedSpan(startSourceIndex, endSourceIndex, alignedTargetIndices); } public Span getAlignedTargetSpan(Span sourceSpan) { return getAlignedSpan(sourceSpan.start, sourceSpan.end, alignedTargetIndices); } /** * Gets the indices of all source words aligned with a * particular location in the target corpus. * * @param targetIndex Index into the target corpus * @return The indices of all source words aligned with * the given location in the target corpus. */ public int[] getAlignedSourceIndices(int targetIndex) { return alignedSourceIndices[targetIndex]; } /** * Gets the indices of all target words aligned with a * particular location in the source corpus. * * @param sourceIndex Index into the source corpus * @return The indices of all target words aligned with * the given location in the source corpus. */ public int[] getAlignedTargetIndices(int sourceIndex) { return alignedTargetIndices[sourceIndex]; } /** * This method looks up source span for the given target * span. * * @param startTargetIndex the staring position in the * target corpus (inclusive) * @param endTargetIndex the end position in the target * corpus (exclusive) * @return a tuple containing the min and max indices in * the source corpus, if the span is unaligned the * value will be <UNALIGNED, undefined> */ public Span getAlignedSourceSpan(int startTargetIndex, int endTargetIndex) { return getAlignedSpan(startTargetIndex, endTargetIndex, alignedSourceIndices); } /** * This method determines whether there is a consistent * word alignment for the specified source phrase. * ccb - debugging */ public boolean hasConsistentAlignment(int startSourceIndex, int endSourceIndex) { Span targetSpan = getAlignedTargetSpan(startSourceIndex, endSourceIndex); if (targetSpan.start == UNALIGNED) return false; // check back to see what sourceSpan the targetSpan // aligns back to, so that we can check that it's // within bounds Span sourceSpan = getAlignedSourceSpan(targetSpan.start, targetSpan.end); return ! (sourceSpan.start < startSourceIndex || sourceSpan.end > endSourceIndex); } /** * Determines if any terminal in the source phrase aligns * with the provided index into the target corpus. * * @param targetIndex * @param sourcePhrases * @param sourcePhraseIndex * @return <code>true</code> if any terminal in the source phrase * aligns with the provided index into the target corpus, * <code>false</code> otherwise */ public boolean hasAlignedTerminal(int targetIndex, HierarchicalPhrases sourcePhrases, int sourcePhraseIndex) { int phraseLength = sourcePhrases.getNumberOfTerminalSequences(); if (alignedSourceIndices[targetIndex]!=null) { for (int alignedSourceIndex : alignedSourceIndices[targetIndex]) { for (int i = 0; i < phraseLength; i++) { int sourceStart = sourcePhrases.getStartPosition(sourcePhraseIndex, i); //int sourceStart = sourcePhrases.terminalSequenceStartIndices[sourcePhraseIndex*(sourcePhrases.terminalSequenceLengths.length)+i]; int sourceEnd = sourcePhrases.getEndPosition(sourcePhraseIndex, i); if (alignedSourceIndex >= sourceStart && alignedSourceIndex < sourceEnd) { if (logger.isLoggable(Level.FINEST)) logger.finest("Target index " + targetIndex + ", source index " + alignedSourceIndex + " is in source phrase at range [" + sourceStart + "-" + sourceEnd + ")"); return true; } } } } if (logger.isLoggable(Level.FINEST)) logger.warning("No aligned point"); return false; } //=========================================================== // Methods //=========================================================== //=============================================================== // Protected //=============================================================== //=============================================================== // Methods //=============================================================== //=============================================================== // Private //=============================================================== //=============================================================== // Methods //=============================================================== /** * This method looks up the minimum and maximum aligned * indices for the span. * * @param startIndex the staring word (inclusive) * @param endIndex the end word (exclusive) * @return a tuple containing the min (inclusive) and max * (exclusive) aligned indices, if the span is * unaligned the value will be <UNALIGNED, ?> */ private Span getAlignedSpan(int startIndex, int endIndex, int[][] alignedIndices) { int lowestHighestMin = UNALIGNED; int lowestHighestMax = -1; for(int i = startIndex; i < endIndex; i++) { if (alignedIndices[i] != null) { lowestHighestMin = ( alignedIndices[i][0] < lowestHighestMin) ? alignedIndices[i][0] : lowestHighestMin; //Math.min(lowestAlignedIndex[i], lowestHighestMin); lowestHighestMax = (alignedIndices[i][alignedIndices[i].length-1] > lowestHighestMax) ? alignedIndices[i][alignedIndices[i].length-1] : lowestHighestMax; //Math.max(highestAlignedIndex[i], lowestHighestMax); } else if (requireTightSpans && (i==startIndex || i==endIndex-1)) { //XXX Is this the correct way to ensure tight spans? // If requiring tight spans return new Span(UNALIGNED, UNALIGNED); } } lowestHighestMax++; return new Span(lowestHighestMin,lowestHighestMax); } public int size() { return this.numSentences; } //=============================================================== // Static //=============================================================== //=============================================================== // Main //=============================================================== }