/* This file is part of the Joshua Machine Translation System. * * Joshua is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 * of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, * MA 02111-1307 USA */ package joshua.corpus.suffix_array; import java.util.Arrays; import java.util.List; import java.util.logging.Level; import java.util.logging.Logger; import joshua.corpus.Corpus; import joshua.corpus.MatchedHierarchicalPhrases; import joshua.corpus.Phrase; import joshua.corpus.vocab.SymbolTable; import joshua.decoder.ff.tm.Rule; import joshua.util.Cache; /** * This class provides a mostly-complete implementation of the * <code>Suffixes</code> interface, designed to minimize the effort * required to build a concrete implementation of a suffix array * data structure. * <p> * To implement a concrete suffix array, the programmer need only * implement the <code>getCorpusIndex(int suffixIndex)</code> and * <code>size()</code> methods. * * @author Lane Schwartz * @author Chris Callison-Burch */ public abstract class AbstractSuffixArray implements Suffixes { /** Logger for this class. */ private static Logger logger = Logger.getLogger(AbstractSuffixArray.class.getName()); /** * Maps from patterns to lists of hierarchical phrases that * match the corresponding pattern in the corpus. * <p> * This cache is a most-recently accessed map, so commonly * accessed patterns will remain in the cache, while rare * patterns will eventually drop out of the cache. */ protected final Cache<Pattern,MatchedHierarchicalPhrases> hierarchicalPhraseCache; /** * Maps from patterns to lists of hierarchical phrases that * match the corresponding pattern in the corpus. * <p> * This cache is a most-recently accessed map, so commonly * accessed patterns will remain in the cache, while rare * patterns will eventually drop out of the cache. */ protected final Cache<Pattern,List<Rule>> ruleCache; /** * Integer array representation of the corpus for this * suffix array. */ protected final Corpus corpus; /** * Constructs an abstract suffix array based on the provided * corpus. * * The specified cache will be used to store matched * hierarchical phrases for frequently accessed patterns. * * @param corpus Corpus upon which this suffix array is based. * @param hierarchicalPhraseCache Cache to store matched * hierarchical phrases for frequently accessed * patterns */ public AbstractSuffixArray( Corpus corpus, Cache<Pattern,MatchedHierarchicalPhrases> hierarchicalPhraseCache, Cache<Pattern,List<Rule>> ruleCache) { this.hierarchicalPhraseCache = hierarchicalPhraseCache; this.ruleCache = ruleCache; this.corpus = corpus; } /* See Javadoc for Suffixes interface.*/ public Cache<Pattern,MatchedHierarchicalPhrases> getCachedHierarchicalPhrases() { return hierarchicalPhraseCache; } /* See Javadoc for Suffixes interface.*/ public Cache<Pattern,List<Rule>> getCachedRules() { return this.ruleCache; } /* See Javadoc for Suffixes interface.*/ public MatchedHierarchicalPhrases createHierarchicalPhrases(Pattern pattern, int minNonterminalSpan, int maxPhraseSpan) { if (hierarchicalPhraseCache.containsKey(pattern)) { return hierarchicalPhraseCache.get(pattern); } else { int arity = pattern.arity(); int size = pattern.size(); int[] patternTokens = pattern.getWordIDs(); SymbolTable vocab = corpus.getVocabulary(); if (arity==0) { int[] bounds = this.findPhrase(pattern, 0, pattern.size(), 0, this.size()-1); int[] startPositions = this.getAllPositions(bounds); MatchedHierarchicalPhrases result = this.createTriviallyHierarchicalPhrases(startPositions, pattern, vocab); return result; } else if (arity==size) { int[] startPositions = new int[]{}; MatchedHierarchicalPhrases result = this.createTriviallyHierarchicalPhrases(startPositions, pattern, vocab); return result; } else if (arity==1 && pattern.startsWithNonterminal()) { int[] terminals = new int[size-1]; for (int i=1; i<size; i++) { terminals[i-1] = patternTokens[i]; } Pattern terminalsPattern = new Pattern(vocab, terminals); MatchedHierarchicalPhrases terminalsMatch = this.createHierarchicalPhrases(terminalsPattern, minNonterminalSpan, maxPhraseSpan); MatchedHierarchicalPhrases result = terminalsMatch.copyWithInitialX(); hierarchicalPhraseCache.put(pattern, result); return result; } else if (arity==1 && pattern.endsWithNonterminal()) { int[] terminals = new int[size-1]; for (int i=0, n=size-1; i<n; i++) { terminals[i] = patternTokens[i]; } Pattern terminalsPattern = new Pattern(vocab, terminals); MatchedHierarchicalPhrases terminalsMatch = this.createHierarchicalPhrases(terminalsPattern, minNonterminalSpan, maxPhraseSpan); MatchedHierarchicalPhrases result = terminalsMatch.copyWithFinalX(); hierarchicalPhraseCache.put(pattern, result); return result; // int[] bounds = this.findPhrase(pattern, 0, size, 0, this.size()); // int[] startPositions = this.getAllPositions(bounds); //// Pattern patternX = new Pattern(pattern, PrefixTree.X); // MatchedHierarchicalPhrases result = this.createHierarchicalPhrases(startPositions, pattern, vocab); // return result; } else { int[] prefixTokens = new int[patternTokens.length - 1]; for (int i=0, n=patternTokens.length-1; i<n; i++) { prefixTokens[i] = patternTokens[i]; } int[] suffixTokens = new int[patternTokens.length - 1]; for (int i=1, n=patternTokens.length; i<n; i++) { suffixTokens[i-1] = patternTokens[i]; } Pattern prefix = new Pattern(vocab, prefixTokens); Pattern suffix = new Pattern(vocab, suffixTokens); MatchedHierarchicalPhrases prefixMatches = createHierarchicalPhrases(prefix, minNonterminalSpan, maxPhraseSpan); MatchedHierarchicalPhrases suffixMatches = createHierarchicalPhrases(suffix, minNonterminalSpan, maxPhraseSpan); MatchedHierarchicalPhrases result = HierarchicalPhrases.queryIntersect( pattern, prefixMatches, suffixMatches, minNonterminalSpan, maxPhraseSpan, this); hierarchicalPhraseCache.put(pattern, result); return result; } } } /* See Javadoc for Suffixes interface.*/ public MatchedHierarchicalPhrases createTriviallyHierarchicalPhrases(int[] startPositions, Pattern pattern, SymbolTable vocab) { if (hierarchicalPhraseCache.containsKey(pattern)) { if (logger.isLoggable(Level.FINEST)) logger.finest("Cache has " + hierarchicalPhraseCache.size() + " entries, and did contain pattern: " + pattern.toString()); return hierarchicalPhraseCache.get(pattern); } else { if (logger.isLoggable(Level.FINEST)) logger.finest("Cache has " + hierarchicalPhraseCache.size() + " entries, but did not contain pattern: " + pattern.toString()); // In the case of contiguous phrases, // the hpCache is essentially acting as Adam's Inverted Index, // because it stores the corpus-sorted indexes of each of the phrases. // It differs because it creates a HierarchicalPhrases object rather than just int[]. Arrays.sort(startPositions); HierarchicalPhrases hierarchicalPhrases = new HierarchicalPhrases(pattern, startPositions, getCorpus().getSentenceIndices(startPositions)); hierarchicalPhraseCache.put(pattern, hierarchicalPhrases); return hierarchicalPhrases; } } /* See Javadoc for Suffixes interface.*/ public int[] findPhrase(Phrase phrase) { return findPhrase(phrase, 0, phrase.size()); } /* See Javadoc for Suffixes interface.*/ public int[] findPhrase(Phrase sentence, int phraseStart, int phraseEnd, int lowerBound, int upperBound) { int[] bounds = new int[2]; lowerBound = findPhraseBound(sentence, phraseStart, phraseEnd, lowerBound, upperBound, true); if (lowerBound < 0) return null; upperBound = findPhraseBound(sentence, phraseStart, phraseEnd, lowerBound, upperBound, false); bounds[0]=lowerBound; bounds[1]=upperBound; return bounds; } /* See Javadoc for Suffixes interface.*/ public int[] getAllPositions(int[] bounds) { if (bounds != null) { int startInSuffixArray = bounds[0]; int endInSuffixArray = bounds[1]; int length = endInSuffixArray - startInSuffixArray + 1; int[] positions = new int[length]; for (int i = 0; i < length; i++) { positions[i] = getCorpusIndex(i+startInSuffixArray); } Arrays.sort(positions); return positions; } else { return new int[0]; } } /* See Javadoc for Suffixes interface.*/ public Corpus getCorpus() { return corpus; } /* See Javadoc for Suffixes interface.*/ public abstract int getCorpusIndex(int suffixIndex); /* See Javadoc for Suffixes interface.*/ public MatchedHierarchicalPhrases getMatchingPhrases(Pattern pattern) { return hierarchicalPhraseCache.get(pattern); } /* See Javadoc for Suffixes interface.*/ public int getSentenceIndex(int corpusIndex) { return corpus.getSentenceIndex(corpusIndex); } /* See Javadoc for Suffixes interface.*/ public int getSentencePosition(int sentenceIndex) { return corpus.getSentencePosition(sentenceIndex); } /* See Javadoc for Suffixes interface.*/ public SymbolTable getVocabulary() { return corpus.getVocabulary(); } /* See Javadoc for Suffixes interface.*/ public void cacheMatchingPhrases(MatchedHierarchicalPhrases matchings) { hierarchicalPhraseCache.put(matchings.getPattern(), matchings); } /* See Javadoc for Suffixes interface.*/ public abstract int size(); /** * Finds a phrase in the suffix array. The phrase is extracted * from the sentence given the start and end points. * * @param sentence the sentence/superphrase to draw the * search phrase from * @param phraseStart the start of the phrase in the sentence * (inclusive) * @param phraseEnd the end of the phrase in the sentence * (exclusive) * @return a tuple containing the (inclusive) start and the * (inclusive) end bounds in the suffix array for * the phrase */ protected int[] findPhrase(Phrase sentence, int phraseStart, int phraseEnd) { return findPhrase(sentence, phraseStart, phraseEnd, 0, size()-1); } /** * Finds the first or last occurrence of a phrase in the * suffix array, within a subset of the suffix array that * is bounded by suffixArrayStart and suffixArrayEnd. For * efficiency of looking up all subphrases in a sentence * we do not require that multplie int[]s be created for * each subphrase. Instead this method will look for the * subphrase within the sentence between phraseStart and * phraseEnd. * * @param sentence the sentence/superphrase in int * representation to draw the search * phrase from * @param phraseStart the start of the phrase in the sentence * (inclusive) * @param phraseEnd the end of the phrase in the sentence * (exclusive) * @param suffixArrayStart the point at which to start the * search in the suffix array * @param suffixArrayEnd the end point in the suffix array * beyond which the search doesn't need * to take place * @param findFirst a flag that indicates whether we * should find the first or last occurrence * of the phrase */ private int findPhraseBound( Phrase sentence, int phraseStart, int phraseEnd, int suffixArrayStart, int suffixArrayEnd, boolean findFirst ) { int low = suffixArrayStart; int high = suffixArrayEnd; // Do a binary search between the low and high points while (low <= high) { int mid = (low + high) >>> 1; int start = getCorpusIndex(mid); int diff = corpus.comparePhrase(start, sentence, phraseStart, phraseEnd); if (diff == 0) { // If the difference between the search phrase and the phrase in the corpus // is 0, then we have found it. However, there might be multiple matches in // the corpus, so we need to continue searching until we find the end point int neighbor = mid; if (findFirst) { neighbor--; } else { neighbor++; } if (neighbor >= suffixArrayStart && neighbor <= suffixArrayEnd) { int nextDiff = corpus.comparePhrase(getCorpusIndex(neighbor), sentence, phraseStart, phraseEnd); if (nextDiff == 0) { // There's another equivalent phrase, so we need to specify // in which direction to continue searching if (findFirst) { diff = 1; //search lower } else { diff = -1; //search higher } } } } if (diff < 0) { low = mid + 1; } else if (diff > 0) { high = mid - 1; } else { return mid; //this is the edge } } return -1; // key not found. } }