/* This file is part of the Joshua Machine Translation System. * * Joshua is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 * of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, * MA 02111-1307 USA */ package joshua.prefix_tree; import joshua.corpus.Corpus; import joshua.corpus.MatchedHierarchicalPhrases; import joshua.corpus.RuleExtractor; import joshua.corpus.alignment.Alignments; import joshua.corpus.lexprob.LexicalProbabilities; import joshua.corpus.suffix_array.HierarchicalPhrases; import joshua.corpus.suffix_array.ParallelCorpusGrammarFactory; import joshua.corpus.suffix_array.Pattern; import joshua.corpus.suffix_array.Suffixes; import joshua.corpus.vocab.SymbolTable; import joshua.decoder.JoshuaConfiguration; import joshua.decoder.ff.tm.AbstractGrammar; import joshua.decoder.ff.tm.BilingualRule; import joshua.decoder.ff.tm.Rule; import joshua.decoder.ff.tm.Trie; import joshua.decoder.ff.tm.hiero.MemoryBasedBatchGrammar; import joshua.util.Cache; import java.io.PrintStream; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Queue; import java.util.Set; import java.util.logging.Level; import java.util.logging.Logger; /** * Represents a prefix tree with suffix links, for use in extracting * hierarchical phrase-based statistical translation rules. * * @author Lane Schwartz * @version $LastChangedDate:2008-11-13 13:13:31 -0600 (Thu, 13 Nov 2008) $ */ public class PrefixTree extends AbstractGrammar { /** Logger for this class. */ private static final Logger logger = Logger.getLogger(PrefixTree.class.getName()); /** * Integer representation of the nonterminal X. * All nonterminals are guaranteed to be represented by negative integers. */ public static final int X = SymbolTable.X;//-1; /** Operating system-specific end of line character(s). */ static final byte[] newline = System.getProperty("line.separator").getBytes(); /** Root node of this tree. */ final RootNode root; /** * Responsible for performing sampling and creating translation * rules. */ final RuleExtractor ruleExtractor; /** * Max span in the source corpus of any extracted hierarchical * phrase. */ final int maxPhraseSpan; /** * Maximum number of terminals plus nonterminals allowed * in any extracted hierarchical phrase. */ final int maxPhraseLength; /** * Maximum number of nonterminals allowed in any extracted * hierarchical phrase. */ final int maxNonterminals; /** * Minimum span in the source corpus of any nonterminal in * an extracted hierarchical phrase. */ final int minNonterminalSpan; /** * Represents a very high cost, corresponding to a very * unlikely probability. */ static final float VERY_UNLIKELY = -1.0f * (float) Math.log(1.0e-9); /** * Indicates whether rules with an initial source-side * nonterminal should be extracted from phrases at the start * of a sentence, even though such rules do not have * supporting corporal evidence. * <p> * This is included for compatibility with Adam Lopez's * Hiero rule extractor, in which this setting is set to * <code>true</code>. * <p> * The default value is <code>false</code>. */ boolean sentenceInitialX = false; /** * Indicates whether rules with a final source-side nonterminal * should be extracted from phrases at the end of a sentence, * even though such rules do not have supporting corporal * evidence. * <p> * This is included for compatibility with Adam Lopez's * Hiero rule extractor, in which this setting is set to * <code>true</code>. * <p> * The default value is <code>false</code>. */ boolean sentenceFinalX = false; boolean edgeXMayViolatePhraseSpan = false; /** Unique integer identifier for the root node. */ static final int ROOT_NODE_ID = -999; /** * Unique integer identifier for the special ⊥ node * that represents the suffix of the root node. * @see Lopez (2008), footnote 9 on p73 */ static final int BOT_NODE_ID = 0;//-2000; /** Suffix array representing the source language corpus. */ final Suffixes suffixArray; /** Corpus array representing the target language corpus. */ final Corpus targetCorpus; /** */ final ParallelCorpusGrammarFactory parallelCorpus; /** * Represents alignments between words in the source corpus * and the target corpus. */ final Alignments alignments; /** Lexical translation probabilities. */ final LexicalProbabilities lexProbs; /** Symbol table */ final SymbolTable vocab; /** Empty pattern */ final Pattern epsilon; /** * Node representing phrases that start with the nonterminal * X. This node's parent is the root node of the tree. */ private final Node xnode; private Set<Integer> printedNodes = null; private Map<Integer,String> ntVocab; private PrintStream out = null; private final int ruleOwner; private final int defaultLHS; private final float oovFeatureCost; /** * Constructs a new prefix tree with suffix links using the * GENERATE_PREFIX_TREE algorithm from Lopez (2008) PhD * Thesis, Algorithm 2, p 76. * * @param parallelCorpus */ public PrefixTree(ParallelCorpusGrammarFactory parallelCorpus) { if (logger.isLoggable(Level.FINER)) logger.finer("\n\n\nConstructing new PrefixTree\n\n"); this.parallelCorpus = parallelCorpus; this.suffixArray = parallelCorpus.getSuffixArray(); this.targetCorpus = parallelCorpus.getTargetCorpus(); this.alignments = parallelCorpus.getAlignments(); this.lexProbs = parallelCorpus.getLexProbs(); this.ruleExtractor = parallelCorpus.getRuleExtractor(); this.maxPhraseSpan = parallelCorpus.getMaxPhraseSpan(); this.maxPhraseLength = parallelCorpus.getMaxPhraseLength(); this.maxNonterminals = parallelCorpus.getMaxNonterminals(); this.minNonterminalSpan = parallelCorpus.getMinNonterminalSpan(); this.vocab = parallelCorpus.getSourceCorpus().getVocabulary(); this.ruleOwner = vocab.getID(parallelCorpus.getRuleOwner()); this.defaultLHS = vocab.getID(parallelCorpus.getDefaultLHSSymbol()); this.oovFeatureCost = parallelCorpus.getOovFeatureCost(); this.root = new RootNode(this,ROOT_NODE_ID); Node bot = new BotNode(parallelCorpus, root); this.root.linkToSuffix(bot); this.ntVocab = new HashMap<Integer,String>(); ntVocab.put(PrefixTree.X, "X"); //// if (suffixArray==null) { ////// vocab = null; //// } else { // if (suffixArray != null) { //// vocab = suffixArray.getVocabulary(); // //int[] bounds = {0, suffixArray.size()-1}; // root.setBounds(0, suffixArray.size()-1); // } // root.sourceHierarchicalPhrases = HierarchicalPhrases.emptyList(vocab); // Define epsilon to be an empty pattern epsilon = new Pattern(vocab); // 1: children(p_eps) <-- children(p_eps) U p_x if (maxNonterminals > 0) { // Create and set up the X node that comes off of ROOT // Add a link from root node to X xnode = root.addChild(X); // Add a suffix link from X back to root Node suffixLink = root.calculateSuffixLink(X); if (logger.isLoggable(Level.FINEST)) { String oldSuffixLink = (xnode.suffixLink==null) ? "null" : "id"+xnode.suffixLink.objectID; String newSuffixLink = (suffixLink==null) ? "null" : "id"+suffixLink.objectID; logger.finest("Changing suffix link from " + oldSuffixLink + " to " + newSuffixLink + " for node " + xnode.toShortString(vocab) + " with token " + X); } xnode.linkToSuffix(suffixLink); } else { this.xnode = null; } if (logger.isLoggable(Level.FINEST)) logger.finest("CURRENT TREE: " + root); } /** * Constructs a new prefix tree with suffix links using the * GENERATE_PREFIX_TREE algorithm from Lopez (2008) PhD * Thesis, Algorithm 2, p 76. * <p> * This constructor does not take a suffix array parameter. * Instead any prefix tree constructed by this constructor * will assume that all possible phrases of this sentence * are valid phrases. * <p> * This constructor is meant to be used primarily for testing * purposes. * * @param sentence * @param maxPhraseSpan * @param maxPhraseLength * @param maxNonterminals */ PrefixTree(SymbolTable vocab, int maxPhraseSpan, int maxPhraseLength, int maxNonterminals) { this(new ParallelCorpusGrammarFactory((Suffixes) null, (Suffixes) null, (Alignments) null, null, Integer.MAX_VALUE, maxPhraseSpan, maxPhraseLength, maxNonterminals, 2, Float.MIN_VALUE, JoshuaConfiguration.phrase_owner, JoshuaConfiguration.default_non_terminal, JoshuaConfiguration.oovFeatureCost)); } /** * Sets a print stream to which newly extracted rules will be written. * * @param out a print stream * to which newly extracted rules will be written */ public void setPrintStream(PrintStream out) { logger.info("Setting output stream"); this.out = out; this.printedNodes = new HashSet<Integer>(); } /** * Modify this prefix tree by adding phrases for this * sentence. * * @param sentence */ public void add(int[] sentence) { long startTime = System.nanoTime(); int START_OF_SENTENCE = 0; int END_OF_SENTENCE = sentence.length - 1; Queue<Tuple> queue = new LinkedList<Tuple>(); if (logger.isLoggable(Level.FINER)) logger.finer("Last sentence index == I == " + END_OF_SENTENCE); // 2: for i from 1 to I for (int i=START_OF_SENTENCE; i<=END_OF_SENTENCE; i++) { //if (logger.isLoggable(Level.FINEST)) logger.finest("Adding tuple (" + i + ","+ i +","+root+",{"+intToString(sentence[i])+"})"); if (logger.isLoggable(Level.FINEST)) logger.finest("Adding tuple (\u03b5," + i + ","+ i +","+root.toShortString(vocab) +")"); // 3: Add <f_i, i, i+1, p_eps> to queue queue.add(new Tuple(epsilon, i, i, root)); } if (this.maxNonterminals > 0) { Pattern xpattern = new Pattern(vocab,X); int start = START_OF_SENTENCE; if (!sentenceInitialX) start += 1; // 4: for i from 1 to I for (int i=start; i<=END_OF_SENTENCE; i++) { //if (logger.isLoggable(Level.FINEST)) logger.finest("Adding tuple (" + (i-1) + ","+(i)+","+root+",{"+X+","+intToString(sentence[i])+"})"); if (logger.isLoggable(Level.FINEST)) logger.finest("Adding tuple (X," + (i-1) + ","+ i +","+xnode.toShortString(vocab) +")"); // 5: Add <X f_i, i-1, i+1, p_x> to queue if (edgeXMayViolatePhraseSpan) { queue.add(new Tuple(xpattern, i, i, xnode)); } else { queue.add(new Tuple(xpattern, i-1, i, xnode)); } } } // 6: While queue is not empty do while (! queue.isEmpty()) { if (logger.isLoggable(Level.FINER)) { logger.finer("\n"); if (logger.isLoggable(Level.FINEST)) logger.finest("CURRENT TREE: " + root); } // 7: Pop <alpha, i, j, p_alphaBeta> from queue Tuple tuple = queue.remove(); int i = tuple.spanStart; int j = tuple.spanEnd; Node prefixNode = tuple.prefixNode; Pattern prefixPattern = tuple.pattern; // if (prefixNode.objectID==329 //) { // || (prefixNode.objectID==28 && i==13 && j==17)) { // int x = -1; // x++; // } if (logger.isLoggable(Level.FINER)) logger.finer("Have tuple (" +prefixPattern+","+ i + ","+j+","+prefixNode.toShortString(vocab)+")"); if (j <= END_OF_SENTENCE) { // 8: If p_alphaBetaF_i elementOf children(p_alphaBeta) then if (prefixNode.hasChild(sentence[j])) { if (logger.isLoggable(Level.FINER)) logger.finer("EXISTING node for \"" + sentence[j] + "\" from " + prefixNode.toShortString(vocab) + " to node " + prefixNode.getChild(sentence[j]).toShortString(vocab) + " with pattern " + prefixPattern); // child is p_alphaBetaF_j Node child = prefixNode.getChild(sentence[j]); // 9: If p_alphaBetaF_j is inactive then if (! child.active) { // 10: Continue to next item in queue continue; // 11: Else } else { // 12: EXTEND_QUEUE(alpha beta f_j, i, j, f_1^I) if (logger.isLoggable(Level.FINER)) { logger.finer("Calling EXTEND_QUEUE("+i+","+j+","+prefixPattern+","+prefixNode.toShortString(vocab)); if (logger.isLoggable(Level.FINEST)) logger.finest("TREE BEFOR EXTEND: " + root); } extendQueue(queue, i, j, sentence, new Pattern(prefixPattern,sentence[j]), child); if (logger.isLoggable(Level.FINEST)) logger.finest("TREE AFTER EXTEND: " + root); } } else { // 13: Else // 14: children(alphaBeta) <-- children(alphaBeta) U p_alphaBetaF_j // (Add new child node) if (logger.isLoggable(Level.FINER)) logger.finer("Adding new node to node " + prefixNode.toShortString(vocab)); Node newNode = prefixNode.addChild(sentence[j]); if (logger.isLoggable(Level.FINER)) { String word = (suffixArray==null) ? ""+sentence[j] : suffixArray.getVocabulary().getWord(sentence[j]); logger.finer("Created new node " + newNode.toShortString(vocab) +" for \"" + word + "\" and \n added it to " + prefixNode.toShortString(vocab)); } // 15: p_beta <-- suffix_link(p_alpha_beta) // suffixNode in this code is p_beta_f_j, not p_beta Node suffixNode = prefixNode.calculateSuffixLink(sentence[j]); if (logger.isLoggable(Level.FINEST)) { String oldSuffixLink = (newNode.suffixLink==null) ? "null" : "id"+newNode.suffixLink.objectID; String newSuffixLink = (suffixNode==null) ? "null" : "id"+suffixNode.objectID; logger.finest("Changing suffix link from " + oldSuffixLink + " to " + newSuffixLink + " for node " + newNode.toShortString(vocab) + " (prefix node " + prefixNode.toShortString(vocab) + " ) with token " + sentence[j]); } newNode.linkToSuffix( suffixNode ); // 16: if p_beta_f_j is inactive then if (! suffixNode.active) { // 17: Mark p_alpha_beta_f_j inactive newNode.active = false; //Node.INACTIVE; // 18: else } else { Pattern extendedPattern = new Pattern(prefixPattern,sentence[j]); MatchedHierarchicalPhrases result = null; if (suffixArray != null) { // 19: Q_alpha-beta-f_j <-- query(alpha-beta-f_j, Q_alpha-beta, Q_beta-f_j) result = query(extendedPattern, newNode, prefixNode, suffixNode); } // 20: if Q_alpha_beta_f_j = ∅ (meaning that no results were found for this query) //if (result != null && result.isEmpty()) {// && prefixNode != xnode) { if (result != null && result.isEmpty()) { // 21: Mark p_alpha_beta_f_j inactive newNode.active = false; //Node.INACTIVE; // 22: else } else { // 23: Mark p_alpha_beta_f_j active newNode.active = true; //Node.ACTIVE; // 24: EXTEND_QUEUE(alpha beta f_j, i, j, f_1^I) extendQueue(queue, i, j, sentence, extendedPattern, newNode); } } } } } long endTime = System.nanoTime(); long microseconds = (endTime - startTime) / 1000; float milliseconds = microseconds / 1000.0f; logger.info("Sentence total extraction time:\t"+ milliseconds + " milliseconds"); if (logger.isLoggable(Level.FINER)) { logger.finer("\n"); if (logger.isLoggable(Level.FINEST)) logger.finest("FINAL TREE: " + root); } } /** * Implements the root QUERY algorithm (Algorithm 4) of * Adam Lopez's (2008) doctoral thesis. * * @param pattern Pattern to search for * @param node Node in the prefix tree * @param prefixNode Prefix node * @param suffixNode Suffix node * @return List of matched hierarchical phrases for the specified pattern. * * @see "Lopez (2008)" */ public MatchedHierarchicalPhrases query(Pattern pattern, Node node, Node prefixNode, Node suffixNode) { if (logger.isLoggable(Level.FINER)) logger.finer("PrefixTree.query( " + pattern + ",\n\t new node " + node + ",\n\tprefix node " + prefixNode + ",\n\tsuffix node " + suffixNode + ")"); long startTime = System.nanoTime(); MatchedHierarchicalPhrases result; // boolean stop = false; // if (pattern.toString().startsWith("[de ")) { // logger.warning("Found it! " + pattern.toString() + " yahoo"); // int x; // x=5; // x+=1; // stop = true; // } // // if (stop) { // if (stop) { // logger.info("Stopping"); // logger.info("Did you stop?"); // } // } // if (suffixArray.getCachedHierarchicalPhrases().containsKey(pattern)) { result = suffixArray.getCachedHierarchicalPhrases().get(pattern); int[] bounds = suffixArray.findPhrase(pattern, 0, pattern.size(), prefixNode.lowBoundIndex, prefixNode.highBoundIndex); if (bounds!=null) { node.setBounds(bounds[0],bounds[1]); } } else { if (pattern.toString().startsWith("[de ")) { int x = 5; x++; } int arity = pattern.arity(); // 1: if alpha=u then // If the pattern is contiguous, look up the pattern in the suffix array if (arity == 0) { // 2: SUFFIX-ARRAY-LOOKUP(SA_f, a alpha b, l_a_alpha, h_a_alpha // Get the first and last index in the suffix array for the specified pattern int[] bounds = suffixArray.findPhrase(pattern, 0, pattern.size(), prefixNode.lowBoundIndex, prefixNode.highBoundIndex); if (bounds==null) { result = HierarchicalPhrases.emptyList(pattern); suffixArray.cacheMatchingPhrases(result); //TODO Should node.setBounds(bounds) be called here? } else { node.setBounds(bounds[0],bounds[1]); int[] startingPositions = suffixArray.getAllPositions(bounds); result = suffixArray.createTriviallyHierarchicalPhrases(startingPositions, pattern, vocab); } } else { // 3: else --- alpha is a discontiguous pattern // 8: If M_a_alpha_b has been precomputed (then result will be non-null) // 9: Retrieve M_a_alpha_b from cache of precomputations // 10: else if (suffixArray.getCachedHierarchicalPhrases().containsKey(pattern)) { result = suffixArray.getMatchingPhrases(pattern); } else { // 16: M_a_alpha_b <-- QUERY_INTERSECT(M_a_alpha, M_alpha_b) int[] sourceWords = prefixNode.getSourcePattern().getWordIDs(); // Special handling of case when prefixNode is the X off of root (hierarchicalPhrases for that node is empty) if (arity==1 && sourceWords[0] < 0 && sourceWords[sourceWords.length-1] < 0){ result = suffixNode.getMatchedPhrases().copyWithInitialX(); } else { // Normal query intersection case (when prefixNode != X off of root) if (logger.isLoggable(Level.FINEST)) logger.finest("Calling queryIntersect("+pattern+" M_a_alpha.pattern=="+prefixNode.getSourcePattern() + ", M_alpha_b.pattern=="+suffixNode.getSourcePattern()+")"); result = HierarchicalPhrases.queryIntersect(pattern, prefixNode.getMatchedPhrases(), suffixNode.getMatchedPhrases(), minNonterminalSpan, maxPhraseSpan, suffixArray); } suffixArray.cacheMatchingPhrases(result); } } } long finalQueryTime = System.nanoTime(); if (logger.isLoggable(Level.FINE)) { long elapsedQueryTime = finalQueryTime - startTime; long microseconds = elapsedQueryTime / 1000; float milliseconds = microseconds / 1000.0f; logger.fine("Time to query pattern:\t" + pattern.toString() + "\t" + milliseconds + " milliseconds\t" + result.size() + " instances"); } // 17: Return M_a_alpha_b List<Rule> rules = ruleExtractor.extractRules(result); // node.storeResults(result, rules); storeResults(node, result, rules); if (logger.isLoggable(Level.FINE)) { long elapsedTime = System.nanoTime() - finalQueryTime; long microseconds = elapsedTime / 1000; float milliseconds = microseconds / 1000.0f; logger.fine("Time to extract rules for pattern:\t" + pattern.toString() + "\t" + milliseconds + " milliseconds\t" + result.size() + " instances"); } return result; } @SuppressWarnings("deprecation") private void storeResults(Node node, MatchedHierarchicalPhrases result, List<Rule> rules) { if (printedNodes==null || !printedNodes.contains(node.objectID)) { node.storeResults(result, rules); if (out==null) { logger.finer("Not printing rules"); } else { for (Rule rule : rules) { String ruleString = rule.toString(ntVocab, suffixArray.getVocabulary(), targetCorpus.getVocabulary()); if (logger.isLoggable(Level.FINEST)) logger.finest("Rule: " + ruleString); out.println(ruleString); } printedNodes.add(node.objectID); } } } /** * Implements Function EXTEND_QUEUE from Lopez (2008) PhD * Thesis, Algorithm 2, p 76 * * @param queue Queue of tuples * @param i Start index of the pattern in the source input * sentence (inclusive, 1-based). * @param j End index of the pattern in the source input * sentence (inclusive, 1-based). * @param sentence * @param pattern Pattern corresponding to the prefix node. * In Lopez's terminology, this pattern is * alpha f_j. * @param node Node in the prefix tree to which a new node * (corresponding to the pattern) will eventually * be attached. */ private void extendQueue(Queue<Tuple> queue, int i, int j, int[] sentence, Pattern pattern, Node node) { int J = j; if (!sentenceFinalX) J += 1; int endOfPhraseSpan = (j+1)-i+1; // 1: if |alpha| < MaxPhraseLength and j-i+1<=MaxPhraseSpan then if (pattern.size() < maxPhraseLength && J<sentence.length) { if (endOfPhraseSpan <= maxPhraseSpan) { // 2: Add <alpha f_j, i, j+1, p_alpha> to queue // (add new tuple to the queue) if (logger.isLoggable(Level.FINEST)) logger.finest("\nextendQueue: Adding tuple (" +pattern+","+ i + ","+ (j+1) +","+node+")");//(new Pattern(alphaPattern,sentence[j+1]))+"})"); queue.add(new Tuple(pattern, i, j+1, node));//, sentence[j+1])); } if (edgeXMayViolatePhraseSpan) endOfPhraseSpan -= 1; // 3: if arity(alpha) < MaxNonterminals then if (pattern.arity() < maxNonterminals && endOfPhraseSpan <= maxPhraseSpan) { Node xNode; if (! node.children.containsKey(X)) { // 4: children(p_alpha) <-- children(p_alpha) U p_alphaX // (add new child node in tree and mark in as active) xNode = node.addChild(X); if (logger.isLoggable(Level.FINEST)) logger.finest("Adding node for \"" + X + "\" from " + node + " to new node " + xNode + " with alphaPattern " + pattern + " (in extendQueue)"); Node suffixLink = node.calculateSuffixLink(X); if (logger.isLoggable(Level.FINEST)) { String oldSuffixLink = (xNode.suffixLink==null) ? "null" : "id"+xNode.suffixLink.objectID; String newSuffixLink = (suffixLink==null) ? "null" : "id"+suffixLink.objectID; logger.finest("Changing suffix link from " + oldSuffixLink + " to " + newSuffixLink + " for node " + xNode + " (prefix node " + node + " ) with token " + X); } xNode.linkToSuffix( suffixLink ); } else { xNode = node.children.get(X); if (logger.isLoggable(Level.FINEST)) logger.finest("X Node is already " + xNode + " for prefixNode " + node); } // 5: Mark p_alphaX active xNode.active = true; //Node.ACTIVE; int[] patternWords = pattern.getWordIDs(); // 6: Q_alphaX <-- Q_alpha { SymbolTable vocab = (suffixArray==null) ? null : suffixArray.getVocabulary(); Pattern xpattern = new Pattern(vocab, patternWords, X); // HierarchicalPhrases phrasesWithFinalX = new HierarchicalPhrases(xpattern, node.sourceHierarchicalPhrases); MatchedHierarchicalPhrases phrasesWithFinalX; if (suffixArray==null) { // This should only happen in certain unit tests logger.severe("This should only be encountered during unit testing!"); if (node.sourceHierarchicalPhrases==null) { node.sourceHierarchicalPhrases = HierarchicalPhrases.emptyList((SymbolTable) null); node.sourcePattern = node.sourceHierarchicalPhrases.getPattern(); } phrasesWithFinalX = node.getMatchedPhrases().copyWithFinalX(); } else { Cache<Pattern,MatchedHierarchicalPhrases> cache = suffixArray.getCachedHierarchicalPhrases(); if (cache.containsKey(xpattern)) { phrasesWithFinalX = cache.get(xpattern); } else { phrasesWithFinalX = node.getMatchedPhrases().copyWithFinalX(); suffixArray.cacheMatchingPhrases(phrasesWithFinalX); } } List<Rule> rules = (ruleExtractor==null) ? Collections.<Rule>emptyList() : ruleExtractor.extractRules(phrasesWithFinalX); //xNode.storeResults(phrasesWithFinalX, rules); storeResults(xNode, phrasesWithFinalX, rules); } if (logger.isLoggable(Level.FINEST)) logger.finest("Alpha pattern is " + pattern); // For efficiency, don't add any tuples to the queue whose patterns would exceed the max allowed number of tokens if (patternWords.length+2 <= maxPhraseLength) { int I = sentence.length; if (!sentenceFinalX) I -= 1; int min = (I<i+maxPhraseSpan) ? I : i+maxPhraseSpan-1; Pattern patternX = new Pattern(pattern, X); // 7: for k from j+1 to min(I, i+MaxPhraseLength) do for (int k=j+2; k<=min; k++) { // 8: Add <alpha f_j X, i, k, p_alphaX> to queue if (logger.isLoggable(Level.FINEST)) logger.finest("extendQueue: Adding tuple ("+patternX+","+i+","+k+","+xNode+ " ) in EXTEND_QUEUE ****************************************" ); queue.add(new Tuple(patternX, i, k, xNode)); } } else if (logger.isLoggable(Level.FINEST)) { logger.finest("Not extending " + pattern + "+X "); } } } } // /** // * Gets the root node of this tree. // * // * @return the root node of this tree // */ // public Grammar getRoot() { // return root; // } // /** // * Gets all translation rules stored in this tree. // * // * @return all translation rules stored in this tree // */ // public List<Rule> getAllRules() { // // return root.getAllRules(); // // } /* See Javadoc for java.lang.Object#toString. */ public String toString() { return root.toTreeString("", vocab); } /** * Gets the number of nodes in this tree. * <p> * This method recursively traverses through all nodes * in the tree every time this method is called. * * @return the number of nodes in this tree */ public int size() { return root.size(); } /** * Constructs an invalid, dummy prefix tree. * <p> * The unit tests for Node require a dummy PrefixTree. */ private PrefixTree() { root = null; parallelCorpus = null; suffixArray = null; targetCorpus = null; alignments = null; lexProbs = null; xnode = null; ruleExtractor = null; this.epsilon = null; this.vocab = null; this.maxPhraseSpan = Integer.MIN_VALUE; this.maxPhraseLength = Integer.MIN_VALUE; this.maxNonterminals = Integer.MIN_VALUE; this.minNonterminalSpan = Integer.MAX_VALUE; this.ruleOwner = Integer.MIN_VALUE; this.defaultLHS = Integer.MIN_VALUE; this.oovFeatureCost = Float.NaN; } /** * Gets an invalid, dummy prefix tree. * <p> * For testing purposes only. * * @return an invalid, dummy prefix tree */ static PrefixTree getDummyPrefixTree() { return new PrefixTree(); } public Rule constructManualRule(int lhs, int[] sourceWords, int[] targetWords, float[] scores, int arity) { return new BilingualRule(lhs, sourceWords, targetWords, scores, arity, this.ruleOwner, 0, getOOVRuleID()); } public Rule constructOOVRule(int numFeatures, int sourceWord, int targetWord, boolean hasLM) { int[] french = new int[1]; french[0] = sourceWord; int[] english = new int[1]; english[0] = targetWord; float[] feat_scores = new float[numFeatures]; // TODO: This is a hack to make the decoding without a LM works /**when a ngram LM is used, the OOV word will have a cost 100. * if no LM is used for decoding, so we should set the cost of some * TM feature to be maximum * */ if ( (!hasLM) && numFeatures > 0) { feat_scores[0] = oovFeatureCost; } return new BilingualRule( this.defaultLHS, french, english, feat_scores, 0, this.ruleOwner, 0, getOOVRuleID()); } public int getNumRules() { return root.getNumRules(); } public int getOOVRuleID() { return MemoryBasedBatchGrammar.OOV_RULE_ID; } public Trie getTrieRoot() { return root; } public boolean hasRuleForSpan(int startIndex, int endIndex, int pathLength) { return (endIndex - startIndex <= this.maxPhraseSpan); } }