PrefixTree.java example

Explorer
relax-decode-master
- third-party
/* This file is part of the Joshua Machine Translation System.
 * 
 * Joshua is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation; either version 2.1
 * of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free
 * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
 * MA 02111-1307 USA
 */
package joshua.prefix_tree;

import joshua.corpus.Corpus;
import joshua.corpus.MatchedHierarchicalPhrases;
import joshua.corpus.RuleExtractor;
import joshua.corpus.alignment.Alignments;
import joshua.corpus.lexprob.LexicalProbabilities;
import joshua.corpus.suffix_array.HierarchicalPhrases;
import joshua.corpus.suffix_array.ParallelCorpusGrammarFactory;
import joshua.corpus.suffix_array.Pattern;
import joshua.corpus.suffix_array.Suffixes;
import joshua.corpus.vocab.SymbolTable;
import joshua.decoder.JoshuaConfiguration;
import joshua.decoder.ff.tm.AbstractGrammar;
import joshua.decoder.ff.tm.BilingualRule;
import joshua.decoder.ff.tm.Rule;
import joshua.decoder.ff.tm.Trie;
import joshua.decoder.ff.tm.hiero.MemoryBasedBatchGrammar;
import joshua.util.Cache;

import java.io.PrintStream;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;

/**
 * Represents a prefix tree with suffix links, for use in extracting
 * hierarchical phrase-based statistical translation rules.
 *
 * @author Lane Schwartz
 * @version $LastChangedDate:2008-11-13 13:13:31 -0600 (Thu, 13 Nov 2008) $
 */
public class PrefixTree extends AbstractGrammar {

	/** Logger for this class. */
	private static final Logger logger = Logger.getLogger(PrefixTree.class.getName());

	/**
	 * Integer representation of the nonterminal X. 
	 * All nonterminals are guaranteed to be represented by negative integers.
	 */
	public static final int X = SymbolTable.X;//-1;
	
	/** Operating system-specific end of line character(s). */
	static final byte[] newline = System.getProperty("line.separator").getBytes();
	
	/** Root node of this tree. */
	final RootNode root;

	/**
	 * Responsible for performing sampling and creating translation
	 * rules.
	 */
	final RuleExtractor ruleExtractor;
	
	/**
	 * Max span in the source corpus of any extracted hierarchical
	 * phrase.
	 */
	final int maxPhraseSpan;   
	
	/**
	 * Maximum number of terminals plus nonterminals allowed
	 * in any extracted hierarchical phrase.
	 */
	final int maxPhraseLength;
	
	/**
	 * Maximum number of nonterminals allowed in any extracted
	 * hierarchical phrase.
	 */
	final int maxNonterminals;

	/**
	 * Minimum span in the source corpus of any nonterminal in
	 * an extracted hierarchical phrase.
	 */
	final int minNonterminalSpan;
	
	
	/**
	 * Represents a very high cost, corresponding to a very
	 * unlikely probability.
	 */
	static final float VERY_UNLIKELY = -1.0f * (float) Math.log(1.0e-9);
	
	/** 
	 * Indicates whether rules with an initial source-side
	 * nonterminal should be extracted from phrases at the start
	 * of a sentence, even though such rules do not have
	 * supporting corporal evidence.
	 * <p>
	 * This is included for compatibility with Adam Lopez's
	 * Hiero rule extractor, in which this setting is set to
	 * <code>true</code>.
	 * <p>
	 * The default value is <code>false</code>.
	 */
	boolean sentenceInitialX = false;
	
	/** 
	 * Indicates whether rules with a final source-side nonterminal 
	 * should be extracted from phrases at the end of a sentence,
	 * even though such rules do not have supporting corporal
	 * evidence.
	 * <p>
	 * This is included for compatibility with Adam Lopez's
	 * Hiero rule extractor, in which this setting is set to
	 * <code>true</code>.
	 * <p>
	 * The default value is <code>false</code>.
	 */
	boolean sentenceFinalX = false;
	
	
	boolean edgeXMayViolatePhraseSpan = false;
	
	
	/** Unique integer identifier for the root node. */
	static final int ROOT_NODE_ID = -999;
	
	/** 
	 * Unique integer identifier for the special ⊥ node
	 * that represents the suffix of the root node.
	 * @see Lopez (2008), footnote 9 on p73
	 */
	static final int BOT_NODE_ID = 0;//-2000;

	/** Suffix array representing the source language corpus. */
	final Suffixes suffixArray;
	
	/** Corpus array representing the target language corpus. */
	final Corpus targetCorpus;
	
	/** */
	final ParallelCorpusGrammarFactory parallelCorpus;
	
	/**
	 * Represents alignments between words in the source corpus
	 * and the target corpus.
	 */
	final Alignments alignments;
	
	/** Lexical translation probabilities. */
	final LexicalProbabilities lexProbs;
	
	/** Symbol table */
	final SymbolTable vocab;
	
	/** Empty pattern */
	final Pattern epsilon;
	
	/** 
	 * Node representing phrases that start with the nonterminal
	 * X. This node's parent is the root node of the tree.
	 */
	private final Node xnode;

	private Set<Integer> printedNodes = null;
	
	private Map<Integer,String> ntVocab;
	
	private PrintStream out = null;
	
	private final int ruleOwner;
	
	private final int defaultLHS;
	
	private final float oovFeatureCost;
	
	/**
	 * Constructs a new prefix tree with suffix links using the
	 * GENERATE_PREFIX_TREE algorithm from Lopez (2008) PhD
	 * Thesis, Algorithm 2, p 76.
	 * 
	 * @param parallelCorpus
	 */
	public PrefixTree(ParallelCorpusGrammarFactory parallelCorpus) {

		
		if (logger.isLoggable(Level.FINER)) logger.finer("\n\n\nConstructing new PrefixTree\n\n");

		this.parallelCorpus = parallelCorpus;
		this.suffixArray = parallelCorpus.getSuffixArray();
		this.targetCorpus = parallelCorpus.getTargetCorpus();
		this.alignments = parallelCorpus.getAlignments();
		this.lexProbs = parallelCorpus.getLexProbs();
		this.ruleExtractor = parallelCorpus.getRuleExtractor();
		this.maxPhraseSpan = parallelCorpus.getMaxPhraseSpan();
		this.maxPhraseLength = parallelCorpus.getMaxPhraseLength();
		this.maxNonterminals = parallelCorpus.getMaxNonterminals();
		this.minNonterminalSpan = parallelCorpus.getMinNonterminalSpan();
		this.vocab = parallelCorpus.getSourceCorpus().getVocabulary();
		this.ruleOwner = vocab.getID(parallelCorpus.getRuleOwner());
		this.defaultLHS = vocab.getID(parallelCorpus.getDefaultLHSSymbol());
		this.oovFeatureCost = parallelCorpus.getOovFeatureCost();
		
		this.root = new RootNode(this,ROOT_NODE_ID);
		Node bot = new BotNode(parallelCorpus, root);
		this.root.linkToSuffix(bot);

		this.ntVocab = new HashMap<Integer,String>();
		ntVocab.put(PrefixTree.X, "X");
		
////		if (suffixArray==null) {
//////			vocab = null;
////		} else {
//		if (suffixArray != null) {
////			vocab = suffixArray.getVocabulary();
//			//int[] bounds = {0, suffixArray.size()-1};
//			root.setBounds(0, suffixArray.size()-1);
//		}
//		root.sourceHierarchicalPhrases = HierarchicalPhrases.emptyList(vocab);

		// Define epsilon to be an empty pattern
		epsilon = new Pattern(vocab);

		
		// 1: children(p_eps) <-- children(p_eps) U p_x

		if (maxNonterminals > 0) {	// Create and set up the X node that comes off of ROOT
			
			// Add a link from root node to X
			xnode = root.addChild(X);

			// Add a suffix link from X back to root
			Node suffixLink = root.calculateSuffixLink(X);

			if (logger.isLoggable(Level.FINEST)) {
				String oldSuffixLink = (xnode.suffixLink==null) ? "null" : "id"+xnode.suffixLink.objectID;
				String newSuffixLink = (suffixLink==null) ? "null" : "id"+suffixLink.objectID;
				logger.finest("Changing suffix link from " + oldSuffixLink + " to " + newSuffixLink + " for node " + xnode.toShortString(vocab) + " with token " + X);
			}

			xnode.linkToSuffix(suffixLink);
		} else {
			this.xnode = null;
		}

		if (logger.isLoggable(Level.FINEST)) logger.finest("CURRENT TREE:  " + root);

	}

	/**
	 * Constructs a new prefix tree with suffix links using the
	 * GENERATE_PREFIX_TREE algorithm from Lopez (2008) PhD
	 * Thesis, Algorithm 2, p 76.
	 * <p>
	 * This constructor does not take a suffix array parameter.
	 * Instead any prefix tree constructed by this constructor
	 * will assume that all possible phrases of this sentence
	 * are valid phrases.
	 * <p>
	 * This constructor is meant to be used primarily for testing
	 * purposes.
	 *
	 * @param sentence
	 * @param maxPhraseSpan
	 * @param maxPhraseLength
	 * @param maxNonterminals
	 */
	PrefixTree(SymbolTable vocab, int maxPhraseSpan, int maxPhraseLength, int maxNonterminals) {
		this(new ParallelCorpusGrammarFactory((Suffixes) null, (Suffixes) null, (Alignments) null, null, Integer.MAX_VALUE, maxPhraseSpan, maxPhraseLength, maxNonterminals, 2, Float.MIN_VALUE, JoshuaConfiguration.phrase_owner, JoshuaConfiguration.default_non_terminal, JoshuaConfiguration.oovFeatureCost));
	}


	/**
	 * Sets a print stream to which newly extracted rules will be written.
	 *
	 * @param out a print stream
	 *            to which newly extracted rules will be written
	 */
	public void setPrintStream(PrintStream out) {
		logger.info("Setting output stream");
		this.out = out;
		this.printedNodes = new HashSet<Integer>();
	}
	
	/**
	 * Modify this prefix tree by adding phrases for this
	 * sentence.
	 *
	 * @param sentence
	 */
	public void add(int[] sentence) {
		
		long startTime = System.nanoTime();
		
		int START_OF_SENTENCE = 0;
		int END_OF_SENTENCE = sentence.length - 1;
		
		Queue<Tuple> queue = new LinkedList<Tuple>();

		if (logger.isLoggable(Level.FINER)) logger.finer("Last sentence index == I == " + END_OF_SENTENCE);

		// 2: for i from 1 to I
		for (int i=START_OF_SENTENCE; i<=END_OF_SENTENCE; i++) {
			//if (logger.isLoggable(Level.FINEST)) logger.finest("Adding tuple (" + i + ","+ i +","+root+",{"+intToString(sentence[i])+"})");
			if (logger.isLoggable(Level.FINEST)) logger.finest("Adding tuple (\u03b5," + i + ","+ i +","+root.toShortString(vocab) +")");
			
			// 3: Add <f_i, i, i+1, p_eps> to queue
			queue.add(new Tuple(epsilon, i, i, root));
		}

		if (this.maxNonterminals > 0) {	Pattern xpattern = new Pattern(vocab,X);
			
			int start = START_OF_SENTENCE;
			if (!sentenceInitialX) start += 1;
		
			// 4: for i from 1 to I
			for (int i=start; i<=END_OF_SENTENCE; i++) {
				//if (logger.isLoggable(Level.FINEST)) logger.finest("Adding tuple (" + (i-1) + ","+(i)+","+root+",{"+X+","+intToString(sentence[i])+"})");
				if (logger.isLoggable(Level.FINEST)) logger.finest("Adding tuple (X," + (i-1) + ","+ i +","+xnode.toShortString(vocab) +")");
				
				// 5: Add <X f_i, i-1, i+1, p_x> to queue
				if (edgeXMayViolatePhraseSpan) {
					queue.add(new Tuple(xpattern, i, i, xnode));	
				} else {
					queue.add(new Tuple(xpattern, i-1, i, xnode));
				}
			}
		}


		// 6: While queue is not empty do
		while (! queue.isEmpty()) {

			if (logger.isLoggable(Level.FINER)) {
				logger.finer("\n");
				if (logger.isLoggable(Level.FINEST)) logger.finest("CURRENT TREE:      " + root);
			}
			
			// 7: Pop <alpha, i, j, p_alphaBeta> from queue
			Tuple tuple = queue.remove();

			int i = tuple.spanStart;
			int j = tuple.spanEnd;
			Node prefixNode = tuple.prefixNode;
			Pattern prefixPattern = tuple.pattern;

//			if (prefixNode.objectID==329 //) {
//					|| (prefixNode.objectID==28 && i==13 && j==17)) {
//				int x = -1;
//				x++;
//			}
			
			if (logger.isLoggable(Level.FINER)) logger.finer("Have tuple (" +prefixPattern+","+ i + ","+j+","+prefixNode.toShortString(vocab)+")");

			if (j <= END_OF_SENTENCE) {

				// 8: If p_alphaBetaF_i elementOf children(p_alphaBeta) then
				if (prefixNode.hasChild(sentence[j])) {

					if (logger.isLoggable(Level.FINER)) logger.finer("EXISTING node for \"" + sentence[j] + "\" from " + prefixNode.toShortString(vocab) + " to node " + prefixNode.getChild(sentence[j]).toShortString(vocab) + " with pattern " + prefixPattern);

					// child is p_alphaBetaF_j
					Node child = prefixNode.getChild(sentence[j]);
					
					// 9: If p_alphaBetaF_j is inactive then
					if (! child.active) {
						
						// 10: Continue to next item in queue
						continue;
						
						// 11: Else
					} else { 
						
						// 12: EXTEND_QUEUE(alpha beta f_j, i, j, f_1^I)
						if (logger.isLoggable(Level.FINER)) {
							logger.finer("Calling EXTEND_QUEUE("+i+","+j+","+prefixPattern+","+prefixNode.toShortString(vocab));
							if (logger.isLoggable(Level.FINEST)) logger.finest("TREE BEFOR EXTEND: " + root);
						}
						extendQueue(queue, i, j, sentence, new Pattern(prefixPattern,sentence[j]), child);
						if (logger.isLoggable(Level.FINEST)) logger.finest("TREE AFTER EXTEND: " + root);
						
					}

				} else { // 13: Else

					// 14: children(alphaBeta) <-- children(alphaBeta) U p_alphaBetaF_j
					//     (Add new child node)
					if (logger.isLoggable(Level.FINER)) logger.finer("Adding new node to node " + prefixNode.toShortString(vocab));
					Node newNode = prefixNode.addChild(sentence[j]);
					if (logger.isLoggable(Level.FINER)) {
						String word = (suffixArray==null) ? ""+sentence[j] : suffixArray.getVocabulary().getWord(sentence[j]);
						logger.finer("Created new node " + newNode.toShortString(vocab) +" for \"" + word + "\" and \n  added it to " + prefixNode.toShortString(vocab));
					}


					// 15: p_beta <-- suffix_link(p_alpha_beta)
					//     suffixNode in this code is p_beta_f_j, not p_beta
					Node suffixNode = prefixNode.calculateSuffixLink(sentence[j]);

					if (logger.isLoggable(Level.FINEST)) {
						String oldSuffixLink = (newNode.suffixLink==null) ? "null" : "id"+newNode.suffixLink.objectID;
						String newSuffixLink = (suffixNode==null) ? "null" : "id"+suffixNode.objectID;
						logger.finest("Changing suffix link from " + oldSuffixLink + " to " + newSuffixLink + " for node " + newNode.toShortString(vocab) + " (prefix node " + prefixNode.toShortString(vocab) + " ) with token " + sentence[j]);
					}
					
					newNode.linkToSuffix( suffixNode );


					// 16: if p_beta_f_j is inactive then
					if (! suffixNode.active) {
						
						// 17: Mark p_alpha_beta_f_j inactive
						newNode.active = false; //Node.INACTIVE;
						
						// 18: else
					} else { 

						Pattern extendedPattern = new Pattern(prefixPattern,sentence[j]);

						MatchedHierarchicalPhrases result = null;
						
						if (suffixArray != null) {
							
							// 19: Q_alpha-beta-f_j <-- query(alpha-beta-f_j, Q_alpha-beta, Q_beta-f_j)
							result = query(extendedPattern, newNode, prefixNode, suffixNode);
							
						}

						// 20: if Q_alpha_beta_f_j = ∅ (meaning that no results were found for this query)
						//if (result != null && result.isEmpty()) {// && prefixNode != xnode) {
						if (result != null && result.isEmpty()) {
							
							// 21: Mark p_alpha_beta_f_j inactive
							newNode.active = false; //Node.INACTIVE;
							
							// 22: else
						} else {
							
							// 23: Mark p_alpha_beta_f_j active
							newNode.active = true; //Node.ACTIVE;
							
							// 24: EXTEND_QUEUE(alpha beta f_j, i, j, f_1^I)
							extendQueue(queue, i, j, sentence, extendedPattern, newNode);
							
						}
					}
				}
			}

		}

		long endTime = System.nanoTime();
		long microseconds = (endTime - startTime) / 1000;
		float milliseconds = microseconds / 1000.0f;
		logger.info("Sentence total extraction time:\t"+ milliseconds + " milliseconds");
		
		
		if (logger.isLoggable(Level.FINER)) {
			logger.finer("\n");
			if (logger.isLoggable(Level.FINEST)) logger.finest("FINAL TREE:  " + root);
		}
	}
	

	/**
	 * Implements the root QUERY algorithm (Algorithm 4) of
	 * Adam Lopez's (2008) doctoral thesis.
	 *
	 * @param pattern Pattern to search for
	 * @param node Node in the prefix tree
	 * @param prefixNode Prefix node
	 * @param suffixNode Suffix node
	 * @return List of matched hierarchical phrases for the specified pattern.
	 * 
	 * @see "Lopez (2008)"
	 */
	public MatchedHierarchicalPhrases query(Pattern pattern, Node node, Node prefixNode, Node suffixNode) {

		if (logger.isLoggable(Level.FINER)) logger.finer("PrefixTree.query( " + pattern + ",\n\t   new node " + node + ",\n\tprefix node " + prefixNode + ",\n\tsuffix node " + suffixNode + ")");
		long startTime = System.nanoTime();
		
		MatchedHierarchicalPhrases result;

//		boolean stop = false;
//		if (pattern.toString().startsWith("[de ")) {
//			logger.warning("Found it! " + pattern.toString() + " yahoo");
//			int x;
//			x=5;
//			x+=1;
//			stop = true;
//		}
//		
//		if (stop) {
//			if (stop) {
//				logger.info("Stopping");
//				logger.info("Did you stop?");
//			}
//		}
//		
		
		if (suffixArray.getCachedHierarchicalPhrases().containsKey(pattern)) {
			result = suffixArray.getCachedHierarchicalPhrases().get(pattern);
			int[] bounds = suffixArray.findPhrase(pattern, 0, pattern.size(), prefixNode.lowBoundIndex, prefixNode.highBoundIndex);
			if (bounds!=null) {
				node.setBounds(bounds[0],bounds[1]);
			}
		} else {
			if (pattern.toString().startsWith("[de ")) {
				int x = 5;
				x++;
			}

			int arity = pattern.arity();

			// 1: if alpha=u then
			//    If the pattern is contiguous, look up the pattern in the suffix array
			if (arity == 0) {

				// 2: SUFFIX-ARRAY-LOOKUP(SA_f, a alpha b, l_a_alpha, h_a_alpha
				// Get the first and last index in the suffix array for the specified pattern
				int[] bounds = suffixArray.findPhrase(pattern, 0, pattern.size(), prefixNode.lowBoundIndex, prefixNode.highBoundIndex);
				if (bounds==null) {
					result = HierarchicalPhrases.emptyList(pattern);
					suffixArray.cacheMatchingPhrases(result);
					//TODO Should node.setBounds(bounds) be called here?
				} else {
					node.setBounds(bounds[0],bounds[1]);
					int[] startingPositions = suffixArray.getAllPositions(bounds);
					result = suffixArray.createTriviallyHierarchicalPhrases(startingPositions, pattern, vocab);
				}


			} else { // 3: else --- alpha is a discontiguous pattern

				// 8: If M_a_alpha_b has been precomputed (then result will be non-null)
				// 9: Retrieve M_a_alpha_b from cache of precomputations


				// 10: else
				if (suffixArray.getCachedHierarchicalPhrases().containsKey(pattern)) {	
					result = suffixArray.getMatchingPhrases(pattern);
				} else {

					// 16: M_a_alpha_b <-- QUERY_INTERSECT(M_a_alpha, M_alpha_b)

					int[] sourceWords = prefixNode.getSourcePattern().getWordIDs();

					// Special handling of case when prefixNode is the X off of root (hierarchicalPhrases for that node is empty)
					if (arity==1 && sourceWords[0] < 0 && sourceWords[sourceWords.length-1] < 0){

						result = suffixNode.getMatchedPhrases().copyWithInitialX();

					} else { 

						// Normal query intersection case (when prefixNode != X off of root)

						if (logger.isLoggable(Level.FINEST)) logger.finest("Calling queryIntersect("+pattern+" M_a_alpha.pattern=="+prefixNode.getSourcePattern() + ", M_alpha_b.pattern=="+suffixNode.getSourcePattern()+")");

						result = HierarchicalPhrases.queryIntersect(pattern, prefixNode.getMatchedPhrases(), suffixNode.getMatchedPhrases(), minNonterminalSpan, maxPhraseSpan, suffixArray);

					}

					suffixArray.cacheMatchingPhrases(result);
				}
			}
		}
		
		long finalQueryTime = System.nanoTime();
		if (logger.isLoggable(Level.FINE)) {
			long elapsedQueryTime = finalQueryTime - startTime;
			long microseconds = elapsedQueryTime / 1000;
			float milliseconds = microseconds / 1000.0f;
			logger.fine("Time to query pattern:\t" + pattern.toString() + "\t" + milliseconds + " milliseconds\t" + result.size() + " instances");
		}
		
		// 17: Return M_a_alpha_b
		List<Rule> rules = ruleExtractor.extractRules(result);
//		node.storeResults(result, rules);
		storeResults(node, result, rules);
		
		if (logger.isLoggable(Level.FINE)) {
			long elapsedTime = System.nanoTime() - finalQueryTime;
			long microseconds = elapsedTime / 1000;
			float milliseconds = microseconds / 1000.0f;
			logger.fine("Time to extract rules for pattern:\t" + pattern.toString() + "\t" + milliseconds + " milliseconds\t" + result.size() + " instances");
		}

		return result;

	}
	
	@SuppressWarnings("deprecation")
	private void storeResults(Node node, MatchedHierarchicalPhrases result, List<Rule> rules) {
		if (printedNodes==null || !printedNodes.contains(node.objectID)) {
			node.storeResults(result, rules);

			if (out==null) {
				logger.finer("Not printing rules");
			} else {

				for (Rule rule : rules) {
					String ruleString = rule.toString(ntVocab, suffixArray.getVocabulary(), targetCorpus.getVocabulary());
					if (logger.isLoggable(Level.FINEST)) logger.finest("Rule: " + ruleString);
					out.println(ruleString);
				}
				printedNodes.add(node.objectID);

			}
		}
	}
	
	/**
	 * Implements Function EXTEND_QUEUE from Lopez (2008) PhD
	 * Thesis, Algorithm 2, p 76
	 *
	 * @param queue Queue of tuples
	 * @param i Start index of the pattern in the source input
	 *          sentence (inclusive, 1-based).
	 * @param j End index of the pattern in the source input
	 *          sentence (inclusive, 1-based).
	 * @param sentence
	 * @param pattern Pattern corresponding to the prefix node.
	 *                In Lopez's terminology, this pattern is
	 *                alpha f_j.
	 * @param node Node in the prefix tree to which a new node
	 *             (corresponding to the pattern) will eventually
	 *             be attached.
	 */
	private void extendQueue(Queue<Tuple> queue, int i, int j, int[] sentence, Pattern pattern, Node node) {

		int J = j;
		if (!sentenceFinalX) J += 1;

		int endOfPhraseSpan = (j+1)-i+1;

		
		// 1: if |alpha| < MaxPhraseLength  and  j-i+1<=MaxPhraseSpan then 		
		if (pattern.size() < maxPhraseLength  && J<sentence.length) {

			if (endOfPhraseSpan <= maxPhraseSpan) {
				// 2: Add <alpha f_j, i, j+1, p_alpha> to queue
				//    (add new tuple to the queue)
				if (logger.isLoggable(Level.FINEST)) logger.finest("\nextendQueue: Adding tuple (" +pattern+","+ i + ","+ (j+1) +","+node+")");//(new Pattern(alphaPattern,sentence[j+1]))+"})");
				queue.add(new Tuple(pattern, i, j+1, node));//, sentence[j+1]));
			}

			if (edgeXMayViolatePhraseSpan) endOfPhraseSpan -= 1;
			
			// 3: if arity(alpha) < MaxNonterminals then
			if (pattern.arity() < maxNonterminals && endOfPhraseSpan <= maxPhraseSpan) {
				Node xNode;

				if (! node.children.containsKey(X)) {

					// 4: children(p_alpha) <-- children(p_alpha) U p_alphaX
					//    (add new child node in tree and mark in as active)
					xNode = node.addChild(X);
					if (logger.isLoggable(Level.FINEST)) logger.finest("Adding node for \"" + X + "\" from " + node + " to new node " + xNode + " with alphaPattern " + pattern + "  (in extendQueue)");

					Node suffixLink = node.calculateSuffixLink(X);

					if (logger.isLoggable(Level.FINEST)) {
						String oldSuffixLink = (xNode.suffixLink==null) ? "null" : "id"+xNode.suffixLink.objectID;
						String newSuffixLink = (suffixLink==null) ? "null" : "id"+suffixLink.objectID;
						logger.finest("Changing suffix link from " + oldSuffixLink + " to " + newSuffixLink + " for node " + xNode + " (prefix node " + node + " ) with token " + X);
					}

					xNode.linkToSuffix( suffixLink );

				} else {
					xNode = node.children.get(X);
					if (logger.isLoggable(Level.FINEST)) logger.finest("X Node is already " + xNode + " for prefixNode " + node);
				}

				// 5: Mark p_alphaX active
				xNode.active = true; //Node.ACTIVE;
				
				int[] patternWords = pattern.getWordIDs();
				
				// 6: Q_alphaX <-- Q_alpha
				{
					SymbolTable vocab = (suffixArray==null) ? null : suffixArray.getVocabulary();
					Pattern xpattern = new Pattern(vocab, patternWords, X);
					
//					HierarchicalPhrases phrasesWithFinalX = new HierarchicalPhrases(xpattern, node.sourceHierarchicalPhrases); 
					MatchedHierarchicalPhrases phrasesWithFinalX;
					if (suffixArray==null) {
						// This should only happen in certain unit tests
						logger.severe("This should only be encountered during unit testing!");
						if (node.sourceHierarchicalPhrases==null) {
							node.sourceHierarchicalPhrases = HierarchicalPhrases.emptyList((SymbolTable) null);
							node.sourcePattern = node.sourceHierarchicalPhrases.getPattern();
						}
						phrasesWithFinalX = node.getMatchedPhrases().copyWithFinalX();
					} else {
						Cache<Pattern,MatchedHierarchicalPhrases> cache = suffixArray.getCachedHierarchicalPhrases();
						if (cache.containsKey(xpattern)) {
							phrasesWithFinalX = cache.get(xpattern);
						} else {
							phrasesWithFinalX = node.getMatchedPhrases().copyWithFinalX();
							suffixArray.cacheMatchingPhrases(phrasesWithFinalX);
						}
					}	
					
					List<Rule> rules = (ruleExtractor==null) ? 
								Collections.<Rule>emptyList() : 
								ruleExtractor.extractRules(phrasesWithFinalX);
					//xNode.storeResults(phrasesWithFinalX, rules);
					storeResults(xNode, phrasesWithFinalX, rules);
				}
			
				if (logger.isLoggable(Level.FINEST)) logger.finest("Alpha pattern is " + pattern);

				// For efficiency, don't add any tuples to the queue whose patterns would exceed the max allowed number of tokens
				if (patternWords.length+2 <= maxPhraseLength) {
					
					int I = sentence.length;
					if (!sentenceFinalX) I -= 1;
					
					int min = (I<i+maxPhraseSpan) ? I : i+maxPhraseSpan-1;
					Pattern patternX = new Pattern(pattern, X);

					// 7: for k from j+1 to min(I, i+MaxPhraseLength) do
					for (int k=j+2; k<=min; k++) {

						// 8: Add <alpha f_j X, i, k, p_alphaX> to queue
						if (logger.isLoggable(Level.FINEST)) logger.finest("extendQueue: Adding tuple ("+patternX+","+i+","+k+","+xNode+ " ) in EXTEND_QUEUE ****************************************" );
						queue.add(new Tuple(patternX, i, k, xNode));

					}
				} else if (logger.isLoggable(Level.FINEST)) {
					logger.finest("Not extending " + pattern + "+X ");
				}
			}
		}
		

	}


//	/**
//	 * Gets the root node of this tree.
//	 * 
//	 * @return the root node of this tree
//	 */
//	public Grammar getRoot() {
//		return root;
//	}
	
//	/**
//	 * Gets all translation rules stored in this tree.
//	 * 
//	 * @return all translation rules stored in this tree
//	 */
//	public List<Rule> getAllRules() {
//		
//		return root.getAllRules();
//		
//	}

	/* See Javadoc for java.lang.Object#toString. */
	public String toString() {
		return root.toTreeString("", vocab);
	}

	/**
	 * Gets the number of nodes in this tree.
	 * <p>
	 * This method recursively traverses through all nodes
	 * in the tree every time this method is called.
	 * 
	 * @return the number of nodes in this tree
	 */
	public int size() {
		return root.size();
	}

	
	/**
	 * Constructs an invalid, dummy prefix tree.
	 * <p>
	 * The unit tests for Node require a dummy PrefixTree.
	 */
	private PrefixTree() {
		root = null;
		parallelCorpus = null;
		suffixArray = null;
		targetCorpus = null;
		alignments = null;
		lexProbs = null;
		xnode = null;
		ruleExtractor = null;
		this.epsilon = null;
		this.vocab = null;
		this.maxPhraseSpan = Integer.MIN_VALUE;
		this.maxPhraseLength = Integer.MIN_VALUE;
		this.maxNonterminals = Integer.MIN_VALUE;
		this.minNonterminalSpan = Integer.MAX_VALUE;
		this.ruleOwner = Integer.MIN_VALUE;
		this.defaultLHS = Integer.MIN_VALUE;
		this.oovFeatureCost = Float.NaN;
	}
	
	/**
	 * Gets an invalid, dummy prefix tree.
	 * <p>
	 * For testing purposes only.
	 * 
	 * @return an invalid, dummy prefix tree
	 */
	static PrefixTree getDummyPrefixTree() {
		return new PrefixTree();
	}
	
	
	public Rule constructManualRule(int lhs, int[] sourceWords,
			int[] targetWords, float[] scores, int arity) {
		return new BilingualRule(lhs, sourceWords, targetWords, scores, arity, this.ruleOwner, 0, getOOVRuleID());
	}

	public Rule constructOOVRule(int numFeatures, int sourceWord, int targetWord,
			boolean hasLM) {
		int[] french      = new int[1];
		french[0]         = sourceWord;
		int[] english       = new int[1];
		english[0]          = targetWord;
		float[] feat_scores = new float[numFeatures];
		
		// TODO: This is a hack to make the decoding without a LM works
		/**when a ngram LM is used, the OOV word will have a cost 100.
		 * if no LM is used for decoding, so we should set the cost of some
		 * TM feature to be maximum
		 * */
		if ( (!hasLM) && numFeatures > 0) { 
			feat_scores[0] = oovFeatureCost;
		}
		
		return new BilingualRule(
				this.defaultLHS, french, english, 
				feat_scores, 0, this.ruleOwner, 
				0, getOOVRuleID());

	}

	public int getNumRules() {
		return root.getNumRules();
	}

	public int getOOVRuleID() {
		return MemoryBasedBatchGrammar.OOV_RULE_ID;
	}

	public Trie getTrieRoot() {
		return root;
	}

	public boolean hasRuleForSpan(int startIndex, int endIndex, int pathLength) {
		return (endIndex - startIndex <= this.maxPhraseSpan);
	}
	
}