/* This file is part of the Joshua Machine Translation System.
*
* Joshua is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or
* (at your option) any later version.
*
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
* License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this library; if not, write to the Free Software Foundation,
* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
package joshua.prefix_tree;
import java.util.BitSet;
import java.util.Map;
import java.util.logging.Level;
import java.util.logging.Logger;
import joshua.corpus.Corpus;
import joshua.corpus.RuleExtractor;
import joshua.corpus.alignment.Alignments;
import joshua.corpus.lexprob.LexicalProbabilities;
import joshua.corpus.suffix_array.Suffixes;
import joshua.corpus.vocab.SymbolTable;
/**
* Space-compact implementation of a prefix tree with suffix links.
* <p>
* <em>Note</em>: This class is under development
* and is <em>not</em> ready for prime-time.</em>
*
* @author Lane Schwartz
*/
public class CompactPrefixTree {
private static final Logger logger = Logger.getLogger(CompactPrefixTree.class.getName());
static final boolean ACTIVE = true;
static final boolean INACTIVE = false;
private static final int DEFAULT_CAPACITY = 512;
private static final int DEFAULT_CAPACITY_INCREMENT = 512;
/** Maximum number of nodes that can be stored in this tree. */
private int capacity;
/**
* Value by which the capacity should be incremented if
* additional space is required to store more nodes.
*/
private int capacityIncrement;
/** Indicates which nodes are active. */
private final BitSet active;
/** Indicates which nodes have children. */
private final BitSet hasChildren;
/**
* Stores several pieces of information compactly.
*
* For each node in the tree, the following integers are
* stored:
*
* <ul>
* <li>Incoming arc value</li>
* <li>Lower bound index</li>
* <li>Upper bound index</li>
* <li>Node ID of suffix link</li>
* </ul>
*
* Each node is identified by a unique integer. This
* identifier is not explicitly stored. Rather, the identifier
* is implicitly stored as the index into the data structure.
* <p>
* In other words, the first values stored in data are for
* the node with identifier 0; the next values are for the
* node with identifier 1, and so on.
*/
private int[] data;
/** Number of integers stored in data for each node. */
private static int INTS_PER_NODE = 4;
private static int INCOMING_ARC_OFFSET=0;
private static int LOWER_BOUND_OFFSET=1;
private static int UPPER_BOUND_OFFSET=2;
private static int SUFFIX_LINK_OFFSET=3;
private static int BITS_PER_INT = 32;
private static final int ROOT_NODE_INCOMING_ARC = Integer.MIN_VALUE;
/** Unique integer identifier for the root node. */
private static final int ROOT_NODE_ID = 0;
/**
* Maps from (Node ID, outgoing arc) --> Node.
* <p>
* This uses a long to encode (int,int).
*/
Map<Long,Integer> children;
private int size;
/** Suffix array representing the source language corpus. */
final Suffixes suffixArray;
/** Corpus array representing the target language corpus. */
final Corpus targetCorpus;
/**
* Represents alignments between words in the source corpus
* and the target corpus.
*/
final Alignments alignments;
/** Lexical translation probabilities. */
final LexicalProbabilities lexProbs;
/** Source side symbol table */
final SymbolTable vocab;
/**
* Responsible for performing sampling and creating translation
* rules.
*/
final RuleExtractor ruleExtractor;
/**
* Max span in the source corpus of any extracted hierarchical
* phrase
*/
final int maxPhraseSpan;
/**
* Maximum number of terminals plus nonterminals allowed
* in any extracted hierarchical phrase.
*/
final int maxPhraseLength;
/**
* Maximum number of nonterminals allowed in any extracted
* hierarchical phrase.
*/
final int maxNonterminals;
/**
* Minimum span in the source corpus of any nonterminal in
* an extracted hierarchical phrase.
*/
final int minNonterminalSpan;
public CompactPrefixTree(Suffixes suffixArray, Corpus targetCorpus, Alignments alignments, SymbolTable vocab, LexicalProbabilities lexProbs, RuleExtractor ruleExtractor, int maxPhraseSpan, int maxPhraseLength, int maxNonterminals, int minNonterminalSpan) {
this(DEFAULT_CAPACITY, DEFAULT_CAPACITY_INCREMENT, suffixArray, targetCorpus, alignments, vocab, lexProbs, ruleExtractor, maxPhraseSpan, maxPhraseLength, maxNonterminals, minNonterminalSpan);
}
public CompactPrefixTree(int capacity, int capacityIncrement, Suffixes suffixArray, Corpus targetCorpus, Alignments alignments, SymbolTable vocab, LexicalProbabilities lexProbs, RuleExtractor ruleExtractor, int maxPhraseSpan, int maxPhraseLength, int maxNonterminals, int minNonterminalSpan) {
if (logger.isLoggable(Level.FINE)) logger.fine("\n\n\nConstructing new CompactPrefixTree\n\n");
this.suffixArray = suffixArray;
this.targetCorpus = targetCorpus;
this.alignments = alignments;
this.vocab = vocab;
this.lexProbs = lexProbs;
this.ruleExtractor = ruleExtractor;
this.maxPhraseSpan = maxPhraseSpan;
this.maxPhraseLength = maxPhraseLength;
this.maxNonterminals = maxNonterminals;
this.minNonterminalSpan = minNonterminalSpan;
this.capacity = capacity;
this.capacityIncrement = capacityIncrement;
this.active = new BitSet(capacity);
this.hasChildren = new BitSet(capacity);
this.data = new int[capacity * INTS_PER_NODE];
// Insert root node
this.data[ROOT_NODE_ID + INCOMING_ARC_OFFSET] = ROOT_NODE_INCOMING_ARC;
this.size = 1;
//TODO Deal with bot and botMap
//TODO Deal with HierarchicalPhrases
if (suffixArray != null) {
int[] bounds = {0, suffixArray.size()-1};
setBounds(ROOT_NODE_ID, bounds);
}
}
public int size() {
return size;
}
private boolean isActive(int node) {
return active.get(node);
}
/**
* Adds a new child node to a parent node.
*
* @param parentNode Node to which a child will be added.
* @param connectingArc Integer representation of the word
* that connects the parent to the child.
*/
private void addChild(int parentNode, int connectingArc) {
// Ensure capacity
if (size >= capacity) {
int newCapacity = capacity + capacityIncrement;
int[] newData = new int[newCapacity];
System.arraycopy(data, 0, newData, 0, capacity);
this.data = null;
this.data = newData;
}
// Add the child node to the data array
int childNode = size++;
this.data[childNode*INTS_PER_NODE + INCOMING_ARC_OFFSET] = connectingArc;
// Store the connection in the children map
long key = getKey(parentNode, connectingArc);
children.put(key, childNode);
}
private int getChild(int parentNode, int outgoingArc) {
long key = getKey(parentNode, outgoingArc);
return children.get(key);
}
private boolean hasChild(int parentNode, int outgoingArc) {
long key = getKey(parentNode, outgoingArc);
return children.containsKey(key);
}
private void linkToSuffix(int node, int suffixNode) {
data[node*INTS_PER_NODE + SUFFIX_LINK_OFFSET] = suffixNode;
}
private int getIncomingArcValue(int node) {
return data[node*INTS_PER_NODE + INCOMING_ARC_OFFSET];
}
private int getSuffixLink(int node) {
return data[node*INTS_PER_NODE + SUFFIX_LINK_OFFSET];
}
private void setBounds(int node, int[] bounds) {
data[node*INTS_PER_NODE + LOWER_BOUND_OFFSET] = bounds[0];
data[node*INTS_PER_NODE + UPPER_BOUND_OFFSET] = bounds[1];
}
private int getLowerBound(int node) {
return data[node*INTS_PER_NODE + LOWER_BOUND_OFFSET];
}
private int getUpperBound(int node) {
return data[node*INTS_PER_NODE + UPPER_BOUND_OFFSET];
}
private int calculateSuffixLink(int node, int endOfPattern) {
int suffixOfNode = getSuffixLink(node);
int childOfSuffix = getChild(suffixOfNode, endOfPattern);
return childOfSuffix;
}
private boolean hasExtensions(int node) {
return hasChildren.get(node);
}
private static long getKey(int parentNode, int outgoingArc) {
// Store the parentNode id in the highest 32 bits of the long
long key = (parentNode << BITS_PER_INT);
// Store the outgoingArc value in the lowest 32 bits of the long
key |= outgoingArc;
return key;
}
public int getCapacity() {
return capacity;
}
}