/* This file is part of the Joshua Machine Translation System. * * Joshua is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 * of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, * MA 02111-1307 USA */ package joshua.prefix_tree; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.logging.Level; import java.util.logging.Logger; import joshua.corpus.MatchedHierarchicalPhrases; import joshua.corpus.RuleExtractor; import joshua.corpus.suffix_array.ParallelCorpusGrammarFactory; import joshua.corpus.suffix_array.Pattern; import joshua.corpus.vocab.SymbolTable; import joshua.decoder.ff.tm.BasicRuleCollection; import joshua.decoder.ff.tm.Rule; import joshua.decoder.ff.tm.RuleCollection; import joshua.decoder.ff.tm.Trie; import joshua.util.Cache; /** * Represents a node in a prefix tree. * * @author Lane Schwartz */ public class Node implements Comparable<Node>, Trie { /** Logger for this class. */ private static final Logger logger = Logger.getLogger(Node.class.getName()); /** Unique integer identifier for this node. */ final int objectID; /** * The lower bound in the suffix array * for the source pattern at this node. */ int lowBoundIndex; /** * The upper bound in the suffix array * for the source pattern at this node. */ int highBoundIndex; /** Indicates whether this is an active node. */ boolean active; /** Suffix link for this node. */ Node suffixLink; /** * Maps from integer representations of words to nodes. * <p> * TODO It may be better to have a single map in PrefixTree that maps (Node,Integer) --> Node */ Map<Integer,Node> children; /** Source side hierarchical phrases for this node. */ MatchedHierarchicalPhrases sourceHierarchicalPhrases; // List<Rule> results; protected final ParallelCorpusGrammarFactory parallelCorpus; // private final Suffixes suffixArray; // private final Cache<Pattern, List<Rule>> ruleCache; // private final Cache<Pattern, MatchedHierarchicalPhrases> matchedPhrasesCache; Pattern sourcePattern; ////================================ // //add by zhifei??????????????????????????????????????????? these parameters are not intialized by the constructor // public static final int OOV_RULE_ID = 0; // private int defaultOwner; // private float oovFeatureCost = 100; // // /** // * the OOV rule should have this lhs, this should be grammar // * specific as only the grammar knows what LHS symbol can // * be combined with other rules // */ // private int defaultLHS; // private int spanLimit = 10; ////============================== // /** * Gets translation rules for this node. * <p> * The results of this method are guaranteed to be * sorted according to whatever feature functions are in use. * * Calling this method will return results equivalent to those * that would be returned by calling * <code>HierarchicalRuleExtractor#extractRules(getMatchedPhrases())</code>. * * @see RuleExtractor#extractRules(MatchedHierarchicalPhrases) * @return translation rules for this node */ protected List<Rule> getResults() { Cache<Pattern,List<Rule>> ruleCache = parallelCorpus.getSuffixArray().getCachedRules(); List<Rule> results; if (ruleCache.containsKey(sourcePattern)) { results = ruleCache.get(sourcePattern); // The rules from the cache are guaranteed to be sorted. } else { results = parallelCorpus.getRuleExtractor().extractRules(getMatchedPhrases()); // The above list of rules extracted is guaranteed to be sorted. ruleCache.put(sourcePattern, results); } // These rules are sorted. return results; } protected MatchedHierarchicalPhrases getMatchedPhrases() { //TODO Implement this method return this.sourceHierarchicalPhrases; // MatchedHierarchicalPhrases results; // // if (matchedPhrasesCache.containsKey(sourcePattern)) { // results = matchedPhrasesCache.get(sourcePattern); // } else { // // // Do some extra lookup // // // throw new RuntimeException("This code not yet implemented"); // // } // // return results; } Node(Node parent) { // this(parent.ruleCache, parent.matchedPhrasesCache, true); this(parent.parallelCorpus, true, nodeIDCounter++); } Node(ParallelCorpusGrammarFactory parallelCorpus, int objectID) { this(parallelCorpus, true, objectID); // this( // (suffixArray==null ? null : suffixArray.getCachedRules()), // (suffixArray==null ? null : suffixArray.getCachedHierarchicalPhrases()), // true, objectID); } Node(ParallelCorpusGrammarFactory parallelCorpus, boolean active) { this(parallelCorpus, active, nodeIDCounter++); } // Node(Cache<Pattern, List<Rule>> ruleCache, Cache<Pattern, MatchedHierarchicalPhrases> matchedPhrasesCache, boolean active) { // this(ruleCache, matchedPhrasesCache, active, nodeIDCounter++); // } // Node(Cache<Pattern, List<Rule>> ruleCache, Cache<Pattern, MatchedHierarchicalPhrases> matchedPhrasesCache, boolean active, int objectID) { Node(ParallelCorpusGrammarFactory parallelCorpus, boolean active, int objectID) { // this.ruleCache = ruleCache; // this.matchedPhrasesCache = matchedPhrasesCache; this.parallelCorpus = parallelCorpus; // this.suffixArray = suffixArray; this.active = active; this.suffixLink = null; this.children = new HashMap<Integer,Node>(); this.objectID = objectID; this.sourceHierarchicalPhrases = null;//HierarchicalPhrases.emptyList((SymbolTable) null); // this.results = Collections.emptyList(); } Node calculateSuffixLink(int endOfPattern) { Node suffixLink = this.suffixLink.getChild(endOfPattern); if (suffixLink==null) { throw new NoSuchChildNodeException(this, endOfPattern); } return suffixLink; } /** * Gets the representation of the source side tokens corresponding * to the hierarchical phrases for this node. * * @return the source language pattern for this node */ public Pattern getSourcePattern() { // return sourceHierarchicalPhrases.getPattern(); return sourcePattern; } /** * Gets rules for this node and the children of this node. * * @return rules for this node and the children of this node. */ public List<Rule> getAllRules() { List<Rule> results = this.getResults(); List<Rule> result = new ArrayList<Rule>( (results==null) ? Collections.<Rule>emptyList() : results); for (Node child : children.values()) { result.addAll(child.getAllRules()); } return result; } /* See Javadoc for joshua.decoder.ff.tm.Trie#getRules */ public RuleCollection getRules() { final int[] sourceSide = (sourcePattern==null) ? new int[]{} : sourcePattern.getWordIDs(); final int arity = (sourcePattern==null) ? 0 : sourcePattern.arity(); List<Rule> results = this.getResults(); return new BasicRuleCollection(arity, sourceSide, results); } /* See Javadoc for joshua.decoder.ff.tm.Trie#hasExtensions */ public boolean hasExtensions() { return ! children.isEmpty(); } /* See Javadoc for joshua.decoder.ff.tm.Trie#hasRules */ public boolean hasRules() { if (active) { MatchedHierarchicalPhrases sourceHierarchicalPhrases = this.getMatchedPhrases(); return ! sourceHierarchicalPhrases.isEmpty(); } else { return false; } } /* See Javadoc for joshua.decoder.ff.tm.Trie#matchOne */ public Trie matchOne(int symbol) { if (children.containsKey(symbol)) { Node child = children.get(symbol); if (child.active) { return child; } else { return null; } // return children.get(symbol); } else { return null; } } /* See Javadoc for joshua.decoder.ff.tm.Trie#getExtensions */ public Collection<Node> getExtensions() { return this.children.values(); } /* See Javadoc for joshua.decoder.ff.tm.Grammar#getTrieRoot */ public Trie getTrieRoot() { return this; } /** * Determines whether this node has a specified child. * * @param child * @return <code>true</code> if this node has a specified child, * <code>false</code> otherwise */ public boolean hasChild(int child) { return children.containsKey(child); } public Node getChild(int child) { return children.get(child); } public Node addChild(int child) { if (children.containsKey(child)) { throw new ChildNodeAlreadyExistsException(this, child); } else { Node node = new Node(this); children.put(child, node); return node; } } /** * Sets the suffix link for this node. * * @param suffix Suffix link for this node */ public void linkToSuffix(Node suffix) { this.suffixLink = suffix; } /** * Sets the lower and upper bounds in the suffix array * where the source pattern associated with this node * are located. * * @param lowBound the lower bound in the suffix array * for the source pattern at this node * @param highBound the upper bound in the suffix array * for the source pattern at this node */ public void setBounds(int lowBound, int highBound) { lowBoundIndex = lowBound; highBoundIndex = highBound; } /** * Stores in this node a list of source language hierarchical * phrases, the associated source language pattern, and the * list of associated translation rules. * <p> * This method is responsible for creating and storing * translation rules from the provided list of source * language hierarchical phrases. * * @param hierarchicalPhrases Source language hierarchical phrases. */ public void storeResults(MatchedHierarchicalPhrases hierarchicalPhrases, List<Rule> rules) { if (logger.isLoggable(Level.FINER)) { logger.finer("Storing " + hierarchicalPhrases.size() + " source phrases at node " + objectID + ":"); } this.sourcePattern = hierarchicalPhrases.getPattern(); // this.matchedPhrasesCache.put(sourcePattern, hierarchicalPhrases); //This is not needed, because this is put into the cache by HierarchicalRuleExtractor // this.parallelCorpus.getSuffixArray().getCachedRules().put(sourcePattern, rules); this.sourceHierarchicalPhrases = hierarchicalPhrases; // int numPhrases = hierarchicalPhrases.size(); // if (numPhrases > 0) { // int lowerBound = hierarchicalPhrases.getFirstTerminalIndex(0); // int upperBound = hierarchicalPhrases.getFirstTerminalIndex(numPhrases-1); // this.setBounds(lowerBound, upperBound); // } // this.results = rules; } /** * Gets the number of rules stored in the grammar. * * @return the number of rules stored in the grammar */ public int getNumRules() { List<Rule> results = this.getResults(); int numRules = (results==null) ? 0 : results.size(); if (children != null) { for (Node child : children.values()) { numRules += child.getNumRules(); } } return numRules; } /** * Gets the number of nodes in the sub-tree rooted at this node. * <p> * This method recursively traverses through all nodes * in the sub-tree every time this method is called. * * @return the number of nodes in the sub-tree rooted at this node */ public int size() { int size = 1; for (Node child : children.values()) { size += child.size(); } return size; } /* See Javadoc for java.lang.Object#hashCode */ public int hashCode() { return objectID*31; } /** * Compares this node to another node * based solely on their respective objectIDs. * * @param o Another node * @return <code>true</code> if this node's objectID * is equal to the other objectID, * false otherwise */ public boolean equals(Object o) { if (this==o) { return true; } else if (o instanceof Node) { Node other = (Node) o; return (objectID == other.objectID); } else { return false; } } /** * Compares this node to another node * based solely on their respective objectIDs. * * @param o Another node * @return -1 if this node's objectID is less than the other objectID, * 0 if this node's objectID is equal to the other objectID, * 1 if this node's objectID is greater than the other objectID */ public int compareTo(Node o) { Integer i = objectID; Integer j = o.objectID; return i.compareTo(j); } /** * Gets a String representation of the sub-tree rooted at this node. * * @return a String representation of the sub-tree rooted at this node */ public String toString(SymbolTable vocab, int incomingArcValue) { StringBuilder s = new StringBuilder(); s.append("[id"); s.append(objectID); s.append(' '); if (incomingArcValue==SymbolTable.X) { s.append('X'); } else if (incomingArcValue==PrefixTree.ROOT_NODE_ID) { s.append("ROOT"); } else if (vocab!=null) { s.append(vocab.getWord(incomingArcValue)); } else { s.append('v'); s.append(incomingArcValue); } s.append(" ("); if (null != suffixLink) { s.append(suffixLink.objectID); } else { s.append("null"); } s.append(')'); s.append(' '); ArrayList<Map.Entry<Integer, Node>> k = new ArrayList<Map.Entry<Integer, Node>>(children.entrySet()); Collections.sort(k, NodeEntryComparator.get()); for (Map.Entry<Integer, Node> kidEntry : k) { Integer arcValue = kidEntry.getKey(); Node kid = kidEntry.getValue(); s.append(kid.toString(vocab, arcValue)); s.append(' '); } if (!active) s.append('*'); s.append(']'); return s.toString(); } String toShortString(SymbolTable vocab) { StringBuilder s = new StringBuilder(); s.append("[id"); s.append(objectID); s.append(' '); s.append(" ("); if (null != suffixLink) { s.append(suffixLink.objectID); } else { s.append("null"); } s.append(')'); s.append(' '); s.append('{'); s.append(children.size()); s.append(" children}"); if (!active) s.append('*'); s.append(']'); return s.toString(); } protected String toTreeString(String tabs, SymbolTable vocab, int incomingArcValue) { StringBuilder s = new StringBuilder(); s.append(tabs); s.append("[id"); s.append(objectID); s.append(' '); if (incomingArcValue==SymbolTable.X) { s.append('X'); } else if (incomingArcValue==PrefixTree.ROOT_NODE_ID) { s.append("ROOT"); } else if (vocab!=null) { s.append(vocab.getWord(incomingArcValue)); } else { s.append('v'); s.append(incomingArcValue); } s.append(" ("); if (null != suffixLink) { s.append(suffixLink.objectID); } else { s.append("null"); } s.append(')'); if (children.size() > 0) { s.append(" \n\n"); ArrayList<Map.Entry<Integer, Node>> k = new ArrayList<Map.Entry<Integer, Node>>(children.entrySet()); Collections.sort(k, NodeEntryComparator.get()); for (Map.Entry<Integer, Node> kidEntry : k) { Integer arcValue = kidEntry.getKey(); Node kid = kidEntry.getValue(); s.append(kid.toTreeString(tabs+"\t", vocab, arcValue)); s.append(' '); } s.append(tabs); } else { s.append(' '); } if (!active) s.append('*'); s.append(']'); return s.toString(); } static int nodeIDCounter = 2; static void resetNodeCounter() { nodeIDCounter = 2; } // public Rule constructManualRule(int lhs, int[] sourceWords, int[] targetWords, float[] scores, int arity) { // return new BilingualRule(lhs, sourceWords, targetWords, scores, arity, this.defaultOwner, 0, getOOVRuleID()); // } // // // // public int getOOVRuleID() { // return OOV_RULE_ID; // } // /** // * if the span covered by the chart bin is greater than the // * limit, then return false // */ // public boolean hasRuleForSpan(int startIndex, int endIndex, int pathLength) { // if (this.spanLimit == -1) { // mono-glue grammar // return (startIndex == 0); // } else { // return (endIndex - startIndex <= this.spanLimit); // } // } // // public Rule constructOOVRule(int qtyFeatures, int sourceWord, int targetWord, boolean hasLM) { // int[] french = new int[1]; // french[0] = sourceWord; // int[] english = new int[1]; // english[0] = targetWord; // float[] feat_scores = new float[qtyFeatures]; // // // TODO: This is a hack to make the decoding without a LM works // /**when a ngram LM is used, the OOV word will have a cost 100. // * if no LM is used for decoding, so we should set the cost of some // * TM feature to be maximum // * */ // if ( (!hasLM) && qtyFeatures > 0) { // feat_scores[0] = oovFeatureCost; // } // // return new BilingualRule(this.defaultLHS, french, english, feat_scores, 0, this.defaultOwner, 0, getOOVRuleID()); // } }