/**
* Copyright 2003-2007 DFKI GmbH.
* All Rights Reserved. Use is subject to license terms.
*
* This file is part of MARY TTS.
*
* MARY TTS is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, version 3 of the License.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
*/
package marytts.fst;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
/**
*
* This class represents a trie, i.e. a symbol (or 'letter') tree. Each trie node has arcs, to each of which a symbol is attached.
* The symbols guide the lookup of entries in the trie.
*
* The main purpose of this particular trie implementation is not the direct use of the trie (e.g. lookup) but its conversion to a
* finite-state machine that allows for an even more efficient storage and lookup.
*
* We are very thankful to Andreas Eisele who had the idea of using tries and transducers for our purposes and who provided us
* with c-code this particular implemention is based on.
*
* @author benjaminroth
*
*/
public class Trie<Symbol> {
// class to store nodes of a trie
class TrieNode {
private boolean hashcodeFixed = false;
int hashcode = -1;
// maps a string to the node the corresponding arc leads to
private Map<Integer, TrieNode> labelId2node = new HashMap<Integer, TrieNode>();
// true if this state marks the end of an entry
protected boolean isFinal = false;
// id for transducer representation. -1 means no id asigned.
private int id = -1;
// pointer to mother node - needed for minimization
private TrieNode backPointer = null;
/**
* This constructs a TrieNode and specifies its predecessor.
*
* @param predecessor
*/
public TrieNode(TrieNode predecessor, Map<Symbol, Integer> label2idMap, List<Symbol> labels) {
this.backPointer = predecessor;
}
/**
*
* This adds an entry (word...) to the node and its daughters. The entry is entered from the specified index on.
*
* @param entry
* word to be entered.
* @param index
* position from which on the entry is to be enetered at this node.
* @return the final node of this entry.
*/
protected TrieNode add(Symbol[] entry, int index) {
if (index == entry.length) {
// index points to the end of the word: already everything entered.
this.isFinal = true;
return this;
}
Integer labelId = label2id.get(entry[index]);
if (null == labelId) {
labelId = labels.size();
labels.add(entry[index]);
label2id.put(entry[index], labelId);
}
// add rest of the entry to successors
// get successor via id
TrieNode successor = this.labelId2node.get(labelId);
if (null == successor) {
successor = new TrieNode(this, label2id, labels);
this.labelId2node.put(labelId, successor);
}
return successor.add(entry, index + 1);
}
protected boolean hasSuccessor() {
return this.labelId2node.size() > 0;
}
@Override
public int hashCode() {
if (this.hashcodeFixed)
return this.hashcode;
int hc = (this.isFinal) ? 1 : 0;
// sortedIds =
for (Integer labelId : this.labelId2node.keySet()) {
hc += labelId ^ labelId2node.get(labelId).id;
}
return hc;
}
/*
* equals compares everything important but _not_ id
*/
public boolean equals(Object other) {
TrieNode otherNode;
try {
otherNode = (TrieNode) other;
} catch (ClassCastException e) {
return false;
}
// System.out.println("comparing two TrieNodes");
// both nodes have to be final
if (this.isFinal != otherNode.isFinal)
return false;
// both nodes have to have same outgoing edges
if (!this.labelId2node.keySet().equals(otherNode.labelId2node.keySet()))
return false;
// edges must lead to nodes of same equivalence class
for (Integer labelId : this.labelId2node.keySet()) {
if (labelId2node.get(labelId).id != otherNode.labelId2node.get(labelId).id)
return false;
}
return true;
}
public int getId() {
return this.id;
}
public void setId(int id2) {
this.id = id2;
this.hashcode = this.hashCode();
this.hashcodeFixed = true;
}
public boolean hasId() {
return this.id != -1;
}
public TrieNode getBackPointer() {
return this.backPointer;
}
/**
* this checks if equivalent states in the right language of this node are already identified.
*
* @return true iff so
*/
public boolean rightIdentified() {
for (TrieNode n : this.labelId2node.values()) {
if (!n.hasId())
return false;
}
return true;
}
public String toString() {
StringBuilder sb = new StringBuilder();
if (this.backPointer == null) {
sb.append(">");
}
if (this.isFinal) {
sb.append("((" + this.id + "))");
} else {
sb.append("(" + this.id + ")");
}
for (Integer lId : this.labelId2node.keySet()) {
String l = labels.get(lId).toString();
sb.append("\n");
sb.append("|-" + l);
sb.append(" (" + this.labelId2node.get(lId).id + ")");
}
return sb.toString();
}
public Map<Integer, TrieNode> getArcMap() {
return this.labelId2node;
}
}
// node that point to the beginning of all words
protected TrieNode root;
// list of nodes that mark the end of a entry (word)
protected List<TrieNode> finalNodes;
// mapping from nodes to representatives in a minimized transducer view.
protected Map<TrieNode, Integer> reprs = null;
// mapping is done via list indices
protected List<TrieNode> rlist = null;
// back and forth mapping from labels to ids
protected Map<Symbol, Integer> label2id;
protected List<Symbol> labels;
/**
* Standard constructor for a trie.
*/
public Trie() {
this.label2id = new HashMap<Symbol, Integer>();
this.labels = new ArrayList<Symbol>();
this.root = new TrieNode(null, label2id, labels);
this.finalNodes = new ArrayList<TrieNode>();
}
/**
*
* This adds an entry to the trie.
*
* @param entry
* entry
*/
public void add(Symbol[] entry) {
// add the entry and remember backpointer to final node
this.finalNodes.add(this.root.add(entry, 0));
}
/**
* This computes the minimization of the trie, i.e. equivalent nodes are identified. This is necessary to store a compact
* version of this trie as a minimal transducer. The trie itself is not represented more compactly.
*
*/
public void computeMinimization() {
// core idea: identify nodes with identical right language.
// candidates are first all final nodes without successors
LinkedList<TrieNode> identityCandidates = new LinkedList<TrieNode>();
for (TrieNode fn : this.finalNodes) {
if (!fn.hasSuccessor()) {
identityCandidates.add(fn);
}
}
// store the representants of the equivalence classes
this.rlist = new ArrayList<TrieNode>();
// maps nodes to their Id/representative
// this.reprs = new HashMap<TrieNode, TrieNode>();
this.reprs = new HashMap<TrieNode, Integer>();
// for each identity candidate check to which nodes it is identical,
// make new equivalence classes when needed and produce new candidates
while (!identityCandidates.isEmpty()) {
// pop the head element
TrieNode currCan = identityCandidates.remove();
// does it belong to one of the already identified equiv. classes?
if (this.reprs.containsKey(currCan)) {
currCan.setId(reprs.get(currCan));
// System.out.println("identifies identical nodes for class " + currCan.getId());
}
/*
* for (TrieNode repr : reprs){ if ( currCan.identicalTo(repr) ){ currCan.setId( repr.getId() ); break; } }
*/
// ... if not, let it represent a new class
if (!currCan.hasId()) {
currCan.setId(reprs.size());
reprs.put(currCan, currCan.getId());
rlist.add(currCan);
}
TrieNode pred = currCan.getBackPointer();
// add the predecessor of this node if...
if (null != pred && // 1. there is one
!pred.hasId() && // 2. it is not already processed
pred.rightIdentified() // 3. but its successors are processed
) {
identityCandidates.add(pred);
}
}
}
public String toString() {
StringBuilder sb = new StringBuilder();
for (TrieNode r : this.reprs.keySet()) {
sb.append("\n");
sb.append(r.toString());
}
return sb.toString();
}
}