package edu.cmu.minorthird.text;
import java.util.*;
/**
* Efficient scheme for matching a rote list of sequences of tokens.
*
* @author William Cohen
*/
public class Trie{
// a node in the trie
private static class TrieNode{
public Map<String,TrieNode> map=null;
public List<String> endIds=null;
@Override
public String toString(){
return "TrieNode(ends="+endIds+",map="+map+")";
}
}
// a match to something in the Trie
private static class TrieMatch{
public List<String> endIds;
public int start;
public int length;
public TrieMatch(List<String> endIds,int start,int length){
this.endIds=endIds;
this.start=start;
this.length=length;
}
}
private TrieNode root;
public Trie(){
root=new TrieNode();
}
/** Lookup matches to the trie in the span */
public ResultIterator lookup(Span span){
List<TrieMatch> accum=new ArrayList<TrieMatch>();
for(int i=0;i<span.size();i++){
lookup(accum,span,i);
}
return new MyResultIterator(span,accum);
}
private void lookup(List<TrieMatch> accum,Span span,int start){
TrieNode node=root;
int depth=0;
while(node!=null){
if(node.endIds!=null){
// add a new match
accum.add(new TrieMatch(node.endIds,start,depth));
}
// extend the trie
if(node.map!=null&&start+depth<span.size()){
node=node.map.get(span.getToken(start+depth).getValue());
}else{
node=null;
}
depth++;
}
}
/** Associate a sequence of words with a given id. */
public void addWords(String id,String[] words){
TrieNode node=root;
for(int i=0;i<words.length;i++){
if(node.map==null)
node.map=new TreeMap<String,TrieNode>();
TrieNode kid=node.map.get(words[i]);
if(kid==null)
node.map.put(words[i],(kid=new TrieNode()));
node=kid;
}
// associate this id with the last node in the chain
if(node.endIds==null)
node.endIds=new ArrayList<String>();
node.endIds.add(id);
}
/** Pretty-print the entire trie. */
@Override
public String toString(){
StringBuilder b=new StringBuilder();
toString(b,0,root);
return b.toString();
}
private void tab(StringBuilder b,int level){
for(int tab=0;tab<level;tab++)
b.append("| ");
}
private void toString(StringBuilder b,int level,TrieNode node){
if(node.map==null)
return;
for(Iterator<String> i=node.map.keySet().iterator();i.hasNext();){
String w=i.next();
TrieNode kid=node.map.get(w);
tab(b,level);
b.append("'").append(w).append("'");
if(kid.endIds!=null){
b.append(" IDS:");
for(Iterator<String> j=kid.endIds.iterator();j.hasNext();){
b.append(" ").append(j.next());
}
}
// buf.append("\t"+node);
b.append("\n");
toString(b,level+1,node.map.get(w));
}
}
/**
* An extension of Span.Looper which also returns the ids associated with a
* Span.
*/
public static interface ResultIterator extends Iterator<Span>{
/** Return a list of the ids associated with the span in the Trie */
public List<String> getAssociatedIds();
}
//
// an implementation of ResultLooper
//
private static class MyResultIterator implements ResultIterator{
private Iterator<TrieMatch> i;
private Span span;
private List<String> lastIdList;
// private int estSize=-1;
public MyResultIterator(Span span,Collection<TrieMatch> c){
this.span=span;
this.i=c.iterator();
// estSize=c.size();
}
@Override
public boolean hasNext(){
return i.hasNext();
}
@Override
public void remove(){
i.remove();
}
@Override
public List<String> getAssociatedIds(){
return lastIdList;
}
@Override
public Span next(){
TrieMatch match=i.next();
lastIdList=match.endIds;
return span.subSpan(match.start,match.length);
}
// public int estimatedSize(){
// return estSize;
// }
}
public static void main(String[] argv){
BasicTextBase base=new BasicTextBase();
Trie trie=new Trie();
for(int i=0;i<argv.length-1;i++){
trie.addWords(("argv"+i),base.getTokenizer().splitIntoTokens(argv[i]));
}
System.out.println(trie.toString());
base.loadDocument("span",argv[argv.length-1]);
for(Iterator<Span> i=trie.lookup(base.documentSpan("span"));i.hasNext();){
System.out.println("match: "+i.next().asString());
}
}
}