/** * */ package arkref.data; import java.io.Serializable; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.lang.ArrayUtils; import org.apache.commons.lang.StringUtils; import arkref.parsestuff.U; import arkref.sent.SentenceBreaker; import com.aliasi.util.Arrays; import edu.stanford.nlp.trees.Tree; /** * Our notion of a sentence, integrating information from all subsystems (or at least maintaining pointers to those data structures) * * Note that this is different than sent.SentenceBreaker.Sentence, which only contains early-stage surface information about a sentence. * * @author brendano * */ public class Sentence implements Serializable { private static final long serialVersionUID = -921962840824846212L; public List<Word> words; private Map<String,Word> node2wordMap; private Tree rootNode; public boolean hasParse; /** optional: more surface info **/ public SentenceBreaker.Sentence surfSent = null; private int id; public Sentence(int id) { this.id = id; words=new ArrayList<Word>(); node2wordMap=new HashMap<String,Word>(); } public void setStuff(Tree root, String neTagging, boolean parseSuccess) { this.setRootNode(root); //String[] neTaggedWords = neTagging.split(" "); //simple whitespace splitting doesn't always work //because of the way PTB tokenizes e.g., 16 2/3 as a single token. //PTB sucks. List<String> neTaggedWords = new ArrayList<String>(); Pattern p = Pattern.compile("(\\S+/\\S+)\\s"); Matcher m = p.matcher(neTagging+" "); while(m.find()){ neTaggedWords.add(m.group(1)); } List<Tree> leaves = root.getLeaves(); if ( !(!parseSuccess || neTaggedWords.size() == leaves.size())) { U.pf("WARNING parser and SST tokenizers disagree on length %d vs %d\nPARSER: %s\nSST: %s\n", leaves.size(), neTaggedWords.size(), leaves, StringUtils.join(neTaggedWords," ")); } // assert !parseSuccess || neTaggedWords.length == leaves.size(); for (int i=0; i < neTaggedWords.size(); i++) { Word word = new Word(); word.sentence = this; String[] parts = neTaggedWords.get(i).split("/"); word.setNeTag(parts[parts.length-1]); String sstToken = StringUtils.join(ArrayUtils.subarray(parts, 0, parts.length-1), "/"); if (parseSuccess) { word.setNode(leaves.get(i)); // assert sstToken.equals( word.node().value() ) : String.format("SST and parser tokens disagree: [%s] vs [%s]", word.token, word.node().value()); if (! sstToken.equals( word.node().value() )) { System.out.println(String.format("SST and parser tokens disagree: [%s] vs [%s]", word.token, word.node().value())); } set_node2word(word.node(), word); } word.token = sstToken.replace("\\/", "/"); words.add(word); } } public Word node2word(Tree node) { String key = nodeKey(node); return node2wordMap.get(key); } public String nodeKey(Tree node) { return String.format("node_%s_%s", rootNode().leftCharEdge(node), node.hashCode()); } public void set_node2word(Tree node, Word w) { String key = nodeKey(node); node2wordMap.put(key, w); } public String neType(Tree leaf) { assert leaf.isLeaf(); Word w = node2word(leaf); // if (w == null) return null; return w.ssTag(); } public String text() { // oops we don't have original anymore. but maybe we don't want it. ArrayList<String> toks = new ArrayList<String>(); for (Tree L : rootNode().getLeaves()) { toks.add(L.label().toString()); } return StringUtils.join(toks, " "); } public String[] tokens() { List<Tree> leaves = rootNode().getLeaves(); String[] toks = new String[leaves.size()]; for (int i=0; i<leaves.size(); i++) toks[i] = leaves.get(i).value(); return toks; } public Tree rootNode() { return rootNode; } public int ID() { return id; } public void setRootNode(Tree rootNode) { this.rootNode = rootNode; } }