package edu.stanford.nlp.trees; import edu.stanford.nlp.process.Tokenizer; import edu.stanford.nlp.objectbank.TokenizerFactory; import edu.stanford.nlp.ling.HasIndex; import java.io.*; import java.util.*; /** * A <code>PennTreeReader</code> is a <code>TreeReader</code> that * reads in Penn Treebank-style files. Example usage: * <br> * <code> TreeReader tr = new PennTreeReader(new BufferedReader(new * InputStreamReader(new FileInputStream(file),"UTF-8")), * myTreeFactory); </code> * * @author Christopher Manning * @author Roger Levy * @version 2003/01 */ public class PennTreeReader implements TreeReader { private Reader in; private Tokenizer<String> st; private TreeNormalizer tn; private TreeFactory tf; private static final boolean DEBUG = false; /** * Read parse trees from a <code>Reader</code>. * For the defaulted arguments, you get a * <code>SimpleTreeFactory</code>, no <code>TreeNormalizer</code>, and * a <code>PennTreebankTokenizer</code>. * * @param in The <code>Reader</code> */ public PennTreeReader(Reader in) { this(in, new SimpleTreeFactory()); } /** * Read parse trees from a <code>Reader</code>. * * @param in the Reader * @param tf TreeFactory -- factory to create some kind of Tree */ public PennTreeReader(Reader in, TreeFactory tf) { this(in, tf, null, new PennTreebankTokenizer(in)); } /** * Read parse trees from a <code>Reader</code>. * * @param in The Reader * @param st The Tokenizer */ public PennTreeReader(Reader in, Tokenizer<String> st) { this(in, new SimpleTreeFactory(), null, st); } /** * Read parse trees from a Reader. * * @param in Reader * @param tf TreeFactory -- factory to create some kind of Tree * @param tn the method of normalizing trees */ public PennTreeReader(Reader in, TreeFactory tf, TreeNormalizer tn) { this(in, tf, tn, new PennTreebankTokenizer(in)); } /** * Read parse trees from a Reader. * * @param in Reader * @param tf TreeFactory -- factory to create some kind of Tree * @param tn the method of normalizing trees * @param st Tokenizer that divides up Reader */ public PennTreeReader(Reader in, TreeFactory tf, TreeNormalizer tn, Tokenizer<String> st) { this.in = in; this.tf = tf; this.tn = tn; this.st = st; // check for whacked out headers still present in Brown corpus in Treebank 3 String first = st.peek(); if (first != null && first.startsWith("*x*x*x")) { if (DEBUG) { System.err.println("PennTreeReader: skipping past whacked out header."); } int foundCount = 0; while (foundCount < 4 && st.hasNext()) { first = st.next(); if (first != null && first.startsWith("*x*x*x")) { foundCount++; } } } if (DEBUG) { System.err.println("Built PennTreeReader from " + in.getClass().getName() + " " + ((tf == null) ? "no tf" : tf.getClass().getName()) + " " + ((tn == null) ? "no tn" : tn.getClass().getName()) + " " + ((st == null) ? "no st" : st.getClass().getName())); } } private int wordIndex; /** * Reads a single tree in standard Penn Treebank format, * with or without an additional set of parens around it (an unnamed * ROOT node). If the token stream ends before the current tree is complete, a * {@link java.util.NoSuchElementException} will get thrown from * deep within the innards of this method. * * @return A single tree, or <code>null</code> at end of token stream. */ public Tree readTree() throws IOException { Tree tr = null; while (tr == null) { if (!st.hasNext()) { return null; } tr = readTreeHelper(); if (DEBUG) { if (tr == null) { System.err.println("readTreeHelper returned null tree; continuing."); } } } return tr; } private Tree readTreeHelper() throws IOException { wordIndex = 0; Tree tr = readTree(st.next()); if (tr == null || tn == null) { return tr; } else { return tn.normalizeWholeTree(tr, tf); } } private Tree readTree(String token) throws IOException { if (DEBUG) { System.out.println("readTree() next token " + token); } String name; // a paren starts new tree, a string is a leaf symbol, // o.w. IO exception if (token == null) { return null; } else if (token.equals(")")) { System.err.println("Expecting start of tree; found surplus close parenthesis ')'. Ignoring it."); return null; } else if (token.equals("(")) { // looks at next name = st.peek(); if (DEBUG) { System.out.println(" peeked is \"" + name+ '\"'); } // checks if it's a normal string and returns it as the label if (name.equals("(") || name.equals(")")) { name = null; } else { // get it for real name = st.next(); } if (tn != null) { name = tn.normalizeNonterminal(name); // we used to .intern(); } return tf.newTreeNode(name, readTrees()); } else { if (tn != null) { name = tn.normalizeTerminal(token); // was: .intern(); } else { name = token; } Tree leaf = tf.newLeaf(name); if (leaf.label() instanceof HasIndex) { HasIndex hi = (HasIndex) leaf.label(); hi.setIndex(wordIndex); } wordIndex++; return leaf; } } /** * Parse sequence of trees, followed by a single right paren. */ private List<Tree> readTrees() throws IOException { // allocate array list for temporarily storing trees List<Tree> parseTrees = new ArrayList<Tree>(); // until a paren closes all subtrees, keep reading trees String nextToken = null; String fullToken = ""; while (st.hasNext()) { nextToken = st.next(); if (nextToken.equals(")")) { break; } else if (nextToken.equals("(")) { if (!fullToken.equals("")) { parseTrees.add(readTree(fullToken)); fullToken = ""; } parseTrees.add(readTree(nextToken)); } else { fullToken += (fullToken.equals("") ? "" : " ") + nextToken; } } if (! ")".equals(nextToken)) { throw(new IOException("Expecting right paren found eof")); } if (!fullToken.equals("")) { parseTrees.add(readTree(fullToken)); } return parseTrees; } /** * Close the Reader behind this <code>TreeReader</code>. */ public void close() throws IOException { in.close(); } public static TokenizerFactory<Tree> tokenizerFactory(final TreeFactory tf, final TreeNormalizer tn, final Tokenizer<String> stringTokenizer) { return new TreeTokenizerFactory(new TreeReaderFactory() { public TreeReader newTreeReader(Reader in) { return new PennTreeReader(in,tf,tn,stringTokenizer); } }); } /* private static class TreeTokenizerFactory implements TokenizerFactory<Tree> { TreeFactory tf; TreeNormalizer tn; Tokenizer t; public TreeTokenizerFactory(TreeFactory tf, TreeNormalizer tn, Tokenizer t) { this.tf = tf; this.tn = tn; this.t = t; } public Tokenizer<Tree> getTokenizer(final Reader r) { return new AbstractTokenizer<Tree>() { PennTreeReader tr = new PennTreeReader(r,tf,tn,t); public Tree getNext() { try { return tr.readTree(); } catch(IOException e) { System.err.println("Error in reading tree."); return null; } } }; } public Iterator<Tree> getIterator(Reader r) { return getTokenizer(r); } } */ /** * Returns an iterator over Trees which is backed by this PennTreeReader. * Warning: any IOExceptions which would normally be thrown are turned * into RuntimeExceptions. */ public Iterator<Tree> asTreeIterator() { return new Iterator<Tree>() { private Tree next = advance(); public boolean hasNext() { return next != null; } public Tree next() { if (next == null) { throw new NoSuchElementException("PennTreeReader exhausted"); } Tree t = next; next = advance(); return t; } public void remove() { throw new UnsupportedOperationException(); } private Tree advance() { Tree t = readTreeThrowRuntime(); if (t == null) closeThrowRuntime(); return t; } }; } private Tree readTreeThrowRuntime() { Tree t; try { t = readTree(); } catch (IOException e) { throw new RuntimeException(e); } return t; } private void closeThrowRuntime() { try { close(); } catch (IOException e) { throw new RuntimeException(e); } } /** * Loads treebank data from first argument and prints it. * * @param args Array of command-line arguments: specifies a filename */ public static void main(String[] args) { try { TreeFactory tf = new LabeledScoredTreeFactory(); Reader r = new BufferedReader(new InputStreamReader(new FileInputStream(args[0]), "UTF-8")); TreeReader tr = new PennTreeReader(r, tf); Tree t = tr.readTree(); while (t != null) { System.out.println(t); System.out.println(); t = tr.readTree(); } r.close(); } catch (IOException ioe) { ioe.printStackTrace(); } } }