package edu.berkeley.nlp.treebank; import edu.berkeley.nlp.syntax.Tree; import edu.berkeley.nlp.syntax.Trees; import edu.berkeley.nlp.util.ConcatenationIterator; import java.util.*; import java.io.*; /** * @author Dan Klein */ public class PennTreebankReader { static class TreeCollection extends AbstractCollection<Tree<String>> { List<File> files; static class TreeIteratorIterator implements Iterator<Iterator<Tree<String>>> { Iterator<File> fileIterator; Iterator<Tree<String>> nextTreeIterator; public boolean hasNext() { return nextTreeIterator != null; } public Iterator<Tree<String>> next() { Iterator<Tree<String>> currentTreeIterator = nextTreeIterator; advance(); return currentTreeIterator; } public void remove() { throw new UnsupportedOperationException(); } private void advance() { nextTreeIterator = null; while (nextTreeIterator == null && fileIterator.hasNext()) { try { File file = fileIterator.next(); nextTreeIterator = new Trees.PennTreeReader(new BufferedReader(new FileReader(file))); } catch (FileNotFoundException e) { } } } TreeIteratorIterator(List<File> files) { this.fileIterator = files.iterator(); advance(); } } public Iterator<Tree<String>> iterator() { return new ConcatenationIterator<Tree<String>>(new TreeIteratorIterator(files)); } public int size() { int size = 0; Iterator i = iterator(); while (i.hasNext()) { size++; i.next(); } return size; } private List<File> getFilesUnder(String path, FileFilter fileFilter) { File root = new File(path); List<File> files = new ArrayList<File>(); addFilesUnder(root, files, fileFilter); return files; } private void addFilesUnder(File root, List<File> files, FileFilter fileFilter) { if (! fileFilter.accept(root)) return; if (root.isFile()) { files.add(root); return; } if (root.isDirectory()) { File[] children = root.listFiles(); for (int i = 0; i < children.length; i++) { File child = children[i]; addFilesUnder(child, files, fileFilter); } } } public TreeCollection(String path, int lowFileNum, int highFileNum) { this(path,lowFileNum,highFileNum,".mrg"); } public TreeCollection(String path, int lowFileNum, int highFileNum, String suffix) { FileFilter fileFilter = new NumberRangeFileFilter(suffix, lowFileNum, highFileNum, true); this.files = getFilesUnder(path, fileFilter); Collections.sort(this.files); } } public static Collection<Tree<String>> readTrees(String path) { return readTrees(path, -1, Integer.MAX_VALUE); } public static Collection<Tree<String>> readTrees(String path, int lowFileNum, int highFileNumber) { return new TreeCollection(path, lowFileNum, highFileNumber); } public static void main(String[] args) { Collection<Tree<String>> trees = readTrees(args[0]); for (Tree<String> tree : trees) { tree = (new Trees.StandardTreeNormalizer()).transformTree(tree); System.out.println(Trees.PennTreeRenderer.render(tree)); } } }