package edu.berkeley.nlp.io; import java.io.BufferedReader; import java.io.File; import java.io.FileFilter; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStreamReader; import java.nio.charset.Charset; import java.nio.charset.UnsupportedCharsetException; import java.util.AbstractCollection; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Iterator; import java.util.List; import java.util.SortedSet; import java.util.TreeSet; import edu.berkeley.nlp.syntax.Tree; import edu.berkeley.nlp.syntax.Trees; import edu.berkeley.nlp.util.ConcatenationIterator; /** * @author Dan Klein */ public class PennTreebankReader { static class TreeCollection extends AbstractCollection<Tree<String>> { List<File> files; Charset charset; static class TreeIteratorIterator implements Iterator<Iterator<Tree<String>>> { Iterator<File> fileIterator; Iterator<Tree<String>> nextTreeIterator; Charset charset; BufferedReader currentFileReader, lastReader, readerToClose; public boolean hasNext() { return nextTreeIterator != null; } public Iterator<Tree<String>> next() { Iterator<Tree<String>> currentTreeIterator = nextTreeIterator; advance(); return currentTreeIterator; } public void remove() { throw new UnsupportedOperationException(); } private void advance() { nextTreeIterator = null; while (nextTreeIterator == null && fileIterator.hasNext()) { File file = fileIterator.next(); // System.out.println(file); try { if (readerToClose!=null) { // System.out.println("closing "+lastReader.toString()); readerToClose.close(); } readerToClose = lastReader; lastReader = currentFileReader; // currentFileReader = new BufferedReader( // new InputStreamReader(new FileInputStream(file), this.charset)); // nextTreeIterator = new Trees.PennTreeReader(currentFileReader); nextTreeIterator = new Trees.PennTreeReader(new BufferedReader( new InputStreamReader(new FileInputStream(file), this.charset))); } catch (FileNotFoundException e) { } catch (UnsupportedCharsetException e) { throw new Error("Unsupported charset in file "+file.getPath()); } catch (IOException e) { new Error("Error closing file handle"); } } if (readerToClose!=null){ try { readerToClose.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } TreeIteratorIterator(List<File> files, Charset charset) { this.fileIterator = files.iterator(); this.charset = charset; advance(); } } @Override public Iterator<Tree<String>> iterator() { return new ConcatenationIterator<Tree<String>>(new TreeIteratorIterator(files, this.charset)); } @Override public int size() { int size = 0; Iterator i = iterator(); while (i.hasNext()) { size++; i.next(); } return size; } private List<File> getFilesUnder(String path, FileFilter fileFilter) { File root = new File(path); List<File> files = new ArrayList<File>(); addFilesUnder(root, files, fileFilter); return files; } private void addFilesUnder(File root, List<File> files, FileFilter fileFilter) { if (! fileFilter.accept(root)) return; if (root.isFile()) { files.add(root); return; } if (root.isDirectory()) { SortedSet<File> children = new TreeSet<File>(Arrays.asList(root.listFiles())); for (File child : children) { addFilesUnder(child, files, fileFilter); } } } public TreeCollection(String path, int lowFileNum, int highFileNum, Charset charset) { FileFilter fileFilter = new NumberRangeFileFilter(".mrg", lowFileNum, highFileNum, true); this.files = getFilesUnder(path, fileFilter); // for (File f : files) System.out.println(f.toString()); this.charset = charset; } public TreeCollection(String path, int lowFileNum, int highFileNum, String charsetName) { this(path,lowFileNum,highFileNum,Charset.forName(charsetName)); } public TreeCollection(String path, int lowFileNum, int highFileNum) { this(path,lowFileNum,highFileNum,Charset.defaultCharset()); } } public static Collection<Tree<String>> readTrees(String path, Charset charset) { return readTrees(path, -1, Integer.MAX_VALUE, charset); } public static Collection<Tree<String>> readTrees(String path, int lowFileNum, int highFileNumber, Charset charset) { return new TreeCollection(path, lowFileNum, highFileNumber, charset); } public static void main(String[] args) { Collection<Tree<String>> trees = readTrees(args[0], Charset.defaultCharset()); for (Tree<String> tree : trees) { tree = (new Trees.StandardTreeNormalizer()).transformTree(tree); System.out.println(Trees.PennTreeRenderer.render(tree)); } } }