DiskTreebank.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.trees; 
import edu.stanford.nlp.util.logging.Redwood;

import java.io.*;
import java.util.*;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.HasIndex;

/**
 * A <code>DiskTreebank</code> is a <code>Collection</code> of
 * <code>Tree</code>s.
 * A <code>DiskTreebank</code> object stores merely the information to
 * get at a corpus of trees that is stored on disk.  Access is usually
 * via apply()'ing a TreeVisitor to each Tree in the Treebank or by using
 * an iterator() to get an iteration over the Trees.
 * <p/>
 * If the root Label of the Tree objects built by the TreeReader
 * implements HasIndex, then the filename and index of the tree in
 * a corpus will be inserted as they are read in.
 *
 * @author Christopher Manning
 * @author Spence Green
 */
public final class DiskTreebank extends Treebank  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(DiskTreebank.class);

  private static boolean PRINT_FILENAMES = false;

  private final List<File> filePaths = new ArrayList<>();
  private final List<FileFilter> fileFilters = new ArrayList<>();

  /*
   * Absolute path of the file currently being read.
   */
  private String currentFilename; // = null;


  /**
   * Create a new DiskTreebank. The trees are made with a <code>LabeledScoredTreeReaderFactory</code>.
   */
  public DiskTreebank() {
    this(new LabeledScoredTreeReaderFactory());
  }

  /**
   * Create a new treebank, set the encoding for file access.
   *
   * @param encoding The charset encoding to use for treebank file decoding
   */
  public DiskTreebank(String encoding) {
    this(new LabeledScoredTreeReaderFactory(), encoding);
  }

  /**
   * Create a new DiskTreebank.
   *
   * @param trf the factory class to be called to create a new
   *            <code>TreeReader</code>
   */
  public DiskTreebank(TreeReaderFactory trf) {
    super(trf);
  }

  /**
   * Create a new DiskTreebank.
   *
   * @param trf      the factory class to be called to create a new
   *                 <code>TreeReader</code>
   * @param encoding The charset encoding to use for treebank file decoding
   */
  public DiskTreebank(TreeReaderFactory trf, String encoding) {
    super(trf, encoding);
  }

  /**
   * Create a new Treebank. The trees are made with a <code>LabeledScoredTreeReaderFactory</code>.
   *
   * @param initialCapacity The initial size of the underlying Collection.
   *                        For a <code>DiskTreebank</code>, this parameter is ignored.
   */
  public DiskTreebank(int initialCapacity) {
    this(initialCapacity, new LabeledScoredTreeReaderFactory());
  }

  /**
   * Create a new Treebank.
   *
   * @param initialCapacity The initial size of the underlying Collection,
   *                        For a <code>DiskTreebank</code>, this parameter is ignored.
   * @param trf             the factory class to be called to create a new
   *                        <code>TreeReader</code>
   */
  public DiskTreebank(int initialCapacity, TreeReaderFactory trf) {
    this(trf);
  }

  /**
   * Empty a <code>Treebank</code>.
   */
  @Override
  public void clear() {
    filePaths.clear();
    fileFilters.clear();
  }

  /**
   * Load trees from given directory.  This version just records
   * the paths to be processed, and actually processes them at apply time.
   *
   * @param path file or directory to load from
   * @param filt a FilenameFilter of files to load
   */
  @Override
  public void loadPath(File path, FileFilter filt) {
    if(path.exists()) {
      filePaths.add(path);
      fileFilters.add(filt);
    } else {
      System.err.printf("%s: File/path %s does not exist. Skipping.%n" , this.getClass().getName(), path.getPath());
    }
  }

  /**
   * Applies the TreeVisitor to to all trees in the Treebank.
   *
   * @param tp A class that can process trees.
   */
  @Override
  public void apply(final TreeVisitor tp) {
    for (Tree t : this) {
      tp.visitTree(t);
    }
  }

  /**
   * Returns the absolute path of the file currently being read.
   *
   */
  public String getCurrentFilename() {
    return currentFilename;
  }

  public List<File> getCurrentPaths() {
    return Collections.unmodifiableList(filePaths);
  }

  public void printFileNames() {
    PRINT_FILENAMES = true;
  }

  private class DiskTreebankIterator implements Iterator<Tree> {

    private TreeReader tr = null;
    private Tree storedTree = null;  // null means iterator is exhausted (or not yet constructed)

    //Create local copies so that calls to loadPath() in the parent class
    //don't cause exceptions i.e., this iterator is valid over the state of DiskTreebank
    //when the iterator is created.
    private final List<File> localPathList;
    private final List<FileFilter> localFilterList;
    private int fileListPtr = 0;

    private File currentFile;
    private int curLineId = 1;

    private List<File> curFileList;
    private Iterator<File> curPathIter;

    private DiskTreebankIterator() {
      localPathList = new ArrayList<>(filePaths);
      localFilterList = new ArrayList<>(fileFilters);

      if(primeNextPath() && primeNextFile())
        storedTree = primeNextTree();
    }

    //In the case of a recursive file filter, performs a BFS through the directory structure.
    private boolean primeNextPath() {
      while(fileListPtr < localPathList.size() && fileListPtr < localFilterList.size()) {
        final File nextPath = localPathList.get(fileListPtr);
        final FileFilter nextFilter = localFilterList.get(fileListPtr);
        fileListPtr++;

        final List<File> pathListing = ((nextPath.isDirectory()) ?
                                        Arrays.asList(nextPath.listFiles(nextFilter)) : Collections.singletonList(nextPath));

        if(pathListing != null) {
          if(pathListing.size() > 1) Collections.sort(pathListing);

          curFileList = new ArrayList<>();
          for(File path : pathListing) {
            if(path.isDirectory()) {
              localPathList.add(path);
              localFilterList.add(nextFilter);
            } else {
              curFileList.add(path);
            }
          }

          if(curFileList.size() != 0) {
            curPathIter = curFileList.iterator();
            return true;
          }
        }
      }

      return false;
    }

    private boolean primeNextFile() {
      try {
        if(curPathIter.hasNext() || (primeNextPath() && curPathIter.hasNext())) {
          currentFile = curPathIter.next();
          currentFilename = currentFile.getAbsolutePath();
          if(PRINT_FILENAMES) log.info(currentFile);

          if (tr != null) { tr.close(); }
          tr = treeReaderFactory().newTreeReader(IOUtils.readerFromFile(currentFile, encoding()));
          curLineId = 1;
          return true;
        }

      } catch (UnsupportedEncodingException e) {
        System.err.printf("%s: Filesystem does not support encoding:%n%s%n", this.getClass().getName(), e.toString());
        throw new RuntimeException(e);
      } catch (FileNotFoundException e) {
        System.err.printf("%s: File does not exist:%n%s%n", this.getClass().getName(),e.toString());
        throw new RuntimeException(e);
      } catch (IOException e) {
        System.err.printf("%s: Unable to close open tree reader:%n%s%n", this.getClass().getName(),currentFile.getPath());
        throw new RuntimeException(e);
      }
      return false;
    }

    private Tree primeNextTree() {
      Tree t = null;

      try {
        t = tr.readTree();
        if(t == null && primeNextFile()) //Current file is exhausted
          t = tr.readTree();

        //Associate this tree with a file and line number
        if(t != null && t.label() != null && t.label() instanceof HasIndex) {
          HasIndex lab = (HasIndex) t.label();
          lab.setSentIndex(curLineId++);
          lab.setDocID(currentFile.getName());
        }

      } catch (IOException e) {
        System.err.printf("%s: Error reading from file %s:%n%s%n", this.getClass().getName(), currentFile.getPath(), e.toString());
        throw new RuntimeException(e);
      }

      return t;
    }

    /**
     * Returns true if the iteration has more elements.
     */
    @Override
    public boolean hasNext() { return storedTree != null; }

    /**
     * Returns the next element in the iteration.
     */
    @Override
    public Tree next() {
      if(storedTree == null)
        throw new NoSuchElementException();

      Tree ret = storedTree;
      storedTree = primeNextTree();
      return ret;
    }

    /**
     * Not supported
     */
    @Override
    public void remove() { throw new UnsupportedOperationException(); }
  }


  /**
   * Return an Iterator over Trees in the Treebank.  This is implemented
   * by building per-file MemoryTreebanks for the files in the
   * DiskTreebank.  As such, it isn't as efficient as using
   * <code>apply()</code>.
   */
  @Override
  public Iterator<Tree> iterator() {
    return new DiskTreebankIterator();
  }

}