TreebankAnnotator.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.parser.lexparser;

import edu.stanford.nlp.io.NumberRangeFileFilter;
import edu.stanford.nlp.ling.CategoryWordTag;
import edu.stanford.nlp.ling.WordFactory;
import edu.stanford.nlp.trees.*;
import edu.stanford.nlp.util.Generics;

import java.io.Reader;
import java.util.*;

// todo [cdm 2014]: This class is all but dead. Delete it.
/**
 * Class for getting an annotated treebank.
 *
 * @author Dan Klein
 */
public class TreebankAnnotator {

  final TreeTransformer treeTransformer;
  final TreeTransformer treeUnTransformer;
  final TreeTransformer collinizer;
  final Options op;

  public List<Tree> annotateTrees(List<Tree> trees) {
    List<Tree> annotatedTrees = new ArrayList<>();
    for (Tree tree : trees) {
      annotatedTrees.add(treeTransformer.transformTree(tree));
    }
    return annotatedTrees;
  }

  public List<Tree> deannotateTrees(List<Tree> trees) {
    List<Tree> deannotatedTrees = new ArrayList<>();
    for (Tree tree : trees) {
      deannotatedTrees.add(treeUnTransformer.transformTree(tree));
    }
    return deannotatedTrees;
  }


  public static List<Tree> getTrees(String path, int low, int high, int minLength, int maxLength) {
    Treebank treebank = new DiskTreebank(in -> new PennTreeReader(in, new LabeledScoredTreeFactory(new WordFactory()), new BobChrisTreeNormalizer()));
    treebank.loadPath(path, new NumberRangeFileFilter(low, high, true));
    List<Tree> trees = new ArrayList<>();
    for (Tree tree : treebank) {
      if (tree.yield().size() <= maxLength && tree.yield().size() >= minLength) {
        trees.add(tree);
      }
    }
    return trees;
  }

  public static List<Tree> removeDependencyRoots(List<Tree> trees) {
    List<Tree> prunedTrees = new ArrayList<>();
    for (Tree tree : trees) {
      prunedTrees.add(removeDependencyRoot(tree));
    }
    return prunedTrees;
  }

  static Tree removeDependencyRoot(Tree tree) {
    List<Tree> childList = tree.getChildrenAsList();
    Tree last = childList.get(childList.size() - 1);
    if (!last.label().value().equals(Lexicon.BOUNDARY_TAG)) {
      return tree;
    }
    List<Tree> lastGoneList = childList.subList(0, childList.size() - 1);
    tree.setChildren(lastGoneList);
    return tree;
  }

  public Tree collinize(Tree tree) {
    return collinizer.transformTree(tree);
  }

  public TreebankAnnotator(Options op, String treebankRoot) {
    //    op.tlpParams = new EnglishTreebankParserParams();
    // CDM: Aug 2004: With new implementation of treebank split categories,
    // I've hardwired this to load English ones.  Otherwise need training data.
    // op.trainOptions.splitters = Generics.newHashSet(Arrays.asList(op.tlpParams.splitters()));
    op.trainOptions.splitters = ParentAnnotationStats.getEnglishSplitCategories(treebankRoot);
    op.trainOptions.sisterSplitters = Generics.newHashSet(Arrays.asList(op.tlpParams.sisterSplitters()));
    op.setOptions("-acl03pcfg", "-cnf");
    treeTransformer = new TreeAnnotatorAndBinarizer(op.tlpParams, op.forceCNF, !op.trainOptions.outsideFactor(), true, op);
    //    BinarizerFactory.TreeAnnotator.setTreebankLang(op.tlpParams);
    treeUnTransformer = new Debinarizer(op.forceCNF);
    collinizer = op.tlpParams.collinizer();
    this.op = op;
  }


  public static void main(String[] args) {
    CategoryWordTag.printWordTag = false;
    String path = args[0];
    List<Tree> trees = getTrees(path, 200, 219, 0, 10);
    trees.iterator().next().pennPrint();
    Options op = new Options();
    List<Tree> annotatedTrees = TreebankAnnotator.removeDependencyRoots(new TreebankAnnotator(op, path).annotateTrees(trees));
    annotatedTrees.iterator().next().pennPrint();
  }

}