AnCoraProcessor.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.international.spanish.pipeline; 
import edu.stanford.nlp.util.logging.Redwood;

import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.stats.Counters;
import edu.stanford.nlp.stats.TwoDimensionalCounter;
import edu.stanford.nlp.trees.*;
import edu.stanford.nlp.trees.international.spanish.*;
import edu.stanford.nlp.trees.tregex.TregexMatcher;
import edu.stanford.nlp.trees.tregex.TregexPattern;
import edu.stanford.nlp.util.*;
import edu.stanford.nlp.util.concurrent.MulticoreWrapper;
import edu.stanford.nlp.util.concurrent.ThreadsafeProcessor;

import java.io.*;
import java.util.*;
import java.util.concurrent.*;
import java.util.function.Predicate;

/**
 * A tool which accepts raw AnCora-3.0 Spanish XML files and produces
 * normalized / pre-processed PTB-style treebanks for use with CoreNLP
 * tools.
 *
 * This is a substitute for an awkward and complicated string of
 * command-line invocations. The produced corpus is the standard
 * treebank which has been used to train the CoreNLP Spanish models.
 *
 * The preprocessing steps performed here include:
 *
 * - Expansion and automatic tagging of multi-word tokens (see
 *   {@link MultiWordPreprocessor},
 *   {@link SpanishTreeNormalizer#normalizeForMultiWord(Tree, TreeFactory)}
 * - Heuristic parsing of expanded multi-word tokens (see
 *   {@link MultiWordTreeExpander}
 * - Splitting of elided forms (<em>al</em>, <em>del</em>,
 *   <em>conmigo</em>, etc.) and clitic pronouns from verb forms (see
 *   {@link SpanishTreeNormalizer#expandElisions(Tree)},
 *   {@link SpanishTreeNormalizer#expandCliticPronouns(Tree)}
 * - Miscellaneous cleanup of parse trees, spelling fixes, parsing
 *   error corrections (see {@link SpanishTreeNormalizer})
 *
 * Apart from raw corpus data, this processor depends upon unigram
 * part-of-speech tag data. If not provided explicitly to the
 * processor, the data will be collected from the given files. (You can
 * pre-compute POS data from AnCora XML using {@link AnCoraPOSStats}.)
 *
 * For invocation options, execute the class with no arguments.
 *
 * @author Jon Gauthier
 */
public class AnCoraProcessor  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(AnCoraProcessor.class);

  private final List<File> inputFiles;
  private final Properties options;

  private final TwoDimensionalCounter<String, String> unigramTagger;

  @SuppressWarnings("unchecked")
  public AnCoraProcessor(List<File> inputFiles, Properties options)
    throws IOException, ClassNotFoundException {

    this.inputFiles = inputFiles;
    this.options = options;

    if (options.containsKey("unigramTagger")) {
      ObjectInputStream ois = new ObjectInputStream(new FileInputStream(options.getProperty
        ("unigramTagger")));
      unigramTagger = (TwoDimensionalCounter<String, String>) ois.readObject();
    } else {
      unigramTagger = new TwoDimensionalCounter<>();
    }
  }

  public List<Tree> process() throws
    InterruptedException, IOException, ExecutionException {

    // Each of the following subroutines are multithreaded; there is a bottleneck between the
    // method calls
    List<Tree> trees = loadTrees();
    trees = fixMultiWordTokens(trees);

    return trees;
  }

  /**
   * Use {@link SpanishXMLTreeReader} to load the trees from the provided files,
   * and begin collecting some statistics to be used in later MWE cleanup.
   *
   * NB: Much of the important cleanup happens implicitly here; the XML tree reader triggers the
   * tree normalization routine.
   */
  private List<Tree> loadTrees() throws
    InterruptedException, IOException, ExecutionException {
    boolean ner = PropertiesUtils.getBool(options, "ner", false);
    final String encoding = new SpanishTreebankLanguagePack().getEncoding();

    final SpanishXMLTreeReaderFactory trf = new SpanishXMLTreeReaderFactory(true, true, ner, false);

    List<Tree> trees = new ArrayList<>();
    for (File file : inputFiles) {
      Pair<TwoDimensionalCounter<String, String>, List<Tree>> ret = processTreeFile(file, trf,
                                                                                    encoding);

      Counters.addInPlace(unigramTagger, ret.first());
      trees.addAll(ret.second());
    }

    return trees;
  }

  /**
   * Processes a single file containing AnCora XML trees. Returns MWE statistics for the trees in
   * the file and the actual parsed trees.
   */
  private static Pair<TwoDimensionalCounter<String, String>, List<Tree>> processTreeFile(
    File file, SpanishXMLTreeReaderFactory trf, String encoding) {

    TwoDimensionalCounter<String, String> tagger = new TwoDimensionalCounter<>();

    try {
      Reader in = new BufferedReader(new InputStreamReader(new FileInputStream(file),
                                                           encoding));
      TreeReader tr = trf.newTreeReader(file.getPath(), in);

      List<Tree> trees = new ArrayList<>();
      Tree t, splitPoint;

      while ((t = tr.readTree()) != null) {
        // We may need to split the current tree into multiple parts.
        // (If not, a call to `split` with a `null` split-point is a
        // no-op
        do {
          splitPoint = findSplitPoint(t);
          Pair<Tree, Tree> split = split(t, splitPoint);

          Tree toAdd = split.first();
          t = split.second();

          trees.add(toAdd);
          updateTagger(tagger, toAdd);
        } while (splitPoint != null);
      }

      tr.close();

      return new Pair<>(tagger, trees);
    } catch (IOException e) {
      e.printStackTrace();
      return null;
    }
  }

  private static void updateTagger(TwoDimensionalCounter<String, String> tagger, Tree t) {
    List<CoreLabel> yield = t.taggedLabeledYield();
    for (CoreLabel label : yield) {
      if (label.tag().equals(SpanishTreeNormalizer.MW_TAG))
        continue;

      tagger.incrementCount(label.word(), label.tag());
    }
  }

  private static TreeNormalizer splittingNormalizer = new SpanishSplitTreeNormalizer();
  private static TreeFactory splittingTreeFactory = new LabeledScoredTreeFactory();

  /**
   * Split the given tree based on a split point such that the
   * terminals leading up to the split point are in the left returned
   * tree and those following the split point are in the left returned
   * tree.
   *
   * AnCora contains a nontrivial amount of trees with multiple
   * sentences in them. This method is used to break apart these
   * sentences into separate trees.
   *
   * @param t Tree from which to extract a subtree. This may be
   *          modified during processing.
   * @param splitPoint Point up to which to extract. If {@code null},
   *                   {@code t} is returned unchanged in the place of
   *                   the right tree.
   * @return A pair where the left tree contains every terminal leading
   *         up to and including {@code splitPoint} and the right tree
   *         contains every terminal following {@code splitPoint}.
   *         Both trees may be normalized before return.
   */
  static Pair<Tree, Tree> split(Tree t, Tree splitPoint) {
    if (splitPoint == null)
      return new Pair<>(t, null);

    Tree left = t.prune(new LeftOfFilter(splitPoint, t));
    Tree right = t.prune(new RightOfExclusiveFilter(splitPoint, t));

    left = splittingNormalizer.normalizeWholeTree(left, splittingTreeFactory);
    right = splittingNormalizer.normalizeWholeTree(right, splittingTreeFactory);

    return new Pair<>(left, right);
  }

  /**
   * Accepts any tree node to the left of the provided node (or the
   * provided node itself).
   */
  private static class LeftOfFilter implements Predicate<Tree>, Serializable {

    private static final long serialVersionUID = -5146948439247427344L;

    private Tree reference;
    private Tree root;

    /**
     * @param reference Node to which nodes provided to this filter
     *                  should be compared
     * @param root Root of the tree which contains the reference node
     *             and all nodes which may be provided to the filter
     */
    private LeftOfFilter(Tree reference, Tree root) {
      this.reference = reference;
      this.root = root;
    }

    @Override
    public boolean test(Tree obj) {
      if (obj == reference || obj.dominates(reference) || reference.dominates(obj))
        return true;

      Tree rightmostDescendant = getRightmostDescendant(obj);
      return Trees.rightEdge(rightmostDescendant, root) <= Trees.leftEdge(reference, root);
    }

    private Tree getRightmostDescendant(Tree t) {
      if (t.isLeaf()) return t;
      else return getRightmostDescendant(t.children()[t.children().length - 1]);
    }
  }

  /**
   * Accepts any tree node to the right of the provided node.
   */
  private static class RightOfExclusiveFilter implements Predicate<Tree>, Serializable {

    private static final long serialVersionUID = 8283161954004080591L;

    private Tree root;

    // This should be the leftmost terminal node of the filtered tree
    private Tree firstToKeep;

    /**
     * @param reference Node to which nodes provided to this filter
     *                  should be compared
     * @param root Root of the tree which contains the reference node
     *             and all nodes which may be provided to the filter
     */
    private RightOfExclusiveFilter(Tree reference, Tree root) {
      this.root = root;

      firstToKeep = getFollowingTerminal(reference, root);
    }

    @Override
    public boolean test(Tree obj) {
      if (obj.dominates(firstToKeep))
        return true;

      Tree leftmostDescendant = getLeftmostDescendant(obj);
      return Trees.rightEdge(leftmostDescendant, root) > Trees.leftEdge(firstToKeep, root);
    }

    /**
     * Get the terminal node which immediately follows the given node.
     */
    private Tree getFollowingTerminal(Tree terminal, Tree root) {
      Tree sibling = getRightSiblingOrRightAncestor(terminal, root);
      if (sibling == null)
        return null;
      return getLeftmostDescendant(sibling);
    }

    /**
     * Get the right sibling of the given node, or some node which is
     * the right sibling of an ancestor of the given node.
     *
     * If no such node can be found, this method returns {@code null}.
     */
    private Tree getRightSiblingOrRightAncestor(Tree t, Tree root) {
      Tree parent = t.parent(root);
      if (parent == null) return null;

      int idxWithinParent = parent.objectIndexOf(t);
      if (idxWithinParent < parent.numChildren() - 1)
        // Easy case: just return the immediate right sibling
        return parent.getChild(idxWithinParent + 1);

      return getRightSiblingOrRightAncestor(parent, root);
    }

    private Tree getLeftmostDescendant(Tree t) {
      if (t.isLeaf()) return t;
      else return getLeftmostDescendant(t.children()[0]);
    }
  }

  /**
   * Matches a point in the AnCora corpus which is the delimiter
   * between two sentences.
   *
   * @see {@link #split(Tree, Tree)}
   */
  private static final TregexPattern pSplitPoint =
    TregexPattern.compile("fp $+ /^[^f]/ > S|sentence");

  /**
   * Find the next point (preterminal) at which the given tree should
   * be split.
   *
   * @param t
   * @return The endpoint of a subtree which should be extracted, or
   *         {@code null} if there are no subtrees which need to be
   *         extracted.
   */
  static Tree findSplitPoint(Tree t) {
    TregexMatcher m = pSplitPoint.matcher(t);
    if (m.find())
      return m.getMatch();
    return null;
  }

  private class MultiWordProcessor implements ThreadsafeProcessor<Collection<Tree>,
    Collection<Tree>> {

    private final TreeNormalizer tn;
    private final Factory<TreeNormalizer> tnf;
    private final TreeFactory tf;

    private final boolean ner;

    // NB: TreeNormalizer is not thread-safe, and so we need to accept + store a
    // TreeNormalizer factory instead
    public MultiWordProcessor(Factory<TreeNormalizer> tnf, TreeFactory tf,
                              boolean ner) {
      this.tnf = tnf;
      this.tn = tnf.create();
      this.tf = tf;
      this.ner = ner;
    }

    @Override
    public Collection<Tree> process(Collection<Tree> coll) {
      List<Tree> ret = new ArrayList<>();

      // Apparently TsurgeonPatterns are not thread safe
      MultiWordTreeExpander expander = new MultiWordTreeExpander();

      for (Tree t : coll) {
        // Begin with basic POS / phrasal category inference
        MultiWordPreprocessor
          .traverseAndFix(t, null, AnCoraProcessor.this.unigramTagger, ner);

        // Now "decompress" further the expanded trees formed by multiword token splitting
        t = expander.expandPhrases(t, tn, tf);

        t = tn.normalizeWholeTree(t, tf);

        ret.add(t);
      }

      return ret;
    }

    @Override
    public ThreadsafeProcessor<Collection<Tree>, Collection<Tree>> newInstance() {
      return new MultiWordProcessor(tnf, tf, ner);
    }
  }

  /**
   * Fix tree structure, phrasal categories and part-of-speech labels in newly expanded
   * multi-word tokens.
   */
  private List<Tree> fixMultiWordTokens(List<Tree> trees)
    throws InterruptedException, ExecutionException {
    boolean ner = PropertiesUtils.getBool(options, "ner", false);

    // Shared resources
    Factory<TreeNormalizer> tnf = new Factory<TreeNormalizer>() {
      @Override public TreeNormalizer create() {
        return new SpanishTreeNormalizer(true, false, false);
      }
    };
    TreeFactory tf = new LabeledScoredTreeFactory();

    ThreadsafeProcessor<Collection<Tree>, Collection<Tree>> processor =
      new MultiWordProcessor(tnf, tf, ner);

    int availableProcessors = Runtime.getRuntime().availableProcessors();
    MulticoreWrapper<Collection<Tree>, Collection<Tree>> wrapper =
            new MulticoreWrapper<>(availableProcessors, processor,
                    false);

    // Chunk our work so that parallelization is actually worth it
    int numChunks = availableProcessors * 20;
    List<List<Tree>> chunked = CollectionUtils.partitionIntoFolds(trees, numChunks);
    List<Tree> ret = new ArrayList<>();

    for (final Collection<Tree> coll : chunked) {
      wrapper.put(coll);

      while (wrapper.peek())
        ret.addAll(wrapper.poll());
    }

    wrapper.join();

    while (wrapper.peek())
      ret.addAll(wrapper.poll());

    return ret;
  }

  private static final String usage =
    String.format("Usage: java %s [OPTIONS] file(s)%n%n", AnCoraProcessor.class.getName()) +
      "Options:\n" +
      "    -unigramTagger <tagger_path>: Path to a serialized `TwoDimensionalCounter` which\n" +
      "        should be used for unigram tagging in multi-word token expansion. If this option\n" +
      "        is not provided, a unigram tagger will be built from the provided corpus data.\n" +
      "        (This option is useful if you are processing splits of the corpus separately but\n" +
      "        want each step to benefit from a complete tagger.)\n" +
      "    -ner: Add NER-specific information to trees\n";

  private static final Map<String, Integer> argOptionDefs = new HashMap<>();
  static {
    argOptionDefs.put("unigramTagger", 1);
    argOptionDefs.put("ner", 0);
  }

  public static void main(String[] args)
    throws InterruptedException, IOException, ExecutionException, ClassNotFoundException {
    if (args.length < 1)
      log.info(usage);

    Properties options = StringUtils.argsToProperties(args, argOptionDefs);
    String[] remainingArgs = options.getProperty("").split(" ");
    List<File> fileList = new ArrayList<>();
    for (String arg : remainingArgs)
      fileList.add(new File(arg));

    AnCoraProcessor processor = new AnCoraProcessor(fileList, options);
    List<Tree> trees = processor.process();

    for (Tree t : trees)
      System.out.println(t);
  }

}