ArabicTreeNormalizer.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.trees.international.arabic; 
import edu.stanford.nlp.util.logging.Redwood;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.regex.Pattern;

import edu.stanford.nlp.international.arabic.pipeline.DefaultLexicalMapper;
import edu.stanford.nlp.international.morph.MorphoFeatureSpecification;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.HasTag;
import edu.stanford.nlp.trees.treebank.Mapper;
import edu.stanford.nlp.trees.BobChrisTreeNormalizer;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeFactory;
import edu.stanford.nlp.trees.tregex.TregexMatcher;
import edu.stanford.nlp.trees.tregex.TregexPattern;
import java.util.function.Predicate;
import edu.stanford.nlp.util.Pair;

/**
 * Normalizes both terminals and non-terminals in Penn Arabic Treebank (ATB)
 * trees. Among the normalizations that can be performed:
 *
 * <ul>
 * <li> Adds a ROOT node to the top of every tree
 * <li> Strips all the interesting stuff off of the POS tags.
 * <li> Can keep NP-TMP annotations (retainNPTmp parameter)
 * <li> Can keep whatever annotations there are on verbs that are sisters
 *           to predicatively marked (-PRD) elements (markPRDverb parameter)
 *           [Chris Nov 2006: I'm a bit unsure on that one!]
 * <li> Can keep categories unchanged, i.e., not mapped to basic categories
 *           (changeNoLabels parameter)
 * <li> Counts pronoun deletions ("nullp" and "_") as empty; filters
 * </ul>
 *
 * @author Roger Levy
 * @author Anna Rafferty
 * @author Spence Green
 */
public class ArabicTreeNormalizer extends BobChrisTreeNormalizer  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(ArabicTreeNormalizer.class);

  private final boolean retainNPTmp;
  private final boolean retainNPSbj;
  private final boolean markPRDverb;
  private final boolean changeNoLabels;
  private final boolean retainPPClr;

  private final Pattern prdPattern;
  private final TregexPattern prdVerbPattern;
  private final TregexPattern npSbjPattern;
  private final String rootLabel;

  private final Mapper lexMapper = new DefaultLexicalMapper();

  public ArabicTreeNormalizer(boolean retainNPTmp, boolean markPRDverb, boolean changeNoLabels,
      boolean retainNPSbj, boolean retainPPClr) {
    super(new ArabicTreebankLanguagePack());
    this.retainNPTmp = retainNPTmp;
    this.retainNPSbj = retainNPSbj;
    this.markPRDverb = markPRDverb;
    this.changeNoLabels = changeNoLabels;
    this.retainPPClr = retainPPClr;

    rootLabel = tlp.startSymbol();

    prdVerbPattern  = TregexPattern.compile("/^V[^P]/ > VP $ /-PRD$/=prd");

    prdPattern = Pattern.compile("^[A-Z]+-PRD");

    //Marks NP subjects that *do not* occur in verb-initial clauses
    npSbjPattern = TregexPattern.compile("/^NP-SBJ/ !> @VP");

    emptyFilter = new ArabicEmptyFilter();
  }

  public ArabicTreeNormalizer(boolean retainNPTmp, boolean markPRDverb,
      boolean changeNoLabels) {
    this(retainNPTmp, markPRDverb, changeNoLabels, false, false);
  }

  public ArabicTreeNormalizer(boolean retainNPTmp, boolean markPRDverb) {
    this(retainNPTmp,markPRDverb,false);
  }

  public ArabicTreeNormalizer(boolean retainNPTmp) {
    this(retainNPTmp,false);
  }

  public ArabicTreeNormalizer() {
    this(false);
  }

  @Override
  public String normalizeNonterminal(String category) {
    String normalizedString;
    if (changeNoLabels) {
      normalizedString = category;
    } else if (retainNPTmp && category != null && category.startsWith("NP-TMP")) {
      normalizedString = "NP-TMP";
    } else if (retainNPSbj && category != null && category.startsWith("NP-SBJ")) {
      normalizedString = "NP-SBJ";
    } else if (retainPPClr && category != null && category.startsWith("PP-CLR")) {
      normalizedString = "PP-CLR";
    } else if (markPRDverb && category != null && prdPattern.matcher(category).matches()) {
      normalizedString = category;
    } else {
      // otherwise, return the basicCategory (and turn null to ROOT)
      normalizedString = super.normalizeNonterminal(category);
    }

    return normalizedString.intern();
  }

  @Override
  public Tree normalizeWholeTree(Tree tree, TreeFactory tf) {
    tree = tree.prune(emptyFilter, tf).spliceOut(aOverAFilter, tf);

    for (Tree t : tree) {

      if(t.isLeaf()) {
        //Strip off morphological analyses and place them in the OriginalTextAnnotation, which is
        //specified by HasContext.
        if(t.value().contains(MorphoFeatureSpecification.MORPHO_MARK)) {
          String[] toks = t.value().split(MorphoFeatureSpecification.MORPHO_MARK);
          if(toks.length != 2)
            System.err.printf("%s: Word contains malformed morph annotation: %s%n",this.getClass().getName(),t.value());

          else if(t.label() instanceof CoreLabel) {
            ((CoreLabel) t.label()).setValue(toks[0].trim().intern());
            ((CoreLabel) t.label()).setWord(toks[0].trim().intern());

            Pair<String,String> lemmaMorph = MorphoFeatureSpecification.splitMorphString(toks[0], toks[1]);
            String lemma = lemmaMorph.first();
            String morphAnalysis = lemmaMorph.second();
            if (lemma.equals(toks[0])) {
              ((CoreLabel) t.label()).setOriginalText(toks[1].trim().intern());
            } else {
              // TODO(speneg): Does this help?
              String newLemma = lexMapper.map(null, lemma);
              if (newLemma == null || newLemma.trim().length() == 0) {
                newLemma = lemma;
              }
              String newMorphAnalysis = newLemma + MorphoFeatureSpecification.LEMMA_MARK + morphAnalysis;
              ((CoreLabel) t.label()).setOriginalText(newMorphAnalysis.intern());
            }

          } else {
            System.err.printf("%s: Cannot store morph analysis in non-CoreLabel: %s%n",this.getClass().getName(),t.label().getClass().getName());
          }
        }

      } else if (t.isPreTerminal()) {

        if (t.value() == null || t.value().equals("")) {
          System.err.printf("%s: missing tag for\n%s\n",this.getClass().getName(),t.pennString());
        } else if(t.label() instanceof HasTag) {
          ((HasTag) t.label()).setTag(t.value());
        }

      } else { //Phrasal nodes

        // there are some nodes "/" missing preterminals.  We'll splice in a tag for these.
        int nk = t.numChildren();
        List<Tree> newKids = new ArrayList<>(nk);
        for (int j = 0; j < nk; j++) {
          Tree child = t.getChild(j);
          if (child.isLeaf()) {
            System.err.printf("%s: Splicing in DUMMYTAG for%n%s%n",this.getClass().getName(),t.toString());
            newKids.add(tf.newTreeNode("DUMMYTAG", Collections.singletonList(child)));

          } else {
            newKids.add(child);
          }
        }
        t.setChildren(newKids);
      }
    }//Every node in the tree has now been processed

    //
    // Additional processing for specific phrasal annotations
    //

    // special global coding for moving PRD annotation from constituent to verb tag.
    if (markPRDverb) {
      TregexMatcher m = prdVerbPattern.matcher(tree);
      Tree match = null;
      while (m.find()) {
        if (m.getMatch() != match) {
          match = m.getMatch();
          match.label().setValue(match.label().value() + "-PRDverb");
          Tree prd = m.getNode("prd");
          prd.label().setValue(super.normalizeNonterminal(prd.label().value()));
        }
      }
    }

    //Mark *only* subjects in verb-initial clauses
    if(retainNPSbj) {
      TregexMatcher m = npSbjPattern.matcher(tree);
      while (m.find()) {
        Tree match = m.getMatch();
        match.label().setValue("NP");
      }
    }

    if (tree.isPreTerminal()) {
      // The whole tree is a bare tag: bad!
      String val = tree.label().value();
      if (val.equals("CC") || val.startsWith("PUNC") || val.equals("CONJ")) {
        System.err.printf("%s: Bare tagged word being wrapped in FRAG\n%s\n",this.getClass().getName(),tree.pennString());
        tree = tf.newTreeNode("FRAG", Collections.singletonList(tree));
      } else {
        System.err.printf("%s: Bare tagged word\n%s\n",this.getClass().getName(),tree.pennString());
      }
    }

    //Add start symbol so that the root has only one sub-state. Escape any enclosing brackets.
    //If the "tree" consists entirely of enclosing brackets e.g. ((())) then this method
    //will return null. In this case, readers e.g. PennTreeReader will try to read the next tree.
    while(tree != null && (tree.value() == null || tree.value().equals("")) && tree.numChildren() <= 1)
      tree = tree.firstChild();

    if(tree != null && !tree.value().equals(rootLabel))
      tree = tf.newTreeNode(rootLabel, Collections.singletonList(tree));

    return tree;
  }


  /**
   * Remove traces and pronoun deletion markers.
   */
  public static class ArabicEmptyFilter implements Predicate<Tree>, Serializable {

    private static final long serialVersionUID = 7417844982953945964L;

    public boolean test(Tree t) {
      // Pronoun deletions
      if(t.isPreTerminal() && (t.value().equals("PRON_1S") || t.value().equals("PRP")) &&
          (t.firstChild().value().equals("nullp") || t.firstChild().value().equals("نللة") || t.firstChild().value().equals("-~a")))
        return false;

      // Traces
      else if(t.isPreTerminal() && t.value() != null && t.value().equals("-NONE-"))
        return false;

      return true;
    }
  }

  private static final long serialVersionUID = -1592231121068698494L;
}