ArabicHeadFinder.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.trees.international.arabic; 
import edu.stanford.nlp.util.logging.Redwood;

import java.util.regex.Pattern;

import edu.stanford.nlp.trees.AbstractCollinsHeadFinder;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreebankLanguagePack;
import edu.stanford.nlp.util.Generics;

/**
 * Find the head of an Arabic tree, using the usual kind of heuristic
 * head finding rules.
 * <p>
 * <i>Implementation notes.</i>
 * TO DO: make sure that -PRD marked elements are always chosen as heads.
 * (Has this now been successfully done or not??)
 * <p>
 * Mona: I added the 8 new Nonterm for the merged DT with its following
 * category as a rule the DT nonterm is right headed, the 8 new nonterm DTs
 * are: DTCD, DTRB, DTRP, DTJJ, DTNN, DTNNS, DTNNP, DTNNPS.
 * This was added Dec 7th, 2004.
 *
 * @author Roger Levy
 * @author Mona Diab
 * @author Christopher Manning (added new stuff for ATBp3v3
 */
public class ArabicHeadFinder extends AbstractCollinsHeadFinder  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(ArabicHeadFinder.class);
  private static final long serialVersionUID = 6203368998430280740L;
  protected TagSet tagSet;

  /* A work in progress. There may well be a better way to parameterize the HeadFinders via tagset. */
  public enum TagSet {
    BIES_COLLAPSED {
      @Override
      String prep()  { return "IN"; }
      @Override
      String noun()  { return "NN"; } // really there should be several here.
      @Override
      String det()  { return "DT"; }
      @Override
      String adj()  { return "JJ"; }
      @Override
      String detPlusNoun()  { return "DTNN"; }  // really there should be several here; major point is that the det part is ignored completely
      @Override
      TreebankLanguagePack langPack()  { return new ArabicTreebankLanguagePack(); }
    },
    ORIGINAL {
      @Override
      String prep()  { return "PREP"; }
      @Override
      String noun()  { return "NOUN"; }
      @Override
      String det()  { return "DET"; }
      @Override
      String adj()  { return "ADJ"; }
      @Override
      String detPlusNoun()  { return "DET+NN"; }
      @Override
      TreebankLanguagePack langPack()  { return new ArabicTreebankLanguagePack(); }
    };

    abstract String prep();
    abstract String noun();
    abstract String adj();
    abstract String det();
    abstract String detPlusNoun();
    abstract TreebankLanguagePack langPack();

    static TagSet tagSet(String str) {
      switch (str) {
        case "BIES_COLLAPSED":
          return BIES_COLLAPSED;
        case "ORIGINAL":
          return ORIGINAL;
        default:
          throw new IllegalArgumentException("Don't know anything about tagset " + str);
      }
    }
  }

  public ArabicHeadFinder() {
    this(new ArabicTreebankLanguagePack());
  }

  /**
   * Construct an ArabicHeadFinder with a String parameter corresponding to the tagset in use.
   * @param tagSet Either "ORIGINAL" or "BIES_COLLAPSED"
   */
  public ArabicHeadFinder(String tagSet) {
    this(TagSet.tagSet(tagSet));
  }

  public ArabicHeadFinder(TagSet tagSet) {
    this(tagSet.langPack(), tagSet);
    //this(new ArabicTreebankLanguagePack(), tagSet);
  }

  public ArabicHeadFinder(TreebankLanguagePack tlp) {
    this(tlp,TagSet.BIES_COLLAPSED);
  }

  protected ArabicHeadFinder(TreebankLanguagePack tlp, TagSet tagSet) {
    super(tlp);
    this.tagSet = tagSet;
    //log.info("##testing: noun tag is " + tagSet.noun());

    nonTerminalInfo = Generics.newHashMap();

    nonTerminalInfo.put("NX", new String[][]{{"left", "DT","DTNN","DTNNS","DTNNP", "DTNNPS", "DTJJ", "DTNOUN_QUANT", "NOUN_QUANT", "MWNP"}});
    nonTerminalInfo.put("ADJP", new String[][]{{"rightdis", tagSet.adj(), "DTJJ", "ADJ_NUM", "DTADJ_NUM", "JJR", "DTJJR", "MWADJP"}, {"right", "ADJP", "VN", tagSet.noun(), "MWNP", "NNP", "NNPS", "NNS", "DTNN", "DTNNS","DTNNP","DTNNPS","DTJJ", "DTNOUN_QUANT", "NOUN_QUANT"}, {"right", "RB", "MWADVP", "CD","DTRB","DTCD"}, {"right", "DT"}}); // sometimes right, sometimes left headed??
    nonTerminalInfo.put("MWADJP", new String[][]{{"rightdis", tagSet.adj(), "DTJJ", "ADJ_NUM", "DTADJ_NUM", "JJR", "DTJJR"}, {"right", tagSet.noun(), "MWNP", "NNP", "NNPS", "NNS", "DTNN", "DTNNS","DTNNP","DTNNPS","DTJJ", "DTNOUN_QUANT", "NOUN_QUANT"}, {"right", "RB", "MWADVP", "CD","DTRB","DTCD"}, {"right", "DT"}}); // sometimes right, sometimes left headed??
    nonTerminalInfo.put("ADVP", new String[][]{{"left", "WRB", "RB", "MWADVP", "ADVP", "WHADVP","DTRB"}, {"left", "CD", "RP", tagSet.noun(), "MWNP", "CC", "MWCONJP", tagSet.adj(), "MWADJP", "DTJJ", "ADJ_NUM", "DTADJ_NUM", "IN", "MWPP", "NP", "NNP", "NOFUNC","DTRP","DTNN","DTNNP","DTNNPS","DTNNS","DTJJ", "DTNOUN_QUANT", "NOUN_QUANT"}}); // NNP is a gerund that they called an unknown (=NNP, believe it or not...)
    nonTerminalInfo.put("MWADVP", new String[][]{{"left", "WRB", "RB", "ADVP", "WHADVP","DTRB"}, {"left", "CD", "RP", tagSet.noun(), "MWNP", "CC", "MWCONJP", tagSet.adj(), "MWADJP", "DTJJ", "ADJ_NUM", "DTADJ_NUM", "IN", "MWPP", "NP", "NNP", "NOFUNC","DTRP","DTNN","DTNNP","DTNNPS","DTNNS","DTJJ", "DTNOUN_QUANT", "NOUN_QUANT"}}); // NNP is a gerund that they called an unknown (=NNP, believe it or not...)
    nonTerminalInfo.put("CONJP", new String[][]{{"right", "IN", "RB", "MWADVP", tagSet.noun(), "MWNP", "NNS","NNP", "NNPS", "DTRB", "DTNN", "DTNNS", "DTNNP", "DTNNPS", "DTNOUN_QUANT", "NOUN_QUANT"}});
    nonTerminalInfo.put("MWCONJP", new String[][]{{"right", "IN", "RB", "MWADVP", tagSet.noun(), "MWNP", "NNS","NNP", "NNPS", "DTRB", "DTNN", "DTNNS", "DTNNP", "DTNNPS", "DTNOUN_QUANT", "NOUN_QUANT"}});
    nonTerminalInfo.put("FRAG", new String[][]{{"left", tagSet.noun(), "MWNP", "NNPS", "NNP","NNS", "DTNN", "DTNNS", "DTNNP", "DTNNPS", "DTNOUN_QUANT", "NOUN_QUANT"}, {"left", "VBP"}});
    nonTerminalInfo.put("MWFRAG", new String[][]{{"left", tagSet.noun(), "MWNP", "NNPS", "NNP","NNS", "DTNN", "DTNNS", "DTNNP", "DTNNPS", "DTNOUN_QUANT", "NOUN_QUANT"}, {"left", "VBP"}});
    nonTerminalInfo.put("INTJ", new String[][]{{"left", "RP", "UH", "DTRP"}});
    nonTerminalInfo.put("LST", new String[][]{{"left"}});
    nonTerminalInfo.put("NAC", new String[][]{{"left", "NP", "SBAR", "PP", "MWP","ADJP", "S", "PRT", "UCP"}, {"left", "ADVP"}}); // note: maybe CC, RB should be the heads?
    nonTerminalInfo.put("NP", new String[][]{{"left", tagSet.noun(), "MWNP", tagSet.detPlusNoun(), "NNS", "NNP", "NNPS", "NP", "PRP", "WHNP", "QP", "WP", "DTNNS", "DTNNPS", "DTNNP", "NOFUNC", "NO_FUNC", "DTNOUN_QUANT", "NOUN_QUANT"}, {"left", tagSet.adj(), "MWADJP", "DTJJ", "JJR", "DTJJR", "ADJ_NUM", "DTADJ_NUM"}, {"right", "CD", "DTCD"}, {"left", "PRP$"}, {"right", "DT"}}); // should the JJ rule be left or right?
    nonTerminalInfo.put("MWNP", new String[][]{{"left", tagSet.noun(), "MWNP", tagSet.detPlusNoun(), "NNS", "NNP", "NNPS", "PRP", "QP", "WP", "DTNNS", "DTNNPS", "DTNNP", "DTNOUN_QUANT", "NOUN_QUANT"}, {"left", tagSet.adj(), "MWADJP", "DTJJ", "JJR", "DTJJR", "ADJ_NUM", "DTADJ_NUM"}, {"right", "CD", "DTCD"}, {"left", "PRP$"}, {"right", "DT"}}); // should the JJ rule be left or right?
    nonTerminalInfo.put("PP", new String[][]{{"left", tagSet.prep(), "MWPP", "PP", "MWP","PRT", "X"}, {"left", "NNP", "RP", tagSet.noun(), "MWNP"}, {"left", "NP"}}); // NN is for a mistaken "fy", and many wsT
    nonTerminalInfo.put("MWPP", new String[][]{{"left", tagSet.prep(), "PP", "MWP","PRT", "X"}, {"left", "NNP", "RP", tagSet.noun(), "MWNP"}, {"left", "NP"}}); // NN is for a mistaken "fy", and many wsT
    nonTerminalInfo.put("PRN", new String[][]{{"left", "NP"}}); // don't get PUNC
    nonTerminalInfo.put("MWPRN", new String[][]{{"left", "IN"}}); // don't get PUNC
    nonTerminalInfo.put("PRT", new String[][]{{"left", "RP", "PRT", "IN", "DTRP"}});
    nonTerminalInfo.put("QP", new String[][]{{"right", "CD", "DTCD", tagSet.noun(), "MWNP", tagSet.adj(), "MWADJP", "NNS", "NNP", "NNPS", "DTNN", "DTNNS", "DTNNP", "DTNNPS", "DTJJ", "DTNOUN_QUANT", "NOUN_QUANT"}});

    nonTerminalInfo.put("S", new String[][]{{"left", "VP", "MWVP", "S"}, {"right", "PP", "MWP","ADVP", "SBAR", "UCP", "ADJP"}}); // really important to put in -PRD sensitivity here!
    nonTerminalInfo.put("MWS", new String[][]{{"left", "VP", "MWVP", "S"}, {"right", "PP", "MWP","ADVP", "SBAR", "UCP", "ADJP"}}); // really important to put in -PRD sensitivity here!
    nonTerminalInfo.put("SQ", new String[][]{{"left", "VP", "MWVP", "PP", "MWP"}}); // to be principled, we need -PRD sensitivity here too.
    nonTerminalInfo.put("SBAR", new String[][]{{"left", "WHNP", "WHADVP", "WRB", "RP", "IN", "SBAR", "CC", "MWCONJP", "WP", "WHPP", "ADVP", "PRT", "RB", "MWADVP", "X", "DTRB", "DTRP"}, {"left", tagSet.noun(), "MWNP", "NNP", "NNS", "NNPS", "DTNN", "DTNNS", "DTNNP", "DTNNPS", "DTNOUN_QUANT", "NOUN_QUANT"}, {"left", "S"}});
    nonTerminalInfo.put("MWSBAR", new String[][]{{"left", "WHNP", "WHADVP", "WRB", "RP", "IN", "SBAR", "CC", "MWCONJP", "WP", "WHPP", "ADVP", "PRT", "RB", "MWADVP", "X", "DTRB", "DTRP"}, {"left", tagSet.noun(), "MWNP", "NNP", "NNS", "NNPS", "DTNN", "DTNNS", "DTNNP", "DTNNPS", "DTNOUN_QUANT", "NOUN_QUANT"}, {"left", "S"}});
    nonTerminalInfo.put("SBARQ", new String[][]{{"left", "WHNP", "WHADVP", "RP", "IN", "SBAR", "CC", "MWCONJP", "WP", "WHPP", "ADVP", "PRT", "RB", "MWADVP", "X"}, {"left", tagSet.noun(), "MWNP", "NNP", "NNS", "NNPS","DTNN", "DTNNS", "DTNNP", "DTNNPS", "DTNOUN_QUANT", "NOUN_QUANT"}, {"left", "S"}}); // copied from SBAR rule -- look more closely when there's time
    nonTerminalInfo.put("UCP", new String[][]{{"left"}});
    nonTerminalInfo.put("VP", new String[][]{{"left", "VBD", "VBN", "VBP", "VBG", "DTVBG", "VN", "DTVN", "VP", "RB", "MWADVP", "X","VB"}, {"left", "IN"}, {"left", "NNP", tagSet.noun(), "MWNP", "DTNN", "DTNNP", "DTNNPS", "DTNNS", "DTNOUN_QUANT", "NOUN_QUANT"}}); // exclude RP because we don't want negation markers as heads -- no useful information?
    nonTerminalInfo.put("MWVP", new String[][]{{"left", "VBD", "VBN", "VBP", "VBG", "DTVBG", "VN", "DTVN", "VP", "MWVP", "RB", "MWADVP", "X","VB"}, {"left", "IN"}, {"left", "NNP", tagSet.noun(), "MWNP", "DTNN", "DTNNP", "DTNNPS", "DTNNS", "DTNOUN_QUANT", "NOUN_QUANT"}}); // exclude RP because we don't want negation markers as heads -- no useful information?

    
    //also, RB is used as gerunds

    nonTerminalInfo.put("WHADVP", new String[][]{{"left", "WRB", "WP"}, {"right", "CC", "MWCONJP"}, {"left", "IN"}});
    nonTerminalInfo.put("WHNP", new String[][]{{"right", "WP"}});
    nonTerminalInfo.put("WHPP", new String[][]{{"left",  "IN", "MWPP", "RB", "MWADVP"}});
    nonTerminalInfo.put("X", new String[][]{{"left"}});

    //Added by Mona 12/7/04 for the newly created DT nonterm cat
    nonTerminalInfo.put("DTNN", new String[][]{{"right"}});
    nonTerminalInfo.put("DTNNS", new String[][]{{"right"}});
    nonTerminalInfo.put("DTNNP", new String[][]{{"right"}});
    nonTerminalInfo.put("DTNNPS", new String[][]{{"right"}});
    nonTerminalInfo.put("DTJJ", new String[][]{{"right"}});
    nonTerminalInfo.put("DTRP", new String[][]{{"right"}});
    nonTerminalInfo.put("DTRB", new String[][]{{"right"}});
    nonTerminalInfo.put("DTCD", new String[][]{{"right"}});
    nonTerminalInfo.put("DTIN", new String[][]{{"right"}});

    // stand-in dependency:
    nonTerminalInfo.put("EDITED", new String[][]{{"left"}});
    nonTerminalInfo.put(tlp.startSymbol(), new String[][]{{"left"}});

    // one stray SINV in the training set...garbage head rule here.
    nonTerminalInfo.put("SINV", new String[][]{{"left","ADJP","VP"}});
  }


  private final Pattern predPattern = Pattern.compile(".*-PRD$");

  /**
   * Predicatively marked elements in a sentence should be noted as heads
   */
  @Override
  protected Tree findMarkedHead(Tree t) {
    String cat = t.value();
    if (cat.equals("S")) {
      Tree[] kids = t.children();
      for (Tree kid : kids) {
        if (predPattern.matcher(kid.value()).matches()) {
          return kid;
        }
      }
    }
    return null;
  }

}