Sieve.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.coref.hybrid.sieve;
import edu.stanford.nlp.util.logging.Redwood;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Properties;
import java.util.Set;

import edu.stanford.nlp.coref.data.CorefCluster;
import edu.stanford.nlp.coref.data.Dictionaries;
import edu.stanford.nlp.coref.data.Document;
import edu.stanford.nlp.coref.data.Mention;
import edu.stanford.nlp.coref.data.Dictionaries.MentionType;
import edu.stanford.nlp.coref.data.Dictionaries.Person;
import edu.stanford.nlp.coref.hybrid.HybridCorefPrinter;
import edu.stanford.nlp.coref.hybrid.HybridCorefProperties;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.util.Generics;

public abstract class Sieve implements Serializable  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(Sieve.class);

  private static final long serialVersionUID = 3986463332365306868L;

  public enum ClassifierType {RULE, RF, ORACLE}

  public ClassifierType classifierType = null;

  protected Locale lang;

  public final String sievename;

  /** the maximum sentence distance for linking two mentions */
  public int maxSentDist = -1;

  /** type of mention we want to resolve. e.g., if mType is PRONOMINAL, we only resolve pronoun mentions */
  public final Set<MentionType> mType;

  /** type of mention we want to compare to. e.g., if aType is PROPER, the resolution can be done only with PROPER antecedent  */
  public final Set<MentionType> aType;

  public final Set<String> mTypeStr;
  public final Set<String> aTypeStr;

  public Properties props = null;

  public Sieve() {
    this.lang = Locale.ENGLISH;
    this.sievename = this.getClass().getSimpleName();
    this.aType = new HashSet<>(Arrays.asList(MentionType.values()));
    this.mType = new HashSet<>(Arrays.asList(MentionType.values()));
    this.maxSentDist = 1000;
    this.mTypeStr = Generics.newHashSet();
    this.aTypeStr = Generics.newHashSet();
  }

  public Sieve(Properties props){
    this.lang = HybridCorefProperties.getLanguage(props);
    this.sievename = this.getClass().getSimpleName();
    this.aType = HybridCorefProperties.getAntecedentType(props, sievename);
    this.mType = HybridCorefProperties.getMentionType(props, sievename);
    this.maxSentDist = HybridCorefProperties.getMaxSentDistForSieve(props, sievename);
    this.mTypeStr = HybridCorefProperties.getMentionTypeStr(props, sievename);
    this.aTypeStr = HybridCorefProperties.getAntecedentTypeStr(props, sievename);
  }

  public Sieve(Properties props, String sievename) {
    this.lang = HybridCorefProperties.getLanguage(props);
    this.sievename = sievename;
    this.aType = HybridCorefProperties.getAntecedentType(props, sievename);
    this.mType = HybridCorefProperties.getMentionType(props, sievename);
    this.maxSentDist = HybridCorefProperties.getMaxSentDistForSieve(props, sievename);
    this.mTypeStr = HybridCorefProperties.getMentionTypeStr(props, sievename);
    this.aTypeStr = HybridCorefProperties.getAntecedentTypeStr(props, sievename);
  }

  public String resolveMention(Document document, Dictionaries dict, Properties props) throws Exception {
    StringBuilder sbLog = new StringBuilder();

    if(HybridCorefProperties.debug(props)) {
      sbLog.append("=======================================================");
      sbLog.append(HybridCorefPrinter.printRawDoc(document, true, true));
    }

    for(List<Mention> mentionsInSent : document.predictedMentions) {
      for(int mIdx = 0 ; mIdx < mentionsInSent.size() ; mIdx++) {
        Mention m = mentionsInSent.get(mIdx);
        if(skipMentionType(m, props)) continue;
        findCoreferentAntecedent(m, mIdx, document, dict, props, sbLog);
      }
    }
    return sbLog.toString();
  }

  public abstract void findCoreferentAntecedent(Mention m, int mIdx, Document document, Dictionaries dict, Properties props, StringBuilder sbLog) throws Exception;


  // load sieve (from file or make a deterministic sieve)
  public static Sieve loadSieve(Properties props, String sievename) throws Exception {
    // log.info("Loading sieve: "+sievename+" ...");
    switch(HybridCorefProperties.getClassifierType(props, sievename)) {
      case RULE:
        DeterministicCorefSieve sieve = (DeterministicCorefSieve) Class.forName("edu.stanford.nlp.coref.hybrid.sieve."+sievename).getConstructor().newInstance();
        sieve.props = props;
        sieve.lang = HybridCorefProperties.getLanguage(props);
        return sieve;

      case RF:
        log.info("Loading sieve: " + sievename + " from " + HybridCorefProperties.getPathModel(props, sievename) + " ... ");
        RFSieve rfsieve = IOUtils.readObjectFromURLOrClasspathOrFileSystem(HybridCorefProperties.getPathModel(props, sievename));
        rfsieve.thresMerge = HybridCorefProperties.getMergeThreshold(props, sievename);
        log.info("done. Merging threshold: " + rfsieve.thresMerge);
        return rfsieve;

      case ORACLE:
        OracleSieve oracleSieve = new OracleSieve(props, sievename);
        oracleSieve.props = props;
        return oracleSieve;

      default:
        throw new RuntimeException("no sieve type specified");
    }
  }


  public static List<Sieve> loadSieves(Properties props) throws Exception {
    List<Sieve> sieves = new ArrayList<>();
    String sieveProp = HybridCorefProperties.getSieves(props);
    String currentSieveForTrain = HybridCorefProperties.getCurrentSieveForTrain(props);
    String[] sievenames = (currentSieveForTrain==null)?
        sieveProp.trim().split(",\\s*") : sieveProp.split(currentSieveForTrain)[0].trim().split(",\\s*");
    for(String sievename : sievenames) {
      Sieve sieve = loadSieve(props, sievename);
      sieves.add(sieve);
    }
    return sieves;
  }

  public static boolean hasThat(List<CoreLabel> words) {
    for(CoreLabel cl : words) {
      if(cl.word().equalsIgnoreCase("that") && cl.tag().equalsIgnoreCase("IN")) {
        return true;
      }
    }
    return false;
  }

  public static boolean hasToVerb(List<CoreLabel> words) {
    for(int i=0 ; i<words.size()-1 ; i++) {
      if(words.get(i).tag().equals("TO") && words.get(i+1).tag().startsWith("V")) return true;
    }
    return false;
  }


  private boolean skipMentionType(Mention m, Properties props) {
    if(mType.contains(m.mentionType)) return false;
    return true;
  }

  public static void merge(Document document, int mID, int antID) {
    CorefCluster c1 = document.corefClusters.get(document.predictedMentionsByID.get(mID).corefClusterID);
    CorefCluster c2 = document.corefClusters.get(document.predictedMentionsByID.get(antID).corefClusterID);
    if(c1==c2) return;

    int removeID = c1.getClusterID();
    CorefCluster.mergeClusters(c2, c1);
    document.mergeIncompatibles(c2, c1);
    document.corefClusters.remove(removeID);
  }

  // check if two mentions are really coref in gold annotation
  public static boolean isReallyCoref(Document document, int mID, int antID) {
    if(!document.goldMentionsByID.containsKey(mID) || !document.goldMentionsByID.containsKey(antID)) {
      return false;
    }
    int mGoldClusterID = document.goldMentionsByID.get(mID).goldCorefClusterID;
    int aGoldClusterID = document.goldMentionsByID.get(antID).goldCorefClusterID;

    return (mGoldClusterID == aGoldClusterID);
  }

  protected static boolean skipForAnalysis(Mention ant, Mention m, Properties props) {
    if(!HybridCorefProperties.doAnalysis(props)) return false;
    String skipMentionType = HybridCorefProperties.getSkipMentionType(props);
    String skipAntType = HybridCorefProperties.getSkipAntecedentType(props);

    return matchedMentionType(ant, skipAntType) && matchedMentionType(m, skipMentionType);
  }
  protected static boolean matchedMentionType(Mention m, Set<String> types) {
    if(types.isEmpty()) return true;
    for(String type : types) {
      if(matchedMentionType(m, type)) return true;
    }
    return false;
  }
  protected static boolean matchedMentionType(Mention m, String type) {
    if(type==null) return false;
    if(type.equalsIgnoreCase("all") || type.equalsIgnoreCase(m.mentionType.toString())) return true;

    // check pronoun specific type
    if(type.equalsIgnoreCase("he") && m.isPronominal() && m.person == Person.HE) return true;
    if(type.equalsIgnoreCase("she") && m.isPronominal() && m.person == Person.SHE) return true;
    if(type.equalsIgnoreCase("you") && m.isPronominal() && m.person == Person.YOU) return true;
    if(type.equalsIgnoreCase("I") && m.isPronominal() && m.person == Person.I) return true;
    if(type.equalsIgnoreCase("it") && m.isPronominal() && m.person == Person.IT) return true;
    if(type.equalsIgnoreCase("they") && m.isPronominal() && m.person == Person.THEY) return true;
    if(type.equalsIgnoreCase("we") && m.isPronominal() && m.person == Person.WE) return true;

    // check named entity type
    if(type.toLowerCase().startsWith("ne:")) {
      if(type.toLowerCase().substring(3).startsWith(m.nerString.toLowerCase().substring(0, Math.min(3, m.nerString.length())))) return true;
    }

    return false;
  }

  public static List<Mention> getOrderedAntecedents(
      Mention m,
      int antecedentSentence,
      int mPosition,
      List<List<Mention>> orderedMentionsBySentence,
      Dictionaries dict) {
    List<Mention> orderedAntecedents = new ArrayList<>();
    // ordering antecedents
    if (antecedentSentence == m.sentNum) {   // same sentence
      orderedAntecedents.addAll(orderedMentionsBySentence.get(m.sentNum).subList(0, mPosition));

      if(dict.relativePronouns.contains(m.spanToString())) Collections.reverse(orderedAntecedents);
      else {
        orderedAntecedents = sortMentionsByClause(orderedAntecedents, m);
      }

    } else {    // previous sentence
      orderedAntecedents.addAll(orderedMentionsBySentence.get(antecedentSentence));
    }
    return orderedAntecedents;
  }

  /** Divides a sentence into clauses and sort the antecedents for pronoun matching  */
  private static List<Mention> sortMentionsByClause(List<Mention> l, Mention m1) {
    List<Mention> sorted = new ArrayList<>();
    Tree tree = m1.contextParseTree;
    Tree current = m1.mentionSubTree;
    if(tree==null || current==null) return l;
    while(true){
      current = current.ancestor(1, tree);
      String curLabel = current.label().value();
      if("TOP".equals(curLabel) || curLabel.startsWith("S") || curLabel.equals("NP")){
//      if(current.label().value().startsWith("S")){
        for(Mention m : l){
          if(!sorted.contains(m) && current.dominates(m.mentionSubTree)) sorted.add(m);
        }
      }
      if(current.ancestor(1, tree)==null) break;
    }
    return sorted;
  }
}