package edu.stanford.nlp.coref.hybrid.sieve;
import edu.stanford.nlp.util.logging.Redwood;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Properties;
import java.util.Set;
import edu.stanford.nlp.coref.data.CorefCluster;
import edu.stanford.nlp.coref.data.Dictionaries;
import edu.stanford.nlp.coref.data.Document;
import edu.stanford.nlp.coref.data.Mention;
import edu.stanford.nlp.coref.data.Dictionaries.MentionType;
import edu.stanford.nlp.coref.data.Dictionaries.Person;
import edu.stanford.nlp.coref.hybrid.HybridCorefPrinter;
import edu.stanford.nlp.coref.hybrid.HybridCorefProperties;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.util.Generics;
public abstract class Sieve implements Serializable {
/** A logger for this class */
private static Redwood.RedwoodChannels log = Redwood.channels(Sieve.class);
private static final long serialVersionUID = 3986463332365306868L;
public enum ClassifierType {RULE, RF, ORACLE}
public ClassifierType classifierType = null;
protected Locale lang;
public final String sievename;
/** the maximum sentence distance for linking two mentions */
public int maxSentDist = -1;
/** type of mention we want to resolve. e.g., if mType is PRONOMINAL, we only resolve pronoun mentions */
public final Set<MentionType> mType;
/** type of mention we want to compare to. e.g., if aType is PROPER, the resolution can be done only with PROPER antecedent */
public final Set<MentionType> aType;
public final Set<String> mTypeStr;
public final Set<String> aTypeStr;
public Properties props = null;
public Sieve() {
this.lang = Locale.ENGLISH;
this.sievename = this.getClass().getSimpleName();
this.aType = new HashSet<>(Arrays.asList(MentionType.values()));
this.mType = new HashSet<>(Arrays.asList(MentionType.values()));
this.maxSentDist = 1000;
this.mTypeStr = Generics.newHashSet();
this.aTypeStr = Generics.newHashSet();
}
public Sieve(Properties props){
this.lang = HybridCorefProperties.getLanguage(props);
this.sievename = this.getClass().getSimpleName();
this.aType = HybridCorefProperties.getAntecedentType(props, sievename);
this.mType = HybridCorefProperties.getMentionType(props, sievename);
this.maxSentDist = HybridCorefProperties.getMaxSentDistForSieve(props, sievename);
this.mTypeStr = HybridCorefProperties.getMentionTypeStr(props, sievename);
this.aTypeStr = HybridCorefProperties.getAntecedentTypeStr(props, sievename);
}
public Sieve(Properties props, String sievename) {
this.lang = HybridCorefProperties.getLanguage(props);
this.sievename = sievename;
this.aType = HybridCorefProperties.getAntecedentType(props, sievename);
this.mType = HybridCorefProperties.getMentionType(props, sievename);
this.maxSentDist = HybridCorefProperties.getMaxSentDistForSieve(props, sievename);
this.mTypeStr = HybridCorefProperties.getMentionTypeStr(props, sievename);
this.aTypeStr = HybridCorefProperties.getAntecedentTypeStr(props, sievename);
}
public String resolveMention(Document document, Dictionaries dict, Properties props) throws Exception {
StringBuilder sbLog = new StringBuilder();
if(HybridCorefProperties.debug(props)) {
sbLog.append("=======================================================");
sbLog.append(HybridCorefPrinter.printRawDoc(document, true, true));
}
for(List<Mention> mentionsInSent : document.predictedMentions) {
for(int mIdx = 0 ; mIdx < mentionsInSent.size() ; mIdx++) {
Mention m = mentionsInSent.get(mIdx);
if(skipMentionType(m, props)) continue;
findCoreferentAntecedent(m, mIdx, document, dict, props, sbLog);
}
}
return sbLog.toString();
}
public abstract void findCoreferentAntecedent(Mention m, int mIdx, Document document, Dictionaries dict, Properties props, StringBuilder sbLog) throws Exception;
// load sieve (from file or make a deterministic sieve)
public static Sieve loadSieve(Properties props, String sievename) throws Exception {
// log.info("Loading sieve: "+sievename+" ...");
switch(HybridCorefProperties.getClassifierType(props, sievename)) {
case RULE:
DeterministicCorefSieve sieve = (DeterministicCorefSieve) Class.forName("edu.stanford.nlp.coref.hybrid.sieve."+sievename).getConstructor().newInstance();
sieve.props = props;
sieve.lang = HybridCorefProperties.getLanguage(props);
return sieve;
case RF:
log.info("Loading sieve: " + sievename + " from " + HybridCorefProperties.getPathModel(props, sievename) + " ... ");
RFSieve rfsieve = IOUtils.readObjectFromURLOrClasspathOrFileSystem(HybridCorefProperties.getPathModel(props, sievename));
rfsieve.thresMerge = HybridCorefProperties.getMergeThreshold(props, sievename);
log.info("done. Merging threshold: " + rfsieve.thresMerge);
return rfsieve;
case ORACLE:
OracleSieve oracleSieve = new OracleSieve(props, sievename);
oracleSieve.props = props;
return oracleSieve;
default:
throw new RuntimeException("no sieve type specified");
}
}
public static List<Sieve> loadSieves(Properties props) throws Exception {
List<Sieve> sieves = new ArrayList<>();
String sieveProp = HybridCorefProperties.getSieves(props);
String currentSieveForTrain = HybridCorefProperties.getCurrentSieveForTrain(props);
String[] sievenames = (currentSieveForTrain==null)?
sieveProp.trim().split(",\\s*") : sieveProp.split(currentSieveForTrain)[0].trim().split(",\\s*");
for(String sievename : sievenames) {
Sieve sieve = loadSieve(props, sievename);
sieves.add(sieve);
}
return sieves;
}
public static boolean hasThat(List<CoreLabel> words) {
for(CoreLabel cl : words) {
if(cl.word().equalsIgnoreCase("that") && cl.tag().equalsIgnoreCase("IN")) {
return true;
}
}
return false;
}
public static boolean hasToVerb(List<CoreLabel> words) {
for(int i=0 ; i<words.size()-1 ; i++) {
if(words.get(i).tag().equals("TO") && words.get(i+1).tag().startsWith("V")) return true;
}
return false;
}
private boolean skipMentionType(Mention m, Properties props) {
if(mType.contains(m.mentionType)) return false;
return true;
}
public static void merge(Document document, int mID, int antID) {
CorefCluster c1 = document.corefClusters.get(document.predictedMentionsByID.get(mID).corefClusterID);
CorefCluster c2 = document.corefClusters.get(document.predictedMentionsByID.get(antID).corefClusterID);
if(c1==c2) return;
int removeID = c1.getClusterID();
CorefCluster.mergeClusters(c2, c1);
document.mergeIncompatibles(c2, c1);
document.corefClusters.remove(removeID);
}
// check if two mentions are really coref in gold annotation
public static boolean isReallyCoref(Document document, int mID, int antID) {
if(!document.goldMentionsByID.containsKey(mID) || !document.goldMentionsByID.containsKey(antID)) {
return false;
}
int mGoldClusterID = document.goldMentionsByID.get(mID).goldCorefClusterID;
int aGoldClusterID = document.goldMentionsByID.get(antID).goldCorefClusterID;
return (mGoldClusterID == aGoldClusterID);
}
protected static boolean skipForAnalysis(Mention ant, Mention m, Properties props) {
if(!HybridCorefProperties.doAnalysis(props)) return false;
String skipMentionType = HybridCorefProperties.getSkipMentionType(props);
String skipAntType = HybridCorefProperties.getSkipAntecedentType(props);
return matchedMentionType(ant, skipAntType) && matchedMentionType(m, skipMentionType);
}
protected static boolean matchedMentionType(Mention m, Set<String> types) {
if(types.isEmpty()) return true;
for(String type : types) {
if(matchedMentionType(m, type)) return true;
}
return false;
}
protected static boolean matchedMentionType(Mention m, String type) {
if(type==null) return false;
if(type.equalsIgnoreCase("all") || type.equalsIgnoreCase(m.mentionType.toString())) return true;
// check pronoun specific type
if(type.equalsIgnoreCase("he") && m.isPronominal() && m.person == Person.HE) return true;
if(type.equalsIgnoreCase("she") && m.isPronominal() && m.person == Person.SHE) return true;
if(type.equalsIgnoreCase("you") && m.isPronominal() && m.person == Person.YOU) return true;
if(type.equalsIgnoreCase("I") && m.isPronominal() && m.person == Person.I) return true;
if(type.equalsIgnoreCase("it") && m.isPronominal() && m.person == Person.IT) return true;
if(type.equalsIgnoreCase("they") && m.isPronominal() && m.person == Person.THEY) return true;
if(type.equalsIgnoreCase("we") && m.isPronominal() && m.person == Person.WE) return true;
// check named entity type
if(type.toLowerCase().startsWith("ne:")) {
if(type.toLowerCase().substring(3).startsWith(m.nerString.toLowerCase().substring(0, Math.min(3, m.nerString.length())))) return true;
}
return false;
}
public static List<Mention> getOrderedAntecedents(
Mention m,
int antecedentSentence,
int mPosition,
List<List<Mention>> orderedMentionsBySentence,
Dictionaries dict) {
List<Mention> orderedAntecedents = new ArrayList<>();
// ordering antecedents
if (antecedentSentence == m.sentNum) { // same sentence
orderedAntecedents.addAll(orderedMentionsBySentence.get(m.sentNum).subList(0, mPosition));
if(dict.relativePronouns.contains(m.spanToString())) Collections.reverse(orderedAntecedents);
else {
orderedAntecedents = sortMentionsByClause(orderedAntecedents, m);
}
} else { // previous sentence
orderedAntecedents.addAll(orderedMentionsBySentence.get(antecedentSentence));
}
return orderedAntecedents;
}
/** Divides a sentence into clauses and sort the antecedents for pronoun matching */
private static List<Mention> sortMentionsByClause(List<Mention> l, Mention m1) {
List<Mention> sorted = new ArrayList<>();
Tree tree = m1.contextParseTree;
Tree current = m1.mentionSubTree;
if(tree==null || current==null) return l;
while(true){
current = current.ancestor(1, tree);
String curLabel = current.label().value();
if("TOP".equals(curLabel) || curLabel.startsWith("S") || curLabel.equals("NP")){
// if(current.label().value().startsWith("S")){
for(Mention m : l){
if(!sorted.contains(m) && current.dominates(m.mentionSubTree)) sorted.add(m);
}
}
if(current.ancestor(1, tree)==null) break;
}
return sorted;
}
}