package edu.stanford.nlp.coref.hybrid.sieve; import java.util.ArrayList; import java.util.Collections; import java.util.HashSet; import java.util.List; import java.util.Locale; import java.util.Properties; import java.util.Set; import edu.stanford.nlp.coref.CorefProperties; import edu.stanford.nlp.coref.CorefRules; import edu.stanford.nlp.coref.data.CorefChain; import edu.stanford.nlp.coref.data.CorefCluster; import edu.stanford.nlp.coref.data.Dictionaries; import edu.stanford.nlp.coref.data.Dictionaries.Animacy; import edu.stanford.nlp.coref.data.Dictionaries.Gender; import edu.stanford.nlp.coref.data.Dictionaries.MentionType; import edu.stanford.nlp.coref.data.Dictionaries.Number; import edu.stanford.nlp.coref.data.Dictionaries.Person; import edu.stanford.nlp.coref.data.Document; import edu.stanford.nlp.coref.data.Document.DocType; import edu.stanford.nlp.coref.data.Mention; import edu.stanford.nlp.coref.hybrid.HybridCorefPrinter; import edu.stanford.nlp.coref.hybrid.HybridCorefProperties; import edu.stanford.nlp.coref.hybrid.rf.RandomForest; import edu.stanford.nlp.coref.md.RuleBasedCorefMentionFinder; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreAnnotations.SpeakerAnnotation; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.ling.RVFDatum; import edu.stanford.nlp.math.ArrayMath; import edu.stanford.nlp.stats.ClassicCounter; import edu.stanford.nlp.stats.Counter; import edu.stanford.nlp.stats.Counters; import edu.stanford.nlp.trees.Tree; import edu.stanford.nlp.util.Generics; import edu.stanford.nlp.util.logging.Redwood; public class RFSieve extends Sieve { /** A logger for this class */ private static Redwood.RedwoodChannels log = Redwood.channels(RFSieve.class); private static final long serialVersionUID = -4090017054885920527L; // for RF sieve public RandomForest rf; /** the probability threshold for merging two mentions */ public double thresMerge; // constructor for RF sieve public RFSieve(RandomForest rf, Properties props, String sievename) { super(props, sievename); this.rf = rf; this.props = props; this.classifierType = ClassifierType.RF; } public void findCoreferentAntecedent(Mention m, int mIdx, Document document, Dictionaries dict, Properties props, StringBuilder sbLog) throws Exception { int sentIdx = m.sentNum; Counter<Integer> probs = new ClassicCounter<>(); int mentionDist = 0; for(int sentDist=0 ; sentDist <= Math.min(this.maxSentDist, sentIdx) ; sentDist++) { List<Mention> candidates = getOrderedAntecedents(m, sentIdx-sentDist, mIdx, document.predictedMentions, dict); for(Mention candidate : candidates) { if(skipForAnalysis(candidate, m, props)) continue; if(candidate == m) continue; if(!aType.contains(candidate.mentionType)) continue; if(m.mentionType == MentionType.PRONOMINAL) { if(!matchedMentionType(m, mTypeStr)) continue; if(!matchedMentionType(candidate, aTypeStr)) continue; } if(sentDist==0 && m.appearEarlierThan(candidate)) continue; // ignore cataphora mentionDist++; RVFDatum<Boolean, String> datum = extractDatum(m, candidate, document, mentionDist, dict, props, sievename); double probTrue = 0; if(this.classifierType == ClassifierType.RF) { probTrue = this.rf.probabilityOfTrue(datum); } probs.setCount(candidate.mentionID, probTrue); } } if(HybridCorefProperties.debug(props)) { sbLog.append(HybridCorefPrinter.printErrorLog(m, document, probs, mIdx, dict, this)); } if(probs.size() > 0 && Counters.max(probs) > this.thresMerge) { // merge highest prob candidate int antID = Counters.argmax(probs); Sieve.merge(document, m.mentionID, antID); } } public static RVFDatum<Boolean, String> extractDatum(Mention m, Mention candidate, Document document, int mentionDist, Dictionaries dict, Properties props, String sievename) { try { boolean label = (document.goldMentions==null)? false : document.isCoref(m, candidate); Counter<String> features = new ClassicCounter<>(); CorefCluster mC = document.corefClusters.get(m.corefClusterID); CorefCluster aC = document.corefClusters.get(candidate.corefClusterID); CoreLabel mFirst = m.sentenceWords.get(m.startIndex); CoreLabel mLast = m.sentenceWords.get(m.endIndex-1); CoreLabel mPreceding = (m.startIndex>0)? m.sentenceWords.get(m.startIndex-1) : null; CoreLabel mFollowing = (m.endIndex < m.sentenceWords.size())? m.sentenceWords.get(m.endIndex) : null; CoreLabel aFirst = candidate.sentenceWords.get(candidate.startIndex); CoreLabel aLast = candidate.sentenceWords.get(candidate.endIndex-1); CoreLabel aPreceding = (candidate.startIndex>0)? candidate.sentenceWords.get(candidate.startIndex-1) : null; CoreLabel aFollowing = (candidate.endIndex < candidate.sentenceWords.size())? candidate.sentenceWords.get(candidate.endIndex) : null; //////////////////////////////////////////////////////////////////////////////// /////// basic features: distance, doctype, mention length, roles //////////// //////////////////////////////////////////////////////////////////////////////// if(HybridCorefProperties.useBasicFeatures(props, sievename)) { int sentDist = m.sentNum - candidate.sentNum; features.incrementCount("SENTDIST", sentDist); features.incrementCount("MENTIONDIST", mentionDist); int minSentDist = sentDist; for(Mention a : aC.corefMentions) { minSentDist = Math.min(minSentDist, Math.abs(m.sentNum - a.sentNum)); } features.incrementCount("MINSENTDIST", minSentDist); // When they are in the same sentence, divides a sentence into clauses and add such feature if(CorefProperties.useConstituencyParse(props)) { if(m.sentNum == candidate.sentNum) { int clauseCount = 0; Tree tree = m.contextParseTree; Tree current = m.mentionSubTree; while(true){ current = current.ancestor(1, tree); if(current.label().value().startsWith("S")) { clauseCount++; } if(current.dominates(candidate.mentionSubTree)) break; if(current.label().value().equals("ROOT") || current.ancestor(1, tree)==null) break; } features.incrementCount("CLAUSECOUNT", clauseCount); } } if(document.docType == DocType.CONVERSATION) features.incrementCount("B-DOCTYPE-"+document.docType); if(m.headWord.get(SpeakerAnnotation.class).equalsIgnoreCase("PER0")) { features.incrementCount("B-SPEAKER-PER0"); } if(document.docInfo!=null && document.docInfo.containsKey("DOC_ID")) { features.incrementCount("B-DOCSOURCE-"+document.docInfo.get("DOC_ID").split("/")[1]); } features.incrementCount("M-LENGTH", m.originalSpan.size()); features.incrementCount("A-LENGTH", candidate.originalSpan.size()); if(m.originalSpan.size() < candidate.originalSpan.size()) features.incrementCount("B-A-ISLONGER"); features.incrementCount("A-SIZE", aC.getCorefMentions().size()); features.incrementCount("M-SIZE", mC.getCorefMentions().size()); String antRole = "A-NOROLE"; String mRole = "M-NOROLE"; if(m.isSubject) mRole = "M-SUBJ"; if(m.isDirectObject) mRole = "M-DOBJ"; if(m.isIndirectObject) mRole = "M-IOBJ"; if(m.isPrepositionObject) mRole = "M-POBJ"; if(candidate.isSubject) antRole = "A-SUBJ"; if(candidate.isDirectObject) antRole = "A-DOBJ"; if(candidate.isIndirectObject) antRole = "A-IOBJ"; if(candidate.isPrepositionObject) antRole = "A-POBJ"; features.incrementCount("B-"+mRole); features.incrementCount("B-"+antRole); features.incrementCount("B-"+antRole+"-"+mRole); if(HybridCorefProperties.combineObjectRoles(props, sievename)) { // combine all objects if(m.isDirectObject || m.isIndirectObject || m.isPrepositionObject || candidate.isDirectObject || candidate.isIndirectObject || candidate.isPrepositionObject) { if(m.isDirectObject || m.isIndirectObject || m.isPrepositionObject) { mRole = "M-OBJ"; features.incrementCount("B-M-OBJ"); } if(candidate.isDirectObject || candidate.isIndirectObject || candidate.isPrepositionObject) { antRole = "A-OBJ"; features.incrementCount("B-A-OBJ"); } features.incrementCount("B-"+antRole+"-"+mRole); } } if(mFirst.word().toLowerCase().matches("a|an")) { features.incrementCount("B-M-START-WITH-INDEFINITE"); } if(aFirst.word().toLowerCase().matches("a|an")) { features.incrementCount("B-A-START-WITH-INDEFINITE"); } if(mFirst.word().equalsIgnoreCase("the")) { features.incrementCount("B-M-START-WITH-DEFINITE"); } if(aFirst.word().equalsIgnoreCase("the")) { features.incrementCount("B-A-START-WITH-DEFINITE"); } if(dict.indefinitePronouns.contains(m.lowercaseNormalizedSpanString())) { features.incrementCount("B-M-INDEFINITE-PRONOUN"); } if(dict.indefinitePronouns.contains(candidate.lowercaseNormalizedSpanString())) { features.incrementCount("B-A-INDEFINITE-PRONOUN"); } if(dict.indefinitePronouns.contains(mFirst.word().toLowerCase())) { features.incrementCount("B-M-INDEFINITE-ADJ"); } if(dict.indefinitePronouns.contains(aFirst.word().toLowerCase())){ features.incrementCount("B-A-INDEFINITE-ADJ"); } if(dict.reflexivePronouns.contains(m.headString)) { features.incrementCount("B-M-REFLEXIVE"); } if(dict.reflexivePronouns.contains(candidate.headString)) { features.incrementCount("B-A-REFLEXIVE"); } if(m.headIndex == m.endIndex-1) features.incrementCount("B-M-HEADEND"); if(m.headIndex < m.endIndex-1) { CoreLabel headnext = m.sentenceWords.get(m.headIndex+1); if(headnext.word().matches("that|,") || headnext.tag().startsWith("W")) { features.incrementCount("B-M-HASPOSTPHRASE"); if(mFirst.tag().equals("DT") && mFirst.word().toLowerCase().matches("the|this|these|those")) features.incrementCount("B-M-THE-HASPOSTPHRASE"); else if(mFirst.word().toLowerCase().matches("a|an")) features.incrementCount("B-M-INDEFINITE-HASPOSTPHRASE"); } } // shape feature from Bjorkelund & Kuhn StringBuilder sb = new StringBuilder(); List<Mention> sortedMentions = new ArrayList<>(aC.corefMentions.size()); sortedMentions.addAll(aC.corefMentions); Collections.sort(sortedMentions, new CorefChain.MentionComparator()); for(Mention a : sortedMentions) { sb.append(a.mentionType).append("-"); } features.incrementCount("B-A-SHAPE-"+sb.toString()); sb = new StringBuilder(); sortedMentions = new ArrayList<>(mC.corefMentions.size()); sortedMentions.addAll(mC.corefMentions); Collections.sort(sortedMentions, new CorefChain.MentionComparator()); for(Mention men : sortedMentions) { sb.append(men.mentionType).append("-"); } features.incrementCount("B-M-SHAPE-"+sb.toString()); if(CorefProperties.useConstituencyParse(props)) { sb = new StringBuilder(); Tree mTree = m.contextParseTree; Tree mHead = mTree.getLeaves().get(m.headIndex).ancestor(1, mTree); for(Tree node : mTree.pathNodeToNode(mHead, mTree)){ sb.append(node.value()).append("-"); if(node.value().equals("S")) break; } features.incrementCount("B-M-SYNPATH-"+sb.toString()); sb = new StringBuilder(); Tree aTree = candidate.contextParseTree; Tree aHead = aTree.getLeaves().get(candidate.headIndex).ancestor(1, aTree); for(Tree node : aTree.pathNodeToNode(aHead, aTree)){ sb.append(node.value()).append("-"); if(node.value().equals("S")) break; } features.incrementCount("B-A-SYNPATH-"+sb.toString()); } features.incrementCount("A-FIRSTAPPEAR", aC.representative.sentNum); features.incrementCount("M-FIRSTAPPEAR", mC.representative.sentNum); int docSize = document.predictedMentions.size(); // document size in # of sentences features.incrementCount("A-FIRSTAPPEAR-NORMALIZED", aC.representative.sentNum/docSize); features.incrementCount("M-FIRSTAPPEAR-NORMALIZED", mC.representative.sentNum/docSize); } //////////////////////////////////////////////////////////////////////////////// /////// mention detection features //////////// //////////////////////////////////////////////////////////////////////////////// if(HybridCorefProperties.useMentionDetectionFeatures(props, sievename)) { // bare plurals if(m.originalSpan.size()==1 && m.headWord.tag().equals("NNS")) features.incrementCount("B-M-BAREPLURAL"); if(candidate.originalSpan.size()==1 && candidate.headWord.tag().equals("NNS")) features.incrementCount("B-A-BAREPLURAL"); // pleonastic it if(CorefProperties.useConstituencyParse(props)) { if(RuleBasedCorefMentionFinder.isPleonastic(m, m.contextParseTree) || RuleBasedCorefMentionFinder.isPleonastic(candidate, candidate.contextParseTree)) { features.incrementCount("B-PLEONASTICIT"); } } // quantRule if(dict.quantifiers.contains(mFirst.word().toLowerCase(Locale.ENGLISH))) features.incrementCount("B-M-QUANTIFIER"); if(dict.quantifiers.contains(aFirst.word().toLowerCase(Locale.ENGLISH))) features.incrementCount("B-A-QUANTIFIER"); // starts with negation if(mFirst.word().toLowerCase(Locale.ENGLISH).matches("none|no|nothing|not") || aFirst.word().toLowerCase(Locale.ENGLISH).matches("none|no|nothing|not")) { features.incrementCount("B-NEGATIVE-START"); } // parititive rule if(RuleBasedCorefMentionFinder.partitiveRule(m, m.sentenceWords, dict)) features.incrementCount("B-M-PARTITIVE"); if(RuleBasedCorefMentionFinder.partitiveRule(candidate, candidate.sentenceWords, dict)) features.incrementCount("B-A-PARTITIVE"); // % if(m.headString.equals("%")) features.incrementCount("B-M-HEAD%"); if(candidate.headString.equals("%")) features.incrementCount("B-A-HEAD%"); // adjective form of nations if(dict.isAdjectivalDemonym(m.spanToString())) features.incrementCount("B-M-ADJ-DEMONYM"); if(dict.isAdjectivalDemonym(candidate.spanToString())) features.incrementCount("B-A-ADJ-DEMONYM"); // ends with "etc." if(m.lowercaseNormalizedSpanString().endsWith("etc.")) features.incrementCount("B-M-ETC-END"); if(candidate.lowercaseNormalizedSpanString().endsWith("etc.")) features.incrementCount("B-A-ETC-END"); } //////////////////////////////////////////////////////////////////////////////// /////// attributes, attributes agree //////////// //////////////////////////////////////////////////////////////////////////////// features.incrementCount("B-M-NUMBER-"+m.number); features.incrementCount("B-A-NUMBER-"+candidate.number); features.incrementCount("B-M-GENDER-"+m.gender); features.incrementCount("B-A-GENDER-"+candidate.gender); features.incrementCount("B-M-ANIMACY-"+m.animacy); features.incrementCount("B-A-ANIMACY-"+candidate.animacy); features.incrementCount("B-M-PERSON-"+m.person); features.incrementCount("B-A-PERSON-"+candidate.person); features.incrementCount("B-M-NETYPE-"+m.nerString); features.incrementCount("B-A-NETYPE-"+candidate.nerString); features.incrementCount("B-BOTH-NUMBER-"+candidate.number+"-"+m.number); features.incrementCount("B-BOTH-GENDER-"+candidate.gender+"-"+m.gender); features.incrementCount("B-BOTH-ANIMACY-"+candidate.animacy+"-"+m.animacy); features.incrementCount("B-BOTH-PERSON-"+candidate.person+"-"+m.person); features.incrementCount("B-BOTH-NETYPE-"+candidate.nerString+"-"+m.nerString); Set<Number> mcNumber = Generics.newHashSet(); for(Number n : mC.numbers) { features.incrementCount("B-MC-NUMBER-"+n); mcNumber.add(n); } if(mcNumber.size()==1) { features.incrementCount("B-MC-CLUSTERNUMBER-"+mcNumber.iterator().next()); } else { mcNumber.remove(Number.UNKNOWN); if(mcNumber.size() == 1) features.incrementCount("B-MC-CLUSTERNUMBER-"+mcNumber.iterator().next()); else features.incrementCount("B-MC-CLUSTERNUMBER-CONFLICT"); } Set<Gender> mcGender = Generics.newHashSet(); for(Gender g : mC.genders) { features.incrementCount("B-MC-GENDER-"+g); mcGender.add(g); } if(mcGender.size()==1) { features.incrementCount("B-MC-CLUSTERGENDER-"+mcGender.iterator().next()); } else { mcGender.remove(Gender.UNKNOWN); if(mcGender.size() == 1) features.incrementCount("B-MC-CLUSTERGENDER-"+mcGender.iterator().next()); else features.incrementCount("B-MC-CLUSTERGENDER-CONFLICT"); } Set<Animacy> mcAnimacy = Generics.newHashSet(); for(Animacy a : mC.animacies) { features.incrementCount("B-MC-ANIMACY-"+a); mcAnimacy.add(a); } if(mcAnimacy.size()==1) { features.incrementCount("B-MC-CLUSTERANIMACY-"+mcAnimacy.iterator().next()); } else { mcAnimacy.remove(Animacy.UNKNOWN); if(mcAnimacy.size() == 1) features.incrementCount("B-MC-CLUSTERANIMACY-"+mcAnimacy.iterator().next()); else features.incrementCount("B-MC-CLUSTERANIMACY-CONFLICT"); } Set<String> mcNER = Generics.newHashSet(); for(String t : mC.nerStrings) { features.incrementCount("B-MC-NETYPE-"+t); mcNER.add(t); } if(mcNER.size()==1) { features.incrementCount("B-MC-CLUSTERNETYPE-"+mcNER.iterator().next()); } else { mcNER.remove("O"); if(mcNER.size() == 1) features.incrementCount("B-MC-CLUSTERNETYPE-"+mcNER.iterator().next()); else features.incrementCount("B-MC-CLUSTERNETYPE-CONFLICT"); } Set<Number> acNumber = Generics.newHashSet(); for(Number n : aC.numbers) { features.incrementCount("B-AC-NUMBER-"+n); acNumber.add(n); } if(acNumber.size()==1) { features.incrementCount("B-AC-CLUSTERNUMBER-"+acNumber.iterator().next()); } else { acNumber.remove(Number.UNKNOWN); if(acNumber.size() == 1) features.incrementCount("B-AC-CLUSTERNUMBER-"+acNumber.iterator().next()); else features.incrementCount("B-AC-CLUSTERNUMBER-CONFLICT"); } Set<Gender> acGender = Generics.newHashSet(); for(Gender g : aC.genders) { features.incrementCount("B-AC-GENDER-"+g); acGender.add(g); } if(acGender.size()==1) { features.incrementCount("B-AC-CLUSTERGENDER-"+acGender.iterator().next()); } else { acGender.remove(Gender.UNKNOWN); if(acGender.size() == 1) features.incrementCount("B-AC-CLUSTERGENDER-"+acGender.iterator().next()); else features.incrementCount("B-AC-CLUSTERGENDER-CONFLICT"); } Set<Animacy> acAnimacy = Generics.newHashSet(); for(Animacy a : aC.animacies) { features.incrementCount("B-AC-ANIMACY-"+a); acAnimacy.add(a); } if(acAnimacy.size()==1) { features.incrementCount("B-AC-CLUSTERANIMACY-"+acAnimacy.iterator().next()); } else { acAnimacy.remove(Animacy.UNKNOWN); if(acAnimacy.size() == 1) features.incrementCount("B-AC-CLUSTERANIMACY-"+acAnimacy.iterator().next()); else features.incrementCount("B-AC-CLUSTERANIMACY-CONFLICT"); } Set<String> acNER = Generics.newHashSet(); for(String t : aC.nerStrings) { features.incrementCount("B-AC-NETYPE-"+t); acNER.add(t); } if(acNER.size()==1) { features.incrementCount("B-AC-CLUSTERNETYPE-"+acNER.iterator().next()); } else { acNER.remove("O"); if(acNER.size() == 1) features.incrementCount("B-AC-CLUSTERNETYPE-"+acNER.iterator().next()); else features.incrementCount("B-AC-CLUSTERNETYPE-CONFLICT"); } if(m.numbersAgree(candidate)) features.incrementCount("B-NUMBER-AGREE"); if(m.gendersAgree(candidate)) features.incrementCount("B-GENDER-AGREE"); if(m.animaciesAgree(candidate)) features.incrementCount("B-ANIMACY-AGREE"); if(CorefRules.entityAttributesAgree(mC, aC)) features.incrementCount("B-ATTRIBUTES-AGREE"); if(CorefRules.entityPersonDisagree(document, m, candidate, dict)) features.incrementCount("B-PERSON-DISAGREE"); //////////////////////////////////////////////////////////////////////////////// /////// dcoref rules //////////// //////////////////////////////////////////////////////////////////////////////// if(HybridCorefProperties.useDcorefRules(props, sievename)) { if(CorefRules.entityIWithinI(m, candidate, dict)) features.incrementCount("B-i-within-i"); if(CorefRules.antecedentIsMentionSpeaker(document, m, candidate, dict)) features.incrementCount("B-ANT-IS-SPEAKER"); if(CorefRules.entitySameSpeaker(document, m, candidate)) features.incrementCount("B-SAME-SPEAKER"); if(CorefRules.entitySubjectObject(m, candidate)) features.incrementCount("B-SUBJ-OBJ"); for(Mention a : aC.corefMentions) { if(CorefRules.entitySubjectObject(m, a)) features.incrementCount("B-CLUSTER-SUBJ-OBJ"); } if(CorefRules.entityPersonDisagree(document, m, candidate, dict) && CorefRules.entitySameSpeaker(document, m, candidate)) features.incrementCount("B-PERSON-DISAGREE-SAME-SPEAKER"); if(CorefRules.entityIWithinI(mC, aC, dict)) features.incrementCount("B-ENTITY-IWITHINI"); if(CorefRules.antecedentMatchesMentionSpeakerAnnotation(m, candidate, document)) features.incrementCount("B-ANT-IS-SPEAKER-OF-MENTION"); Set<MentionType> mType = HybridCorefProperties.getMentionType(props, sievename); if(mType.contains(MentionType.PROPER) || mType.contains(MentionType.NOMINAL)) { if(m.headString.equals(candidate.headString)) features.incrementCount("B-HEADMATCH"); if(CorefRules.entityHeadsAgree(mC, aC, m, candidate, dict)) features.incrementCount("B-HEADSAGREE"); if(CorefRules.entityExactStringMatch(mC, aC, dict, document.roleSet)) features.incrementCount("B-EXACTSTRINGMATCH"); if(CorefRules.entityHaveExtraProperNoun(m, candidate, new HashSet<>())) features.incrementCount("B-HAVE-EXTRA-PROPER-NOUN"); if(CorefRules.entityBothHaveProper(mC, aC)) features.incrementCount("B-BOTH-HAVE-PROPER"); if(CorefRules.entityHaveDifferentLocation(m, candidate, dict)) features.incrementCount("B-HAVE-DIFF-LOC"); if(CorefRules.entityHaveIncompatibleModifier(mC, aC)) features.incrementCount("B-HAVE-INCOMPATIBLE-MODIFIER"); if(CorefRules.entityIsAcronym(document, mC, aC)) features.incrementCount("B-IS-ACRONYM"); if(CorefRules.entityIsApposition(mC, aC, m, candidate)) features.incrementCount("B-IS-APPOSITION"); if(CorefRules.entityIsPredicateNominatives(mC, aC, m, candidate)) features.incrementCount("B-IS-PREDICATE-NOMINATIVES"); if(CorefRules.entityIsRoleAppositive(mC, aC, m, candidate, dict)) features.incrementCount("B-IS-ROLE-APPOSITIVE"); if(CorefRules.entityNumberInLaterMention(m, candidate)) features.incrementCount("B-NUMBER-IN-LATER"); if(CorefRules.entityRelaxedExactStringMatch(mC, aC, m, candidate, dict, document.roleSet)) features.incrementCount("B-RELAXED-EXACT-STRING-MATCH"); if(CorefRules.entityRelaxedHeadsAgreeBetweenMentions(mC, aC, m, candidate)) features.incrementCount("B-RELAXED-HEAD-AGREE"); if(CorefRules.entitySameProperHeadLastWord(m, candidate)) features.incrementCount("B-SAME-PROPER-HEAD"); if(CorefRules.entitySameProperHeadLastWord(mC, aC, m, candidate)) features.incrementCount("B-CLUSTER-SAME-PROPER-HEAD"); if(CorefRules.entityWordsIncluded(mC, aC, m, candidate)) features.incrementCount("B-WORD-INCLUSION"); } if(mType.contains(MentionType.LIST)) { features.incrementCount("NUM-LIST-", numEntitiesInList(m)); if(m.spanToString().contains("two") || m.spanToString().contains("2") || m.spanToString().contains("both")) features.incrementCount("LIST-M-TWO"); if(m.spanToString().contains("three") || m.spanToString().contains("3")) features.incrementCount("LIST-M-THREE"); if(candidate.spanToString().contains("two") || candidate.spanToString().contains("2") || candidate.spanToString().contains("both")) { features.incrementCount("B-LIST-A-TWO"); } if(candidate.spanToString().contains("three") || candidate.spanToString().contains("3")) { features.incrementCount("B-LIST-A-THREE"); } } if(mType.contains(MentionType.PRONOMINAL)) { if(dict.firstPersonPronouns.contains(m.headString)) features.incrementCount("B-M-I"); if(dict.secondPersonPronouns.contains(m.headString)) features.incrementCount("B-M-YOU"); if(dict.thirdPersonPronouns.contains(m.headString)) features.incrementCount("B-M-3RDPERSON"); if(dict.possessivePronouns.contains(m.headString)) features.incrementCount("B-M-POSSESSIVE"); if(dict.neutralPronouns.contains(m.headString)) features.incrementCount("B-M-NEUTRAL"); if(dict.malePronouns.contains(m.headString)) features.incrementCount("B-M-MALE"); if(dict.femalePronouns.contains(m.headString)) features.incrementCount("B-M-FEMALE"); if(dict.firstPersonPronouns.contains(candidate.headString)) features.incrementCount("B-A-I"); if(dict.secondPersonPronouns.contains(candidate.headString)) features.incrementCount("B-A-YOU"); if(dict.thirdPersonPronouns.contains(candidate.headString)) features.incrementCount("B-A-3RDPERSON"); if(dict.possessivePronouns.contains(candidate.headString)) features.incrementCount("B-A-POSSESSIVE"); if(dict.neutralPronouns.contains(candidate.headString)) features.incrementCount("B-A-NEUTRAL"); if(dict.malePronouns.contains(candidate.headString)) features.incrementCount("B-A-MALE"); if(dict.femalePronouns.contains(candidate.headString)) features.incrementCount("B-A-FEMALE"); features.incrementCount("B-M-GENERIC-"+m.generic); features.incrementCount("B-A-GENERIC-"+candidate.generic); if(HybridCorefPrinter.dcorefPronounSieve.skipThisMention(document, m, mC, dict)) { features.incrementCount("B-SKIPTHISMENTION-true"); } if(m.spanToString().equalsIgnoreCase("you") && mFollowing!=null && mFollowing.word().equalsIgnoreCase("know")) { features.incrementCount("B-YOUKNOW-PRECEDING-POS-" + ((mPreceding==null)? "NULL" : mPreceding.tag()) ); features.incrementCount("B-YOUKNOW-PRECEDING-WORD-"+ ((mPreceding==null)? "NULL" : mPreceding.word().toLowerCase()) ); CoreLabel nextword = (m.endIndex+1 < m.sentenceWords.size())? m.sentenceWords.get(m.endIndex+1) : null; features.incrementCount("B-YOUKNOW-FOLLOWING-POS-" + ((nextword==null)? "NULL" : nextword.tag()) ); features.incrementCount("B-YOUKNOW-FOLLOWING-WORD-"+ ((nextword==null)? "NULL" : nextword.word().toLowerCase()) ); } if(candidate.spanToString().equalsIgnoreCase("you") && aFollowing!=null && aFollowing.word().equalsIgnoreCase("know")) { features.incrementCount("B-YOUKNOW-PRECEDING-POS-" + ((aPreceding==null)? "NULL" : aPreceding.tag()) ); features.incrementCount("B-YOUKNOW-PRECEDING-WORD-"+ ((aPreceding==null)? "NULL" : aPreceding.word().toLowerCase()) ); CoreLabel nextword = (candidate.endIndex+1 < candidate.sentenceWords.size())? candidate.sentenceWords.get(candidate.endIndex+1) : null; features.incrementCount("B-YOUKNOW-FOLLOWING-POS-" + ((nextword==null)? "NULL" : nextword.tag()) ); features.incrementCount("B-YOUKNOW-FOLLOWING-WORD-"+ ((nextword==null)? "NULL" : nextword.word().toLowerCase()) ); } } // discourse match features if(m.person==Person.YOU && document.docType==DocType.ARTICLE && m.headWord.get(CoreAnnotations.SpeakerAnnotation.class).equals("PER0")) { features.incrementCount("B-DISCOURSE-M-YOU-GENERIC?"); } if(candidate.generic && candidate.person==Person.YOU) features.incrementCount("B-DISCOURSE-A-YOU-GENERIC?"); String mString = m.lowercaseNormalizedSpanString(); String antString = candidate.lowercaseNormalizedSpanString(); // I-I if(m.number==Number.SINGULAR && dict.firstPersonPronouns.contains(mString) && candidate.number==Number.SINGULAR && dict.firstPersonPronouns.contains(antString) && CorefRules.entitySameSpeaker(document, m, candidate)) { features.incrementCount("B-DISCOURSE-I-I-SAMESPEAKER"); } // (speaker - I) if ((m.number==Number.SINGULAR && dict.firstPersonPronouns.contains(mString)) && CorefRules.antecedentIsMentionSpeaker(document, m, candidate, dict)) { features.incrementCount("B-DISCOURSE-SPEAKER-I"); } // (I - speaker) if ((candidate.number==Number.SINGULAR && dict.firstPersonPronouns.contains(antString)) && CorefRules.antecedentIsMentionSpeaker(document, candidate, m, dict)) { features.incrementCount("B-DISCOURSE-I-SPEAKER"); } // Can be iffy if more than two speakers... but still should be okay most of the time if (dict.secondPersonPronouns.contains(mString) && dict.secondPersonPronouns.contains(antString) && CorefRules.entitySameSpeaker(document, m, candidate)) { features.incrementCount("B-DISCOURSE-BOTH-YOU"); } // previous I - you or previous you - I in two person conversation if (((m.person==Person.I && candidate.person==Person.YOU || (m.person==Person.YOU && candidate.person==Person.I)) && (m.headWord.get(CoreAnnotations.UtteranceAnnotation.class)-candidate.headWord.get(CoreAnnotations.UtteranceAnnotation.class) == 1) && document.docType==DocType.CONVERSATION)) { features.incrementCount("B-DISCOURSE-I-YOU"); } if (dict.reflexivePronouns.contains(m.headString) && CorefRules.entitySubjectObject(m, candidate)){ features.incrementCount("B-DISCOURSE-REFLEXIVE"); } if(m.person==Person.I && candidate.person==Person.I && !CorefRules.entitySameSpeaker(document, m, candidate)) { features.incrementCount("B-DISCOURSE-I-I-DIFFSPEAKER"); } if(m.person==Person.YOU && candidate.person==Person.YOU && !CorefRules.entitySameSpeaker(document, m, candidate)) { features.incrementCount("B-DISCOURSE-YOU-YOU-DIFFSPEAKER"); } if(m.person==Person.WE && candidate.person==Person.WE && !CorefRules.entitySameSpeaker(document, m, candidate)) { features.incrementCount("B-DISCOURSE-WE-WE-DIFFSPEAKER"); } } //////////////////////////////////////////////////////////////////////////////// /////// POS features //////////// //////////////////////////////////////////////////////////////////////////////// if(HybridCorefProperties.usePOSFeatures(props, sievename)) { features.incrementCount("B-LEXICAL-M-HEADPOS-"+m.headWord.tag()); features.incrementCount("B-LEXICAL-A-HEADPOS-"+candidate.headWord.tag()); features.incrementCount("B-LEXICAL-M-FIRSTPOS-"+mFirst.tag()); features.incrementCount("B-LEXICAL-A-FIRSTPOS-"+aFirst.tag()); features.incrementCount("B-LEXICAL-M-LASTPOS-"+mLast.tag()); features.incrementCount("B-LEXICAL-A-LASTPOS-"+aLast.tag()); features.incrementCount("B-LEXICAL-M-PRECEDINGPOS-"+ ((mPreceding==null)? "NULL" : mPreceding.tag()) ); features.incrementCount("B-LEXICAL-A-PRECEDINGPOS-"+ ((aPreceding==null)? "NULL" : aPreceding.tag()) ); features.incrementCount("B-LEXICAL-M-FOLLOWINGPOS-"+ ((mFollowing==null)? "NULL" : mFollowing.tag()) ); features.incrementCount("B-LEXICAL-A-FOLLOWINGPOS-"+ ((aFollowing==null)? "NULL" : aFollowing.tag()) ); } //////////////////////////////////////////////////////////////////////////////// /////// lexical features //////////// //////////////////////////////////////////////////////////////////////////////// if(HybridCorefProperties.useLexicalFeatures(props, sievename)) { features.incrementCount("B-LEXICAL-M-HEADWORD-"+m.headString.toLowerCase()); features.incrementCount("B-LEXICAL-A-HEADWORD-"+candidate.headString.toLowerCase()); features.incrementCount("B-LEXICAL-M-FIRSTWORD-"+mFirst.word().toLowerCase()); features.incrementCount("B-LEXICAL-A-FIRSTWORD-"+aFirst.word().toLowerCase()); features.incrementCount("B-LEXICAL-M-LASTWORD-"+mLast.word().toLowerCase()); features.incrementCount("B-LEXICAL-A-LASTWORD-"+aLast.word().toLowerCase()); features.incrementCount("B-LEXICAL-M-PRECEDINGWORD-"+ ((mPreceding==null)? "NULL" : mPreceding.word().toLowerCase()) ); features.incrementCount("B-LEXICAL-A-PRECEDINGWORD-"+ ((aPreceding==null)? "NULL" : aPreceding.word().toLowerCase()) ); features.incrementCount("B-LEXICAL-M-FOLLOWINGWORD-"+ ((mFollowing==null)? "NULL" : mFollowing.word().toLowerCase()) ); features.incrementCount("B-LEXICAL-A-FOLLOWINGWORD-"+ ((aFollowing==null)? "NULL" : aFollowing.word().toLowerCase()) ); //extra headword, modifiers lexical features for(String mHead : mC.heads) { if(!aC.heads.contains(mHead)) features.incrementCount("B-LEXICAL-MC-EXTRAHEAD-"+mHead); } for(String mWord : mC.words) { if(!aC.words.contains(mWord)) features.incrementCount("B-LEXICAL-MC-EXTRAWORD-"+mWord); } } //////////////////////////////////////////////////////////////////////////////// /////// word vector features //////////// //////////////////////////////////////////////////////////////////////////////// // cosine if(HybridCorefProperties.useWordEmbedding(props, sievename)) { // dimension int dim = dict.vectors.entrySet().iterator().next().getValue().length; // distance between headword float[] mV = dict.vectors.get(m.headString.toLowerCase()); float[] aV = dict.vectors.get(candidate.headString.toLowerCase()); if(mV!=null && aV!=null) { features.incrementCount("WORDVECTOR-DIFF-HEADWORD", cosine(mV, aV)); } mV = dict.vectors.get(mFirst.word().toLowerCase()); aV = dict.vectors.get(aFirst.word().toLowerCase()); if(mV!=null && aV!=null) { features.incrementCount("WORDVECTOR-DIFF-FIRSTWORD", cosine(mV, aV)); } mV = dict.vectors.get(mLast.word().toLowerCase()); aV = dict.vectors.get(aLast.word().toLowerCase()); if(mV!=null && aV!=null) { features.incrementCount("WORDVECTOR-DIFF-LASTWORD", cosine(mV, aV)); } if(mPreceding!=null && aPreceding!=null) { mV = dict.vectors.get(mPreceding.word().toLowerCase()); aV = dict.vectors.get(aPreceding.word().toLowerCase()); if(mV!=null && aV!=null) { features.incrementCount("WORDVECTOR-DIFF-PRECEDINGWORD", cosine(mV, aV)); } } if(mFollowing!=null && aFollowing!=null) { mV = dict.vectors.get(mFollowing.word().toLowerCase()); aV = dict.vectors.get(aFollowing.word().toLowerCase()); if(mV!=null && aV!=null) { features.incrementCount("WORDVECTOR-DIFF-FOLLOWINGWORD", cosine(mV, aV)); } } float[] aggreM = new float[dim]; float[] aggreA = new float[dim]; for(CoreLabel cl : m.originalSpan) { float[] v = dict.vectors.get(cl.word().toLowerCase()); if(v==null) continue; ArrayMath.pairwiseAddInPlace(aggreM, v); } for(CoreLabel cl : candidate.originalSpan) { float[] v = dict.vectors.get(cl.word().toLowerCase()); if(v==null) continue; ArrayMath.pairwiseAddInPlace(aggreA, v); } if(ArrayMath.L2Norm(aggreM)!=0 && ArrayMath.L2Norm(aggreA)!=0) { features.incrementCount("WORDVECTOR-AGGREGATE-DIFF", cosine(aggreM, aggreA)); } int cnt = 0; double dist = 0; for(CoreLabel mcl : m.originalSpan) { for(CoreLabel acl : candidate.originalSpan) { mV = dict.vectors.get(mcl.word().toLowerCase()); aV = dict.vectors.get(acl.word().toLowerCase()); if(mV==null || aV==null) continue; cnt++; dist += cosine(mV, aV); } } features.incrementCount("WORDVECTOR-AVG-DIFF", dist/cnt); } return new RVFDatum<>(features, label); } catch (Exception e) { log.info("Datum Extraction failed in Sieve.java while processing document: "+document.docInfo.get("DOC_ID")+" part: "+document.docInfo.get("DOC_PART")); throw new RuntimeException(e); } } // assume the input vectors are normalized private static double cosine(float[] normalizedVector1, float[] normalizedVector2) { double inner = ArrayMath.innerProduct(normalizedVector1, normalizedVector2); return inner; } public static int numEntitiesInList(Mention m) { int num = 0; for(int i=1 ; i < m.originalSpan.size() ; i++) { CoreLabel cl = m.originalSpan.get(i); if(cl.word().equals(",")) num++; if((cl.word().equalsIgnoreCase("and") || cl.word().equalsIgnoreCase("or")) && !m.originalSpan.get(i-1).word().equals(",")) num++; } return num; } }