package edu.stanford.nlp.dcoref; import edu.stanford.nlp.trees.*; import edu.stanford.nlp.util.logging.Redwood; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Set; import java.util.regex.Pattern; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.MultiTokenTag; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.ling.Label; import edu.stanford.nlp.parser.common.ParserAnnotations; import edu.stanford.nlp.parser.common.ParserConstraint; import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.pipeline.AnnotationPipeline; import edu.stanford.nlp.pipeline.Annotator; import edu.stanford.nlp.pipeline.StanfordCoreNLP; import edu.stanford.nlp.semgraph.SemanticGraph; import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations; import edu.stanford.nlp.trees.tregex.TregexMatcher; import edu.stanford.nlp.trees.tregex.TregexPattern; import edu.stanford.nlp.util.*; public class RuleBasedCorefMentionFinder implements CorefMentionFinder { /** A logger for this class */ private static Redwood.RedwoodChannels log = Redwood.channels(RuleBasedCorefMentionFinder.class); protected boolean assignIds = true; // protected int maxID = -1; private final HeadFinder headFinder; protected Annotator parserProcessor; private final boolean allowReparsing; public RuleBasedCorefMentionFinder() { this(Constants.ALLOW_REPARSING); } public RuleBasedCorefMentionFinder(boolean allowReparsing) { SieveCoreferenceSystem.logger.fine("Using SEMANTIC HEAD FINDER!!!!!!!!!!!!!!!!!!!"); this.headFinder = new SemanticHeadFinder(); this.allowReparsing = allowReparsing; } /** When mention boundaries are given */ public List<List<Mention>> filterPredictedMentions(List<List<Mention>> allGoldMentions, Annotation doc, Dictionaries dict){ List<List<Mention>> predictedMentions = new ArrayList<>(); for(int i = 0 ; i < allGoldMentions.size(); i++){ CoreMap s = doc.get(CoreAnnotations.SentencesAnnotation.class).get(i); List<Mention> goldMentions = allGoldMentions.get(i); List<Mention> mentions = new ArrayList<>(); predictedMentions.add(mentions); mentions.addAll(goldMentions); findHead(s, mentions); // todo [cdm 2013]: This block seems to do nothing - the two sets are never used Set<IntPair> mentionSpanSet = Generics.newHashSet(); Set<IntPair> namedEntitySpanSet = Generics.newHashSet(); for(Mention m : mentions) { mentionSpanSet.add(new IntPair(m.startIndex, m.endIndex)); if(!m.headWord.get(CoreAnnotations.NamedEntityTagAnnotation.class).equals("O")) { namedEntitySpanSet.add(new IntPair(m.startIndex, m.endIndex)); } } setBarePlural(mentions); removeSpuriousMentions(s, mentions, dict); } return predictedMentions; } /** Main method of mention detection. * Extract all NP, PRP or NE, and filter out by manually written patterns. */ @Override public List<List<Mention>> extractPredictedMentions(Annotation doc, int maxID, Dictionaries dict) { // this.maxID = _maxID; List<List<Mention>> predictedMentions = new ArrayList<>(); for (CoreMap s : doc.get(CoreAnnotations.SentencesAnnotation.class)) { List<Mention> mentions = new ArrayList<>(); predictedMentions.add(mentions); Set<IntPair> mentionSpanSet = Generics.newHashSet(); Set<IntPair> namedEntitySpanSet = Generics.newHashSet(); extractPremarkedEntityMentions(s, mentions, mentionSpanSet, namedEntitySpanSet); extractNamedEntityMentions(s, mentions, mentionSpanSet, namedEntitySpanSet); extractNPorPRP(s, mentions, mentionSpanSet, namedEntitySpanSet); extractEnumerations(s, mentions, mentionSpanSet, namedEntitySpanSet); findHead(s, mentions); setBarePlural(mentions); removeSpuriousMentions(s, mentions, dict); } // assign mention IDs if(assignIds) assignMentionIDs(predictedMentions, maxID); return predictedMentions; } protected static void assignMentionIDs(List<List<Mention>> predictedMentions, int maxID) { for(List<Mention> mentions : predictedMentions) { for(Mention m : mentions) { m.mentionID = (++maxID); } } } protected static void setBarePlural(List<Mention> mentions) { for (Mention m : mentions) { String pos = m.headWord.get(CoreAnnotations.PartOfSpeechAnnotation.class); if(m.originalSpan.size()==1 && pos.equals("NNS")) m.generic = true; } } protected static void extractPremarkedEntityMentions(CoreMap s, List<Mention> mentions, Set<IntPair> mentionSpanSet, Set<IntPair> namedEntitySpanSet) { List<CoreLabel> sent = s.get(CoreAnnotations.TokensAnnotation.class); SemanticGraph dependency = s.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class); int beginIndex = -1; for(CoreLabel w : sent) { MultiTokenTag t = w.get(CoreAnnotations.MentionTokenAnnotation.class); if (t != null) { // Part of a mention if (t.isStart()) { // Start of mention beginIndex = w.get(CoreAnnotations.IndexAnnotation.class) - 1; } if (t.isEnd()) { // end of mention int endIndex = w.get(CoreAnnotations.IndexAnnotation.class); if (beginIndex >= 0) { IntPair mSpan = new IntPair(beginIndex, endIndex); int dummyMentionId = -1; Mention m = new Mention(dummyMentionId, beginIndex, endIndex, dependency, new ArrayList<>(sent.subList(beginIndex, endIndex))); mentions.add(m); mentionSpanSet.add(mSpan); beginIndex = -1; } else { SieveCoreferenceSystem.logger.warning("Start of marked mention not found in sentence: " + t + " at tokenIndex=" + (w.get(CoreAnnotations.IndexAnnotation.class)-1)+ " for " + s.get(CoreAnnotations.TextAnnotation.class)); } } } } } protected static void extractNamedEntityMentions(CoreMap s, List<Mention> mentions, Set<IntPair> mentionSpanSet, Set<IntPair> namedEntitySpanSet) { List<CoreLabel> sent = s.get(CoreAnnotations.TokensAnnotation.class); SemanticGraph dependency = s.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class); String preNE = "O"; int beginIndex = -1; for(CoreLabel w : sent) { String nerString = w.get(CoreAnnotations.NamedEntityTagAnnotation.class); if(!nerString.equals(preNE)) { int endIndex = w.get(CoreAnnotations.IndexAnnotation.class) - 1; if(!preNE.matches("O|QUANTITY|CARDINAL|PERCENT|DATE|DURATION|TIME|SET")){ if(w.get(CoreAnnotations.TextAnnotation.class).equals("'s")) endIndex++; IntPair mSpan = new IntPair(beginIndex, endIndex); // Need to check if beginIndex < endIndex because, for // example, there could be a 's mislabeled by the NER and // attached to the previous NER by the earlier heuristic if(beginIndex < endIndex && !mentionSpanSet.contains(mSpan)) { int dummyMentionId = -1; Mention m = new Mention(dummyMentionId, beginIndex, endIndex, dependency, new ArrayList<>(sent.subList(beginIndex, endIndex))); mentions.add(m); mentionSpanSet.add(mSpan); namedEntitySpanSet.add(mSpan); } } beginIndex = endIndex; preNE = nerString; } } // NE at the end of sentence if(!preNE.matches("O|QUANTITY|CARDINAL|PERCENT|DATE|DURATION|TIME|SET")) { IntPair mSpan = new IntPair(beginIndex, sent.size()); if(!mentionSpanSet.contains(mSpan)) { int dummyMentionId = -1; Mention m = new Mention(dummyMentionId, beginIndex, sent.size(), dependency, new ArrayList<>(sent.subList(beginIndex, sent.size()))); mentions.add(m); mentionSpanSet.add(mSpan); namedEntitySpanSet.add(mSpan); } } } private static final TregexPattern npOrPrpMentionPattern = TregexPattern.compile("/^(?:NP|PRP)/"); protected static void extractNPorPRP(CoreMap s, List<Mention> mentions, Set<IntPair> mentionSpanSet, Set<IntPair> namedEntitySpanSet) { List<CoreLabel> sent = s.get(CoreAnnotations.TokensAnnotation.class); Tree tree = s.get(TreeCoreAnnotations.TreeAnnotation.class); tree.indexLeaves(); SemanticGraph dependency = s.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class); TregexPattern tgrepPattern = npOrPrpMentionPattern; TregexMatcher matcher = tgrepPattern.matcher(tree); while (matcher.find()) { Tree t = matcher.getMatch(); List<Tree> mLeaves = t.getLeaves(); int beginIdx = ((CoreLabel)mLeaves.get(0).label()).get(CoreAnnotations.IndexAnnotation.class)-1; int endIdx = ((CoreLabel)mLeaves.get(mLeaves.size()-1).label()).get(CoreAnnotations.IndexAnnotation.class); if (",".equals(sent.get(endIdx-1).word())) { endIdx--; } // try not to have span that ends with , IntPair mSpan = new IntPair(beginIdx, endIdx); if(!mentionSpanSet.contains(mSpan) && !insideNE(mSpan, namedEntitySpanSet)) { int dummyMentionId = -1; Mention m = new Mention(dummyMentionId, beginIdx, endIdx, dependency, new ArrayList<>(sent.subList(beginIdx, endIdx)), t); mentions.add(m); mentionSpanSet.add(mSpan); } } } /** Extract enumerations (A, B, and C) */ private static final TregexPattern enumerationsMentionPattern = TregexPattern.compile("NP < (/^(?:NP|NNP|NML)/=m1 $.. (/^CC|,/ $.. /^(?:NP|NNP|NML)/=m2))"); protected static void extractEnumerations(CoreMap s, List<Mention> mentions, Set<IntPair> mentionSpanSet, Set<IntPair> namedEntitySpanSet) { List<CoreLabel> sent = s.get(CoreAnnotations.TokensAnnotation.class); Tree tree = s.get(TreeCoreAnnotations.TreeAnnotation.class); SemanticGraph dependency = s.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class); TregexPattern tgrepPattern = enumerationsMentionPattern; TregexMatcher matcher = tgrepPattern.matcher(tree); Map<IntPair, Tree> spanToMentionSubTree = Generics.newHashMap(); while (matcher.find()) { matcher.getMatch(); Tree m1 = matcher.getNode("m1"); Tree m2 = matcher.getNode("m2"); List<Tree> mLeaves = m1.getLeaves(); int beginIdx = ((CoreLabel)mLeaves.get(0).label()).get(CoreAnnotations.IndexAnnotation.class)-1; int endIdx = ((CoreLabel)mLeaves.get(mLeaves.size()-1).label()).get(CoreAnnotations.IndexAnnotation.class); spanToMentionSubTree.put(new IntPair(beginIdx, endIdx), m1); mLeaves = m2.getLeaves(); beginIdx = ((CoreLabel)mLeaves.get(0).label()).get(CoreAnnotations.IndexAnnotation.class)-1; endIdx = ((CoreLabel)mLeaves.get(mLeaves.size()-1).label()).get(CoreAnnotations.IndexAnnotation.class); spanToMentionSubTree.put(new IntPair(beginIdx, endIdx), m2); } for(IntPair mSpan : spanToMentionSubTree.keySet()){ if(!mentionSpanSet.contains(mSpan) && !insideNE(mSpan, namedEntitySpanSet)) { int dummyMentionId = -1; Mention m = new Mention(dummyMentionId, mSpan.get(0), mSpan.get(1), dependency, new ArrayList<>(sent.subList(mSpan.get(0), mSpan.get(1))), spanToMentionSubTree.get(mSpan)); mentions.add(m); mentionSpanSet.add(mSpan); } } } /** Check whether a mention is inside of a named entity */ private static boolean insideNE(IntPair mSpan, Set<IntPair> namedEntitySpanSet) { for (IntPair span : namedEntitySpanSet){ if(span.get(0) <= mSpan.get(0) && mSpan.get(1) <= span.get(1)) return true; } return false; } protected void findHead(CoreMap s, List<Mention> mentions) { Tree tree = s.get(TreeCoreAnnotations.TreeAnnotation.class); List<CoreLabel> sent = s.get(CoreAnnotations.TokensAnnotation.class); tree.indexSpans(0); for (Mention m : mentions){ Tree head = findSyntacticHead(m, tree, sent); m.headIndex = ((CoreLabel) head.label()).get(CoreAnnotations.IndexAnnotation.class)-1; m.headWord = sent.get(m.headIndex); m.headString = m.headWord.get(CoreAnnotations.TextAnnotation.class).toLowerCase(Locale.ENGLISH); int start = m.headIndex - m.startIndex; if (start < 0 || start >= m.originalSpan.size()) { SieveCoreferenceSystem.logger.warning("Invalid index for head " + start + "=" + m.headIndex + "-" + m.startIndex + ": originalSpan=[" + StringUtils.joinWords(m.originalSpan, " ") + "], head=" + m.headWord); SieveCoreferenceSystem.logger.warning("Setting head string to entire mention"); m.headIndex = m.startIndex; m.headWord = m.originalSpan.size() > 0 ? m.originalSpan.get(0) : sent.get(m.startIndex); m.headString = m.originalSpan.toString(); } } } protected Tree findSyntacticHead(Mention m, Tree root, List<CoreLabel> tokens) { // mention ends with 's int endIdx = m.endIndex; if (m.originalSpan.size() > 0) { String lastWord = m.originalSpan.get(m.originalSpan.size()-1).get(CoreAnnotations.TextAnnotation.class); if((lastWord.equals("'s") || lastWord.equals("'")) && m.originalSpan.size() != 1 ) endIdx--; } Tree exactMatch = findTreeWithSpan(root, m.startIndex, endIdx); // // found an exact match // if (exactMatch != null) { return safeHead(exactMatch, endIdx); } // no exact match found // in this case, we parse the actual extent of the mention, embedded in a sentence // context, so as to make the parser work better :-) if (allowReparsing) { int approximateness = 0; List<CoreLabel> extentTokens = new ArrayList<>(); extentTokens.add(initCoreLabel("It")); extentTokens.add(initCoreLabel("was")); final int ADDED_WORDS = 2; for (int i = m.startIndex; i < endIdx; i++) { // Add everything except separated dashes! The separated dashes mess with the parser too badly. CoreLabel label = tokens.get(i); if ( ! "-".equals(label.word())) { // necessary to copy tokens in case the parser does things like // put new indices on the tokens extentTokens.add((CoreLabel) label.labelFactory().newLabel(label)); } else { approximateness++; } } extentTokens.add(initCoreLabel(".")); // constrain the parse to the part we're interested in. // Starting from ADDED_WORDS comes from skipping "It was". // -1 to exclude the period. // We now let it be any kind of nominal constituent, since there // are VP and S ones ParserConstraint constraint = new ParserConstraint(ADDED_WORDS, extentTokens.size() - 1, Pattern.compile(".*")); List<ParserConstraint> constraints = Collections.singletonList(constraint); Tree tree = parse(extentTokens, constraints); convertToCoreLabels(tree); // now unnecessary, as parser uses CoreLabels? tree.indexSpans(m.startIndex - ADDED_WORDS); // remember it has ADDED_WORDS extra words at the beginning Tree subtree = findPartialSpan(tree, m.startIndex); // There was a possible problem that with a crazy parse, extentHead could be one of the added words, not a real word! // Now we make sure in findPartialSpan that it can't be before the real start, and in safeHead, we disallow something // passed the right end (that is, just that final period). Tree extentHead = safeHead(subtree, endIdx); assert(extentHead != null); // extentHead is a child in the local extent parse tree. we need to find the corresponding node in the main tree // Because we deleted dashes, it's index will be >= the index in the extent parse tree CoreLabel l = (CoreLabel) extentHead.label(); Tree realHead = funkyFindLeafWithApproximateSpan(root, l.value(), l.get(CoreAnnotations.BeginIndexAnnotation.class), approximateness); assert(realHead != null); return realHead; } // If reparsing wasn't allowed, try to find a span in the tree // which happens to have the head Tree wordMatch = findTreeWithSmallestSpan(root, m.startIndex, endIdx); if (wordMatch != null) { Tree head = safeHead(wordMatch, endIdx); if (head != null) { int index = ((CoreLabel) head.label()).get(CoreAnnotations.IndexAnnotation.class)-1; if (index >= m.startIndex && index < endIdx) { return head; } } } // If that didn't work, guess that it's the last word int lastNounIdx = endIdx-1; for(int i=m.startIndex ; i < m.endIndex ; i++) { if(tokens.get(i).tag().startsWith("N")) lastNounIdx = i; else if(tokens.get(i).tag().startsWith("W")) break; } List<Tree> leaves = root.getLeaves(); Tree endLeaf = leaves.get(lastNounIdx); return endLeaf; } /** Find the tree that covers the portion of interest. */ private static Tree findPartialSpan(final Tree root, final int start) { CoreLabel label = (CoreLabel) root.label(); int startIndex = label.get(CoreAnnotations.BeginIndexAnnotation.class); if (startIndex == start) { return root; } for (Tree kid : root.children()) { CoreLabel kidLabel = (CoreLabel) kid.label(); int kidStart = kidLabel.get(CoreAnnotations.BeginIndexAnnotation.class); int kidEnd = kidLabel.get(CoreAnnotations.EndIndexAnnotation.class); if (kidStart <= start && kidEnd > start) { return findPartialSpan(kid, start); } } throw new RuntimeException("Shouldn't happen: " + start + " " + root); } private static Tree funkyFindLeafWithApproximateSpan(Tree root, String token, int index, int approximateness) { // log.info("Searching " + root + "\n for " + token + " at position " + index + " (plus up to " + approximateness + ")"); List<Tree> leaves = root.getLeaves(); for (Tree leaf : leaves) { CoreLabel label = CoreLabel.class.cast(leaf.label()); Integer indexInteger = label.get(CoreAnnotations.IndexAnnotation.class); if (indexInteger == null) continue; int ind = indexInteger - 1; if (token.equals(leaf.value()) && ind >= index && ind <= index + approximateness) { return leaf; } } // this shouldn't happen // throw new RuntimeException("RuleBasedCorefMentionFinder: ERROR: Failed to find head token"); SieveCoreferenceSystem.logger.warning("RuleBasedCorefMentionFinder: Failed to find head token:\n" + "Tree is: " + root + "\n" + "token = |" + token + "|" + index + "|, approx=" + approximateness); for (Tree leaf : leaves) { if (token.equals(leaf.value())) { //log.info("Found something: returning " + leaf); return leaf; } } int fallback = Math.max(0, leaves.size() - 2); SieveCoreferenceSystem.logger.warning("RuleBasedCorefMentionFinder: Last resort: returning as head: " + leaves.get(fallback)); return leaves.get(fallback); // last except for the added period. } private static CoreLabel initCoreLabel(String token) { CoreLabel label = new CoreLabel(); label.set(CoreAnnotations.TextAnnotation.class, token); label.set(CoreAnnotations.ValueAnnotation.class, token); return label; } private Tree parse(List<CoreLabel> tokens) { return parse(tokens, null); } private Tree parse(List<CoreLabel> tokens, List<ParserConstraint> constraints) { CoreMap sent = new Annotation(""); sent.set(CoreAnnotations.TokensAnnotation.class, tokens); sent.set(ParserAnnotations.ConstraintAnnotation.class, constraints); Annotation doc = new Annotation(""); List<CoreMap> sents = new ArrayList<>(1); sents.add(sent); doc.set(CoreAnnotations.SentencesAnnotation.class, sents); getParser().annotate(doc); sents = doc.get(CoreAnnotations.SentencesAnnotation.class); return sents.get(0).get(TreeCoreAnnotations.TreeAnnotation.class); } private Annotator getParser() { if(parserProcessor == null){ Annotator parser = StanfordCoreNLP.getExistingAnnotator("parse"); if (parser == null) { // TODO: these assertions rule out the possibility of alternately named parse/pos annotators throw new AssertionError("Failed to get parser - this should not be possible"); } if (parser.requires().contains(CoreAnnotations.PartOfSpeechAnnotation.class)) { Annotator tagger = StanfordCoreNLP.getExistingAnnotator("pos"); if (tagger == null) { throw new AssertionError("Parser required tagger, but failed to find the pos annotator"); } List<Annotator> annotators = Generics.newArrayList(); annotators.add(tagger); annotators.add(parser); parserProcessor = new AnnotationPipeline(annotators); } else { parserProcessor = parser; } } return parserProcessor; } // This probably isn't needed now; everything is always a core label. But no-op. private static void convertToCoreLabels(Tree tree) { Label l = tree.label(); if (! (l instanceof CoreLabel)) { CoreLabel cl = new CoreLabel(); cl.setValue(l.value()); tree.setLabel(cl); } for (Tree kid : tree.children()) { convertToCoreLabels(kid); } } private Tree safeHead(Tree top, int endIndex) { // The trees passed in do not have the CoordinationTransformer // applied, but that just means the SemanticHeadFinder results are // slightly worse. Tree head = top.headTerminal(headFinder); // One obscure failure case is that the added period becomes the head. Disallow this. if (head != null) { Integer headIndexInteger = ((CoreLabel) head.label()).get(CoreAnnotations.IndexAnnotation.class); if (headIndexInteger != null) { int headIndex = headIndexInteger - 1; if (headIndex < endIndex) { return head; } } } // if no head found return the right-most leaf List<Tree> leaves = top.getLeaves(); int candidate = leaves.size() - 1; while (candidate >= 0) { head = leaves.get(candidate); Integer headIndexInteger = ((CoreLabel) head.label()).get(CoreAnnotations.IndexAnnotation.class); if (headIndexInteger != null) { int headIndex = headIndexInteger - 1; if (headIndex < endIndex) { return head; } } candidate--; } // fallback: return top return top; } static Tree findTreeWithSmallestSpan(Tree tree, int start, int end) { List<Tree> leaves = tree.getLeaves(); Tree startLeaf = leaves.get(start); Tree endLeaf = leaves.get(end - 1); return Trees.getLowestCommonAncestor(Arrays.asList(startLeaf, endLeaf), tree); } private static Tree findTreeWithSpan(Tree tree, int start, int end) { CoreLabel l = (CoreLabel) tree.label(); if (l != null && l.containsKey(CoreAnnotations.BeginIndexAnnotation.class) && l.containsKey(CoreAnnotations.EndIndexAnnotation.class)) { int myStart = l.get(CoreAnnotations.BeginIndexAnnotation.class); int myEnd = l.get(CoreAnnotations.EndIndexAnnotation.class); if (start == myStart && end == myEnd){ // found perfect match return tree; } else if (end < myStart) { return null; } else if (start >= myEnd) { return null; } } // otherwise, check inside children - a match is possible for (Tree kid : tree.children()) { if (kid == null) continue; Tree ret = findTreeWithSpan(kid, start, end); // found matching child if (ret != null) return ret; } // no match return null; } /** Filter out all spurious mentions */ protected static void removeSpuriousMentions(CoreMap s, List<Mention> mentions, Dictionaries dict) { Tree tree = s.get(TreeCoreAnnotations.TreeAnnotation.class); List<CoreLabel> sent = s.get(CoreAnnotations.TokensAnnotation.class); Set<Mention> remove = Generics.newHashSet(); for(Mention m : mentions){ String headPOS = m.headWord.get(CoreAnnotations.PartOfSpeechAnnotation.class); String headNE = m.headWord.get(CoreAnnotations.NamedEntityTagAnnotation.class); // pleonastic it if(isPleonastic(m, tree)) remove.add(m); // non word such as 'hmm' if(dict.nonWords.contains(m.headString)) remove.add(m); // quantRule : not starts with 'any', 'all' etc if (m.originalSpan.size() > 0 && dict.quantifiers.contains(m.originalSpan.get(0).get(CoreAnnotations.TextAnnotation.class).toLowerCase(Locale.ENGLISH))) remove.add(m); // partitiveRule if (partitiveRule(m, sent, dict)) remove.add(m); // bareNPRule if (headPOS.equals("NN") && !dict.temporals.contains(m.headString) && (m.originalSpan.size()==1 || m.originalSpan.get(0).get(CoreAnnotations.PartOfSpeechAnnotation.class).equals("JJ"))) { remove.add(m); } // remove generic rule // if(m.generic==true) remove.add(m); if (m.headString.equals("%")) remove.add(m); if (headNE.equals("PERCENT") || headNE.equals("MONEY")) remove.add(m); // adjective form of nations if (dict.isAdjectivalDemonym(m.spanToString())) remove.add(m); // stop list (e.g., U.S., there) if (inStopList(m)) remove.add(m); } // nested mention with shared headword (except apposition, enumeration): pick larger one for (Mention m1 : mentions){ for (Mention m2 : mentions){ if (m1==m2 || remove.contains(m1) || remove.contains(m2)) continue; if (m1.sentNum==m2.sentNum && m1.headWord==m2.headWord && m2.insideIn(m1)) { if (m2.endIndex < sent.size() && (sent.get(m2.endIndex).get(CoreAnnotations.PartOfSpeechAnnotation.class).equals(",") || sent.get(m2.endIndex).get(CoreAnnotations.PartOfSpeechAnnotation.class).equals("CC"))) { continue; } remove.add(m2); } } } mentions.removeAll(remove); } private static boolean inStopList(Mention m) { String mentionSpan = m.spanToString().toLowerCase(Locale.ENGLISH); if (mentionSpan.equals("u.s.") || mentionSpan.equals("u.k.") || mentionSpan.equals("u.s.s.r")) return true; if (mentionSpan.equals("there") || mentionSpan.startsWith("etc.") || mentionSpan.equals("ltd.")) return true; if (mentionSpan.startsWith("'s ")) return true; if (mentionSpan.endsWith("etc.")) return true; return false; } private static boolean partitiveRule(Mention m, List<CoreLabel> sent, Dictionaries dict) { return m.startIndex >= 2 && sent.get(m.startIndex - 1).get(CoreAnnotations.TextAnnotation.class).equalsIgnoreCase("of") && dict.parts.contains(sent.get(m.startIndex - 2).get(CoreAnnotations.TextAnnotation.class).toLowerCase(Locale.ENGLISH)); } /** Check whether pleonastic 'it'. E.g., It is possible that ... */ private static final TregexPattern[] pleonasticPatterns = getPleonasticPatterns(); private static boolean isPleonastic(Mention m, Tree tree) { if ( ! m.spanToString().equalsIgnoreCase("it")) return false; for (TregexPattern p : pleonasticPatterns) { if (checkPleonastic(m, tree, p)) { // SieveCoreferenceSystem.logger.fine("RuleBasedCorefMentionFinder: matched pleonastic pattern '" + p + "' for " + tree); return true; } } return false; } private static TregexPattern[] getPleonasticPatterns() { final String[] patterns = { // cdm 2013: I spent a while on these patterns. I fixed a syntax error in five patterns ($.. split with space), so it now shouldn't exception in checkPleonastic. This gave 0.02% on CoNLL11 dev // I tried some more precise patterns but they didn't help. Indeed, they tended to hurt vs. the higher recall patterns. //"NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:is|was|become|became)/) $.. (VP < (VBN $.. /S|SBAR/))))", // overmatches // "@NP < (PRP=m1 < it|IT|It) $.. (@VP < (/^V.*/ < /^(?i:is|was|be|becomes|become|became)$/ $.. (@VP < (VBN < expected|hoped $.. @SBAR))))", // this one seems more accurate, but ... "@NP < (PRP=m1 < it|IT|It) $.. (@VP < (/^V.*/ < /^(?i:is|was|be|becomes|become|became)$/ $.. (@VP < (VBN $.. @S|SBAR))))", // in practice, go with this one (best results) "NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:is|was|become|became)/) $.. (ADJP $.. (/S|SBAR/))))", "NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:is|was|become|became)/) $.. (ADJP < (/S|SBAR/))))", // "@NP < (PRP=m1 < it|IT|It) $.. (@VP < (/^V.*/ < /^(?i:is|was|be|becomes|become|became)$/ $.. (@ADJP < (/^(?:JJ|VB)/ < /^(?i:(?:hard|tough|easi)(?:er|est)?|(?:im|un)?(?:possible|interesting|worthwhile|likely|surprising|certain)|disappointing|pointless|easy|fine|okay)$/) [ < @S|SBAR | $.. (@S|SBAR !< (IN !< for|For|FOR|that|That|THAT)) ] )))", // does worse than above 2 on CoNLL11 dev "NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:is|was|become|became)/) $.. (NP < /S|SBAR/)))", "NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:is|was|become|became)/) $.. (NP $.. ADVP $.. /S|SBAR/)))", // "@NP < (PRP=m1 < it|IT|It) $.. (@VP < (/^V.*/ < /^(?i:is|was|be|becomes|become|became)$/ $.. (@NP $.. @ADVP $.. @SBAR)))", // cleft examples, generalized to not need ADVP; but gave worse CoNLL12 dev numbers.... // these next 5 had buggy space in "$ ..", which I fixed "NP < (PRP=m1) $.. (VP < (MD $.. (VP < ((/^V.*/ < /^(?:be|become)/) $.. (VP < (VBN $.. /S|SBAR/))))))", "NP < (PRP=m1) $.. (VP < (MD $.. (VP < ((/^V.*/ < /^(?:be|become)/) $.. (ADJP $.. (/S|SBAR/))))))", // extraposed. OK 1/2 correct; need non-adverbial case "NP < (PRP=m1) $.. (VP < (MD $.. (VP < ((/^V.*/ < /^(?:be|become)/) $.. (ADJP < (/S|SBAR/))))))", // OK: 3/3 good matches on dev; but 3/4 wrong on WSJ // certain can be either but relatively likely pleonastic with it ... be // "@NP < (PRP=m1 < it|IT|It) $.. (@VP < (MD $.. (@VP < ((/^V.*/ < /^(?:be|become)/) $.. (@ADJP < (/^JJ/ < /^(?i:(?:hard|tough|easi)(?:er|est)?|(?:im|un)?(?:possible|interesting|worthwhile|likely|surprising|certain)|disappointing|pointless|easy|fine|okay))$/) [ < @S|SBAR | $.. (@S|SBAR !< (IN !< for|For|FOR|that|That|THAT)) ] )))))", // GOOD REPLACEMENT ; 2nd clause is for extraposed ones "NP < (PRP=m1) $.. (VP < (MD $.. (VP < ((/^V.*/ < /^(?:be|become)/) $.. (NP < /S|SBAR/)))))", "NP < (PRP=m1) $.. (VP < (MD $.. (VP < ((/^V.*/ < /^(?:be|become)/) $.. (NP $.. ADVP $.. /S|SBAR/)))))", "NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:seems|appears|means|follows)/) $.. /S|SBAR/))", "NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:turns|turned)/) $.. PRT $.. /S|SBAR/))" }; TregexPattern[] tgrepPatterns = new TregexPattern[patterns.length]; for (int i = 0; i < tgrepPatterns.length; i++) { tgrepPatterns[i] = TregexPattern.compile(patterns[i]); } return tgrepPatterns; } private static boolean checkPleonastic(Mention m, Tree tree, TregexPattern tgrepPattern) { try { TregexMatcher matcher = tgrepPattern.matcher(tree); while (matcher.find()) { Tree np1 = matcher.getNode("m1"); if (((CoreLabel)np1.label()).get(CoreAnnotations.BeginIndexAnnotation.class)+1 == m.headWord.get(CoreAnnotations.IndexAnnotation.class)) { return true; } } } catch (Exception e) { e.printStackTrace(); } return false; } }