//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.coreference.impl.sieves;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import java.util.function.Predicate;
import java.util.regex.Pattern;
import org.apache.uima.jcas.JCas;
import com.google.common.collect.Sets;
import uk.gov.dstl.baleen.annotators.coreference.impl.data.Cluster;
import uk.gov.dstl.baleen.annotators.coreference.impl.data.Mention;
import uk.gov.dstl.baleen.types.language.WordToken;
import uk.gov.dstl.baleen.types.semantic.Location;
import uk.gov.dstl.baleen.uima.grammar.ParseTree;
import uk.gov.dstl.baleen.uima.grammar.ParseTreeNode;
/**
* Sieves based on very specific (precise) rules.
* <p>
* Includes acronyms or certain constructs like "Prime Minister, Tony Blair".
* <p>
* Our parser, OpenNlp, does not output (,) so we need to do a manual check for that.
*
*/
public class PreciseConstructsSieve extends AbstractCoreferenceSieve {
private static final Predicate<WordToken> CONJUNCTION_FILTER = w -> "CC".equals(w.getPartOfSpeech());
private static final Pattern COMMA = Pattern.compile("\\s*,\\s*");
private final ParseTree parseTree;
/**
* Constructor for PreciseConstructsSieve
*/
public PreciseConstructsSieve(JCas jCas, ParseTree parseTree, List<Cluster> clusters, List<Mention> mentions) {
super(jCas, clusters, mentions);
this.parseTree = parseTree;
}
@Override
public void sieve() {
parseTree.traverseChildren(children -> {
for (int i = 0; i < children.size() - 1; i++) {
final ParseTreeNode a = children.get(i);
final ParseTreeNode b = children.get(i + 1);
if ("NP".equals(a.getChunk().getChunkType()) && "NP".equals(b.getChunk().getChunkType())) {
// Appositive - look for two NP chunks
doAppositive(a, b);
}else if ("NP".equals(a.getChunk().getChunkType()) && "VP".equals(b.getChunk().getChunkType())) {
// Predicate nominative - (NP VP(is / was) ) then take the NP under VP as
doPredicateNominative(a, b);
}else if ("NP".equals(a.getChunk().getChunkType()) && "WHNP".equals(b.getChunk().getChunkType())) {
// Relative pronoun
doRelativePronoun(a, b);
}
}
});
// TODO: Role appositive - slightly unclear how this is used. I guess its the "The actress
// Rachel is in the show. The actress plays a single role"
// Which is an the import anamorphic relation. However in that example "actress" is not
// found as a NP / Entity in baleen. Perhaps we should create a
// role annotation and then use that? (effective look for ROLE PERSON to fulfil this rule)
// Acronym
// The implement here depends on the acronym generator
doAcronym();
// Denoymns: Nationality - Country
// We are fortunate that we have Nationality and Location entities, and we already have the
// existing
// NationalityToLocation annotator, so this is not required.
}
private void doAppositive(ParseTreeNode a, ParseTreeNode b){
// Is there a comma between them, without AND/BUT/ETC
// Not in paper: Need to see if there's an AND in the larger noun phrase, eg
// Police, Fire and Ambulance (will get police-fire at the moment)
final String between = getJCas().getDocumentText().substring(a.getChunk().getEnd(),
b.getChunk().getBegin());
final ParseTreeNode parent = a.getParent();
// Special case there if there's its a location "London, UK" will match
// but we don't want it too. Probably need both the a and b to have a location
// before its wrong. Of course these depend on the quality of the entity
// extraction.
boolean notCoversLocation = !coversLocation(a) || !coversLocation(b);
if (COMMA.matcher(between).matches() && !parent.containsWord(CONJUNCTION_FILTER) && notCoversLocation) {
addCoveredToCluster(a.getChunk(), b.getChunk());
}
}
private void doPredicateNominative(ParseTreeNode a, ParseTreeNode b){
final Optional<ParseTreeNode> np = b.getChildren().stream()
.filter(n -> "NP".equals(n.getChunk().getChunkType()))
.findFirst();
final Optional<WordToken> is = b.getWords().stream()
.filter(w -> "is".equalsIgnoreCase(w.getCoveredText()))
.findFirst();
if (np.isPresent() && is.isPresent()) {
addCoveredToCluster(a.getChunk(), np.get().getChunk());
}
}
private void doRelativePronoun(ParseTreeNode a, ParseTreeNode b){
// The NP could be something that interests us, or it could a subpart of a large
// NP.
final List<Mention> mention = findMentionsExactly(a.getChunk().getBegin(), a.getChunk().getEnd());
final List<Mention> pronoun = findMentionsExactly(b.getChunk().getBegin(), b.getChunk().getEnd());
addPairwiseToCluster(mention, pronoun);
}
private void doAcronym(){
for (int i = 0; i < getMentions().size(); i++) {
final Mention a = getMentions().get(i);
final Set<String> aAcronyms = a.getAcronyms();
for (int j = i + 1; j < getMentions().size(); j++) {
final Mention b = getMentions().get(j);
final Set<String> bAcronyms = b.getAcronyms();
if (aAcronyms != null && bAcronyms != null && b.isAcronym() != a.isAcronym()
&& !Sets.intersection(aAcronyms, bAcronyms).isEmpty()) {
addToCluster(a, b);
}
}
}
}
private boolean coversLocation(ParseTreeNode a) {
return findMentionsUnder(a.getChunk().getBegin(), a.getChunk().getEnd())
.stream()
.anyMatch(m -> m.getAnnotation() instanceof Location);
}
}