//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.coreference.impl.sieves;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.apache.uima.jcas.JCas;
import com.google.common.primitives.Doubles;
import uk.gov.dstl.baleen.annotators.coreference.impl.data.Cluster;
import uk.gov.dstl.baleen.annotators.coreference.impl.data.Mention;
import uk.gov.dstl.baleen.annotators.coreference.impl.data.MentionType;
/**
* Sieve based on exact matching of the head word.
*/
public class ProperHeadMatchSieve extends AbstractCoreferenceSieve {
private static final Pattern NUMBER = Pattern.compile("-?\\d+(,\\d+)*(\\.\\d+)?[k|m|b]?", Pattern.CASE_INSENSITIVE);
private final Set<String> spatialModifiers = new HashSet<String>(
Arrays.asList("northern", "southern", "western", "eastern", "south", "east", "north", "west",
"central", "upper", "lower", "middle", "inner", "outer"));
/**
* Constructor for ProperHeadMatchSieve
*/
public ProperHeadMatchSieve(JCas jCas, List<Cluster> clusters, List<Mention> mentions) {
super(jCas, clusters, mentions);
}
@Override
public void sieve() {
// Note: Head must be proper nouns, but ours are by construction
List<Mention> mentions = getMentionsWithHead(MentionType.ENTITY, MentionType.NP);
for (int i = 0; i < mentions.size(); i++) {
final Mention a = mentions.get(i);
String aHead = a.getHead().toLowerCase();
for (int j = i + 1; j < mentions.size(); j++) {
final Mention b = mentions.get(j);
String bHead = b.getHead().toLowerCase();
if (aHead.equals(bHead) && shouldAddMentionsToCluster(a, b)){
addToCluster(a, b);
}
}
}
}
private boolean hasSameModifiers(Mention a, Mention b) {
// TODO: The paper says location named entities, other proper nouns or other spatial
// modifiers but since locations should be other proper nouns we ignore that clause. We
// could look for Locations covered by the annotation.
final Set<String> aModifiers = getSpatialAndPNModifier(a);
final Set<String> bModifiers = getSpatialAndPNModifier(b);
return aModifiers.size() == bModifiers.size() && aModifiers.containsAll(bModifiers);
}
private Set<String> getSpatialAndPNModifier(Mention a) {
return a.getWords().stream()
.filter(w -> w.getPartOfSpeech().startsWith("NP") || spatialModifiers.contains(w.getCoveredText()))
.map(w -> w.getCoveredText().toLowerCase())
.collect(Collectors.toSet());
}
// Asymetric
private List<Double> extractNumbers(String text) {
final List<Double> list = new LinkedList<>();
final Matcher matcher = NUMBER.matcher(text);
while (matcher.find()) {
final Double d = Doubles.tryParse(matcher.group().replaceAll(",", ""));
if (d != null) {
list.add(d);
}
}
return list;
}
// Asymetric
private boolean hasSameNumbers(Collection<Double> aNumbers, Collection<Double> bNumbers) {
for (final double b : bNumbers) {
boolean found = false;
for (final double a : aNumbers) {
// 'Fuzzy match' the numbers
if (Math.abs(a - b) < 0.01 * Math.max(Math.abs(a), Math.abs(a))) {
found = true;
break;
}
}
if (!found) {
return false;
}
}
return true;
}
private boolean shouldAddMentionsToCluster(Mention a, Mention b){
// Not i-within-i
if (a.overlaps(b)) {
return false;
}
// No modifier
if (!hasSameModifiers(a, b)) {
return false;
}
// No numerical mismatches
final List<Double> aNumbers = extractNumbers(a.getText());
final List<Double> bNumbers = extractNumbers(b.getText());
if (!hasSameNumbers(aNumbers, bNumbers)) {
return false;
}
return true;
}
}