//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.annotators.coreference.impl.sieves; import java.util.Arrays; import java.util.HashSet; import java.util.List; import java.util.Set; import java.util.stream.Collectors; import org.apache.uima.jcas.JCas; import uk.gov.dstl.baleen.annotators.coreference.impl.data.Cluster; import uk.gov.dstl.baleen.annotators.coreference.impl.data.Mention; import uk.gov.dstl.baleen.annotators.coreference.impl.data.MentionType; /** * Coreference based on exact matching. * * See 3.3.3 Pass 3. */ public class ExactStringMatchSieve extends AbstractCoreferenceSieve { private static final Set<String> EXCLUDED = new HashSet<>(Arrays.asList("that", "there")); /** * Constructor for ExactStringMatchSieve */ public ExactStringMatchSieve(JCas jCas, List<Cluster> clusters, List<Mention> mentions) { super(jCas, clusters, mentions); } @Override public void sieve() { List<Mention> mentions = getMentions(MentionType.ENTITY, MentionType.NP).stream() .filter(m -> m.getHead() != null) .filter(m -> !EXCLUDED.contains(m.getHead().toLowerCase())) .collect(Collectors.toList()); for (int i = 0; i < mentions.size(); i++) { final Mention a = mentions.get(i); final String aText = a.getText(); for (int j = i + 1; j < mentions.size(); j++) { final Mention b = mentions.get(j); final String bText = b.getText(); if (aText.equalsIgnoreCase(bText)) { addToCluster(a, b); } } } } }