//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.cleaners;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import com.google.common.base.Strings;
import com.google.common.collect.ImmutableSet;
import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction;
import uk.gov.dstl.baleen.types.semantic.Entity;
import uk.gov.dstl.baleen.types.semantic.ReferenceTarget;
import uk.gov.dstl.baleen.uima.BaleenAnnotator;
/**
* Find groups of entities of the same type and same value, ignoring case and apostrophes, and point them at the same ReferenceTarget.
*
* <p>If two or more existing ReferenceTargets exist within a group, then the group is either ignored
* or it's ReferenceTargets merged depending on the mergeReferents parameter.
* If there is one existing ReferenceTarget within a group then that is used as the ReferenceTarget for all entities in that group.</p>
*
*
* @baleen.javadoc
*/
public class CorefCapitalisationAndApostrophe extends BaleenAnnotator {
/**
* If two location entities are thought to be coreferences, but they have different existing reference targets, should we merge them?
*
* @baleen.config false
*/
public static final String PARAM_MERGE_REFERENTS = "mergeReferents";
@ConfigurationParameter(name = PARAM_MERGE_REFERENTS, defaultValue = "false")
boolean mergeReferents = false;
@Override
public void doProcess(JCas jCas) throws AnalysisEngineProcessException {
Map<String, List<Entity>> groups = new HashMap<>();
Collection<Entity> entities = JCasUtil.select(jCas, Entity.class);
for(Entity entity : entities){
String value = getEntityValue(entity);
value = normalizeValue(value);
String key = entity.getType().getName().toUpperCase() + "::" + value;
List<Entity> groupEntities = groups.containsKey(key) ? groups.get(key) : new ArrayList<>();
groupEntities.add(entity);
groups.put(key, groupEntities);
}
for(List<Entity> group : groups.values()){
if(group.size() <= 1){
continue;
}
Set<ReferenceTarget> rts = new HashSet<>();
for(Entity e : group){
if(e.getReferent() != null){
rts.add(e.getReferent());
}
}
ReferenceTarget rt = selectAppropriateReferenceTarget(jCas, rts);
if(rt == null){
getMonitor().info("Unable to coreference capitalised entities '{}' as they have different existing referents", getEntityValue(group.get(0)));
}else{
for(Entity e : group){
e.setReferent(rt);
}
}
}
}
private String getEntityValue(Entity e){
if(e == null){
return null;
}
String val = e.getValue();
if(Strings.isNullOrEmpty(val)){
val = e.getCoveredText();
}
return val;
}
private String normalizeValue(String value){
String s = value.trim().toLowerCase();
if(s.endsWith("'s")){
s = s.substring(0, s.length() - 2);
}else if(s.endsWith("s'")){
s = s.substring(0, s.length() - 1);
}
return s;
}
private ReferenceTarget selectAppropriateReferenceTarget(JCas jCas, Collection<ReferenceTarget> referenceTargets){
ReferenceTarget rt = null;
if(referenceTargets.isEmpty()){
rt = createReferenceTarget(jCas);
}else if(referenceTargets.size() == 1){
rt = referenceTargets.toArray(new ReferenceTarget[0])[0];
}else if(mergeReferents){
rt = createReferenceTarget(jCas);
removeFromJCasIndex(referenceTargets);
}
return rt;
}
private ReferenceTarget createReferenceTarget(JCas jCas){
ReferenceTarget rt = new ReferenceTarget(jCas);
rt.setBegin(0);
rt.setEnd(jCas.getDocumentText().length());
addToJCasIndex(rt);
return rt;
}
@Override
public AnalysisEngineAction getAction() {
return new AnalysisEngineAction(ImmutableSet.of(Entity.class), Collections.emptySet());
}
}