//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.misc;
import java.util.Collection;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import com.google.common.collect.ImmutableSet;
import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction;
import uk.gov.dstl.baleen.types.semantic.Entity;
import uk.gov.dstl.baleen.types.semantic.ReferenceTarget;
import uk.gov.dstl.baleen.uima.BaleenTextAwareAnnotator;
import uk.gov.dstl.baleen.uima.data.TextBlock;
/**
* Creates entity annotations for each piece of text that is the same as the covered text.
* <p>
* This is useful when a model is used (rather than a regex) and it only finds a subset of the
* mentions in a document.
* <p>
* If an annotation of the same type already exists on the covering text then another is not added.
*
* @baleen.javadoc
*/
public class MentionedAgain extends BaleenTextAwareAnnotator {
/**
* Should comparisons be done case sensitively?
*
* @baleen.config false
*/
public static final String PARAM_CASE_SENSITIVE = "caseSensitive";
@ConfigurationParameter(name = PARAM_CASE_SENSITIVE, defaultValue = "true")
protected boolean caseSensitive;
@Override
protected void doProcessTextBlock(TextBlock block) throws AnalysisEngineProcessException {
// We look through the JCas for the entities, but we only look for matches in this block
String text = block.getCoveredText();
Collection<Entity> list = JCasUtil.select(block.getJCas(), Entity.class);
Set<String> existingSpans = new HashSet<>(list.size());
Set<String> existingEntities = new HashSet<>();
Set<Entity> entities = new HashSet<>();
list.stream().forEach(e -> {
existingSpans.add(e.getBegin() + "//" + e.getEnd() + "//" + e.getTypeName());
if(existingEntities.add(e.getTypeName() + "//" + e.getValue())){
//Only add entities of a new type and value
entities.add(e);
}
});
for(Entity e : entities){
Pattern pattern;
if(caseSensitive){
pattern = Pattern.compile("\\b" + Pattern.quote(e.getCoveredText()) + "\\b");
}else{
pattern = Pattern.compile("\\b" + Pattern.quote(e.getCoveredText()) + "\\b", Pattern.CASE_INSENSITIVE);
}
Matcher matcher = pattern.matcher(text);
while (matcher.find()) {
foundMatch(block, matcher, e, existingSpans);
}
}
}
private void foundMatch(TextBlock block, Matcher matcher, Entity e, Set<String> existingSpans){
if(existingSpans.contains(matcher.start() + "//" + matcher.end() + "//" + e.getTypeName()))
return;
try{
Entity newEntity = e.getClass().getConstructor(JCas.class).newInstance(block.getJCas());
newEntity.setBegin(block.toDocumentOffset(matcher.start()));
newEntity.setEnd(block.toDocumentOffset(matcher.end()));
newEntity.setValue(e.getValue());
ReferenceTarget rt = e.getReferent();
if(rt == null){
rt = new ReferenceTarget(block.getJCas());
addToJCasIndex(rt);
e.setReferent(rt);
}
newEntity.setReferent(rt);
addToJCasIndex(newEntity);
}catch(Exception ex){
getMonitor().warn("Unable to create new entitiy", ex);
}
}
@Override
public AnalysisEngineAction getAction() {
return new AnalysisEngineAction(ImmutableSet.of(Entity.class), ImmutableSet.of(Entity.class));
}
}