//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.cleaners;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import com.google.common.collect.ImmutableSet;
import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction;
import uk.gov.dstl.baleen.types.common.Nationality;
import uk.gov.dstl.baleen.types.semantic.Entity;
import uk.gov.dstl.baleen.uima.BaleenTextAwareAnnotator;
import uk.gov.dstl.baleen.uima.data.TextBlock;
/**
* Merge Nationality entities into an adjacent following entity of any time.
* For instance [British] [Prime Minister Theresa May] would become
* [British Prime Minister Theresa May].
*/
public class MergeNationalityIntoEntity extends BaleenTextAwareAnnotator {
@Override
protected void doProcessTextBlock(TextBlock block) throws AnalysisEngineProcessException {
Collection<Nationality> nationalities = block.select(Nationality.class);
if(nationalities.isEmpty())
return;
List<Entity> entities = block.select(Entity.class).stream().filter(e -> !e.getClass().equals(Nationality.class)).collect(Collectors.toList());
for(Nationality n : nationalities){
mergeEntities(block, n, entities);
}
}
private void mergeEntities(TextBlock block, Nationality n, List<Entity> entities){
for(Entity e : entities){
if(e.getBegin() < n.getEnd())
continue;
String between = block.getDocumentText().substring(n.getEnd(), e.getBegin());
if(between.trim().isEmpty()){
e.setBegin(n.getBegin());
e.setValue(e.getCoveredText());
mergeWithExisting(e, n);
return;
}
}
}
@Override
public AnalysisEngineAction getAction() {
return new AnalysisEngineAction(ImmutableSet.of(Nationality.class, Entity.class), Collections.emptySet());
}
}