//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.cleaners;
import java.util.List;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import com.google.common.collect.ImmutableSet;
import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction;
import uk.gov.dstl.baleen.types.semantic.Location;
import uk.gov.dstl.baleen.uima.BaleenAnnotator;
/**
* Tries to find instances where a single location may have been split into
* several entities, and collapse them into a single entity.
*
* Examples are:
* <ul>
* <li>The Iraq-Syria border</li>
* <li>The city of Mosul</li>
* </ul>
*/
public class CollapseLocations extends BaleenAnnotator{
private static final String BORDER = "border";
@Override
protected void doProcess(JCas jCas) throws AnalysisEngineProcessException {
collapseBorders(jCas);
collapseOf(jCas);
}
private void collapseBorders(JCas jCas){
for(Location l1 : JCasUtil.select(jCas, Location.class)){
List<Location> followingLocs = JCasUtil.selectFollowing(jCas, Location.class, l1, 1);
if(!followingLocs.isEmpty())
checkBorders(jCas, l1, followingLocs.get(0));
}
}
private void checkBorders(JCas jCas, Location l1, Location l2){
String betweenText = jCas.getDocumentText().substring(l1.getEnd(), l2.getBegin());
String followingText = jCas.getDocumentText().substring(l2.getEnd()).toLowerCase();
if("-".equals(betweenText.trim())){
Location l;
if(l2.getCoveredText().toLowerCase().endsWith(BORDER)){
l = new Location(jCas, l1.getBegin(), l2.getEnd());
}else if(followingText.trim().startsWith(BORDER)){
l = new Location(jCas, l1.getBegin(), l2.getEnd() + followingText.indexOf(BORDER) + BORDER.length());
}else{
return;
}
mergeWithNew(l, l1, l2);
}
}
private void collapseOf(JCas jCas){
for(Location l1 : JCasUtil.select(jCas, Location.class)){
List<Location> followingLocs = JCasUtil.selectFollowing(jCas, Location.class, l1, 1);
if(followingLocs.isEmpty())
continue;
Location l2 = followingLocs.get(0);
String betweenText = jCas.getDocumentText().substring(l1.getEnd(), l2.getBegin());
if("of".equals(betweenText.trim())){
l2.setBegin(l1.getBegin());
l2.setValue(l2.getCoveredText());
mergeWithExisting(l2, l1);
}
}
}
@Override
public AnalysisEngineAction getAction() {
return new AnalysisEngineAction(ImmutableSet.of(Location.class), ImmutableSet.of(Location.class));
}
}