//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.cleaners;
import java.util.Collection;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import com.google.common.collect.ImmutableSet;
import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction;
import uk.gov.dstl.baleen.types.semantic.Entity;
import uk.gov.dstl.baleen.types.semantic.ReferenceTarget;
import uk.gov.dstl.baleen.uima.BaleenAnnotator;
/**
* Identify entities that include brackets, and split them into separate coreferenced entities
*/
public class SplitBrackets extends BaleenAnnotator {
private static final Pattern ENDS_WITH_BRACKET = Pattern.compile("\\s*\\(([^\\(\\)]*?)\\)$");
@Override
protected void doProcess(JCas jCas) throws AnalysisEngineProcessException {
Collection<Entity> entities = JCasUtil.select(jCas, Entity.class);
for(Entity e : entities){
String text = e.getCoveredText();
ReferenceTarget rt = e.getReferent();
if(rt == null){
rt = new ReferenceTarget(jCas);
e.setReferent(rt);
}
Matcher m = ENDS_WITH_BRACKET.matcher(text);
while(m.find()){
//Split bracket off
Entity eBracket = null;
try {
eBracket = e.getClass().getConstructor(JCas.class).newInstance(jCas);
eBracket.setBegin(e.getBegin() + m.start(1));
eBracket.setEnd(e.getBegin() + m.end(1));
eBracket.setReferent(rt);
addToJCasIndex(eBracket);
} catch (Exception ex) {
getMonitor().error("Unable to create new entity of class {}", e.getClass().getName(), ex);
}
text = text.substring(0, m.start());
m = ENDS_WITH_BRACKET.matcher(text);
}
if(text.length() != e.getCoveredText().length()){
e.setEnd(e.getBegin() + text.length());
e.setValue(e.getCoveredText());
}
}
}
@Override
public AnalysisEngineAction getAction() {
return new AnalysisEngineAction(ImmutableSet.of(Entity.class), ImmutableSet.of(Entity.class));
}
}