//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.regex.helpers;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import uk.gov.dstl.baleen.types.Base;
import uk.gov.dstl.baleen.types.semantic.Entity;
import uk.gov.dstl.baleen.uima.BaleenTextAwareAnnotator;
import uk.gov.dstl.baleen.uima.data.TextBlock;
/** An abstract base class for building RegexAnnotators.
*
* Implement create and pass in the regex definition to the contractor.
*
*
*
* @param <T> the type of entity produced.
*/
public abstract class AbstractRegexAnnotator<T extends Annotation> extends BaleenTextAwareAnnotator {
private Pattern pattern;
private double confidence;
private int matcherGroup;
/** New instance, based on the supplied pattern. Uses the whole matched regex as the entity text.
* @param pattern the regex pattern
* @param caseSensitive should this be treated a case sensitive
* @param confidence the confidence to assign to created entities.
*/
protected AbstractRegexAnnotator(String pattern, boolean caseSensitive, double confidence) {
this(Pattern.compile(pattern, caseSensitive ? 0 : Pattern.CASE_INSENSITIVE), 0, confidence);
}
/** New instance, based on a regex pattern. Uses the whole matched regex as the entity text.
* @param pattern the regex pattern
* @param confidence the confidence to assign to created entities.
*/
protected AbstractRegexAnnotator(Pattern pattern, double confidence) {
this(pattern, 0, confidence);
}
/** New instance, based on the supplied pattern.
* @param pattern the regex pattern
* @param matcherGroup the matcher group to use as the content of the entity
* @param caseSensitive should this be treated a case sensitive
* @param confidence the confidence to assign to created entities.
*/
protected AbstractRegexAnnotator(String pattern, int matcherGroup, boolean caseSensitive, double confidence) {
this(Pattern.compile(pattern, caseSensitive ? 0 : Pattern.CASE_INSENSITIVE), matcherGroup, confidence);
}
/** New instance, based on a regex pattern.
* @param pattern the regex pattern
* @param matcherGroup the matcher group to use as the content of the entity
* @param confidence the confidence to assign to created entities.
*/
protected AbstractRegexAnnotator(Pattern pattern, int matcherGroup, double confidence) {
this.pattern = pattern;
this.matcherGroup = matcherGroup;
this.confidence = confidence;
}
/** Create an entity, using the information from the matcher.
*
* Not the implementor does not need to set the offset, confidence, or add to the jcas.
* See {@link AbstractRegexAnnotator} doProcess().
*
* @param jCas the jcas being processed
* @param matcher matcher (from the pattern supplied in the constructor)
* @return an instance, or null if this is not a valid match.
*/
protected abstract T create(JCas jCas, Matcher matcher);
@Override
public void doProcessTextBlock(TextBlock block) throws AnalysisEngineProcessException {
String text = block.getCoveredText();
Matcher matcher = pattern.matcher(text);
while(matcher.find()){
Annotation a = create(block.getJCas(), matcher);
if(a != null) {
block.setBeginAndEnd(a, matcher.start(matcherGroup), matcher.end(matcherGroup));
if(a instanceof Base) {
((Base)a).setConfidence(confidence);
}
if(a instanceof Entity) {
((Entity)a).setValue(matcher.group(matcherGroup));
}
addToJCasIndex(a);
}
}
}
}