//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.regex;
import java.util.Collections;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.uima.UIMAException;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.factory.JCasFactory;
import org.apache.uima.resource.ResourceInitializationException;
import com.google.common.base.Strings;
import com.google.common.collect.ImmutableSet;
import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction;
import uk.gov.dstl.baleen.core.utils.ConfigUtils;
import uk.gov.dstl.baleen.exceptions.BaleenException;
import uk.gov.dstl.baleen.types.semantic.Entity;
import uk.gov.dstl.baleen.uima.BaleenTextAwareAnnotator;
import uk.gov.dstl.baleen.uima.data.TextBlock;
import uk.gov.dstl.baleen.uima.utils.TypeSystemSingleton;
import uk.gov.dstl.baleen.uima.utils.TypeUtils;
/**
* Process the document using user supplied regular expressions and annotating matches as a user specified type
*
* <p>The regular expression supplied by the user is run over the document content. Matches are annotated as a user specified type, which must inherit from the Entity class.
* Users can supply a confidence to assign to annotations created by this RegEx.</p>
*
*
* @baleen.javadoc
*/
public class Custom extends BaleenTextAwareAnnotator {
private Pattern p = null;
private Class<? extends Entity> et = null;
/**
* Is the regular expression case sensitive?
*
* @baleen.config false
*/
public static final String PARAM_CASE_SENSITIVE = "caseSensitive";
@ConfigurationParameter(name = PARAM_CASE_SENSITIVE, defaultValue="false")
private boolean caseSensitive = false;
/**
* Which group in the regular expression should be used as the entity value?
*
* @baleen.config 0
*/
public static final String PARAM_GROUP = "group";
@ConfigurationParameter(name = PARAM_GROUP, defaultValue="0")
private String patternGroupString;
//Parse the patternGroup config parameter into this variable to avoid issues with parameter types
private int patternGroup;
/**
* The regular expression to search for
*
* @baleen.config
*/
public static final String PARAM_PATTERN = "pattern";
@ConfigurationParameter(name = PARAM_PATTERN, defaultValue = "")
private String pattern;
/**
* The entity type to use for matched entities
*
* @baleen.config uk.gov.dstl.baleen.types.semantic.Entity
*/
public static final String PARAM_TYPE = "type";
@ConfigurationParameter(name = PARAM_TYPE, defaultValue="uk.gov.dstl.baleen.types.semantic.Entity")
private String type = "uk.gov.dstl.baleen.types.semantic.Entity";
/**
* The entity subType to use for matched entities
*
* @baleen.config
*/
public static final String PARAM_SUB_TYPE = "subType";
@ConfigurationParameter(name = PARAM_SUB_TYPE, defaultValue="")
private String subType = "";
/**
* The confidence to assign to matched entities
*
* @baleen.config 1.0
*/
public static final String PARAM_CONFIDENCE = "confidence";
@ConfigurationParameter(name = PARAM_CONFIDENCE, defaultValue="1.0")
private String confidenceString;
//Parse the confidence config parameter into this variable to avoid issues with parameter types
private Float confidence;
@Override
public void doInitialize(UimaContext aContext) throws ResourceInitializationException {
patternGroup = ConfigUtils.stringToInteger(patternGroupString, 0);
confidence = ConfigUtils.stringToFloat(confidenceString, 1.0f);
if(caseSensitive){
p = Pattern.compile(pattern);
getMonitor().debug("The regular expression is \"{}\"", p.pattern());
}else{
p = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE);
}
try{
et = TypeUtils.getEntityClass(type, JCasFactory.createJCas(TypeSystemSingleton.getTypeSystemDescriptionInstance()));
}catch(UIMAException | BaleenException e){
throw new ResourceInitializationException(e);
}
}
@Override
public void doProcessTextBlock(TextBlock block) throws AnalysisEngineProcessException {
String text = block.getCoveredText();
Matcher m = p.matcher(text);
while(m.find()){
Entity ret;
try {
ret = block.newAnnotation(et, m.start(), m.end());
} catch (Exception e) {
throw new AnalysisEngineProcessException(e);
}
ret.setValue(m.group(patternGroup));
ret.setConfidence(confidence);
if (!Strings.isNullOrEmpty(subType)) {
ret.setSubType(subType);
}
addToJCasIndex(ret);
}
}
@Override
public void doDestroy(){
pattern = null;
et = null;
}
@Override
public AnalysisEngineAction getAction() {
return new AnalysisEngineAction(Collections.emptySet(), ImmutableSet.of(et));
}
}