//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.annotators.gazetteer.helpers; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.resource.ResourceInitializationException; import uk.gov.dstl.baleen.exceptions.BaleenException; import uk.gov.dstl.baleen.resources.gazetteer.IGazetteer; import uk.gov.dstl.baleen.types.BaleenAnnotation; import uk.gov.dstl.baleen.uima.data.TextBlock; /** * Abstract class that builds on AbstractAhoCorasickAnnotator, but rather than searching the entire * document for gazetteer matches, it uses a regular expression to find potential matches and then * checks to see whether they appear in the Gazetteer. * * * @baleen.javadoc */ public abstract class AbstractRegexAhoCorasickAnnotator extends AbstractAhoCorasickAnnotator { /** * The regular expression to check against * * @baleen.config \\b\\w*\\b */ public static final String PARAM_REGEX = "regex"; @ConfigurationParameter(name = PARAM_REGEX, defaultValue = "\\b\\w*\\b") protected String regex; Pattern regexPattern; /** * Constructor */ public AbstractRegexAhoCorasickAnnotator() { // Do nothing } @Override public abstract IGazetteer configureGazetteer() throws BaleenException; @Override public void doInitialize(UimaContext aContext) throws ResourceInitializationException { super.doInitialize(aContext); if (caseSensitive) { regexPattern = Pattern.compile(regex); } else { regexPattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE); } } @Override public void doProcessTextBlock(TextBlock block) throws AnalysisEngineProcessException { Map<String, List<BaleenAnnotation>> entities = new HashMap<>(); Matcher m = regexPattern.matcher(block.getCoveredText()); while (m.find()) { String csValue = caseSensitive ? m.group() : m.group().toLowerCase(); if (gazetteer.hasValue(csValue)) { try { BaleenAnnotation ent = createEntity(block, m.start(), m.end(), m.group(), csValue); List<String> aliases = new ArrayList<>(Arrays.asList(gazetteer.getAliases(csValue))); aliases.add(csValue); String key = generateKey(aliases); List<BaleenAnnotation> groupEntities = entities.containsKey(key) ? entities.get(key) : new ArrayList<>(); groupEntities.add(ent); entities.put(key, groupEntities); } catch (Exception e) { getMonitor().error("Unable to create entity of type '{}' for value '{}'", entityType.getName(), m.group(), e); continue; } } } createReferenceTargets(block, entities.values()); } }