//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.annotators.regex.helpers; import java.util.Collection; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import uk.gov.dstl.baleen.types.language.PhraseChunk; import uk.gov.dstl.baleen.types.semantic.Entity; import uk.gov.dstl.baleen.uima.BaleenAnnotator; /** An abstract base class for building RegexAnnotators that only act on Noun Phrases. * If no noun phrases are found, then this annotator will act entirely on the Regex - unless the document is entirely upper case in which case no results are returned. * * Implement create and pass in the regex definition to the contractor. * * * * * @param <T> the type of entity produced. */ public abstract class AbstractRegexNPAnnotator<T extends Entity> extends BaleenAnnotator { private Pattern pattern; private double confidence; private int matcherGroup; /** New instance, based on the supplied pattern. Uses the whole matched regex as the entity text. * @param pattern the regex pattern * @param caseSensitive should this be treated a case sensitive * @param confidence the confidence to assign to created entities. */ protected AbstractRegexNPAnnotator(String pattern, boolean caseSensitive, double confidence) { this(Pattern.compile(pattern, caseSensitive ? 0 : Pattern.CASE_INSENSITIVE), 0, confidence); } /** New instance, based on a regex pattern. Uses the whole matched regex as the entity text. * @param pattern the regex pattern * @param confidence the confidence to assign to created entities. */ protected AbstractRegexNPAnnotator(Pattern pattern, double confidence) { this(pattern, 0, confidence); } /** New instance, based on the supplied pattern. * @param pattern the regex pattern * @param matcherGroup the matcher group to use as the content of the entity * @param caseSensitive should this be treated a case sensitive * @param confidence the confidence to assign to created entities. */ protected AbstractRegexNPAnnotator(String pattern, int matcherGroup, boolean caseSensitive, double confidence) { this(Pattern.compile(pattern, caseSensitive ? 0 : Pattern.CASE_INSENSITIVE), matcherGroup, confidence); } /** New instance, based on a regex pattern. * @param pattern the regex pattern * @param matcherGroup the matcher group to use as the content of the entity * @param confidence the confidence to assign to created entities. */ protected AbstractRegexNPAnnotator(Pattern pattern, int matcherGroup, double confidence) { this.pattern = pattern; this.matcherGroup = matcherGroup; this.confidence = confidence; } /** Create an entity, using the information from the matcher. * * Not the implementor does not need to set the offset, confidence, or add to the jcas. * See {@link AbstractRegexNPAnnotator} doProcess(). * * @param jCas the jcas being processed * @param matcher matcher (from the pattern supplied in the constructor) * @return an instance, or null if this is not a valid match. */ protected abstract T create(JCas jCas, Matcher matcher); @Override public void doProcess(JCas jCas) throws AnalysisEngineProcessException { Collection<PhraseChunk> chunks = JCasUtil.select(jCas, PhraseChunk.class); if(!chunks.isEmpty()){ for (PhraseChunk chunk: chunks ) { if ("NP".equals(chunk.getChunkType()) ) { createEntities(jCas, chunk.getCoveredText(), chunk.getBegin()); } } }else if(!isUpperCase(jCas.getDocumentText())){ createEntities(jCas, jCas.getDocumentText(), 0); } } private void createEntities(JCas jCas, String text, int offset){ Matcher matcher = pattern.matcher(text); while(matcher.find()){ createEntity(jCas, matcher, offset); } } private Entity createEntity(JCas jCas, Matcher matcher, int offset){ Entity a = create(jCas, matcher); if(a != null) { a.setConfidence(confidence); a.setBegin(offset + matcher.start(matcherGroup)); a.setEnd(offset + matcher.end(matcherGroup)); a.setValue(matcher.group(matcherGroup)); addToJCasIndex(a); } return a; } private static boolean isUpperCase(String s){ for(char c : s.toCharArray()){ if(Character.isLetter(c) && Character.isLowerCase(c)){ return false; } } return true; } }