AbstractRegexNPAnnotator.java example

Explorer
baleen-master
//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.regex.helpers;

import java.util.Collection;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;

import uk.gov.dstl.baleen.types.language.PhraseChunk;
import uk.gov.dstl.baleen.types.semantic.Entity;
import uk.gov.dstl.baleen.uima.BaleenAnnotator;

/** An abstract base class for building RegexAnnotators that only act on Noun Phrases.
 * If no noun phrases are found, then this annotator will act entirely on the Regex - unless the document is entirely upper case in which case no results are returned.
 * 
 * Implement create and pass in the regex definition to the contractor.
 * 
 * 
 * 
 *
 * @param <T> the type of entity produced.
 */
public abstract class AbstractRegexNPAnnotator<T extends Entity> extends BaleenAnnotator {
	
	private Pattern pattern;
	private double confidence;
	private int matcherGroup;

	/** New instance, based on the supplied pattern. Uses the whole matched regex as the entity text.
	 * @param pattern the regex pattern
	 * @param caseSensitive should this be treated a case sensitive
	 * @param confidence the confidence to assign to created entities.
	 */
	protected AbstractRegexNPAnnotator(String pattern, boolean caseSensitive, double confidence) {
		this(Pattern.compile(pattern, caseSensitive ? 0 : Pattern.CASE_INSENSITIVE), 0, confidence);
	}
	
	/** New instance, based on a regex pattern. Uses the whole matched regex as the entity text.
	 * @param pattern the regex pattern
	 * @param confidence the confidence to assign to created entities.
	 */
	protected AbstractRegexNPAnnotator(Pattern pattern, double confidence) {
		this(pattern, 0, confidence);
	}
	
	/** New instance, based on the supplied pattern.
	 * @param pattern the regex pattern
	 * @param matcherGroup the matcher group to use as the content of the entity
	 * @param caseSensitive should this be treated a case sensitive
	 * @param confidence the confidence to assign to created entities.
	 */
	protected AbstractRegexNPAnnotator(String pattern, int matcherGroup, boolean caseSensitive, double confidence) {
		this(Pattern.compile(pattern, caseSensitive ? 0 : Pattern.CASE_INSENSITIVE), matcherGroup, confidence);
	}
	
	/** New instance, based on a regex pattern.
	 * @param pattern the regex pattern
	 * @param matcherGroup the matcher group to use as the content of the entity 
	 * @param confidence the confidence to assign to created entities.
	 */
	protected AbstractRegexNPAnnotator(Pattern pattern, int matcherGroup, double confidence) {
		this.pattern = pattern;
		this.matcherGroup = matcherGroup;
		this.confidence = confidence;
	}
	
	
	/** Create an entity, using the information from the matcher.
	 * 
	 * Not the implementor does not need to set the offset, confidence, or add to the jcas.
	 * See {@link AbstractRegexNPAnnotator} doProcess().
	 * 
	 * @param jCas the jcas being processed
	 * @param matcher matcher (from the pattern supplied in the constructor)
	 * @return an instance, or null if this is not a valid match.
	 */
	protected abstract T create(JCas jCas, Matcher matcher);

	@Override
	public void doProcess(JCas jCas) throws AnalysisEngineProcessException {
		Collection<PhraseChunk> chunks = JCasUtil.select(jCas, PhraseChunk.class);
	
		if(!chunks.isEmpty()){
			for (PhraseChunk chunk: chunks ) {
				if ("NP".equals(chunk.getChunkType()) ) {
					createEntities(jCas, chunk.getCoveredText(), chunk.getBegin());
				}
			}
		}else if(!isUpperCase(jCas.getDocumentText())){
			createEntities(jCas, jCas.getDocumentText(), 0);
		}
	}
	
	private void createEntities(JCas jCas, String text, int offset){
		Matcher matcher = pattern.matcher(text);
		while(matcher.find()){
			createEntity(jCas, matcher, offset);
		}
	}
	
	private Entity createEntity(JCas jCas, Matcher matcher, int offset){
		Entity a = create(jCas, matcher);
		
		if(a != null) {
			a.setConfidence(confidence);
			a.setBegin(offset + matcher.start(matcherGroup));
			a.setEnd(offset + matcher.end(matcherGroup));
			a.setValue(matcher.group(matcherGroup));
			
			addToJCasIndex(a);
		}
		
		return a;
	}
	
	private static boolean isUpperCase(String s){
		for(char c : s.toCharArray()){
			if(Character.isLetter(c) && Character.isLowerCase(c)){
				return false;
			}
		}
		
		return true;
	}
}