RegexSpotter.java example

Explorer
termsuite-core-master
- src
/*******************************************************************************
 * Copyright 2015-2016 - CNRS (Centre National de Recherche Scientifique)
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 *
 *******************************************************************************/
package eu.project.ttc.engines;

import java.util.Collection;
import java.util.Iterator;

import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.ExternalResource;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.cas.FSArray;
import org.apache.uima.jcas.cas.StringArray;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import eu.project.ttc.history.TermHistory;
import eu.project.ttc.history.TermHistoryResource;
import eu.project.ttc.resources.OccurrenceFilter;
import eu.project.ttc.resources.TrueFilter;
import eu.project.ttc.types.TermOccAnnotation;
import eu.project.ttc.types.WordAnnotation;
import eu.project.ttc.utils.JCasUtils;
import eu.project.ttc.utils.OccurrenceBuffer;
import eu.project.ttc.utils.TermSuiteConstants;
import eu.project.ttc.utils.TermSuiteUtils;
import eu.project.ttc.utils.TermUtils;
import fr.univnantes.lina.uima.tkregex.LabelledAnnotation;
import fr.univnantes.lina.uima.tkregex.RegexOccurrence;
import fr.univnantes.lina.uima.tkregex.ae.TokenRegexAE;
import uima.sandbox.filter.resources.FilterResource;


/**
 * 
 * Adds all Token Regex Occurrences to the Cas and to the Term Index.
 * 
 * @author Damien Cram
 *
 */
public class RegexSpotter extends TokenRegexAE {
	private static final Logger LOGGER = LoggerFactory.getLogger(RegexSpotter.class);
	
	public static final String POST_PROCESSING_STRATEGY = "PostProcessingStrategy";
	@ConfigurationParameter(name = POST_PROCESSING_STRATEGY, mandatory = false, defaultValue = OccurrenceBuffer.NO_CLEANING)
	private String postProcessingStrategy;

	public static final String LOG_OVERLAPPING_RULES = "LogOverlappingRules";
	@ConfigurationParameter(name = LOG_OVERLAPPING_RULES, mandatory = false, defaultValue = "false")
	private boolean logOverlappingRules;

	public static final String CONTEXTUALIZE = "Contextualize";
	@ConfigurationParameter(name = CONTEXTUALIZE, mandatory = false, defaultValue = "false")
	private boolean contextualize;


	public static final String CHARACTER_FOOTPRINT_TERM_FILTER = "CharacterFootprintTermFilter";
	@ExternalResource(key =CHARACTER_FOOTPRINT_TERM_FILTER, mandatory = false)
	private OccurrenceFilter termFilter = TrueFilter.INSTANCE;
	
	public static final String STOP_WORD_FILTER = "StopWordFilter";

	@ExternalResource(key =STOP_WORD_FILTER, mandatory = true)
	private FilterResource stopWordFilter;
	
	@ExternalResource(key =TermHistoryResource.TERM_HISTORY, mandatory = true)
	private TermHistoryResource historyResource;

	
	@Override
	protected void beforeRuleProcessing(JCas jCas) {
		this.occurrenceBuffer = new OccurrenceBuffer(this.postProcessingStrategy);
	}
	
	private OccurrenceBuffer occurrenceBuffer;
	
	
	@Override
	public void ruleMatched(JCas jCas, RegexOccurrence occurrence) {
		String groupingKey = TermSuiteUtils.toGroupingKey(occurrence);
		TermHistory history = historyResource.getHistory();
		
		
		/*
		 * Do not keep the term if it has too many bad characters
		 */
		if(!termFilter.accept(occurrence)) {
			if(history.isWatched(groupingKey))
				history.saveEvent(
						groupingKey, 
						RegexSpotter.class, String.format(
								"[!] Term spotted at [%d,%d] in file %s but filtered out by %s",
								occurrence.getBegin(), occurrence.getEnd(),
								JCasUtils.getSourceDocumentAnnotation(jCas).get().getUri(),
								termFilter.getClass()));
			return;
		}

		/*
		 * Do not keep the term if it is a stop word
		 */
		WordAnnotation wa = (WordAnnotation)occurrence.getLabelledAnnotations().get(0).getAnnotation();
		if((occurrence.size() == 1 && stopWordFilter.getFilters().contains(wa.getCoveredText().toLowerCase()))
				|| (occurrence.size() == 1 && wa.getLemma() != null && stopWordFilter.getFilters().contains(wa.getLemma().toLowerCase()))) {
			if(history.isWatched(groupingKey))
				history.saveEvent(
						groupingKey, 
						RegexSpotter.class, String.format(
								"[!] Term spotted at [%d,%d] in file %s but filtered out by stop word filter",
								occurrence.getBegin(), occurrence.getEnd(),
								JCasUtils.getSourceDocumentAnnotation(jCas).get().getUri(),
								termFilter.getClass()));
			
			return;
		}
		
		if(history.isWatched(groupingKey))
			history.saveEvent(
					groupingKey, 
					RegexSpotter.class, String.format("Term spotted at [%d,%d] in file %s",
							occurrence.getBegin(), occurrence.getEnd(),
							JCasUtils.getSourceDocumentAnnotation(jCas).get().getUri()));

			
		/*
		 * Add the occurrence the buffer. Will be added to jCas if it is not filtered by any post processing strategy
		 */
		this.occurrenceBuffer.bufferize(occurrence);
	}

	@Override
	protected void allRulesFailed(JCas jCas) {
		flushOccurrenceBuffer(jCas);
	}
	
	/*
	 * 
	 */
	private void flushOccurrenceBuffer(JCas jCas) {
		
		/*
		 * Log a warning if the occurrence was found for another rule
		 */
		if(logOverlappingRules) {
			for(Collection<RegexOccurrence> doublons:this.occurrenceBuffer.findDuplicates()) {
				Iterator<RegexOccurrence> it = doublons.iterator();
				RegexOccurrence base = it.next();
				while(it.hasNext()) {
					RegexOccurrence occ = it.next();
					LOGGER.warn("Rules {} and {} overlap on occurrence [{},{}] \"{}\"", 
							base.getRule().getName(),
							occ.getRule().getName(),
							occ.getBegin(),
							occ.getEnd(),
							TermUtils.collapseText(jCas.getDocumentText().substring(occ.getBegin(), occ.getEnd()))
						);
				}
			}
		}
		
		this.occurrenceBuffer.cleanBuffer();
		for(RegexOccurrence occ:this.occurrenceBuffer) 
			addOccurrenceToCas(jCas, occ);
		this.occurrenceBuffer.clear();
	}
	
	private void addOccurrenceToCas(JCas jCas, RegexOccurrence occurrence) {
		TermOccAnnotation annotation = (TermOccAnnotation) jCas
				.getCas().createAnnotation(
						jCas.getCasType(TermOccAnnotation.type),
						occurrence.getBegin(),
						occurrence.getEnd());
		
		
		StringArray patternFeature = new StringArray(jCas, occurrence.size());
		FSArray innerWords = new FSArray(jCas, occurrence.size());
		StringBuilder termLemma = new StringBuilder();
		int i = 0;
		for (LabelledAnnotation la:occurrence.getLabelledAnnotations()) {
			patternFeature.set(i, la.getLabel());
			WordAnnotation word = (WordAnnotation) la.getAnnotation();
			termLemma.append(word.getLemma());
			if(i<occurrence.size()-1)
				termLemma.append(TermSuiteConstants.WHITESPACE);
			WordAnnotation wordAnno = (WordAnnotation) la.getAnnotation();
			if(wordAnno.getRegexLabel() != null) {
				if(!wordAnno.getRegexLabel().equals(la.getLabel())) {
					LOGGER.warn("Another label has already been set for WordAnnotation "+wordAnno.getCoveredText()+":"+wordAnno.getRegexLabel()+" ["+wordAnno.getBegin()+","+wordAnno.getEnd()+"]. Ignoring the new label "+la.getLabel()+" (rule: "+occurrence.getRule().getName()+")");
				}
			} else
				wordAnno.setRegexLabel(la.getLabel());
			innerWords.set(i, wordAnno);
			i++;
		}
		
		annotation.setWords(innerWords);
		annotation.setPattern(patternFeature);
		annotation.setSpottingRuleName(occurrence.getRule().getName());
		annotation.setTermKey(TermSuiteUtils.getGroupingKey(annotation));
		annotation.addToIndexes();
	}
	
	@Override
	protected void afterRuleProcessing(JCas jCas) {
		flushOccurrenceBuffer(jCas);
	}
}