/******************************************************************************* * Copyright 2015-2016 - CNRS (Centre National de Recherche Scientifique) * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. * *******************************************************************************/ package eu.project.ttc.engines; import java.util.Collection; import java.util.Iterator; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.ExternalResource; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.FSArray; import org.apache.uima.jcas.cas.StringArray; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import eu.project.ttc.history.TermHistory; import eu.project.ttc.history.TermHistoryResource; import eu.project.ttc.resources.OccurrenceFilter; import eu.project.ttc.resources.TrueFilter; import eu.project.ttc.types.TermOccAnnotation; import eu.project.ttc.types.WordAnnotation; import eu.project.ttc.utils.JCasUtils; import eu.project.ttc.utils.OccurrenceBuffer; import eu.project.ttc.utils.TermSuiteConstants; import eu.project.ttc.utils.TermSuiteUtils; import eu.project.ttc.utils.TermUtils; import fr.univnantes.lina.uima.tkregex.LabelledAnnotation; import fr.univnantes.lina.uima.tkregex.RegexOccurrence; import fr.univnantes.lina.uima.tkregex.ae.TokenRegexAE; import uima.sandbox.filter.resources.FilterResource; /** * * Adds all Token Regex Occurrences to the Cas and to the Term Index. * * @author Damien Cram * */ public class RegexSpotter extends TokenRegexAE { private static final Logger LOGGER = LoggerFactory.getLogger(RegexSpotter.class); public static final String POST_PROCESSING_STRATEGY = "PostProcessingStrategy"; @ConfigurationParameter(name = POST_PROCESSING_STRATEGY, mandatory = false, defaultValue = OccurrenceBuffer.NO_CLEANING) private String postProcessingStrategy; public static final String LOG_OVERLAPPING_RULES = "LogOverlappingRules"; @ConfigurationParameter(name = LOG_OVERLAPPING_RULES, mandatory = false, defaultValue = "false") private boolean logOverlappingRules; public static final String CONTEXTUALIZE = "Contextualize"; @ConfigurationParameter(name = CONTEXTUALIZE, mandatory = false, defaultValue = "false") private boolean contextualize; public static final String CHARACTER_FOOTPRINT_TERM_FILTER = "CharacterFootprintTermFilter"; @ExternalResource(key =CHARACTER_FOOTPRINT_TERM_FILTER, mandatory = false) private OccurrenceFilter termFilter = TrueFilter.INSTANCE; public static final String STOP_WORD_FILTER = "StopWordFilter"; @ExternalResource(key =STOP_WORD_FILTER, mandatory = true) private FilterResource stopWordFilter; @ExternalResource(key =TermHistoryResource.TERM_HISTORY, mandatory = true) private TermHistoryResource historyResource; @Override protected void beforeRuleProcessing(JCas jCas) { this.occurrenceBuffer = new OccurrenceBuffer(this.postProcessingStrategy); } private OccurrenceBuffer occurrenceBuffer; @Override public void ruleMatched(JCas jCas, RegexOccurrence occurrence) { String groupingKey = TermSuiteUtils.toGroupingKey(occurrence); TermHistory history = historyResource.getHistory(); /* * Do not keep the term if it has too many bad characters */ if(!termFilter.accept(occurrence)) { if(history.isWatched(groupingKey)) history.saveEvent( groupingKey, RegexSpotter.class, String.format( "[!] Term spotted at [%d,%d] in file %s but filtered out by %s", occurrence.getBegin(), occurrence.getEnd(), JCasUtils.getSourceDocumentAnnotation(jCas).get().getUri(), termFilter.getClass())); return; } /* * Do not keep the term if it is a stop word */ WordAnnotation wa = (WordAnnotation)occurrence.getLabelledAnnotations().get(0).getAnnotation(); if((occurrence.size() == 1 && stopWordFilter.getFilters().contains(wa.getCoveredText().toLowerCase())) || (occurrence.size() == 1 && wa.getLemma() != null && stopWordFilter.getFilters().contains(wa.getLemma().toLowerCase()))) { if(history.isWatched(groupingKey)) history.saveEvent( groupingKey, RegexSpotter.class, String.format( "[!] Term spotted at [%d,%d] in file %s but filtered out by stop word filter", occurrence.getBegin(), occurrence.getEnd(), JCasUtils.getSourceDocumentAnnotation(jCas).get().getUri(), termFilter.getClass())); return; } if(history.isWatched(groupingKey)) history.saveEvent( groupingKey, RegexSpotter.class, String.format("Term spotted at [%d,%d] in file %s", occurrence.getBegin(), occurrence.getEnd(), JCasUtils.getSourceDocumentAnnotation(jCas).get().getUri())); /* * Add the occurrence the buffer. Will be added to jCas if it is not filtered by any post processing strategy */ this.occurrenceBuffer.bufferize(occurrence); } @Override protected void allRulesFailed(JCas jCas) { flushOccurrenceBuffer(jCas); } /* * */ private void flushOccurrenceBuffer(JCas jCas) { /* * Log a warning if the occurrence was found for another rule */ if(logOverlappingRules) { for(Collection<RegexOccurrence> doublons:this.occurrenceBuffer.findDuplicates()) { Iterator<RegexOccurrence> it = doublons.iterator(); RegexOccurrence base = it.next(); while(it.hasNext()) { RegexOccurrence occ = it.next(); LOGGER.warn("Rules {} and {} overlap on occurrence [{},{}] \"{}\"", base.getRule().getName(), occ.getRule().getName(), occ.getBegin(), occ.getEnd(), TermUtils.collapseText(jCas.getDocumentText().substring(occ.getBegin(), occ.getEnd())) ); } } } this.occurrenceBuffer.cleanBuffer(); for(RegexOccurrence occ:this.occurrenceBuffer) addOccurrenceToCas(jCas, occ); this.occurrenceBuffer.clear(); } private void addOccurrenceToCas(JCas jCas, RegexOccurrence occurrence) { TermOccAnnotation annotation = (TermOccAnnotation) jCas .getCas().createAnnotation( jCas.getCasType(TermOccAnnotation.type), occurrence.getBegin(), occurrence.getEnd()); StringArray patternFeature = new StringArray(jCas, occurrence.size()); FSArray innerWords = new FSArray(jCas, occurrence.size()); StringBuilder termLemma = new StringBuilder(); int i = 0; for (LabelledAnnotation la:occurrence.getLabelledAnnotations()) { patternFeature.set(i, la.getLabel()); WordAnnotation word = (WordAnnotation) la.getAnnotation(); termLemma.append(word.getLemma()); if(i<occurrence.size()-1) termLemma.append(TermSuiteConstants.WHITESPACE); WordAnnotation wordAnno = (WordAnnotation) la.getAnnotation(); if(wordAnno.getRegexLabel() != null) { if(!wordAnno.getRegexLabel().equals(la.getLabel())) { LOGGER.warn("Another label has already been set for WordAnnotation "+wordAnno.getCoveredText()+":"+wordAnno.getRegexLabel()+" ["+wordAnno.getBegin()+","+wordAnno.getEnd()+"]. Ignoring the new label "+la.getLabel()+" (rule: "+occurrence.getRule().getName()+")"); } } else wordAnno.setRegexLabel(la.getLabel()); innerWords.set(i, wordAnno); i++; } annotation.setWords(innerWords); annotation.setPattern(patternFeature); annotation.setSpottingRuleName(occurrence.getRule().getName()); annotation.setTermKey(TermSuiteUtils.getGroupingKey(annotation)); annotation.addToIndexes(); } @Override protected void afterRuleProcessing(JCas jCas) { flushOccurrenceBuffer(jCas); } }