/******************************************************************************* * Copyright 2015-2016 - CNRS (Centre National de Recherche Scientifique) * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. * *******************************************************************************/ package eu.project.ttc.engines; import java.util.Deque; import java.util.LinkedList; import java.util.Set; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.cas.FSIterator; import org.apache.uima.fit.component.JCasAnnotator_ImplBase; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.ExternalResource; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.collect.Lists; import com.google.common.collect.Sets; import eu.project.ttc.resources.FixedExpressionResource; import eu.project.ttc.types.FixedExpression; import eu.project.ttc.types.TermOccAnnotation; import eu.project.ttc.types.WordAnnotation; import eu.project.ttc.utils.JCasUtils; /** * * * @author Damien Cram * */ public class FixedExpressionSpotter extends JCasAnnotator_ImplBase { private static final Logger LOGGER = LoggerFactory.getLogger(FixedExpressionSpotter.class); @ExternalResource(key=FixedExpressionResource.FIXED_EXPRESSION_RESOURCE, mandatory=true) protected FixedExpressionResource fixedExpressionResource; public static final String REMOVE_TERM_OCC_ANNOTATIONS_FROM_CAS = "RemoveTermOccFromCas"; @ConfigurationParameter(name=REMOVE_TERM_OCC_ANNOTATIONS_FROM_CAS, mandatory=false, defaultValue="false") private boolean removeTermoccAnnotationsFromCas; public static final String REMOVE_WORD_ANNOTATIONS_FROM_CAS = "RemoveWordFromCas"; @ConfigurationParameter(name=REMOVE_WORD_ANNOTATIONS_FROM_CAS, mandatory=false, defaultValue="false") private boolean removeWordAnnotationsFromCas; public static final String FIXED_EXPRESSION_MAX_SIZE = "FixedExpressionMaxSize"; @ConfigurationParameter(name=FIXED_EXPRESSION_MAX_SIZE, mandatory=false, defaultValue="5") private int maxFixedExpressionSize; @Override public void process(JCas aJCas) throws AnalysisEngineProcessException { FSIterator<Annotation> it = aJCas.getAnnotationIndex(WordAnnotation.type).iterator(); // stats int cnt = 0; // Buffer of last n annotations Deque<WordAnnotation> lastNAnnos = new LinkedList<WordAnnotation>(); while (it.hasNext()) { WordAnnotation wa = (WordAnnotation) it.next(); /* * Update the buffer with the current anno. */ lastNAnnos.addLast(wa); if(lastNAnnos.size() > maxFixedExpressionSize) lastNAnnos.removeFirst(); /* * Iterate over the buffer in the reverse order to build * candidate fixed expressions. */ LinkedList<WordAnnotation> candidateFE = Lists.newLinkedList(lastNAnnos); while(candidateFE.size() >= 2) {// needs at least size >= 2 to be a fixed expression /* * Builds the lemma for the current candidate fixed expression */ StringBuffer candidateFELemmaBuilder = new StringBuffer(); boolean first = true; for(WordAnnotation a:candidateFE) { if(!first) candidateFELemmaBuilder.append(' '); candidateFELemmaBuilder.append(a.getLemma()); first = false; } String candidateFeLemma = candidateFELemmaBuilder.toString(); /* * Tests if the current candidate fixed expression can be found * in the resource. * */ if(fixedExpressionResource.containsLemma(candidateFeLemma)) { cnt++; if(LOGGER.isTraceEnabled()) LOGGER.trace("New fixed expression spotted: {} ({}: [{},{}])", candidateFeLemma, JCasUtils.getSourceDocumentAnnotation(aJCas).get().getUri(), candidateFE.getFirst().getBegin(), candidateFE.getLast().getEnd()); FixedExpression fe = (FixedExpression)aJCas.getCas().createAnnotation( aJCas.getCasType(FixedExpression.type), candidateFE.getFirst().getBegin(), candidateFE.getLast().getEnd()); fe.addToIndexes(); } /* * Loops again without the left-most word */ candidateFE.removeFirst(); } } /* * Removes all WordAnnotation and TermOccAnnotation contained * in a FixedExpression */ Set<Annotation> trash = Sets.newHashSet(); if(removeWordAnnotationsFromCas || removeTermoccAnnotationsFromCas) { FSIterator<Annotation> feIt = aJCas.getAnnotationIndex(FixedExpression.type).iterator(); while (feIt.hasNext()) { FixedExpression fe = (FixedExpression) feIt.next(); FSIterator<Annotation> it3 = aJCas.getAnnotationIndex().iterator(); while(it3.hasNext()) { Annotation a = it3.next(); if(removeWordAnnotationsFromCas && a instanceof WordAnnotation) { if(JCasUtils.containsStrictly(fe, a)) trash.add(a); } else if(removeTermoccAnnotationsFromCas && a instanceof TermOccAnnotation) { if(JCasUtils.containsStrictly(fe, a)) trash.add(a); } } } } for(Annotation a:trash) a.removeFromIndexes(); LOGGER.debug("{} fixed expressions found in {}", cnt, JCasUtils.getSourceDocumentAnnotation(aJCas).get().getUri()); } @Override public void collectionProcessComplete() throws AnalysisEngineProcessException { } }