/******************************************************************************* * Copyright 2015-2016 - CNRS (Centre National de Recherche Scientifique) * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. * *******************************************************************************/ package eu.project.ttc.engines; import java.util.List; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.cas.FSIterator; import org.apache.uima.fit.component.JCasAnnotator_ImplBase; import org.apache.uima.fit.descriptor.ExternalResource; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; import com.google.common.collect.Lists; import eu.project.ttc.resources.MateLemmatizerModel; import eu.project.ttc.resources.MateTaggerModel; import eu.project.ttc.types.WordAnnotation; import is2.data.SentenceData09; import is2.lemmatizer.Lemmatizer; import is2.tag.Tagger; /** * Post-process the lemma found by TreeTagger * * @author Damien Cram * */ public class MateLemmatizerTagger extends JCasAnnotator_ImplBase { public static final String LEMMATIZER = "Lemmatizer"; @ExternalResource(key = LEMMATIZER, mandatory = true) private MateLemmatizerModel mateLemmatizerModel; public static final String TAGGER = "Tagger"; @ExternalResource(key = TAGGER, mandatory = true) private MateTaggerModel mateTaggerModel; @Override public void process(JCas jcas) throws AnalysisEngineProcessException { Lemmatizer mateLemmatizer = mateLemmatizerModel.getEngine(); Tagger mateTagger = mateTaggerModel.getEngine(); /* * keeps an array of annotations in memory so as to be able * to access them by index. */ List<WordAnnotation> annotations = Lists.newArrayList(); FSIterator<Annotation> it = jcas.getAnnotationIndex(WordAnnotation.type).iterator(); while(it.hasNext()) { WordAnnotation a = (WordAnnotation) it.next(); annotations.add(a); } String[] tokens = new String[annotations.size()+2]; // preprends to fake words to prevent Mate from bugging on the two first words tokens[0] = "<root>"; tokens[1] = "<root2>"; for(int i = 0; i< annotations.size() ; i++) tokens[i+2] = annotations.get(i).getCoveredText(); SentenceData09 mateSentence = new SentenceData09(); mateSentence.init(tokens); // Run POS tagging mateSentence = mateTagger.apply(mateSentence); // Run lemmatization mateSentence = mateLemmatizer.apply(mateSentence); WordAnnotation wordAnnotation; for(int j=1; j<mateSentence.length(); j++) { wordAnnotation = annotations.get(j-1); wordAnnotation.setTag(mateSentence.ppos[j]); wordAnnotation.setLemma(mateSentence.plemmas[j]); } } }