MateLemmatizerTagger.java example

Explorer
termsuite-core-master
- src
/*******************************************************************************
 * Copyright 2015-2016 - CNRS (Centre National de Recherche Scientifique)
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 *
 *******************************************************************************/
package eu.project.ttc.engines;

import java.util.List;

import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
import org.apache.uima.fit.descriptor.ExternalResource;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;

import com.google.common.collect.Lists;

import eu.project.ttc.resources.MateLemmatizerModel;
import eu.project.ttc.resources.MateTaggerModel;
import eu.project.ttc.types.WordAnnotation;
import is2.data.SentenceData09;
import is2.lemmatizer.Lemmatizer;
import is2.tag.Tagger;

/**
 * Post-process the lemma found by TreeTagger
 * 
 * @author Damien Cram
 *
 */
public class MateLemmatizerTagger extends JCasAnnotator_ImplBase {
	
	public static final String LEMMATIZER = "Lemmatizer";
	@ExternalResource(key = LEMMATIZER, mandatory = true)
	private MateLemmatizerModel mateLemmatizerModel;
	
	public static final String TAGGER = "Tagger";
	@ExternalResource(key = TAGGER, mandatory = true)
	private MateTaggerModel mateTaggerModel;
	
	@Override
	public void process(JCas jcas) throws AnalysisEngineProcessException {
		Lemmatizer mateLemmatizer = mateLemmatizerModel.getEngine();
		Tagger mateTagger = mateTaggerModel.getEngine();

		/*
		 * keeps an array of annotations in memory so as to be able 
		 * to access them by index.
		 */
		List<WordAnnotation> annotations = Lists.newArrayList();
		FSIterator<Annotation> it = jcas.getAnnotationIndex(WordAnnotation.type).iterator();
		while(it.hasNext()) {
			WordAnnotation a = (WordAnnotation) it.next();
			annotations.add(a);
		}
		
		
		String[] tokens = new String[annotations.size()+2];
		
		// preprends to fake words to prevent Mate from bugging on the two first words
		tokens[0] = "<root>";
		tokens[1] = "<root2>";
		for(int i = 0; i< annotations.size() ; i++)
			tokens[i+2] = annotations.get(i).getCoveredText();
		
		SentenceData09 mateSentence = new SentenceData09();
		mateSentence.init(tokens);

		// Run POS tagging
		mateSentence = mateTagger.apply(mateSentence);
		
		// Run lemmatization
		mateSentence = mateLemmatizer.apply(mateSentence);
		
		
		WordAnnotation wordAnnotation;
		for(int j=1; j<mateSentence.length(); j++) {
			wordAnnotation = annotations.get(j-1);
			wordAnnotation.setTag(mateSentence.ppos[j]);
			wordAnnotation.setLemma(mateSentence.plemmas[j]);
		}
	}
}