/** * Copyright (C) 2012 cogroo <cogroo@cogroo.org> * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package cogroo.uima.ae; import java.util.ArrayList; import java.util.List; import org.apache.uima.cas.FSIterator; import org.apache.uima.cas.Feature; import org.apache.uima.cas.Type; import org.apache.uima.cas.TypeSystem; import org.apache.uima.cas.text.AnnotationFS; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; import br.usp.pcs.lta.cogroo.entity.Sentence; import br.usp.pcs.lta.cogroo.entity.Token; import br.usp.pcs.lta.cogroo.entity.impl.runtime.MorphologicalTag; import br.usp.pcs.lta.cogroo.tools.ProcessingEngine; import cogroo.util.EntityUtils; import cogroo.util.TypedSpan; public class UimaPOSTagger extends AnnotationService implements ProcessingEngine { private Type tokenType; private Type sentenceType; private Feature posFeature; private Feature additionalContextFeature; private Feature lexemeFeature; public UimaPOSTagger() throws AnnotationServiceException { super("UimaPOSTagger"); } public void process(Sentence text) { // ************************************ // Add text to the CAS // ************************************ updateCas(text, cas); // ************************************ // Analyze text // ************************************ try { ae.process(cas); } catch (Exception e) { throw new RuntimeException("Error processing a text.", e); } // ************************************ // Extract the result using annotated CAS // ************************************ FSIterator<Annotation> tokenIterator = cas.getAnnotationIndex(tokenType) .iterator(); int index = 0; List<Token> tokens = text.getTokens(); while (tokenIterator.hasNext()) { Annotation a = tokenIterator.next(); String tag = a.getFeatureValueAsString(posFeature); tokens.get(index).setOriginalPOSTag(tag); //tokens.get(index).setMorphologicalTag(toMorphologicalTag(tag)); index++; } text.setTokens(EntityUtils.groupTokens(text.getSentence(), text.getTokens(), createSpanList(toTokensArray(tokens), toTagsArray(tokens)))); cas.reset(); } // this is from opennlp public static List<TypedSpan> createSpanList(String[] toks, String[] tags) { // initialize with the list maximum size List<TypedSpan> phrases = new ArrayList<TypedSpan>(toks.length); String startTag = ""; int startIndex = 0; boolean foundPhrase = false; for (int ci = 0, cn = tags.length; ci < cn; ci++) { String pred = tags[ci]; if(!tags[ci].startsWith("B-") && !tags[ci].startsWith("I-")) { pred = "O"; } if (pred.startsWith("B-") || (!pred.equals("I-" + startTag) && !pred.equals("O"))) { // start if (foundPhrase) { // handle the last phrases.add(new TypedSpan(startIndex, ci, startTag)); } startIndex = ci; startTag = pred.substring(2); foundPhrase = true; } else if (pred.equals("I-" + startTag)) { // middle // do nothing } else if (foundPhrase) {// end phrases.add(new TypedSpan(startIndex, ci, startTag)); foundPhrase = false; startTag = ""; } } if (foundPhrase) { // leftover phrases.add(new TypedSpan(startIndex, tags.length, startTag)); } return phrases; } @Override protected void initTypes(TypeSystem typeSystem) { sentenceType = cas.getTypeSystem().getType("opennlp.uima.Sentence"); tokenType = cas.getTypeSystem().getType("opennlp.uima.Token"); posFeature = tokenType.getFeatureByBaseName("pos"); additionalContextFeature = tokenType.getFeatureByBaseName("additionalContext"); lexemeFeature = tokenType.getFeatureByBaseName("lexeme"); } private void updateCas(Sentence sentence, JCas cas) { cas.reset(); cas.setDocumentText(sentence.getSentence()); AnnotationFS a = cas.getCas().createAnnotation(sentenceType, sentence.getOffset(), sentence.getOffset() + sentence.getSentence().length()); cas.getIndexRepository().addFS(a); for (Token t : sentence.getTokens()) { a = cas.getCas().createAnnotation(tokenType, t.getSpan().getStart() + sentence.getOffset(), t.getSpan().getEnd() + sentence.getOffset()); a.setStringValue(additionalContextFeature, t.getAdditionalContext()); a.setStringValue(lexemeFeature, t.getLexeme()); cas.getIndexRepository().addFS(a); } } private String[] toTagsArray(List<Token> tokens) { String[] tag = new String[tokens.size()]; for (int i = 0; i < tokens.size(); i++) { tag[i] = tokens.get(i).getOriginalPOSTag(); } return tag; } private String[] toTokensArray(List<Token> tokens) { String[] toks = new String[tokens.size()]; for (int i = 0; i < tokens.size(); i++) { toks[i] = tokens.get(i).getLexeme(); } return toks; } }