/** * Copyright (C) 2012 cogroo <cogroo@cogroo.org> * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package cogroo.uima.ae; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.List; import opennlp.tools.util.Span; import org.apache.uima.cas.FSIterator; import org.apache.uima.cas.Type; import org.apache.uima.cas.TypeSystem; import org.apache.uima.cas.text.AnnotationFS; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; import cogroo.util.EntityUtils; import br.usp.pcs.lta.cogroo.entity.Sentence; import br.usp.pcs.lta.cogroo.entity.Token; import br.usp.pcs.lta.cogroo.entity.impl.runtime.TokenCogroo; import br.usp.pcs.lta.cogroo.tools.ProcessingEngine; public class UimaMultiWordExp extends AnnotationService implements ProcessingEngine { private Type tokenType; private Type sentenceType; private Type personType; private Type expType; public UimaMultiWordExp() throws AnnotationServiceException { super("UIMAMultiWordExp"); } public void process(Sentence text) { // ************************************ // Add text to the CAS // ************************************ updateCas(text, cas); // ************************************ // Analyze text // ************************************ try { ae.process(cas); } catch (Exception e) { throw new RuntimeException("Error processing a text.", e); } // ************************************ // Extract the result using annotated CAS // ************************************ FSIterator<Annotation> personIterator = cas.getAnnotationIndex(personType) .iterator(); // FSIterator<Annotation> expIterator = cas.getAnnotationIndex(expType) // .iterator(); List<Span> names = new ArrayList<Span>(); // List<Span> exp = new ArrayList<Span>(); List<Token> tokens = new ArrayList<Token>(); while (personIterator.hasNext()) { Annotation a = personIterator.next(); Span s = new Span(a.getBegin(), a.getEnd()); names.add(s); } // while (expIterator.hasNext()) { // Annotation a = expIterator.next(); // Span s = new Span(a.getBegin(), a.getEnd()); // exp.add(s); // } // List<Span> merged = merge(exp, names); text.setTokens(EntityUtils.groupTokensChar(text.getSentence(), text.getTokens(), names, "P")); cas.reset(); } @Override protected void initTypes(TypeSystem typeSystem) { sentenceType = cas.getTypeSystem().getType("opennlp.uima.Sentence"); tokenType = cas.getTypeSystem().getType("opennlp.uima.Token"); personType = cas.getTypeSystem().getType("opennlp.uima.Person"); expType = cas.getTypeSystem().getType("opennlp.uima.Exp"); } private void updateCas(Sentence sentence, JCas cas) { cas.reset(); cas.setDocumentText(sentence.getSentence()); AnnotationFS a = cas.getCas().createAnnotation(sentenceType, sentence.getOffset(), sentence.getOffset() + sentence.getSentence().length()); cas.getIndexRepository().addFS(a); for (Token t : sentence.getTokens()) { a = cas.getCas().createAnnotation(tokenType, t.getSpan().getStart() + sentence.getOffset(), t.getSpan().getEnd() + sentence.getOffset()); cas.getIndexRepository().addFS(a); } } /* private static List<Token> groupTokens(List<Token> toks, List<Span> spans) { if (spans == null || spans.size() == 0) { return toks; } List<Token> grouped = new ArrayList<Token>(toks); int lastTokVisited = 0; List<Integer> toMerge = new ArrayList<Integer>(); for (int i = 0; i < spans.size(); i++) { Span s = spans.get(i); boolean canStop = false; for (int j = lastTokVisited; j < toks.size(); j++) { Token t = toks.get(j); if (s.intersects(t.getSpan())) { toMerge.add(j); canStop = true; } else if (canStop) { lastTokVisited = j; break; } } } mergeTokens(grouped, toMerge); return grouped; }*/ private static void mergeTokens(List<Token> grouped, List<Integer> toMerge) { if (toMerge.size() > 0) { StringBuilder sb = new StringBuilder(); int s = grouped.get(toMerge.get(0)).getSpan().getStart(); int e = grouped.get(toMerge.get(toMerge.size() - 1)).getSpan().getEnd(); for (int i = 0; i < toMerge.size(); i++) { int index = toMerge.get(i); sb.append(grouped.get(index).getLexeme() + "_"); } String lexeme = sb.substring(0, sb.length() - 1); for (int i = toMerge.size() - 1; i > 0; i--) { grouped.remove(toMerge.get(i).intValue()); } grouped.set(toMerge.get(0).intValue(), new TokenCogroo(lexeme, new Span( s, e))); } } private static List<Span> merge(List<Span> first, List<Span> second) { List<Span> merged = new ArrayList<Span>(first.size() + second.size()); // add all of the first merged.addAll(first); for (Span s : second) { boolean addS = true; for (Span f : first) { if (s.intersects(f)) { addS = false; break; } } if (addS) { merged.add(s); } } Collections.<Span> sort(merged); return merged; } }