/** * Copyright (C) 2012 cogroo <cogroo@cogroo.org> * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package cogroo.uima.ae; import java.util.List; import opennlp.tools.util.Span; import org.apache.log4j.Logger; import org.apache.uima.cas.FSIterator; import org.apache.uima.cas.Feature; import org.apache.uima.cas.Type; import org.apache.uima.cas.TypeSystem; import org.apache.uima.cas.text.AnnotationFS; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; import br.usp.pcs.lta.cogroo.entity.Chunk; import br.usp.pcs.lta.cogroo.entity.Sentence; import br.usp.pcs.lta.cogroo.entity.Token; import br.usp.pcs.lta.cogroo.entity.impl.runtime.ChunkTag; import br.usp.pcs.lta.cogroo.entity.impl.runtime.SyntacticTag; import br.usp.pcs.lta.cogroo.tag.TagInterpreterI; import br.usp.pcs.lta.cogroo.tools.ProcessingEngine; import br.usp.pcs.lta.cogroo.tools.checker.rules.model.TagMask.ChunkFunction; import br.usp.pcs.lta.cogroo.tools.checker.rules.model.TagMask.Class; import br.usp.pcs.lta.cogroo.tools.checker.rules.model.TagMask.SyntacticFunction; import cogroo.ExpandedSentence; import cogroo.uima.interpreters.FlorestaTagInterpreter; public class UimaChunkerHeadFinder extends AnnotationService implements ProcessingEngine { private Type tokenType; private Type sentenceType; private Feature postagFeature; private Type chunkType; private Feature chunktagFeature; private TagInterpreterI floresta = new FlorestaTagInterpreter(); protected static final Logger LOGGER = Logger .getLogger(UimaChunkerHeadFinder.class); public UimaChunkerHeadFinder() throws AnnotationServiceException { super("UIMAChunkerHeadFinder"); } public void process(Sentence text) { ExpandedSentence extSentence = new ExpandedSentence(text); // ************************************ // Add text to the CAS // ************************************ updateCas(extSentence, cas); // ************************************ // Analyze text // ************************************ try { ae.process(cas); } catch (Exception e) { throw new RuntimeException("Error processing a text.", e); } // ************************************ // Extract the result using annotated CAS // ************************************ // List<Token> tokens = text.getTokens(); List<Chunk> chunks = text.getChunks(); FSIterator<Annotation> iterator = cas.getAnnotationIndex(chunkType) .iterator(); // tenho criar um por tag... int lastToken = 0; while (iterator.hasNext()) { Annotation a = iterator.next(); boolean isHead = false; String uimatag = a.getStringValue(chunktagFeature); if (uimatag != null && uimatag.equals("H")) { isHead = true; } Span s = new Span(a.getBegin(), a.getEnd()); for (int i = 0; i < text.getTokens().size(); i++) { Token token = text.getTokens().get(i); if (s.intersects(extSentence.getTokenSpan(i))) { token.setChunkTag(create(token.getChunkTag(), isHead)); if (isHead) { token.getChunk().setMorphologicalTag(token.getMorphologicalTag()); } break; // boolean isSubjOrMainVerb = st.match(SUBJ) || st.match(MV); /* * if ( isSubjOrMainVerb ) { token.getChunk().setSyntacticTag(st); } * else if(token.getChunk().getSyntacticTag() == null) { SyntacticTag * none = new SyntacticTag(); * none.setSyntacticFunction(SyntacticFunction.NONE); * token.getChunk().setSyntacticTag(none); } */ } } } for (int j = 0; j < text.getChunks().size(); j++) { Chunk c = text.getChunks().get(j); if (c.getMorphologicalTag() == null) { if (c.getTokens().size() > 0) { c.setMorphologicalTag(c.getTokens().get(0).getMorphologicalTag()); } else { System.out.println("dude"); } } } cas.reset(); } final static ChunkTag BOUNDARY_NOUN_PHRASE_MAIN = new ChunkTag(); final static ChunkTag BOUNDARY_VERB_PHRASE_MAIN = new ChunkTag(); final static ChunkTag INTERMEDIARY_NOUN_PHRASE_MAIN = new ChunkTag(); static { BOUNDARY_NOUN_PHRASE_MAIN .setChunkFunction(ChunkFunction.BOUNDARY_NOUN_PHRASE_MAIN); BOUNDARY_VERB_PHRASE_MAIN .setChunkFunction(ChunkFunction.BOUNDARY_VERB_PHRASE_MAIN); INTERMEDIARY_NOUN_PHRASE_MAIN .setChunkFunction(ChunkFunction.INTERMEDIARY_NOUN_PHRASE_MAIN); } private ChunkTag create(ChunkTag chunkTag, boolean isHead) { if (isHead) { if (chunkTag != null) { if (ChunkFunction.BOUNDARY_NOUN_PHRASE.equals(chunkTag .getChunkFunction())) { return BOUNDARY_NOUN_PHRASE_MAIN; } /* * else if(ChunkFunction.BOUNDARY_VERB_PHRASE_MAIN.equals(chunkTag. * getChunkFunction())) { return BOUNDARY_VERB_PHRASE_MAIN; } */else if (ChunkFunction.INTERMEDIARY_NOUN_PHRASE.equals(chunkTag .getChunkFunction())) { return INTERMEDIARY_NOUN_PHRASE_MAIN; } else { // throw new IllegalArgumentException(chunkTag + // " whithout main equivalent"); } } } return chunkTag; } private SyntacticTag create(String uimatag) { SyntacticTag s = new SyntacticTag(); if ("SUBJ".equals(uimatag)) { s.setSyntacticFunction(SyntacticFunction.SUBJECT); } else if ("P".equals(uimatag)) { s.setSyntacticFunction(SyntacticFunction.VERB); } else { s.setSyntacticFunction(SyntacticFunction.NONE); } return s; } @Override protected void initTypes(TypeSystem typeSystem) { sentenceType = cas.getTypeSystem().getType("opennlp.uima.Sentence"); tokenType = cas.getTypeSystem().getType("opennlp.uima.Token"); postagFeature = tokenType.getFeatureByBaseName("pos"); chunkType = cas.getTypeSystem().getType("opennlp.uima.Chunk"); chunktagFeature = chunkType.getFeatureByBaseName("type"); // chunkheadFeature = chunkType.getFeatureByBaseName("head"); } private void updateCas(ExpandedSentence sentence, JCas cas) { cas.reset(); cas.setDocumentText(sentence.getExtendedSentence()); AnnotationFS sentenceAnnotation = cas.getCas().createAnnotation( sentenceType, sentence.getSent().getOffset(), sentence.getSent().getOffset() + sentence.getExtendedSentence().length()); cas.getIndexRepository().addFS(sentenceAnnotation); for (int i = 0; i < sentence.getSent().getTokens().size(); i++) { Token t = sentence.getSent().getTokens().get(i); AnnotationFS tokenAnnotation = cas.getCas().createAnnotation(tokenType, sentence.getTokenSpan(i).getStart()/* + sentence.getOffset() */, sentence.getTokenSpan(i).getEnd()/* + sentence.getOffset() */); br.usp.pcs.lta.cogroo.tools.checker.rules.model.TagMask.Class c = t .getMorphologicalTag().getClazzE(); String tag; if (c != null) { if (t.getMorphologicalTag().getClazzE().equals(Class.VERB)) { tag = floresta.serialize(t.getMorphologicalTag().getFinitenessE()); } else { tag = floresta.serialize(t.getMorphologicalTag().getClazzE()); } } else { tag = t.getLexeme(); } String chunk = floresta.serialize(t.getChunkTag()); if (tag == null || tag.isEmpty()) { throw new RuntimeException("tag was empty!"); } if (chunk == null || chunk.isEmpty()) { throw new RuntimeException("chunk was empty!"); } tokenAnnotation.setStringValue(postagFeature, tag + "|" + chunk.replace("*", "")); cas.getIndexRepository().addFS(tokenAnnotation); } } }