package org.cogroo.ruta.uima; import org.apache.commons.lang.StringUtils; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.cas.CAS; import org.apache.uima.cas.CASRuntimeException; import org.apache.uima.cas.Feature; import org.apache.uima.cas.Type; import org.apache.uima.cas.TypeSystem; import org.apache.uima.cas.text.AnnotationFS; import org.cogroo.text.Chunk; import org.cogroo.text.Sentence; import org.cogroo.text.SyntacticChunk; import org.cogroo.text.Token; public class UimaCasAdapter { /** * Type parameters. */ public static final String CHUNK_TYPE_PARAMETER = "opennlp.uima.ChunkType"; /** * The chunk tag feature parameter */ public static final String CHUNK_TAG_FEATURE_PARAMETER = "opennlp.uima.ChunkTagFeature"; private Type mSentenceType; private Type mTokenType; private Type mChunkType; private Feature mPosFeature; private Feature mChunkFeature; private Feature mLemmaFeature; private Feature mFeaturesFeature; private Feature mLexemeFeature; private Feature mChunkHead; private Type mSyntacticChunkType; private Feature mSyntacticChunkFeature; private boolean typesystemInitialized = false; /** * Initializes the type system. */ private void typeSystemInit(TypeSystem typeSystem) throws AnalysisEngineProcessException { if(typesystemInitialized == true) { return; } // sentence type mSentenceType = AnnotatorUtil.getType(typeSystem, "opennlp.uima.Sentence"); // token type mTokenType = AnnotatorUtil.getType(typeSystem, "opennlp.uima.Token"); // pos feature mPosFeature = AnnotatorUtil.getRequiredFeature(mTokenType, "pos", CAS.TYPE_NAME_STRING); // lexeme feature mLexemeFeature = AnnotatorUtil.getRequiredFeature(mTokenType, "lexeme", CAS.TYPE_NAME_STRING); // lemma feature mLemmaFeature = AnnotatorUtil.getRequiredFeature(mTokenType, "lemma", CAS.TYPE_NAME_STRING); // features feature mFeaturesFeature = AnnotatorUtil.getRequiredFeature(mTokenType, "features", CAS.TYPE_NAME_STRING); // chunk type mChunkType = AnnotatorUtil.getType(typeSystem, "opennlp.uima.Chunk"); // chunk feature mChunkFeature = AnnotatorUtil.getRequiredFeature(mChunkType, "chunkType", CAS.TYPE_NAME_STRING); // chunk head feature mChunkHead = AnnotatorUtil.getRequiredFeature(mChunkType, "head", "opennlp.uima.Token"); // syntactic chunk type mSyntacticChunkType = AnnotatorUtil.getType(typeSystem, "opennlp.uima.SyntacticChunk"); // chunk feature mSyntacticChunkFeature = AnnotatorUtil .getRequiredFeature(mSyntacticChunkType, "syntChunkType", CAS.TYPE_NAME_STRING); typesystemInitialized = true; } public void populateCas(Sentence sentence, CAS tcas) throws AnalysisEngineProcessException, CASRuntimeException { typeSystemInit(tcas.getTypeSystem()); // for (Sentence sentence : document.getSentences()) { // create sentence annotations AnnotationFS sentenceAnn = tcas.createAnnotation(mSentenceType, sentence.getStart(), sentence.getEnd()); tcas.getIndexRepository().addFS(sentenceAnn); int sentenceOffset = sentence.getStart(); AnnotationFS[] tokenAnnotationArr = new AnnotationFS[sentence .getTokens().size()]; int i = 0; for (Token token : sentence.getTokens()) { // create token annotations tokenAnnotationArr[i] = tcas.createAnnotation(mTokenType, sentenceOffset + token.getStart(), sentenceOffset + token.getEnd()); // add POSTag annotations tokenAnnotationArr[i].setStringValue(this.mPosFeature, token.getPOSTag()); // add lexeme annotations tokenAnnotationArr[i].setStringValue(this.mLexemeFeature, token.getLexeme()); // add lemma annotations String[] lemmas = token.getLemmas(); String lemma = StringUtils.join(lemmas, " "); tokenAnnotationArr[i].setStringValue(this.mLemmaFeature, lemma); // StringArrayFS lemmas = tcas.createStringArrayFS(token // .getLemmas().length); // lemmas.copyFromArray(token.getLemmas(), 0, 0, // token.getLemmas().length); // tokenAnnotationArr[i].setFeatureValue(this.mLemmaFeature, // lemmas); tokenAnnotationArr[i].setStringValue(this.mFeaturesFeature, token.getFeatures()); tcas.getIndexRepository().addFS(tokenAnnotationArr[i]); i++; } // chunks for (Chunk chunk : sentence.getChunks()) { int start = sentence.getTokens().get(chunk.getStart()) .getStart() + sentenceOffset; int end = sentence.getTokens().get(chunk.getEnd() - 1).getEnd() + sentenceOffset; AnnotationFS chunkAnn = tcas.createAnnotation(mChunkType, start, end); chunkAnn.setStringValue(mChunkFeature, chunk.getTag()); if(chunk.getHeadIndex() >= 0) { chunkAnn.setFeatureValue(mChunkHead, tokenAnnotationArr[chunk.getHeadIndex()]); } tcas.getIndexRepository().addFS(chunkAnn); } // syntactic chunk for (SyntacticChunk sc : sentence.getSyntacticChunks()) { int start = sentence.getTokens().get(sc.getStart()).getStart() + sentenceOffset; int end = sentence.getTokens().get(sc.getEnd() - 1).getEnd() + sentenceOffset; AnnotationFS syntChunkAnn = tcas.createAnnotation( mSyntacticChunkType, start, end); syntChunkAnn.setStringValue(mSyntacticChunkFeature, sc.getTag()); tcas.getIndexRepository().addFS(syntChunkAnn); } // } } public Type getSentenceType() { return mSentenceType; } public Type getTokenType() { return mTokenType; } public Type getChunkType() { return mChunkType; } public Feature getPosFeature() { return mPosFeature; } public Feature getChunkFeature() { return mChunkFeature; } public Feature getLemmaFeature() { return mLemmaFeature; } public Feature getFeaturesFeature() { return mFeaturesFeature; } public Feature getLexemeFeature() { return mLexemeFeature; } public Feature getChunkHead() { return mChunkHead; } public Type getSyntacticChunkType() { return mSyntacticChunkType; } public Feature getSyntacticChunkFeature() { return mSyntacticChunkFeature; } }