package edu.isistan.uima.unified.analysisengines.stanfordnlp;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.text.AnnotationIndex;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import org.apache.uima.resource.ResourceInitializationException;
import org.eclipse.core.runtime.IProgressMonitor;
import org.eclipse.core.runtime.SubProgressMonitor;
import org.uimafit.component.JCasAnnotator_ImplBase;
import org.uimafit.descriptor.ConfigurationParameter;
import org.uimafit.descriptor.ExternalResource;
import edu.isistan.uima.unified.analysisengines.AnnotationGenerator;
import edu.isistan.uima.unified.sharedresources.ProgressMonitorResource;
import edu.isistan.uima.unified.typesystems.srs.Document;
import edu.isistan.uima.unified.typesystems.srs.Section;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.tagger.maxent.MaxentTagger;
public class SentenceTokenAnnotator extends JCasAnnotator_ImplBase {
@ConfigurationParameter(name="model")
private String modelName;
protected MaxentTagger mt;
//
@ExternalResource(key="monitor")
private ProgressMonitorResource monitorResource;
private IProgressMonitor subMonitor;
@Override
public void initialize(UimaContext aContext) throws ResourceInitializationException {
super.initialize(aContext);
try {
//modelName = (String) aContext.getConfigParameterValue("model");
mt = new MaxentTagger(modelName);
} catch (Exception e) {
e.printStackTrace();
}
}
@Override
public void process(JCas aJCas) throws AnalysisEngineProcessException {
if(mt == null)
return;
//
subMonitor = new SubProgressMonitor(monitorResource.getMonitor(), 1, SubProgressMonitor.PREPEND_MAIN_LABEL_TO_SUBTASK);
subMonitor.subTask("Annotating sentences and tokens (Stanford)");
//
//String docText = aJCas.getDocumentText();
AnnotationIndex<Annotation> dAnnotations = aJCas.getAnnotationIndex(Document.type);
AnnotationIndex<Annotation> sAnnotations = aJCas.getAnnotationIndex(Section.type);
//
subMonitor.beginTask(this.getClass().getSimpleName(), dAnnotations.size());
//
for(Annotation dAnnotation : dAnnotations) {
//Document documentAnnotation = (Document) dAnnotation;
//String document = dAnnotation.getCoveredText();
Iterator<Annotation> sectionIterator = sAnnotations.subiterator(dAnnotation);
while(sectionIterator.hasNext()) {
Annotation sAnnotation = sectionIterator.next();
Section section = (Section) sAnnotation;
String sectionText = section.getCoveredText();
String[] splittedText = sectionText.split("\\r?\\n");
int textPos = 0;
for(String text : splittedText) {
Reader input = new StringReader(text);
List<ArrayList<? extends HasWord>> sentences = MaxentTagger.tokenizeText(input);
int textBegin = sectionText.indexOf(text, textPos);
int textEnd = textBegin + text.length();
for(ArrayList<? extends HasWord> sentence : sentences) {
Word firstWord = (Word) sentence.get(0);
Word lastWord = (Word) sentence.get(sentence.size() - 1);
//
AnnotationGenerator.generateSentence(section.getBegin() + textBegin + firstWord.beginPosition(), section.getBegin() + textBegin + lastWord.endPosition(), aJCas);
for(HasWord hasWord : sentence) {
Word word = (Word) hasWord;
AnnotationGenerator.generateToken(section.getBegin() + textBegin + word.beginPosition(), section.getBegin() + textBegin + word.endPosition(), aJCas);
}
}
textPos = textEnd;
}
}
//
subMonitor.worked(1);
}
//
subMonitor.done();
}
@Override
public void destroy() {
mt = null;
super.destroy();
}
}