/**
* This is a preprocessing engine for use in a UIMA pipeline. It will invoke
* functions from the Stanford POS Tagger to tokenize words and sentences
* and add part of speech tags to the pipeline.
*/
package de.unihd.dbs.uima.annotator.stanfordtagger;
import java.io.File;
import java.io.FileInputStream;
import java.io.StringReader;
import java.util.List;
import java.util.ListIterator;
import java.util.Properties;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.jcas.JCas;
import de.unihd.dbs.uima.annotator.heideltime.utilities.Logger;
import de.unihd.dbs.uima.types.heideltime.Sentence;
import de.unihd.dbs.uima.types.heideltime.Token;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.TaggedWord;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.process.TokenizerFactory;
import edu.stanford.nlp.tagger.maxent.MaxentTagger;
import edu.stanford.nlp.tagger.maxent.TaggerConfig;
import edu.stanford.nlp.process.PTBTokenizer.PTBTokenizerFactory;
/**
* @author Julian Zell
*
*/
public class StanfordPOSTaggerWrapper extends JCasAnnotator_ImplBase {
private Class<?> component = this.getClass();
// definitions of what names these parameters have in the wrapper's descriptor file
public static final String PARAM_MODEL_PATH = "model_path";
public static final String PARAM_CONFIG_PATH = "config_path";
public static final String PARAM_ANNOTATE_TOKENS = "annotate_tokens";
public static final String PARAM_ANNOTATE_SENTENCES = "annotate_sentences";
public static final String PARAM_ANNOTATE_PARTOFSPEECH = "annotate_partofspeech";
// switches for annotation parameters
private String model_path;
private String config_path;
private Boolean annotate_tokens = false;
private Boolean annotate_sentences = false;
private Boolean annotate_partofspeech = false;
// Maximum Entropy Tagger from the Stanford POS Tagger
private MaxentTagger mt;
/**
* initialization method where we fill configuration values and check some prerequisites
*/
public void initialize(UimaContext aContext) {
// get configuration from the descriptor
annotate_tokens = (Boolean) aContext.getConfigParameterValue(PARAM_ANNOTATE_TOKENS);
annotate_sentences = (Boolean) aContext.getConfigParameterValue(PARAM_ANNOTATE_SENTENCES);
annotate_partofspeech = (Boolean) aContext.getConfigParameterValue(PARAM_ANNOTATE_PARTOFSPEECH);
model_path = (String) aContext.getConfigParameterValue(PARAM_MODEL_PATH);
config_path = (String) aContext.getConfigParameterValue(PARAM_CONFIG_PATH);
// check if the model file exists
if(model_path == null || (new File(model_path)).exists() == false) {
Logger.printError(component, "The supplied model file for the Stanford Tagger could not be found.");
System.exit(-1);
}
// try instantiating the MaxEnt Tagger
try {
if(config_path != null) { // configuration exists
FileInputStream isr = new FileInputStream(config_path);
Properties props = new Properties();
props.load(isr);
mt = new MaxentTagger(model_path, new TaggerConfig(props), false);
} else { // instantiate without configuration file
mt = new MaxentTagger(model_path, new TaggerConfig("-model", model_path), false);
}
} catch(Exception e) {
e.printStackTrace();
Logger.printError(component, "MaxentTagger could not be instantiated with the supplied model("+model_path+") and config("+config_path+") file.");
System.exit(-1);
}
}
/**
* Method that gets called to process the documents' cas objects
*/
public void process(JCas jcas) throws AnalysisEngineProcessException {
Integer offset = 0; // a cursor of sorts to keep up with the position in the document text
// grab the document text
String docText = jcas.getDocumentText();
// get [sentence-tokens[word-tokens]] from the MaxentTagger
TokenizerFactory<Word> fac = PTBTokenizerFactory.newTokenizerFactory();
fac.setOptions("ptb3Escaping=false,untokenizable=noneKeep");
List<List<HasWord>> tokenArray = MaxentTagger.tokenizeText(new StringReader(docText), fac);
// iterate over sentences in this document
for(List<HasWord> sentenceToken : tokenArray) {
List<TaggedWord> taggedSentence = mt.tagSentence(sentenceToken);
ListIterator<TaggedWord> twit = taggedSentence.listIterator();
// create a sentence object. gets added to index or discarded depending on configuration
Sentence sentence = new Sentence(jcas);
sentence.setBegin(offset);
Integer wordCount = 0;
// iterate over words in this sentence
for(HasWord wordToken : sentenceToken) {
Token t = new Token(jcas);
TaggedWord tw = twit.next();
// if pos is supposed to be added, iterate through the tagged tokens and set pos
if(annotate_partofspeech) {
t.setPos(tw.tag());
}
String thisWord = wordToken.word();
if(docText.indexOf(thisWord, offset) < 0) {
Logger.printDetail(component, "A previously tagged token wasn't found in the document text: \"" + thisWord + "\". " +
"This may be due to unpredictable punctuation tokenization; hence this token isn't tagged.");
continue; // jump to next token: discards token
} else {
offset = docText.indexOf(thisWord, offset); // set cursor to the starting position of token in docText
t.setBegin(offset);
++wordCount;
}
offset += thisWord.length(); // move cursor behind the word
t.setEnd(offset);
// add tokens to indexes.
if(annotate_tokens) {
t.addToIndexes();
}
}
// if flag is set, also tag sentences
if(annotate_sentences) {
if(wordCount == 0)
sentence.setEnd(offset);
else
sentence.setEnd(offset-1);
sentence.addToIndexes();
}
}
// TODO: DEBUG
FSIterator fsi = jcas.getAnnotationIndex(Sentence.type).iterator();
while(fsi.hasNext()) {
Sentence s = (Sentence) fsi.next();
if(s.getBegin() < 0 || s.getEnd() < 0) {
System.err.println("Sentence: " + s.getBegin() + ":" + s.getEnd() + " = " + s.getCoveredText());
System.err.println("wrong index in text: " + jcas.getDocumentText());
System.exit(-1);
}
}
FSIterator fsi2 = jcas.getAnnotationIndex(Token.type).iterator();
while(fsi2.hasNext()) {
Token t = (Token) fsi2.next();
if(t.getBegin() < 0 || t.getEnd() < 0) {
System.err.println("In text: " + jcas.getDocumentText());
System.err.println("Token: " + t.getBegin() + ":" + t.getEnd());
System.exit(-1);
}
}
}
}