package edu.stanford.nlp.pipeline;
import edu.stanford.nlp.util.logging.Redwood;
import java.util.*;
import edu.stanford.nlp.ling.*;
import edu.stanford.nlp.tagger.maxent.MaxentTagger;
import edu.stanford.nlp.util.*;
import edu.stanford.nlp.util.concurrent.MulticoreWrapper;
import edu.stanford.nlp.util.concurrent.ThreadsafeProcessor;
/**
* Wrapper for the maxent part of speech tagger.
*
* @author Anna Rafferty
*/
public class POSTaggerAnnotator implements Annotator {
/** A logger for this class */
private static Redwood.RedwoodChannels log = Redwood.channels(POSTaggerAnnotator.class);
private final MaxentTagger pos;
private final int maxSentenceLength;
private final int nThreads;
private final boolean reuseTags;
/** Create a tagger annotator using the default English tagger from the models jar
* (and non-verbose initialization).
*/
public POSTaggerAnnotator() {
this(false);
}
public POSTaggerAnnotator(boolean verbose) {
this(System.getProperty("pos.model", MaxentTagger.DEFAULT_JAR_PATH), verbose);
}
public POSTaggerAnnotator(String posLoc, boolean verbose) {
this(posLoc, verbose, Integer.MAX_VALUE, 1);
}
/** Create a POS tagger annotator.
*
* @param posLoc Location of POS tagger model (may be file path, classpath resource, or URL
* @param verbose Whether to show verbose information on model loading
* @param maxSentenceLength Sentences longer than this length will be skipped in processing
* @param numThreads The number of threads for the POS tagger annotator to use
*/
public POSTaggerAnnotator(String posLoc, boolean verbose, int maxSentenceLength, int numThreads) {
this(loadModel(posLoc, verbose), maxSentenceLength, numThreads);
}
public POSTaggerAnnotator(MaxentTagger model) {
this(model, Integer.MAX_VALUE, 1);
}
public POSTaggerAnnotator(MaxentTagger model, int maxSentenceLength, int numThreads) {
this.pos = model;
this.maxSentenceLength = maxSentenceLength;
this.nThreads = numThreads;
this.reuseTags = false;
}
public POSTaggerAnnotator(String annotatorName, Properties props) {
String posLoc = props.getProperty(annotatorName + ".model");
if (posLoc == null) {
posLoc = DefaultPaths.DEFAULT_POS_MODEL;
}
boolean verbose = PropertiesUtils.getBool(props, annotatorName + ".verbose", false);
this.pos = loadModel(posLoc, verbose);
this.maxSentenceLength = PropertiesUtils.getInt(props, annotatorName + ".maxlen", Integer.MAX_VALUE);
this.nThreads = PropertiesUtils.getInt(props, annotatorName + ".nthreads", PropertiesUtils.getInt(props, "nthreads", 1));
this.reuseTags = PropertiesUtils.getBool(props, annotatorName + ".reuseTags", false);
}
private static MaxentTagger loadModel(String loc, boolean verbose) {
Timing timer = null;
if (verbose) {
timer = new Timing();
timer.doing("Loading POS Model [" + loc + ']');
}
MaxentTagger tagger = new MaxentTagger(loc);
if (verbose) {
timer.done();
}
return tagger;
}
@Override
public void annotate(Annotation annotation) {
// turn the annotation into a sentence
if (annotation.containsKey(CoreAnnotations.SentencesAnnotation.class)) {
if (nThreads == 1) {
for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
doOneSentence(sentence);
}
} else {
MulticoreWrapper<CoreMap, CoreMap> wrapper = new MulticoreWrapper<>(nThreads, new POSTaggerProcessor());
for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
wrapper.put(sentence);
while (wrapper.peek()) {
wrapper.poll();
}
}
wrapper.join();
while (wrapper.peek()) {
wrapper.poll();
}
}
} else {
throw new RuntimeException("unable to find words/tokens in: " + annotation);
}
}
private class POSTaggerProcessor implements ThreadsafeProcessor<CoreMap, CoreMap> {
@Override
public CoreMap process(CoreMap sentence) {
return doOneSentence(sentence);
}
@Override
public ThreadsafeProcessor<CoreMap, CoreMap> newInstance() {
return this;
}
}
private CoreMap doOneSentence(CoreMap sentence) {
List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
List<TaggedWord> tagged = null;
if (tokens.size() <= maxSentenceLength) {
try {
tagged = pos.tagSentence(tokens, this.reuseTags);
} catch (OutOfMemoryError e) {
log.error(e); // Beware that we can now get an OOM in logging, too.
log.warn("Tagging of sentence ran out of memory. " +
"Will ignore and continue: " +
SentenceUtils.listToString(tokens));
}
}
if (tagged != null) {
for (int i = 0, sz = tokens.size(); i < sz; i++) {
tokens.get(i).set(CoreAnnotations.PartOfSpeechAnnotation.class, tagged.get(i).tag());
}
} else {
for (CoreLabel token : tokens) {
token.set(CoreAnnotations.PartOfSpeechAnnotation.class, "X");
}
}
return sentence;
}
@Override
public Set<Class<? extends CoreAnnotation>> requires() {
return Collections.unmodifiableSet(new ArraySet<>(Arrays.asList(
CoreAnnotations.TextAnnotation.class,
CoreAnnotations.TokensAnnotation.class,
CoreAnnotations.CharacterOffsetBeginAnnotation.class,
CoreAnnotations.CharacterOffsetEndAnnotation.class,
CoreAnnotations.SentencesAnnotation.class
)));
}
@Override
public Set<Class<? extends CoreAnnotation>> requirementsSatisfied() {
return Collections.singleton(CoreAnnotations.PartOfSpeechAnnotation.class);
}
}