package edu.stanford.nlp.pipeline;
import edu.stanford.nlp.util.logging.Redwood;
import edu.stanford.nlp.ie.NERClassifierCombiner;
import edu.stanford.nlp.ie.regexp.NumberSequenceClassifier;
import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.tokensregex.types.Tags;
import edu.stanford.nlp.time.TimeAnnotations;
import edu.stanford.nlp.time.TimeExpression;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.PropertiesUtils;
import edu.stanford.nlp.util.RuntimeInterruptedException;
import java.io.IOException;
import java.util.*;
/**
* This class will add NER information to an Annotation using a combination of NER models.
* It assumes that the Annotation already contains the tokenized words in sentences
* under {@code CoreAnnotations.SentencesAnnotation.class} as
* {@code List<? extends CoreLabel>}} or a
* {@code List<List<? extends CoreLabel>>} under {@code Annotation.WORDS_KEY}
* and adds NER information to each CoreLabel,
* in the {@code CoreLabel.NER_KEY} field. It uses
* the NERClassifierCombiner class in the ie package.
*
* @author Jenny Finkel
* @author Mihai Surdeanu (modified it to work with the new NERClassifierCombiner)
*/
public class NERCombinerAnnotator extends SentenceAnnotator {
/** A logger for this class */
private static final Redwood.RedwoodChannels log = Redwood.channels(NERCombinerAnnotator.class);
private final NERClassifierCombiner ner;
private final boolean VERBOSE;
private final long maxTime;
private final int nThreads;
private final int maxSentenceLength;
public NERCombinerAnnotator(Properties properties) throws IOException {
List<String> models = new ArrayList<>();
String modelNames = properties.getProperty("ner.model");
if (modelNames == null) {
modelNames = DefaultPaths.DEFAULT_NER_THREECLASS_MODEL + "," + DefaultPaths.DEFAULT_NER_MUC_MODEL + "," + DefaultPaths.DEFAULT_NER_CONLL_MODEL;
}
if ( ! modelNames.isEmpty()) {
models.addAll(Arrays.asList(modelNames.split(",")));
}
if (models.isEmpty()) {
// Allow for no real NER model - can just use numeric classifiers or SUTime.
// Have to unset ner.model, so unlikely that people got here by accident.
log.info("WARNING: no NER models specified");
}
boolean applyNumericClassifiers =
PropertiesUtils.getBool(properties,
NERClassifierCombiner.APPLY_NUMERIC_CLASSIFIERS_PROPERTY,
NERClassifierCombiner.APPLY_NUMERIC_CLASSIFIERS_DEFAULT);
boolean applyRegexner =
PropertiesUtils.getBool(properties,
NERClassifierCombiner.APPLY_GAZETTE_PROPERTY,
NERClassifierCombiner.APPLY_GAZETTE_DEFAULT);
boolean useSUTime =
PropertiesUtils.getBool(properties,
NumberSequenceClassifier.USE_SUTIME_PROPERTY,
NumberSequenceClassifier.USE_SUTIME_DEFAULT);
NERClassifierCombiner.Language nerLanguage = NERClassifierCombiner.Language.fromString(PropertiesUtils.getString(properties,
NERClassifierCombiner.NER_LANGUAGE_PROPERTY, null), NERClassifierCombiner.NER_LANGUAGE_DEFAULT);
boolean verbose = PropertiesUtils.getBool(properties, "ner." + "verbose", false);
String[] loadPaths = models.toArray(new String[models.size()]);
Properties combinerProperties = PropertiesUtils.extractSelectedProperties(properties,
NERClassifierCombiner.DEFAULT_PASS_DOWN_PROPERTIES);
if (useSUTime) {
// Make sure SUTime parameters are included
Properties sutimeProps = PropertiesUtils.extractPrefixedProperties(properties, NumberSequenceClassifier.SUTIME_PROPERTY + ".", true);
PropertiesUtils.overWriteProperties(combinerProperties, sutimeProps);
}
NERClassifierCombiner nerCombiner = new NERClassifierCombiner(applyNumericClassifiers, nerLanguage,
useSUTime, applyRegexner, combinerProperties, loadPaths);
int nThreads = PropertiesUtils.getInt(properties, "ner.nthreads", PropertiesUtils.getInt(properties, "nthreads", 1));
long maxTime = PropertiesUtils.getLong(properties, "ner.maxtime", 0);
int maxSentenceLength = PropertiesUtils.getInt(properties, "ner.maxlen", Integer.MAX_VALUE);
VERBOSE = verbose;
this.ner = nerCombiner;
this.maxTime = maxTime;
this.nThreads = nThreads;
this.maxSentenceLength = maxSentenceLength;
}
public NERCombinerAnnotator() throws IOException, ClassNotFoundException {
this(true);
}
public NERCombinerAnnotator(boolean verbose)
throws IOException, ClassNotFoundException
{
this(new NERClassifierCombiner(new Properties()), verbose);
}
public NERCombinerAnnotator(boolean verbose, String... classifiers)
throws IOException, ClassNotFoundException
{
this(new NERClassifierCombiner(classifiers), verbose);
}
public NERCombinerAnnotator(NERClassifierCombiner ner, boolean verbose) {
this(ner, verbose, 1, 0, Integer.MAX_VALUE);
}
public NERCombinerAnnotator(NERClassifierCombiner ner, boolean verbose, int nThreads, long maxTime) {
this(ner, verbose, nThreads, maxTime, Integer.MAX_VALUE);
}
public NERCombinerAnnotator(NERClassifierCombiner ner, boolean verbose, int nThreads, long maxTime, int maxSentenceLength) {
VERBOSE = verbose;
this.ner = ner;
this.maxTime = maxTime;
this.nThreads = nThreads;
this.maxSentenceLength = maxSentenceLength;
}
@Override
protected int nThreads() {
return nThreads;
}
@Override
protected long maxTime() {
return maxTime;
}
@Override
public void annotate(Annotation annotation) {
if (VERBOSE) {
log.info("Adding NER Combiner annotation ... ");
}
super.annotate(annotation);
this.ner.finalizeAnnotation(annotation);
if (VERBOSE) {
log.info("done.");
}
}
@Override
public void doOneSentence(Annotation annotation, CoreMap sentence) {
List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
List<CoreLabel> output; // only used if try assignment works.
if (tokens.size() <= this.maxSentenceLength) {
try {
output = this.ner.classifySentenceWithGlobalInformation(tokens, annotation, sentence);
} catch (RuntimeInterruptedException e) {
// If we get interrupted, set the NER labels to the background
// symbol if they are not already set, then exit.
output = null;
}
} else {
output = null;
}
if (output == null) {
doOneFailedSentence(annotation, sentence);
} else {
for (int i = 0, sz = tokens.size(); i < sz; ++i) {
// add the named entity tag to each token
String neTag = output.get(i).get(CoreAnnotations.NamedEntityTagAnnotation.class);
String normNeTag = output.get(i).get(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class);
tokens.get(i).setNER(neTag);
if (normNeTag != null) tokens.get(i).set(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class, normNeTag);
NumberSequenceClassifier.transferAnnotations(output.get(i), tokens.get(i));
}
if (VERBOSE) {
boolean first = true;
StringBuilder sb = new StringBuilder("NERCombinerAnnotator output: [");
for (CoreLabel w : tokens) {
if (first) {
first = false;
} else {
sb.append(", ");
}
sb.append(w.toShorterString("Text", "NamedEntityTag", "NormalizedNamedEntityTag"));
}
sb.append(']');
log.info(sb);
}
}
}
/** {@inheritDoc} */
@Override
public void doOneFailedSentence(Annotation annotation, CoreMap sentence) {
List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
for (CoreLabel token : tokens) {
// add the background named entity tag to each token if it doesn't have an NER tag.
if (token.ner() == null) {
token.setNER(this.ner.backgroundSymbol());
}
}
}
@Override
public Set<Class<? extends CoreAnnotation>> requires() {
// TODO: we could check the models to see which ones use lemmas
// and which ones use pos tags
if (ner.usesSUTime() || ner.appliesNumericClassifiers()) {
return Collections.unmodifiableSet(new HashSet<>(Arrays.asList(
CoreAnnotations.TextAnnotation.class,
CoreAnnotations.TokensAnnotation.class,
CoreAnnotations.SentencesAnnotation.class,
CoreAnnotations.CharacterOffsetBeginAnnotation.class,
CoreAnnotations.CharacterOffsetEndAnnotation.class,
CoreAnnotations.PartOfSpeechAnnotation.class,
CoreAnnotations.LemmaAnnotation.class,
CoreAnnotations.BeforeAnnotation.class,
CoreAnnotations.AfterAnnotation.class,
CoreAnnotations.TokenBeginAnnotation.class,
CoreAnnotations.TokenEndAnnotation.class,
CoreAnnotations.IndexAnnotation.class,
CoreAnnotations.OriginalTextAnnotation.class,
CoreAnnotations.SentenceIndexAnnotation.class
)));
} else {
return Collections.unmodifiableSet(new HashSet<>(Arrays.asList(
CoreAnnotations.TextAnnotation.class,
CoreAnnotations.TokensAnnotation.class,
CoreAnnotations.SentencesAnnotation.class,
CoreAnnotations.CharacterOffsetBeginAnnotation.class,
CoreAnnotations.CharacterOffsetEndAnnotation.class,
CoreAnnotations.BeforeAnnotation.class,
CoreAnnotations.AfterAnnotation.class,
CoreAnnotations.TokenBeginAnnotation.class,
CoreAnnotations.TokenEndAnnotation.class,
CoreAnnotations.IndexAnnotation.class,
CoreAnnotations.OriginalTextAnnotation.class,
CoreAnnotations.SentenceIndexAnnotation.class
)));
}
}
@SuppressWarnings("unchecked")
@Override
public Set<Class<? extends CoreAnnotation>> requirementsSatisfied() {
return new HashSet<>(Arrays.asList(
CoreAnnotations.NamedEntityTagAnnotation.class,
CoreAnnotations.NormalizedNamedEntityTagAnnotation.class,
CoreAnnotations.ValueAnnotation.class,
TimeExpression.Annotation.class,
TimeExpression.TimeIndexAnnotation.class,
CoreAnnotations.DistSimAnnotation.class,
CoreAnnotations.NumericCompositeTypeAnnotation.class,
TimeAnnotations.TimexAnnotation.class,
CoreAnnotations.NumericValueAnnotation.class,
TimeExpression.ChildrenAnnotation.class,
CoreAnnotations.NumericTypeAnnotation.class,
CoreAnnotations.ShapeAnnotation.class,
Tags.TagsAnnotation.class,
CoreAnnotations.NumerizedTokensAnnotation.class,
CoreAnnotations.AnswerAnnotation.class,
CoreAnnotations.NumericCompositeValueAnnotation.class
));
}
}