package edu.stanford.nlp.pipeline;
import edu.stanford.nlp.ie.NERClassifierCombiner;
import edu.stanford.nlp.ie.regexp.NumberSequenceClassifier;
import edu.stanford.nlp.ie.regexp.RegexNERSequenceClassifier;
import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.PropertiesUtils;
import edu.stanford.nlp.util.Timing;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.*;
/**
* This class will add NER information to an
* Annotation using a combination of NER models.
* It assumes that the Annotation
* already contains the tokenized words as a
* List<? extends CoreLabel> or a
* List<List<? extends CoreLabel>> under Annotation.WORDS_KEY
* and adds NER information to each CoreLabel,
* in the CoreLabel.NER_KEY field. It uses
* the NERClassifierCombiner class in the ie package.
*
* @author Jenny Finkel
* @author Mihai Surdeanu (modified it to work with the new NERClassifierCombiner)
*/
public class NERCombinerAnnotator implements Annotator {
private final NERClassifierCombiner ner;
private final Timing timer = new Timing();
private boolean VERBOSE = true;
public NERCombinerAnnotator() throws IOException, ClassNotFoundException {
this(true);
}
private void timerStart(String msg) {
if(VERBOSE){
timer.start();
System.err.println(msg);
}
}
private void timerStop() {
if(VERBOSE){
timer.stop("done.");
}
}
public NERCombinerAnnotator(boolean verbose) throws IOException, ClassNotFoundException {
VERBOSE = verbose;
timerStart("Loading NER combiner model...");
ner = new NERClassifierCombiner(new Properties());
timerStop();
}
public NERCombinerAnnotator(boolean verbose, String... classifiers)
throws IOException, ClassNotFoundException {
VERBOSE = verbose;
timerStart("Loading NER combiner model...");
ner = new NERClassifierCombiner(classifiers);
timerStop();
}
public NERCombinerAnnotator(NERClassifierCombiner ner, boolean verbose) {
VERBOSE = verbose;
this.ner = ner;
}
public NERCombinerAnnotator(String name, Properties properties) {
this(createNERClassifierCombiner(name, properties), false);
}
private final static NERClassifierCombiner createNERClassifierCombiner(String name, Properties properties) {
// TODO: Move function into NERClassifierCombiner?
List<String> models = new ArrayList<String>();
String prefix = (name != null)? name + ".": "ner.";
String modelNames = properties.getProperty(prefix + "model");
if (modelNames == null) {
modelNames = DefaultPaths.DEFAULT_NER_THREECLASS_MODEL + "," + DefaultPaths.DEFAULT_NER_MUC_MODEL + "," + DefaultPaths.DEFAULT_NER_CONLL_MODEL;
}
if (modelNames.length() > 0) {
models.addAll(Arrays.asList(modelNames.split(",")));
}
if (models.isEmpty()) {
// Allow for no real NER model - can just use numeric classifiers or SUTime
System.err.println("WARNING: no NER models specified");
}
NERClassifierCombiner nerCombiner;
try {
// TODO: use constants for part after prefix so we can ensure consistent options
boolean applyNumericClassifiers =
PropertiesUtils.getBool(properties,
prefix + "applyNumericClassifiers",
NERClassifierCombiner.APPLY_NUMERIC_CLASSIFIERS_DEFAULT);
boolean useSUTime =
PropertiesUtils.getBool(properties,
prefix + "useSUTime",
NumberSequenceClassifier.USE_SUTIME_DEFAULT);
// TODO: properties are passed in as it for number sequence classifiers (don't care about the prefix)
nerCombiner = new NERClassifierCombiner(applyNumericClassifiers,
useSUTime, properties,
models.toArray(new String[models.size()]));
} catch (FileNotFoundException e) {
throw new RuntimeIOException(e);
}
return nerCombiner;
}
public void annotate(Annotation annotation) {
timerStart("Adding NER Combiner annotation...");
if (annotation.containsKey(CoreAnnotations.SentencesAnnotation.class)) {
// classify tokens for each sentence
for (CoreMap sentence: annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
doOneSentence(annotation, sentence);
}
this.ner.finalizeAnnotation(annotation);
} else {
throw new RuntimeException("unable to find sentences in: " + annotation);
}
//timerStop("done.");
}
public CoreMap doOneSentence(Annotation annotation, CoreMap sentence) {
List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
List<CoreLabel> output = this.ner.classifySentenceWithGlobalInformation(tokens, annotation, sentence);
if (VERBOSE) {
boolean first = true;
System.err.print("NERCombinerAnnotator direct output: [");
for (CoreLabel w : output) {
if (first) { first = false; } else { System.err.print(", "); }
System.err.print(w.toString());
}
System.err.println(']');
}
for (int i = 0; i < tokens.size(); ++i) {
// add the named entity tag to each token
String neTag = output.get(i).get(CoreAnnotations.NamedEntityTagAnnotation.class);
String normNeTag = output.get(i).get(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class);
tokens.get(i).setNER(neTag);
if(normNeTag != null) tokens.get(i).set(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class, normNeTag);
NumberSequenceClassifier.transferAnnotations(output.get(i), tokens.get(i));
}
if (VERBOSE) {
boolean first = true;
System.err.print("NERCombinerAnnotator output: [");
for (CoreLabel w : tokens) {
if (first) { first = false; } else { System.err.print(", "); }
System.err.print(w.toShorterString("Word", "NamedEntityTag", "NormalizedNamedEntityTag"));
}
System.err.println(']');
}
return sentence;
}
@Override
public Set<Requirement> requires() {
// TODO: we could check the models to see which ones use lemmas
// and which ones use pos tags
if (ner.usesSUTime() || ner.appliesNumericClassifiers()) {
return TOKENIZE_SSPLIT_POS_LEMMA;
} else {
return TOKENIZE_AND_SSPLIT;
}
}
@Override
public Set<Requirement> requirementsSatisfied() {
return Collections.singleton(NER_REQUIREMENT);
}
}