package edu.stanford.nlp.ie;
import java.io.File;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.util.*;
import java.util.stream.Collectors;
import edu.stanford.nlp.ie.regexp.ChineseNumberSequenceClassifier;
import edu.stanford.nlp.ie.regexp.NumberSequenceClassifier;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.DefaultPaths;
import edu.stanford.nlp.sequences.DocumentReaderAndWriter;
import edu.stanford.nlp.sequences.SeqClassifierFlags;
import edu.stanford.nlp.util.*;
import edu.stanford.nlp.util.logging.Redwood;
/**
* Subclass of ClassifierCombiner that behaves like a NER, by copying
* the AnswerAnnotation labels to NERAnnotation. Also, it can run additional
* classifiers (NumberSequenceClassifier, QuantifiableEntityNormalizer, SUTime)
* to recognize numeric and date/time entities, depending on flag settings.
*
* @author Mihai Surdeanu
*/
public class NERClassifierCombiner extends ClassifierCombiner<CoreLabel> {
/** A logger for this class */
private static final Redwood.RedwoodChannels log = Redwood.channels(NERClassifierCombiner.class);
private final boolean applyNumericClassifiers;
public static final boolean APPLY_NUMERIC_CLASSIFIERS_DEFAULT = true;
public static final String APPLY_NUMERIC_CLASSIFIERS_PROPERTY = "ner.applyNumericClassifiers";
private static final String APPLY_NUMERIC_CLASSIFIERS_PROPERTY_BASE = "applyNumericClassifiers";
public static final String APPLY_GAZETTE_PROPERTY = "ner.regex";
public static final boolean APPLY_GAZETTE_DEFAULT = false;
private final Language nerLanguage;
public static final Language NER_LANGUAGE_DEFAULT = Language.ENGLISH;
public static final String NER_LANGUAGE_PROPERTY = "ner.language";
public static final String NER_LANGUAGE_PROPERTY_BASE = "language";
private final boolean useSUTime;
public enum Language {
ENGLISH("English"),
CHINESE("Chinese");
public String languageName;
Language(String name) {
this.languageName = name;
}
public static Language fromString(String name, Language defaultValue) {
if(name != null) {
for(Language l : Language.values()) {
if(name.equalsIgnoreCase(l.languageName)) {
return l;
}
}
}
return defaultValue;
}
}
// todo [cdm 2015]: Could avoid constructing this if applyNumericClassifiers is false
private final AbstractSequenceClassifier<CoreLabel> nsc;
/**
* A mapping from single words to the NER tag that they should be.
*/
private final Map<String, String> gazetteMapping;
public NERClassifierCombiner(Properties props)
throws IOException
{
super(props);
applyNumericClassifiers = PropertiesUtils.getBool(props, APPLY_NUMERIC_CLASSIFIERS_PROPERTY, APPLY_NUMERIC_CLASSIFIERS_DEFAULT);
nerLanguage = Language.fromString(PropertiesUtils.getString(props, NER_LANGUAGE_PROPERTY, null), NER_LANGUAGE_DEFAULT);
useSUTime = PropertiesUtils.getBool(props, NumberSequenceClassifier.USE_SUTIME_PROPERTY, NumberSequenceClassifier.USE_SUTIME_DEFAULT);
nsc = new NumberSequenceClassifier(new Properties(), useSUTime, props);
if (PropertiesUtils.getBool(props, NERClassifierCombiner.APPLY_GAZETTE_PROPERTY, NERClassifierCombiner.APPLY_GAZETTE_DEFAULT) ) {
this.gazetteMapping = readRegexnerGazette(DefaultPaths.DEFAULT_NER_GAZETTE_MAPPING);
} else {
this.gazetteMapping = Collections.emptyMap();
}
}
public NERClassifierCombiner(String... loadPaths)
throws IOException
{
this(APPLY_NUMERIC_CLASSIFIERS_DEFAULT, NERClassifierCombiner.APPLY_GAZETTE_DEFAULT, NumberSequenceClassifier.USE_SUTIME_DEFAULT, loadPaths);
}
public NERClassifierCombiner(boolean applyNumericClassifiers,
boolean augmentRegexNER,
boolean useSUTime,
String... loadPaths)
throws IOException
{
super(loadPaths);
this.applyNumericClassifiers = applyNumericClassifiers;
this.nerLanguage = NER_LANGUAGE_DEFAULT;
this.useSUTime = useSUTime;
this.nsc = new NumberSequenceClassifier(useSUTime);
if (augmentRegexNER) {
this.gazetteMapping = readRegexnerGazette(DefaultPaths.DEFAULT_NER_GAZETTE_MAPPING);
} else {
this.gazetteMapping = Collections.emptyMap();
}
}
public NERClassifierCombiner(boolean applyNumericClassifiers,
Language nerLanguage,
boolean useSUTime,
boolean augmentRegexNER,
Properties nscProps,
String... loadPaths)
throws IOException
{
// NOTE: nscProps may contains sutime props which will not be recognized by the SeqClassifierFlags
super(nscProps, ClassifierCombiner.extractCombinationModeSafe(nscProps), loadPaths);
this.applyNumericClassifiers = applyNumericClassifiers;
this.nerLanguage = nerLanguage;
this.useSUTime = useSUTime;
// check for which language to use for number sequence classifier
if (nerLanguage == Language.CHINESE) {
this.nsc = new ChineseNumberSequenceClassifier(new Properties(), useSUTime, nscProps);
} else {
this.nsc = new NumberSequenceClassifier(new Properties(), useSUTime, nscProps);
}
if (augmentRegexNER) {
this.gazetteMapping = readRegexnerGazette(DefaultPaths.DEFAULT_NER_GAZETTE_MAPPING);
} else {
this.gazetteMapping = Collections.emptyMap();
}
}
@SafeVarargs
public NERClassifierCombiner(AbstractSequenceClassifier<CoreLabel>... classifiers)
throws IOException
{
this(APPLY_NUMERIC_CLASSIFIERS_DEFAULT, NumberSequenceClassifier.USE_SUTIME_DEFAULT, NERClassifierCombiner.APPLY_GAZETTE_DEFAULT, classifiers);
}
@SafeVarargs
public NERClassifierCombiner(boolean applyNumericClassifiers,
boolean useSUTime,
boolean augmentRegexNER,
AbstractSequenceClassifier<CoreLabel>... classifiers)
throws IOException
{
super(classifiers);
this.applyNumericClassifiers = applyNumericClassifiers;
this.nerLanguage = NER_LANGUAGE_DEFAULT;
this.useSUTime = useSUTime;
this.nsc = new NumberSequenceClassifier(useSUTime);
if (augmentRegexNER) {
this.gazetteMapping = readRegexnerGazette(DefaultPaths.DEFAULT_NER_GAZETTE_MAPPING);
} else {
this.gazetteMapping = Collections.emptyMap();
}
}
// constructor which builds an NERClassifierCombiner from an ObjectInputStream
public NERClassifierCombiner(ObjectInputStream ois, Properties props) throws IOException, ClassCastException, ClassNotFoundException {
super(ois,props);
// read the useSUTime from disk
Boolean diskUseSUTime = ois.readBoolean();
if (props.getProperty("ner.useSUTime") != null) {
this.useSUTime = Boolean.parseBoolean(props.getProperty("ner.useSUTime"));
} else {
this.useSUTime = diskUseSUTime;
}
// read the applyNumericClassifiers from disk
Boolean diskApplyNumericClassifiers = ois.readBoolean();
if (props.getProperty("ner.applyNumericClassifiers") != null) {
this.applyNumericClassifiers = Boolean.parseBoolean(props.getProperty("ner.applyNumericClassifiers"));
} else {
this.applyNumericClassifiers = diskApplyNumericClassifiers;
}
this.nerLanguage = NER_LANGUAGE_DEFAULT;
// build the nsc, note that initProps should be set by ClassifierCombiner
this.nsc = new NumberSequenceClassifier(new Properties(), useSUTime, props);
if (PropertiesUtils.getBool(props, NERClassifierCombiner.APPLY_GAZETTE_PROPERTY, NERClassifierCombiner.APPLY_GAZETTE_DEFAULT) ) {
this.gazetteMapping = readRegexnerGazette(DefaultPaths.DEFAULT_NER_GAZETTE_MAPPING);
} else {
this.gazetteMapping = Collections.emptyMap();
}
}
public static final Set<String> DEFAULT_PASS_DOWN_PROPERTIES =
CollectionUtils.asSet("encoding", "inputEncoding", "outputEncoding", "maxAdditionalKnownLCWords","map",
"ner.combinationMode");
/** This factory method is used to create the NERClassifierCombiner used in NERCombinerAnnotator
* (and, thence, in StanfordCoreNLP).
*
* @param name A "x.y" format property name prefix (the "x" part). This is commonly null,
* and then "ner" is used. If it is the empty string, then no property prefix is used.
* @param properties Various properties, including a list in "ner.model".
* The used ones start with name + "." or are in passDownProperties
* @return An NERClassifierCombiner with the given properties
*/
public static NERClassifierCombiner createNERClassifierCombiner(String name, Properties properties) {
return createNERClassifierCombiner(name, DEFAULT_PASS_DOWN_PROPERTIES, properties);
}
/** This factory method is used to create the NERClassifierCombiner used in NERCombinerAnnotator
* (and, thence, in StanfordCoreNLP).
*
* @param name A "x.y" format property name prefix (the "x" part). This is commonly null,
* and then "ner" is used. If it is the empty string, then no property prefix is used.
* @param passDownProperties Property names for which the property should be passed down
* to the NERClassifierCombiner. The default is not to pass down, but pass down is
* useful for things like charset encoding.
* @param properties Various properties, including a list in "ner.model".
* The used ones start with name + "." or are in passDownProperties
* @return An NERClassifierCombiner with the given properties
*/
public static NERClassifierCombiner createNERClassifierCombiner(String name,
Set<String> passDownProperties,
Properties properties) {
String prefix = (name == null) ? "ner." : name.isEmpty() ? "" : name + '.';
String modelNames = properties.getProperty(prefix + "model");
if (modelNames == null) {
modelNames = DefaultPaths.DEFAULT_NER_THREECLASS_MODEL + ',' + DefaultPaths.DEFAULT_NER_MUC_MODEL + ',' +
DefaultPaths.DEFAULT_NER_CONLL_MODEL;
}
// but modelNames can still be empty string is set explicitly to be empty!
String[] models;
if ( ! modelNames.isEmpty()) {
models = modelNames.split(",");
} else {
// Allow for no real NER model - can just use numeric classifiers or SUTime
log.info("WARNING: no NER models specified");
models = StringUtils.EMPTY_STRING_ARRAY;
}
NERClassifierCombiner nerCombiner;
try {
boolean applyNumericClassifiers =
PropertiesUtils.getBool(properties,
prefix + APPLY_NUMERIC_CLASSIFIERS_PROPERTY_BASE,
APPLY_NUMERIC_CLASSIFIERS_DEFAULT);
boolean useSUTime =
PropertiesUtils.getBool(properties,
prefix + NumberSequenceClassifier.USE_SUTIME_PROPERTY_BASE,
NumberSequenceClassifier.USE_SUTIME_DEFAULT);
boolean applyRegexner =
PropertiesUtils.getBool(properties,
NERClassifierCombiner.APPLY_GAZETTE_PROPERTY,
NERClassifierCombiner.APPLY_GAZETTE_DEFAULT);
Properties combinerProperties;
if (passDownProperties != null) {
combinerProperties = PropertiesUtils.extractSelectedProperties(properties, passDownProperties);
if (useSUTime) {
// Make sure SUTime parameters are included
Properties sutimeProps = PropertiesUtils.extractPrefixedProperties(properties, NumberSequenceClassifier.SUTIME_PROPERTY + ".", true);
PropertiesUtils.overWriteProperties(combinerProperties, sutimeProps);
}
} else {
// if passDownProperties is null, just pass everything through
combinerProperties = properties;
}
//Properties combinerProperties = PropertiesUtils.extractSelectedProperties(properties, passDownProperties);
Language nerLanguage = Language.fromString(properties.getProperty(prefix+"language"),Language.ENGLISH);
nerCombiner = new NERClassifierCombiner(applyNumericClassifiers, nerLanguage,
useSUTime, applyRegexner, combinerProperties, models);
} catch (IOException e) {
throw new RuntimeIOException(e);
}
return nerCombiner;
}
public boolean appliesNumericClassifiers() {
return applyNumericClassifiers;
}
public boolean usesSUTime() {
// if applyNumericClassifiers is false, SUTime isn't run regardless of setting of useSUTime
return useSUTime && applyNumericClassifiers;
}
private static <INN extends CoreMap> void copyAnswerFieldsToNERField(List<INN> l) {
for (INN m: l) {
m.set(CoreAnnotations.NamedEntityTagAnnotation.class, m.get(CoreAnnotations.AnswerAnnotation.class));
}
}
@Override
public List<CoreLabel> classify(List<CoreLabel> tokens) {
return classifyWithGlobalInformation(tokens, null, null);
}
@Override
public List<CoreLabel> classifyWithGlobalInformation(List<CoreLabel> tokens, final CoreMap document, final CoreMap sentence) {
List<CoreLabel> output = super.classify(tokens);
if (applyNumericClassifiers) {
try {
// recognizes additional MONEY, TIME, DATE, and NUMBER using a set of deterministic rules
// note: some DATE and TIME entities are recognized by our statistical NER based on MUC
// note: this includes SUTime
// note: requires TextAnnotation, PartOfSpeechTagAnnotation, and AnswerAnnotation
// note: this sets AnswerAnnotation!
recognizeNumberSequences(output, document, sentence);
} catch (RuntimeInterruptedException e) {
throw e;
} catch (Exception e) {
log.info("Ignored an exception in NumberSequenceClassifier: (result is that some numbers were not classified)");
log.info("Tokens: " + StringUtils.joinWords(tokens, " "));
e.printStackTrace(System.err);
}
// AnswerAnnotation -> NERAnnotation
copyAnswerFieldsToNERField(output);
try {
// normalizes numeric entities such as MONEY, TIME, DATE, or PERCENT
// note: this uses and sets NamedEntityTagAnnotation!
if(nerLanguage == Language.CHINESE) {
// For chinese there is no support for SUTime by default
// We need to hand in document and sentence for Chinese to handle DocDate; however, since English normalization
// is handled by SUTime, and the information is passed in recognizeNumberSequences(), English only need output.
ChineseQuantifiableEntityNormalizer.addNormalizedQuantitiesToEntities(output, document, sentence);
} else {
QuantifiableEntityNormalizer.addNormalizedQuantitiesToEntities(output, false, useSUTime);
}
} catch (Exception e) {
log.info("Ignored an exception in QuantifiableEntityNormalizer: (result is that entities were not normalized)");
log.info("Tokens: " + StringUtils.joinWords(tokens, " "));
e.printStackTrace(System.err);
} catch(AssertionError e) {
log.info("Ignored an assertion in QuantifiableEntityNormalizer: (result is that entities were not normalized)");
log.info("Tokens: " + StringUtils.joinWords(tokens, " "));
e.printStackTrace(System.err);
}
} else {
// AnswerAnnotation -> NERAnnotation
copyAnswerFieldsToNERField(output);
}
// Apply RegexNER annotations
// cdm 2016: Used to say and do "// skip first token" but I couldn't understand why, so I removed that.
for (CoreLabel token : tokens) {
// System.out.println(token.toShorterString());
if ((token.tag() == null || token.tag().charAt(0) == 'N') && "O".equals(token.ner()) || "MISC".equals(token.ner())) {
String target = gazetteMapping.get(token.originalText());
if (target != null) {
token.setNER(target);
}
}
}
// Return
return output;
}
private void recognizeNumberSequences(List<CoreLabel> words, final CoreMap document, final CoreMap sentence) {
// we need to copy here because NumberSequenceClassifier overwrites the AnswerAnnotation
List<CoreLabel> newWords = NumberSequenceClassifier.copyTokens(words, sentence);
nsc.classifyWithGlobalInformation(newWords, document, sentence);
// copy AnswerAnnotation back. Do not overwrite!
// also, copy all the additional annotations generated by SUTime and NumberNormalizer
for (int i = 0, sz = words.size(); i < sz; i++){
CoreLabel origWord = words.get(i);
CoreLabel newWord = newWords.get(i);
// log.info(newWord.word() + " => " + newWord.get(CoreAnnotations.AnswerAnnotation.class) + " " + origWord.ner());
String before = origWord.get(CoreAnnotations.AnswerAnnotation.class);
String newGuess = newWord.get(CoreAnnotations.AnswerAnnotation.class);
if ((before == null || before.equals(nsc.flags.backgroundSymbol) || before.equals("MISC")) && !newGuess.equals(nsc.flags.backgroundSymbol)) {
origWord.set(CoreAnnotations.AnswerAnnotation.class, newGuess);
}
// transfer other annotations generated by SUTime or NumberNormalizer
NumberSequenceClassifier.transferAnnotations(newWord, origWord);
}
}
public void finalizeAnnotation(Annotation annotation) {
nsc.finalizeClassification(annotation);
}
// write an NERClassifierCombiner to an ObjectOutputStream
public void serializeClassifier(ObjectOutputStream oos) {
try {
// first write the ClassifierCombiner part to disk
super.serializeClassifier(oos);
// write whether to use SUTime
oos.writeBoolean(useSUTime);
// write whether to use NumericClassifiers
oos.writeBoolean(applyNumericClassifiers);
} catch (IOException e) {
throw new RuntimeIOException(e);
}
}
/** Static method for getting an NERClassifierCombiner from a string path. */
public static NERClassifierCombiner getClassifier(String loadPath, Properties props) throws IOException,
ClassNotFoundException, ClassCastException {
ObjectInputStream ois = IOUtils.readStreamFromString(loadPath);
NERClassifierCombiner returnNCC = getClassifier(ois, props);
IOUtils.closeIgnoringExceptions(ois);
return returnNCC;
}
// static method for getting an NERClassifierCombiner from an ObjectInputStream
public static NERClassifierCombiner getClassifier(ObjectInputStream ois, Properties props) throws IOException,
ClassNotFoundException, ClassCastException {
return new NERClassifierCombiner(ois, props);
}
/** Method for displaying info about an NERClassifierCombiner. */
public static void showNCCInfo(NERClassifierCombiner ncc) {
log.info("");
log.info("info for this NERClassifierCombiner: ");
ClassifierCombiner.showCCInfo(ncc);
log.info("useSUTime: "+ncc.useSUTime);
log.info("applyNumericClassifier: "+ncc.applyNumericClassifiers);
log.info("");
}
/**
* Read a gazette mapping in TokensRegex format from the given path
* The format is: 'case_sensitive_word \t target_ner_class' (additional info is ignored).
*
* @param mappingFile The mapping file to read from, as a path either on the filesystem or in your classpath.
*
* @return The mapping from word to NER tag.
*/
private static Map<String, String> readRegexnerGazette(String mappingFile) {
Map<String, String> mapping = new HashMap<>();
try {
for (String line : IOUtils.slurpReader(IOUtils.readerFromString(mappingFile.trim())).split("\n")) {
String[] fields = line.split("\t");
String key = fields[0];
String target = fields[1];
mapping.put(key, target);
}
} catch (IOException e) {
log.warn("Could not read Regex mapping: " + mappingFile);
}
return Collections.unmodifiableMap(mapping);
}
/** The main method. */
public static void main(String[] args) throws Exception {
StringUtils.logInvocationString(log, args);
Properties props = StringUtils.argsToProperties(args);
SeqClassifierFlags flags = new SeqClassifierFlags(props, false); // false for print probs as printed in next code block
String loadPath = props.getProperty("loadClassifier");
NERClassifierCombiner ncc;
if (loadPath != null) {
// note that when loading a serialized classifier, the philosophy is override
// any settings in props with those given in the commandline
// so if you dumped it with useSUTime = false, and you say -useSUTime at
// the commandline, the commandline takes precedence
ncc = getClassifier(loadPath,props);
} else {
// pass null for passDownProperties to let all props go through
ncc = createNERClassifierCombiner("ner", null, props);
}
// write the NERClassifierCombiner to the given path on disk
String serializeTo = props.getProperty("serializeTo");
if (serializeTo != null) {
ncc.serializeClassifier(serializeTo);
}
String textFile = props.getProperty("textFile");
if (textFile != null) {
ncc.classifyAndWriteAnswers(textFile);
}
// run on multiple textFiles , based off CRFClassifier code
String textFiles = props.getProperty("textFiles");
if (textFiles != null) {
List<File> files = new ArrayList<>();
for (String filename : textFiles.split(",")) {
files.add(new File(filename));
}
ncc.classifyFilesAndWriteAnswers(files);
}
// options for run the NERClassifierCombiner on a testFile or testFiles
String testFile = props.getProperty("testFile");
String testFiles = props.getProperty("testFiles");
String crfToExamine = props.getProperty("crfToExamine");
DocumentReaderAndWriter<CoreLabel> readerAndWriter = ncc.defaultReaderAndWriter();
if (testFile != null || testFiles != null) {
// check if there is not a crf specific request
if (crfToExamine == null) {
// in this case there is no crfToExamine
if (testFile != null) {
ncc.classifyAndWriteAnswers(testFile, readerAndWriter, true);
} else {
List<File> files = Arrays.asList(testFiles.split(",")).stream().map(File::new).collect(Collectors.toList());
ncc.classifyFilesAndWriteAnswers(files, ncc.defaultReaderAndWriter(), true);
}
} else {
ClassifierCombiner.examineCRF(ncc, crfToExamine, flags, testFile, testFiles, readerAndWriter);
}
}
// option for showing info about the NERClassifierCombiner
String showNCCInfo = props.getProperty("showNCCInfo");
if (showNCCInfo != null) {
showNCCInfo(ncc);
}
// option for reading in from stdin
if (flags.readStdin) {
ncc.classifyStdin();
}
}
}