package edu.stanford.nlp.pipeline; import edu.stanford.nlp.util.logging.Redwood; import edu.stanford.nlp.ie.AbstractSequenceClassifier; import edu.stanford.nlp.ie.regexp.NumberSequenceClassifier; import edu.stanford.nlp.ling.CoreAnnotation; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.util.ArraySet; import edu.stanford.nlp.util.CoreMap; import edu.stanford.nlp.util.PropertiesUtils; import java.util.*; /** * This calls NumberSequenceClassifier, which is a rule based classifier, which * adds a NUMBER entity tag to numbers not already given another entity tag, and * also has additional rules for marking MONEY, TIME, and DATE. It assumes that * tokens already have a (POS) TagAnnotation, and an original round of NER that * covers MONEY and American DATE/TIME formats, such as MUC NER in * AnswerAnnotation, to which we add. * * @author Jenny Finkel */ public class NumberAnnotator implements Annotator { /** A logger for this class */ private static Redwood.RedwoodChannels log = Redwood.channels(NumberAnnotator.class); private final AbstractSequenceClassifier<CoreLabel> nsc; private boolean VERBOSE = true; private static final String DEFAULT_BACKGROUND_SYMBOL = "O"; private final String BACKGROUND_SYMBOL; public static final String BACKGROUND_SYMBOL_PROPERTY = "background"; public NumberAnnotator() { this(DEFAULT_BACKGROUND_SYMBOL, true, NumberSequenceClassifier.USE_SUTIME_DEFAULT); } public NumberAnnotator(boolean verbose) { this(DEFAULT_BACKGROUND_SYMBOL, verbose, NumberSequenceClassifier.USE_SUTIME_DEFAULT); } public NumberAnnotator(boolean verbose, boolean useSUTime) { this(DEFAULT_BACKGROUND_SYMBOL, verbose, useSUTime); } public NumberAnnotator(String backgroundSymbol, boolean verbose, boolean useSUTime) { BACKGROUND_SYMBOL = backgroundSymbol; VERBOSE = verbose; nsc = new NumberSequenceClassifier(useSUTime); } public NumberAnnotator(String name, Properties props) { String property = name + "." + BACKGROUND_SYMBOL_PROPERTY; BACKGROUND_SYMBOL = props.getProperty(property, DEFAULT_BACKGROUND_SYMBOL); boolean useSUTime = PropertiesUtils.getBool(props, NumberSequenceClassifier.USE_SUTIME_PROPERTY, NumberSequenceClassifier.USE_SUTIME_DEFAULT); VERBOSE = false; nsc = new NumberSequenceClassifier(useSUTime); } @Override public void annotate(Annotation annotation) { if (VERBOSE) { log.info("Adding number annotation ... "); } if (annotation.containsKey(CoreAnnotations.SentencesAnnotation.class)) { // classify tokens for each sentence for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) { List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class); doOneSentenceNew(tokens, annotation, sentence); } if (VERBOSE) { log.info("done. Output: " + annotation.get(CoreAnnotations.SentencesAnnotation.class)); } } else if (annotation.containsKey(CoreAnnotations.TokensAnnotation.class)) { List<CoreLabel> tokens = annotation.get(CoreAnnotations.TokensAnnotation.class); doOneSentenceNew(tokens, annotation, null); } else { throw new RuntimeException("unable to find sentences in: " + annotation); } } private void doOneSentenceNew(List<CoreLabel> words, Annotation doc, CoreMap sentence) { List<CoreLabel> newWords = NumberSequenceClassifier.copyTokens(words, sentence); nsc.classifyWithGlobalInformation(newWords, doc, sentence); Iterator<? extends CoreLabel> newFLIter = newWords.iterator(); for (CoreLabel origWord : words) { CoreLabel newWord = newFLIter.next(); String before = origWord.ner(); String newGuess = newWord.get(CoreAnnotations.AnswerAnnotation.class); // log.info(origWord.word()); // log.info(origWord.ner()); if (VERBOSE) log.info(newWord); // log.info("-------------------------------------"); if ((before == null || before.equals(BACKGROUND_SYMBOL) || before.equals("MISC")) && !newGuess.equals(BACKGROUND_SYMBOL)) { origWord.setNER(newGuess); } // transfer other annotations generated by SUTime or NumberNormalizer NumberSequenceClassifier.transferAnnotations(newWord, origWord); } } @Override public Set<Class<? extends CoreAnnotation>> requires() { return Collections.unmodifiableSet(new ArraySet<>(Arrays.asList( CoreAnnotations.TokensAnnotation.class, CoreAnnotations.SentencesAnnotation.class ))); } @Override public Set<Class<? extends CoreAnnotation>> requirementsSatisfied() { // technically it adds some NER, but someone who wants full NER // labels will be very disappointed, so we do not claim to produce NER return Collections.singleton(CoreAnnotations.NumerizedTokensAnnotation.class); } }