TokensRegexAnnotator.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.pipeline;

import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.tokensregex.CoreMapExpressionExtractor;
import edu.stanford.nlp.ling.tokensregex.Env;
import edu.stanford.nlp.ling.tokensregex.EnvLookup;
import edu.stanford.nlp.ling.tokensregex.MatchedExpression;
import edu.stanford.nlp.ling.tokensregex.*;
import edu.stanford.nlp.ling.tokensregex.TokenSequencePattern;
import edu.stanford.nlp.util.*;
import edu.stanford.nlp.util.logging.Redwood;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Properties;
import java.util.regex.*;
import java.util.Set;

/**
 * Uses TokensRegex patterns to annotate tokens.
 *
 * <p>
 *   Configuration:
 *   <ul>
 *     <li>{@code rules} - Name of file containing extraction rules
 *        (see {@link CoreMapExpressionExtractor} and {@link edu.stanford.nlp.ling.tokensregex.SequenceMatchRules}</li>
 *   </ul>
 *   Other options (can be set in rules file using {@code options.xxx = ...})
 *   <ul>
 *     <li>{@code setTokenOffsets} - whether to explicit set the token offsets of individual tokens (needed to token sequence matches to work)</li>
 *     <li>{@code extractWithTokens} - whether to return unmatched tokens as well</li>
 *     <li>{@code flatten} - whether to flatten matched expressions into individual tokens</li>
 *     <li>{@code matchedExpressionsAnnotationKey} - Annotation key where matched expressions are stored as a list</li>
 *   </ul>
 * </p>
 * <p>Multiple {@code TokensRegexAnnotator} can be configured using the same properties file by specifying
 * difference prefix for the {@code TokensRegexAnnotator}</p>
 *
 * @author Angel Chang
 */
public class TokensRegexAnnotator implements Annotator {

  private final Env env;
  private final CoreMapExpressionExtractor<MatchedExpression> extractor;
  private final Options options = new Options();
  private final boolean verbose;


  // Make public so can be accessed and set via reflection
  public static class Options {
    public Class matchedExpressionsAnnotationKey;
    public boolean setTokenOffsets;
    public boolean extractWithTokens;
    public boolean flatten;
  }


  public TokensRegexAnnotator(String... files) {
    env = TokenSequencePattern.getNewEnv();
    extractor = CoreMapExpressionExtractor.createExtractorFromFiles(env, files);
    verbose = false;
  }

  public TokensRegexAnnotator(String name, Properties props) {
    String prefix = (name == null)? "": name + '.';
    String[] files  = PropertiesUtils.getStringArray(props, prefix + "rules");
    env = TokenSequencePattern.getNewEnv();
    env.bind("options", options);
    if (PropertiesUtils.getBool(props, prefix+"caseInsensitive")) {
      System.err.println("using case insensitive!");
      env.setDefaultStringMatchFlags(NodePattern.CASE_INSENSITIVE);
      env.setDefaultStringPatternFlags(Pattern.CASE_INSENSITIVE);
    }
    if (files.length != 0) {
      extractor = CoreMapExpressionExtractor.createExtractorFromFiles(env, files);
    } else {
      extractor = null;
    }
    verbose = PropertiesUtils.getBool(props, prefix + "verbose", false);
    options.setTokenOffsets = PropertiesUtils.getBool(props, prefix + "setTokenOffsets", options.setTokenOffsets);
    options.extractWithTokens = PropertiesUtils.getBool(props, prefix + "extractWithTokens", options.extractWithTokens);
    options.flatten = PropertiesUtils.getBool(props, prefix + "flatten", options.flatten);
    String matchedExpressionsAnnotationKeyName = props.getProperty(prefix + "matchedExpressionsAnnotationKey");
    if (matchedExpressionsAnnotationKeyName != null) {
      options.matchedExpressionsAnnotationKey = EnvLookup.lookupAnnotationKeyWithClassname(env, matchedExpressionsAnnotationKeyName);
      if (options.matchedExpressionsAnnotationKey == null) {
        String propName = prefix + "matchedExpressionsAnnotationKey";
        throw new RuntimeException("Cannot determine annotation key for " + propName + '=' + matchedExpressionsAnnotationKeyName);
      }
    }
  }

  public TokensRegexAnnotator(Properties props) {
    this(null, props);
  }


  private static void addTokenOffsets(CoreMap annotation) {
    // We are going to mark the token begin and token end for each token
    Integer startTokenOffset = annotation.get(CoreAnnotations.TokenBeginAnnotation.class);
    if (startTokenOffset == null) {
      startTokenOffset = 0;
    }
    //set token offsets
    int i = 0;
    for (CoreMap c:annotation.get(CoreAnnotations.TokensAnnotation.class)) {
      //set token begin
      c.set(CoreAnnotations.TokenBeginAnnotation.class, i+startTokenOffset);
      i++;
      //set token end
      c.set(CoreAnnotations.TokenEndAnnotation.class, i+startTokenOffset);
    }
  }

  private List<CoreMap> extract(CoreMap annotation) {
    List<CoreMap> cms;
    if (options.extractWithTokens) {
      cms = extractor.extractCoreMapsMergedWithTokens(annotation);
    } else {
      cms = extractor.extractCoreMaps(annotation);
    }
    if (options.flatten) {
      return extractor.flatten(cms);
    } else {
      return cms;
    }
  }

  @Override
  public void annotate(Annotation annotation) {
    if (verbose) {
      Redwood.log(Redwood.DBG, "Adding TokensRegexAnnotator annotation...");
    }
    if (options.setTokenOffsets) {
      addTokenOffsets(annotation);
    }
    // just do nothing if no extractor is specified
    if (extractor != null) {
      List<CoreMap> allMatched;
      if (annotation.containsKey(CoreAnnotations.SentencesAnnotation.class)) {
        allMatched = new ArrayList<>();
        List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);
        for (CoreMap sentence : sentences) {
          List<CoreMap> matched = extract(sentence);
          if (matched != null && options.matchedExpressionsAnnotationKey != null) {
            allMatched.addAll(matched);
            sentence.set(options.matchedExpressionsAnnotationKey, matched);
            for (CoreMap cm : matched) {
              cm.set(CoreAnnotations.SentenceIndexAnnotation.class, sentence.get(CoreAnnotations.SentenceIndexAnnotation.class));
            }
          }
        }
      } else {
        allMatched = extract(annotation);
      }
      if (options.matchedExpressionsAnnotationKey != null) {
        annotation.set(options.matchedExpressionsAnnotationKey, allMatched);
      }
    }
    if (verbose) {
      Redwood.log(Redwood.DBG, "done.");
    }
  }

  @Override
  public Set<Class<? extends CoreAnnotation>> requires() {
    return Collections.singleton(CoreAnnotations.TokensAnnotation.class);
  }

  @Override
  public Set<Class<? extends CoreAnnotation>> requirementsSatisfied() {
    // TODO: not sure what goes here
    return Collections.emptySet();
  }

}