package edu.stanford.nlp.pipeline;
import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.tokensregex.CoreMapExpressionExtractor;
import edu.stanford.nlp.ling.tokensregex.Env;
import edu.stanford.nlp.ling.tokensregex.EnvLookup;
import edu.stanford.nlp.ling.tokensregex.MatchedExpression;
import edu.stanford.nlp.ling.tokensregex.*;
import edu.stanford.nlp.ling.tokensregex.TokenSequencePattern;
import edu.stanford.nlp.util.*;
import edu.stanford.nlp.util.logging.Redwood;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Properties;
import java.util.regex.*;
import java.util.Set;
/**
* Uses TokensRegex patterns to annotate tokens.
*
* <p>
* Configuration:
* <ul>
* <li>{@code rules} - Name of file containing extraction rules
* (see {@link CoreMapExpressionExtractor} and {@link edu.stanford.nlp.ling.tokensregex.SequenceMatchRules}</li>
* </ul>
* Other options (can be set in rules file using {@code options.xxx = ...})
* <ul>
* <li>{@code setTokenOffsets} - whether to explicit set the token offsets of individual tokens (needed to token sequence matches to work)</li>
* <li>{@code extractWithTokens} - whether to return unmatched tokens as well</li>
* <li>{@code flatten} - whether to flatten matched expressions into individual tokens</li>
* <li>{@code matchedExpressionsAnnotationKey} - Annotation key where matched expressions are stored as a list</li>
* </ul>
* </p>
* <p>Multiple {@code TokensRegexAnnotator} can be configured using the same properties file by specifying
* difference prefix for the {@code TokensRegexAnnotator}</p>
*
* @author Angel Chang
*/
public class TokensRegexAnnotator implements Annotator {
private final Env env;
private final CoreMapExpressionExtractor<MatchedExpression> extractor;
private final Options options = new Options();
private final boolean verbose;
// Make public so can be accessed and set via reflection
public static class Options {
public Class matchedExpressionsAnnotationKey;
public boolean setTokenOffsets;
public boolean extractWithTokens;
public boolean flatten;
}
public TokensRegexAnnotator(String... files) {
env = TokenSequencePattern.getNewEnv();
extractor = CoreMapExpressionExtractor.createExtractorFromFiles(env, files);
verbose = false;
}
public TokensRegexAnnotator(String name, Properties props) {
String prefix = (name == null)? "": name + '.';
String[] files = PropertiesUtils.getStringArray(props, prefix + "rules");
env = TokenSequencePattern.getNewEnv();
env.bind("options", options);
if (PropertiesUtils.getBool(props, prefix+"caseInsensitive")) {
System.err.println("using case insensitive!");
env.setDefaultStringMatchFlags(NodePattern.CASE_INSENSITIVE);
env.setDefaultStringPatternFlags(Pattern.CASE_INSENSITIVE);
}
if (files.length != 0) {
extractor = CoreMapExpressionExtractor.createExtractorFromFiles(env, files);
} else {
extractor = null;
}
verbose = PropertiesUtils.getBool(props, prefix + "verbose", false);
options.setTokenOffsets = PropertiesUtils.getBool(props, prefix + "setTokenOffsets", options.setTokenOffsets);
options.extractWithTokens = PropertiesUtils.getBool(props, prefix + "extractWithTokens", options.extractWithTokens);
options.flatten = PropertiesUtils.getBool(props, prefix + "flatten", options.flatten);
String matchedExpressionsAnnotationKeyName = props.getProperty(prefix + "matchedExpressionsAnnotationKey");
if (matchedExpressionsAnnotationKeyName != null) {
options.matchedExpressionsAnnotationKey = EnvLookup.lookupAnnotationKeyWithClassname(env, matchedExpressionsAnnotationKeyName);
if (options.matchedExpressionsAnnotationKey == null) {
String propName = prefix + "matchedExpressionsAnnotationKey";
throw new RuntimeException("Cannot determine annotation key for " + propName + '=' + matchedExpressionsAnnotationKeyName);
}
}
}
public TokensRegexAnnotator(Properties props) {
this(null, props);
}
private static void addTokenOffsets(CoreMap annotation) {
// We are going to mark the token begin and token end for each token
Integer startTokenOffset = annotation.get(CoreAnnotations.TokenBeginAnnotation.class);
if (startTokenOffset == null) {
startTokenOffset = 0;
}
//set token offsets
int i = 0;
for (CoreMap c:annotation.get(CoreAnnotations.TokensAnnotation.class)) {
//set token begin
c.set(CoreAnnotations.TokenBeginAnnotation.class, i+startTokenOffset);
i++;
//set token end
c.set(CoreAnnotations.TokenEndAnnotation.class, i+startTokenOffset);
}
}
private List<CoreMap> extract(CoreMap annotation) {
List<CoreMap> cms;
if (options.extractWithTokens) {
cms = extractor.extractCoreMapsMergedWithTokens(annotation);
} else {
cms = extractor.extractCoreMaps(annotation);
}
if (options.flatten) {
return extractor.flatten(cms);
} else {
return cms;
}
}
@Override
public void annotate(Annotation annotation) {
if (verbose) {
Redwood.log(Redwood.DBG, "Adding TokensRegexAnnotator annotation...");
}
if (options.setTokenOffsets) {
addTokenOffsets(annotation);
}
// just do nothing if no extractor is specified
if (extractor != null) {
List<CoreMap> allMatched;
if (annotation.containsKey(CoreAnnotations.SentencesAnnotation.class)) {
allMatched = new ArrayList<>();
List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);
for (CoreMap sentence : sentences) {
List<CoreMap> matched = extract(sentence);
if (matched != null && options.matchedExpressionsAnnotationKey != null) {
allMatched.addAll(matched);
sentence.set(options.matchedExpressionsAnnotationKey, matched);
for (CoreMap cm : matched) {
cm.set(CoreAnnotations.SentenceIndexAnnotation.class, sentence.get(CoreAnnotations.SentenceIndexAnnotation.class));
}
}
}
} else {
allMatched = extract(annotation);
}
if (options.matchedExpressionsAnnotationKey != null) {
annotation.set(options.matchedExpressionsAnnotationKey, allMatched);
}
}
if (verbose) {
Redwood.log(Redwood.DBG, "done.");
}
}
@Override
public Set<Class<? extends CoreAnnotation>> requires() {
return Collections.singleton(CoreAnnotations.TokensAnnotation.class);
}
@Override
public Set<Class<? extends CoreAnnotation>> requirementsSatisfied() {
// TODO: not sure what goes here
return Collections.emptySet();
}
}