package edu.stanford.nlp.pipeline; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.tokensregex.CoreMapExpressionExtractor; import edu.stanford.nlp.ling.tokensregex.Env; import edu.stanford.nlp.ling.tokensregex.EnvLookup; import edu.stanford.nlp.ling.tokensregex.TokenSequencePattern; import edu.stanford.nlp.util.*; import edu.stanford.nlp.util.logging.Redwood; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Properties; import java.util.Set; /** * <p>Uses TokensRegex patterns to annotate tokens.</p> * * <p> * Configuration: * <ul> * <li><code>rules</code> - Name of file containing extraction rules * (see {@link CoreMapExpressionExtractor} and {@link edu.stanford.nlp.ling.tokensregex.SequenceMatchRules}</li> * </ul> * Other options (can be set in rules file using <code>options.xxx = ...</code>) * <ul> * <li><code>setTokenOffsets</code> - whether to explicit set the token offsets of individual tokens (needed to token sequence matches to work)</li> * <li><code>extractWithTokens</code> - whether to return unmatched tokens as well</li> * <li><code>flatten</code> - whether to flatten matched expressions into individual tokens</li> * <li><code>matchedExpressionsAnnotationKey</code> - Annotation key where matched expressions are stored as a list</li> * </ul> * </p> * <p>Multiple <code>TokensRegexAnnotator</code> can be configured using the same properties file by specifying * difference prefix for the <code>TokensRegexAnnotator</code></p> * * @author Angel Chang */ public class TokensRegexAnnotator implements Annotator { private Env env; private CoreMapExpressionExtractor extractor; private Options options = new Options(); public static class Options { public Class matchedExpressionsAnnotationKey; public boolean setTokenOffsets; public boolean extractWithTokens; public boolean flatten; } private Timing timer = new Timing(); private boolean verbose; public TokensRegexAnnotator(String... files) { env = TokenSequencePattern.getNewEnv(); extractor = CoreMapExpressionExtractor.createExtractorFromFiles(env, files); } public TokensRegexAnnotator(String name, Properties props) { String prefix = (name == null)? "":name + "."; String[] files = PropertiesUtils.getStringArray(props, prefix + "rules"); if (files == null || files.length == 0) { throw new RuntimeException("No rules specified for TokensRegexAnnotator " + name + ", check " + prefix + "rules property"); } env = TokenSequencePattern.getNewEnv(); env.bind("options", options); extractor = CoreMapExpressionExtractor.createExtractorFromFiles(env, files); verbose = PropertiesUtils.getBool(props, prefix + "verbose", verbose); options.setTokenOffsets = PropertiesUtils.getBool(props, prefix + "setTokenOffsets", options.setTokenOffsets); options.extractWithTokens = PropertiesUtils.getBool(props, prefix + "extractWithTokens", options.extractWithTokens); options.flatten = PropertiesUtils.getBool(props, prefix + "flatten", options.flatten); String matchedExpressionsAnnotationKeyName = props.getProperty(prefix + "matchedExpressionsAnnotationKey"); if (matchedExpressionsAnnotationKeyName != null) { options.matchedExpressionsAnnotationKey = EnvLookup.lookupAnnotationKey(env, matchedExpressionsAnnotationKeyName); if (options.matchedExpressionsAnnotationKey == null) { String propName = prefix + "matchedExpressionsAnnotationKey"; throw new RuntimeException("Cannot determine annotation key for " + propName + "=" + matchedExpressionsAnnotationKeyName); } } } public TokensRegexAnnotator(Properties props) { this(null, props); } public void addTokenOffsets(CoreMap annotation) { // We are going to mark the token begin and token end for each token Integer startTokenOffset = annotation.get(CoreAnnotations.TokenBeginAnnotation.class); if (startTokenOffset == null) { startTokenOffset = 0; } //set token offsets int i = 0; for (CoreMap c:annotation.get(CoreAnnotations.TokensAnnotation.class)) { //set token begin c.set(CoreAnnotations.TokenBeginAnnotation.class, i+startTokenOffset); i++; //set token end c.set(CoreAnnotations.TokenEndAnnotation.class, i+startTokenOffset); } } private List<CoreMap> extract(CoreMap annotation) { List<CoreMap> cms; if (options.extractWithTokens) { cms = extractor.extractCoreMapsMergedWithTokens(annotation); } else { cms = extractor.extractCoreMaps(annotation); } if (options.flatten) { return extractor.flatten(cms); } else { return cms; } } public void annotate(Annotation annotation) { if (verbose) { timer.start(); Redwood.log(Redwood.DBG, "Adding TokensRegexAnnotator annotation..."); } if (options.setTokenOffsets) { addTokenOffsets(annotation); } List<CoreMap> allMatched; if (annotation.containsKey(CoreAnnotations.SentencesAnnotation.class)) { allMatched = new ArrayList<CoreMap>(); List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class); for (CoreMap sentence : sentences) { List<CoreMap> matched = extract(sentence); if (matched != null && options.matchedExpressionsAnnotationKey != null) { allMatched.addAll(matched); sentence.set(options.matchedExpressionsAnnotationKey, matched); for (CoreMap cm:matched) { cm.set(CoreAnnotations.SentenceIndexAnnotation.class, sentence.get(CoreAnnotations.SentenceIndexAnnotation.class)); } } } } else { allMatched = extract(annotation); } if (options.matchedExpressionsAnnotationKey != null) { annotation.set(options.matchedExpressionsAnnotationKey, allMatched); } if (verbose) timer.stop("done."); } @Override public Set<Requirement> requires() { return Collections.singleton(TOKENIZE_REQUIREMENT); } @Override public Set<Requirement> requirementsSatisfied() { // TODO: not sure what goes here return Collections.emptySet(); } }