package edu.stanford.nlp.ling.tokensregex; import edu.stanford.nlp.ling.tokensregex.types.Expressions; import edu.stanford.nlp.ling.tokensregex.types.Tags; import edu.stanford.nlp.pipeline.CoreMapAttributeAggregator; import edu.stanford.nlp.util.Function; import edu.stanford.nlp.util.Generics; import edu.stanford.nlp.util.Pair; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Holds environment variables to be used for compiling string into a pattern. * Use {@link EnvLookup} to perform actual lookup (it will provide reasonable defaults) * * <p> * Some of the types of variables to bind are: * <ul> * <li><code>SequencePattern</code> (compiled pattern)</li> * <li><code>PatternExpr</code> (sequence pattern expression - precompiled)</li> * <li><code>NodePattern</code> (pattern for matching one element)</li> * <li><code>Class</code> (binding of CoreMap attribute to java Class)</li> * </ul> * </p> */ public class Env { /** * Parser that converts a string into a SequencePattern. * @see edu.stanford.nlp.ling.tokensregex.parser.TokenSequenceParser */ SequencePattern.Parser parser; /** * Mapping of variable names to their values */ Map<String, Object> variables = Generics.newHashMap(); /** * Mapping of per thread temporary variables to their values */ ThreadLocal<Map<String,Object>> threadLocalVariables = new ThreadLocal<Map<String,Object>>(); /** * Mapping of variables that can be expanded in a regular expression for strings, * to their regular expressions. * The variable name must start with "$" and include only the alphanumeric characters * (it should follow the pattern <code>$[A-Za-z0-9_]+</code>). * Each variable is mapped to a pair, consisting of the <code>Pattern</code> representing * the name of the variable to be replaced, and a <code>String</code> representing the * regular expression (escaped) that is used to replace the name of the variable. */ Map<String, Pair<Pattern,String>> stringRegexVariables = Generics.newHashMap(); /** * Default parameters (used when reading in rules for {@link SequenceMatchRules}. */ public Map<String, Object> defaults = Generics.newHashMap(); /** * Default flags to use for string regular expressions match * @see java.util.regex.Pattern#compile(String,int) */ public int defaultStringPatternFlags = 0; /** * Default flags to use for string literal match * @see NodePattern#CASE_INSENSITIVE */ public int defaultStringMatchFlags = 0; public Class sequenceMatchResultExtractor; public Class stringMatchResultExtractor; /** * Annotation key to use to getting tokens (default is CoreAnnotations.TokensAnnotation.class) */ public Class defaultTokensAnnotationKey; /** * Annotation key to use to getting text (default is CoreAnnotations.TextAnnotation.class) */ public Class defaultTextAnnotationKey; /** * List of keys indicating the per-token annotations (default is null). * If specified, each token will be annotated with the extracted results from the * {@link #defaultResultsAnnotationExtractor}. * If null, then individual tokens that are matched are not annotated. */ public List<Class> defaultTokensResultAnnotationKey; /** * List of keys indicating what fields should be annotated for the aggregated coremap. * If specified, the aggregated coremap is annotated with the extracted results from the * {@link #defaultResultsAnnotationExtractor}. * If null, then the aggregated coremap is not annotated. */ public List<Class> defaultResultAnnotationKey; /** * Annotation key to use during composite phase for storing matched sequences and to match against. */ public Class defaultNestedResultsAnnotationKey; /** * How should the tokens be aggregated when collapsing a sequence of tokens into one CoreMap */ public Map<Class, CoreMapAttributeAggregator> defaultTokensAggregators; /** * How annotations be extracted from the MatchedExpression * If the result type is a List and more than one annotation key is specified, * then the result is paired with the annotation key * Example: If annotation key is [ner,normalized] and result is [CITY,San Francisco] * then the final coremap will have ner=CITY, normalized=San Francisco * Otherwise, the result is treated as one object (all keys will be assigned that value). */ Function<MatchedExpression,?> defaultResultsAnnotationExtractor; /** * Interface for performing custom binding of values to the environment */ public static interface Binder { public void init(String prefix, Properties props); public void bind(Env env); } public Env(SequencePattern.Parser p) { this.parser = p; } public void initDefaultBindings() { bind("FALSE", Expressions.FALSE); bind("TRUE", Expressions.TRUE); bind("NIL", Expressions.NIL); bind("ENV", this); bind("tags", Tags.TagsAnnotation.class); } public Map<String, Object> getDefaults() { return defaults; } public void setDefaults(Map<String, Object> defaults) { this.defaults = defaults; } public Map<Class, CoreMapAttributeAggregator> getDefaultTokensAggregators() { return defaultTokensAggregators; } public void setDefaultTokensAggregators(Map<Class, CoreMapAttributeAggregator> defaultTokensAggregators) { this.defaultTokensAggregators = defaultTokensAggregators; } public Class getDefaultTextAnnotationKey() { return defaultTextAnnotationKey; } public void setDefaultTextAnnotationKey(Class defaultTextAnnotationKey) { this.defaultTextAnnotationKey = defaultTextAnnotationKey; } public Class getDefaultTokensAnnotationKey() { return defaultTokensAnnotationKey; } public void setDefaultTokensAnnotationKey(Class defaultTokensAnnotationKey) { this.defaultTokensAnnotationKey = defaultTokensAnnotationKey; } public List<Class> getDefaultTokensResultAnnotationKey() { return defaultTokensResultAnnotationKey; } public void setDefaultTokensResultAnnotationKey(Class... defaultTokensResultAnnotationKey) { this.defaultTokensResultAnnotationKey = Arrays.asList(defaultTokensResultAnnotationKey); } public void setDefaultTokensResultAnnotationKey(List<Class> defaultTokensResultAnnotationKey) { this.defaultTokensResultAnnotationKey = defaultTokensResultAnnotationKey; } public List<Class> getDefaultResultAnnotationKey() { return defaultResultAnnotationKey; } public void setDefaultResultAnnotationKey(Class... defaultResultAnnotationKey) { this.defaultResultAnnotationKey = Arrays.asList(defaultResultAnnotationKey); } public void setDefaultResultAnnotationKey(List<Class> defaultResultAnnotationKey) { this.defaultResultAnnotationKey = defaultResultAnnotationKey; } public Class getDefaultNestedResultsAnnotationKey() { return defaultNestedResultsAnnotationKey; } public void setDefaultNestedResultsAnnotationKey(Class defaultNestedResultsAnnotationKey) { this.defaultNestedResultsAnnotationKey = defaultNestedResultsAnnotationKey; } public Function<MatchedExpression, ?> getDefaultResultsAnnotationExtractor() { return defaultResultsAnnotationExtractor; } public void setDefaultResultsAnnotationExtractor(Function<MatchedExpression, ?> defaultResultsAnnotationExtractor) { this.defaultResultsAnnotationExtractor = defaultResultsAnnotationExtractor; } public Class getSequenceMatchResultExtractor() { return sequenceMatchResultExtractor; } public void setSequenceMatchResultExtractor(Class sequenceMatchResultExtractor) { this.sequenceMatchResultExtractor = sequenceMatchResultExtractor; } public Class getStringMatchResultExtractor() { return stringMatchResultExtractor; } public void setStringMatchResultExtractor(Class stringMatchResultExtractor) { this.stringMatchResultExtractor = stringMatchResultExtractor; } public Map<String, Object> getVariables() { return variables; } public void setVariables(Map<String, Object> variables) { this.variables = variables; } public void clearVariables() { this.variables.clear(); } public int getDefaultStringPatternFlags() { return defaultStringPatternFlags; } public void setDefaultStringPatternFlags(int defaultStringPatternFlags) { this.defaultStringPatternFlags = defaultStringPatternFlags; } public int getDefaultStringMatchFlags() { return defaultStringMatchFlags; } public void setDefaultStringMatchFlags(int defaultStringMatchFlags) { this.defaultStringMatchFlags = defaultStringMatchFlags; } private static final Pattern STRING_REGEX_VAR_NAME_PATTERN = Pattern.compile("\\$[A-Za-z0-9_]+"); public void bindStringRegex(String var, String regex) { // Enforce requirements on variable names ($alphanumeric_) if (!STRING_REGEX_VAR_NAME_PATTERN.matcher(var).matches()) { throw new IllegalArgumentException("StringRegex binding error: Invalid variable name " + var); } Pattern varPattern = Pattern.compile(Pattern.quote(var)); String replace = Matcher.quoteReplacement(regex); stringRegexVariables.put(var, new Pair<Pattern, String>(varPattern, replace)); } public String expandStringRegex(String regex) { // Replace all variables in regex String expanded = regex; for (String v:stringRegexVariables.keySet()) { Pair<Pattern,String> p = stringRegexVariables.get(v); expanded = p.first().matcher(expanded).replaceAll(p.second()); } return expanded; } public Pattern getStringPattern(String regex) { String expanded = expandStringRegex(regex); return Pattern.compile(expanded, defaultStringPatternFlags); } public void bind(String name, Object obj) { if (obj != null) { variables.put(name, obj); } else { variables.remove(name); } } public void bind(String name, SequencePattern pattern) { bind(name, pattern.getPatternExpr()); } public void unbind(String name) { bind(name, null); } public NodePattern getNodePattern(String name) { Object obj = variables.get(name); if (obj != null) { if (obj instanceof SequencePattern) { SequencePattern seqPattern = (SequencePattern) obj; if (seqPattern.getPatternExpr() instanceof SequencePattern.NodePatternExpr) { return ((SequencePattern.NodePatternExpr) seqPattern.getPatternExpr()).nodePattern; } else { throw new Error("Invalid node pattern class: " + seqPattern.getPatternExpr().getClass() + " for variable " + name); } } else if (obj instanceof SequencePattern.NodePatternExpr) { SequencePattern.NodePatternExpr pe = (SequencePattern.NodePatternExpr) obj; return pe.nodePattern; } else if (obj instanceof NodePattern) { return (NodePattern) obj; } else if (obj instanceof String) { try { SequencePattern.NodePatternExpr pe = (SequencePattern.NodePatternExpr) parser.parseNode(this, (String) obj); return pe.nodePattern; } catch (Exception pex) { throw new RuntimeException("Error parsing " + obj + " to node pattern", pex); } } else { throw new Error("Invalid node pattern variable class: " + obj.getClass() + " for variable " + name); } } return null; } public SequencePattern.PatternExpr getSequencePatternExpr(String name, boolean copy) { Object obj = variables.get(name); if (obj != null) { if (obj instanceof SequencePattern) { SequencePattern seqPattern = (SequencePattern) obj; return seqPattern.getPatternExpr(); } else if (obj instanceof SequencePattern.PatternExpr) { SequencePattern.PatternExpr pe = (SequencePattern.PatternExpr) obj; return (copy)? pe.copy():pe; } else if (obj instanceof NodePattern) { return new SequencePattern.NodePatternExpr( (NodePattern) obj); } else if (obj instanceof String) { try { return parser.parseSequence(this, (String) obj); } catch (Exception pex) { throw new RuntimeException("Error parsing " + obj + " to sequence pattern", pex); } } else { throw new Error("Invalid sequence pattern variable class: " + obj.getClass()); } } return null; } public Object get(String name) { return variables.get(name); } // Functions for storing temporary thread specific variables // that are used when running tokensregex public void push(String name, Object value) { Map<String,Object> vars = threadLocalVariables.get(); if (vars == null) { threadLocalVariables.set(vars = Generics.newHashMap()); } Stack<Object> stack = (Stack<Object>) vars.get(name); if (stack == null) { vars.put(name, stack = new Stack<Object>()); } stack.push(value); } public Object pop(String name) { Map<String,Object> vars = threadLocalVariables.get(); if (vars == null) return null; Stack<Object> stack = (Stack<Object>) vars.get(name); if (stack == null || stack.isEmpty()) { return null; } else { return stack.pop(); } } public Object peek(String name) { Map<String,Object> vars = threadLocalVariables.get(); if (vars == null) return null; Stack<Object> stack = (Stack<Object>) vars.get(name); if (stack == null || stack.isEmpty()) { return null; } else { return stack.peek(); } } }