package edu.stanford.nlp.ling.tokensregex;
import edu.stanford.nlp.ling.tokensregex.types.AssignableExpression;
import edu.stanford.nlp.ling.tokensregex.types.Expression;
import edu.stanford.nlp.ling.tokensregex.types.Expressions;
import edu.stanford.nlp.ling.tokensregex.types.Value;
import edu.stanford.nlp.util.*;
import java.io.Serializable;
import java.util.*;
import java.util.function.Function;
import java.util.function.Predicate;
import java.util.regex.MatchResult;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Rules for matching sequences using regular expressions
* <p>
* There are 2 types of rules:
* <ol>
* <li><b>Assignment rules</b> which assign a value to a variable for later use.
* </li>
* <li><b>Extraction rules</b> which specifies how regular expression patterns are to be matched against text,
* which matched text expressions are to extracted, and what value to assign to the matched expression.</li>
* </ol>
* </p>
*
* NOTE: {@code #} or {@code //} can be used to indicates one-line comments
*
* <p><b>Assignment Rules</b> are used to assign values to variables.
* The basic format is: {@code variable = value}
* </p>
* <p>
* <em>Variable Names</em>:
* <ul>
* <li>Variable names should follow the pattern [A-Za-z_][A-Za-z0-9_]*</li>
* <li>Variable names for use in regular expressions (to be expanded later) must start with {@code $}</li>
* </ul>
* </p>
* <p>
* <em>Value Types</em>:
* <table>
* <tr><th>Type</th><th>Format</th><th>Example</th><th>Description</th></tr>
* <tr><td>{@code BOOLEAN}</td><td>{@code TRUE | FALSE}</td><td>{@code TRUE}</td><td></td></tr>
* <tr><td>{@code STRING}</td><td>{@code "..."}</td><td>{@code "red"}</td><td></td></tr>
* <tr><td>{@code INTEGER}</td><td>{@code [+-]\d+}</td><td>{@code 1500}</td><td></td></tr>
* <tr><td>{@code LONG}</td><td>{@code [+-]\d+L}</td><td>{@code 1500000000000L}</td><td></td></tr>
* <tr><td>{@code DOUBLE}</td><td>{@code [+-]\d*\.\d+}</td><td>{@code 6.98}</td><td></td></tr>
* <tr><td>{@code REGEX}</td><td>{@code /.../}</td><td>{@code /[Aa]pril/}</td>
* <td>String regular expression {@link Pattern}</td></tr>
* <tr><td>{@code TOKENS_REGEX}</td><td>{@code ( [...] [...] ... ) }</td><td>{@code ( /up/ /to/ /4/ /months/ )}</td>
* <td>Tokens regular expression {@link TokenSequencePattern}</td></tr>
* <tr><td>{@code LIST}</td><td>{@code ( [item1] , [item2], ... )}</td><td>{@code ("red", "blue", "yellow" )}</td>
* <td></td></tr>
* </table>
* </p>
* <p>
* Some typical uses and examples for assignment rules include:
* <ol>
* <li>Assignment of value to variables for use in later rules</li>
* <li>Binding of text key to annotation key (as {@code Class}).
* <pre>
* tokens = { type: "CLASS", value: "edu.stanford.nlp.ling.CoreAnnotations$TokensAnnotation" }
* </pre>
* </li>
* <li>Defining regular expressions macros to be embedded in other regular expressions
* <pre>
* $SEASON = "/spring|summer|fall|autumn|winter/"
* $NUM = ( [ { numcomptype:NUMBER } ] )
* </pre>
* </li>
* <li>Setting default environment variables.
* Rules are applied with respect to an environment ({@link Env}), which can be accessed using the variable {@code ENV}.
* Members of the Environment can be set as needed.
* <pre>
* # Set default parameters to be used when reading rules
* ENV.defaults["ruleType"] = "tokens"
* # Set default string pattern flags (to case-insensitive)
* ENV.defaultStringPatternFlags = 2
* # Specifies that the result should go into the {@code tokens} key (as defined above).
* ENV.defaultResultAnnotationKey = tokens
* </pre>
* </li>
* <li>Defining options</li>
* </ol>
* </p>
*
* Predefined values are:
* <table>
* <tr><th>Variable</th><th>Type</th><th>Description</th></tr>
* <tr><td>{@code ENV}</td><td>{@link Env}</td><td>The environment with respect to which the rules are applied.</td></tr>
* <tr><td>{@code TRUE}</td><td>{@code BOOLEAN}</td><td>The {@code Boolean} value {@code true}.</td></tr>
* <tr><td>{@code FALSE}</td><td>{@code BOOLEAN}</td><td>The {@code Boolean} value {@code false}.</td></tr>
* <tr><td>{@code NIL}</td><td>{@code}</td><td>The {@code null} value.</td></tr>
* <tr><td>{@code tags}</td><td>{@code Class}</td><td>The annotation key {@link edu.stanford.nlp.ling.tokensregex.types.Tags.TagsAnnotation}.</td></tr>
* </table>
* </p>
* <p><b>Extraction Rules</b> specifies how regular expression patterns are to be matched against text.
* See {@link CoreMapExpressionExtractor} for more information on the types of the rules, and in what sequence the rules are applied.
* A basic rule can be specified using the following template:
* <pre>{
* # Type of the rule
* ruleType: "tokens" | "text" | "composite" | "filter",
* # Pattern to match against
* pattern: ( <TokenSequencePattern> ) | /<TextPattern>/,
* # Resulting value to go into the resulting annotation
* result: ...
*
* # More fields following...
* }
* </pre>
* Example:
* <pre>
* {
* ruleType: "tokens",
* pattern: ( /one/ ),
* result: 1
* }
* </pre>
* </p>
* Extraction rule fields (most fields are optional):
* <table>
* <tr><th>Field</th><th>Values</th><th>Example</th><th>Description</th></tr>
* <tr><td>{@code ruleType}</td><td>{@code "tokens" | "text" | "composite" | "filter" }</td>
* <td>{@code tokens}</td><td>Type of the rule (required).</td></tr>
* <tr><td>{@code pattern}</td><td>{@code <Token Sequence Pattern> = (...) | <Text Pattern> = /.../}</td>
* <td>{@code ( /winter/ /of/ $YEAR )}</td><td>Pattern to match against.
* See {@link TokenSequencePattern} and {@link Pattern} for
* how to specify patterns over tokens and strings (required).</td></tr>
* <tr><td>{@code action}</td><td>{@code <Action List> = (...)}</td>
* <td>{@code ( Annotate($0, ner, "DATE") )}</td><td>List of actions to apply when the pattern is triggered.
* Each action is a {@link Expressions TokensRegex Expression}</td></tr>
* <tr><td>{@code result}</td><td>{@code <Expression>}</td>
* <td>{@code}</td><td>Resulting value to go into the resulting annotation. See {@link Expressions} for how to specify the result.</td></tr>
* <tr><td>{@code name}</td><td>{@code STRING}</td>
* <td>{@code}</td><td>Name to identify the extraction rule.</td></tr>
* <tr><td>{@code stage}</td><td>{@code INTEGER}</td>
* <td>{@code}</td><td>Stage at which the rule is to be applied. Rules are grouped in stages, which are applied from lowest to highest.</td></tr>
* <tr><td>{@code active}</td><td>{@code Boolean}</td>
* <td>{@code}</td><td>Whether this rule is enabled (active) or not (default true).</td></tr>
* <tr><td>{@code priority}</td><td>{@code DOUBLE}</td>
* <td>{@code}</td><td>Priority of rule. Within a stage, matches from higher priority rules are preferred.</td></tr>
* <tr><td>{@code weight}</td><td>{@code DOUBLE}</td>
* <td>{@code}</td><td>Weight of rule (not currently used).</td></tr>
* <tr><td>{@code over}</td><td>{@code CLASS}</td>
* <td>{@code}</td><td>Annotation field to check pattern against.</td></tr>
* <tr><td>{@code matchFindType}</td><td>{@code FIND_NONOVERLAPPING | FIND_ALL}</td>
* <td>{@code}</td><td>Whether to find all matched expression or just the nonoverlapping ones (default {@code FIND_NONOVERLAPPING}).</td></tr>
* <tr><td>{@code matchWithResults}</td><td>{@code Boolean}</td>
* <td>{@code}</td><td>Whether results of the matches should be returned (default false).
* Set to true to access captured groups of embedded regular expressions.</td></tr>
* <tr><td>{@code matchedExpressionGroup}</td><td>{@code Integer}</td>
* <td>{@code 2}</td><td>What group should be treated as the matched expression group (default 0).</td></tr>
* </table>
*
* @author Angel Chang
* @see CoreMapExpressionExtractor
* @see TokenSequencePattern
*/
public class SequenceMatchRules {
private SequenceMatchRules() { } // static class with inner classes
/** A sequence match rule */
public interface Rule {
}
/**
* Rule that specifies what value to assign to a variable
*/
public static class AssignmentRule implements Rule {
Expression expr;
public AssignmentRule(AssignableExpression varExpr, Expression value) {
expr = varExpr.assign(value);
}
public void evaluate(Env env) {
expr.evaluate(env);
}
}
/**
* Rule that specifies how to extract sequence of MatchedExpression from an annotation (CoreMap).
* @param <T> Output type (MatchedExpression)
*/
public static class AnnotationExtractRule<S, T extends MatchedExpression> implements Rule, ExtractRule<S,T>, Predicate<T>, Serializable {
private static final long serialVersionUID = -2148125332223720424L;
/** Name of the rule */
public String name;
/** Stage in which this rule should be applied with respect to others */
public int stage = 1;
/** Priority in which this rule should be applied with respect to others */
public double priority;
/** Weight given to the rule (how likely is this rule to fire) */
public double weight;
/** Annotation field to apply rule over: text or tokens or numerizedtokens */
public Class annotationField;
public Class tokensAnnotationField;
/** Annotation field(s) on individual tokens to put new annotation */
public List<Class> tokensResultAnnotationField;
/** Annotation field(s) to put new annotation */
public List<Class> resultAnnotationField;
/** Annotation field for child/nested annotations */
public Class resultNestedAnnotationField;
public SequenceMatcher.FindType matchFindType;
/** Which group to take as the matched expression - default is 0 */
public int matchedExpressionGroup;
public boolean matchWithResults;
// TODO: Combine ruleType and isComposite
/** Type of rule to apply: token string match, pattern string match */
public String ruleType;
public boolean isComposite;
public boolean includeNested = true; // TODO: Get parameter from somewhere....
public boolean active = true;
/** Actual rule performing the extraction (converting annotation to MatchedExpression) */
public ExtractRule<S, T> extractRule;
public Predicate<T> filterRule;
/** Pattern - the type of which is dependent on the rule type */
public Object pattern;
public Expression result;
public void update(Env env, Map<String, Object> attributes) {
for (Map.Entry<String, Object> stringObjectEntry : attributes.entrySet()) {
String key = stringObjectEntry.getKey();
Object obj = stringObjectEntry.getValue();
switch (key) {
case "name":
name = (String) Expressions.asObject(env, obj);
break;
case "priority":
priority = ((Number) Expressions.asObject(env, obj)).doubleValue();
break;
case "stage":
stage = ((Number) Expressions.asObject(env, obj)).intValue();
break;
case "weight":
weight = ((Number) Expressions.asObject(env, obj)).doubleValue();
break;
case "over":
Object annoKey = Expressions.asObject(env, obj);
if (annoKey instanceof Class) {
annotationField = (Class) annoKey;
} else if (annoKey instanceof String) {
annotationField = EnvLookup.lookupAnnotationKeyWithClassname(env, (String) annoKey);
} else if (annotationField == null) {
annotationField = CoreMap.class;
} else {
throw new IllegalArgumentException("Invalid annotation key " + annoKey);
}
break;
case "active":
active = (Boolean) Expressions.asObject(env, obj);
break;
case "ruleType":
ruleType = (String) Expressions.asObject(env, obj);
break;
case "matchFindType":
matchFindType = SequenceMatcher.FindType.valueOf((String) Expressions.asObject(env, obj));
break;
case "matchWithResults":
matchWithResults = ((Boolean) Expressions.asObject(env, obj)).booleanValue();
break;
case "matchedExpressionGroup":
matchedExpressionGroup = ((Number) Expressions.asObject(env, obj)).intValue();
break;
}
}
}
@Override
public boolean extract(S in, List<T> out) {
return extractRule.extract(in, out);
}
@Override
public boolean test(T obj) {
return filterRule.test(obj);
}
public boolean isMostlyCompatible(AnnotationExtractRule<S, T> aer) {
// TODO: Check tokensResultAnnotationField, resultAnnotationField, resultNestedAnnotationField?
return (stage == aer.stage
&& Objects.equals(annotationField, aer.annotationField)
&& Objects.equals(tokensAnnotationField, aer.tokensAnnotationField)
&& matchedExpressionGroup == 0
&& aer.matchedExpressionGroup == 0
&& matchWithResults == aer.matchWithResults
&& Objects.equals(ruleType, aer.ruleType)
&& isComposite == aer.isComposite
&& active == aer.active
&& Objects.equals(result, aer.result));
}
public boolean hasTokensRegexPattern() {
return pattern != null && pattern instanceof TokenSequencePattern;
}
public String toString() { return getClass().getSimpleName() + '[' + pattern.toString() + ']'; }
} // end static class AnnotationExtractRule
public static AssignmentRule createAssignmentRule(Env env, AssignableExpression var, Expression result) {
AssignmentRule ar = new AssignmentRule(var, result);
ar.evaluate(env);
return ar;
}
public static Rule createRule(Env env, Expressions.CompositeValue cv) {
Map<String, Object> attributes;
cv = cv.simplifyNoTypeConversion(env);
attributes = new HashMap<>();//Generics.newHashMap();
for (String s:cv.getAttributes()) {
attributes.put(s, cv.getExpression(s));
}
return createExtractionRule(env, attributes);
}
protected static AnnotationExtractRule createExtractionRule(Env env, Map<String,Object> attributes) {
String ruleType = (String) Expressions.asObject(env, attributes.get("ruleType"));
if (ruleType == null && env != null) {
ruleType = (String) env.getDefaults().get("ruleType");
}
AnnotationExtractRuleCreator ruleCreator = lookupExtractRuleCreator(env, ruleType);
if (ruleCreator != null) {
return ruleCreator.create(env, attributes);
} else {
throw new IllegalArgumentException("Unknown rule type: " + ruleType);
}
}
public static AnnotationExtractRule createExtractionRule(Env env, String ruleType, Object pattern, Expression result) {
if (ruleType == null && env != null) {
ruleType = (String) env.getDefaults().get("ruleType");
}
AnnotationExtractRuleCreator ruleCreator = lookupExtractRuleCreator(env, ruleType);
if (ruleCreator != null) {
Map<String,Object> attributes = new HashMap<>();//Generics.newHashMap();
attributes.put("ruleType", ruleType);
attributes.put("pattern", pattern);
attributes.put("result", result);
return ruleCreator.create(env, attributes);
} else {
throw new IllegalArgumentException("Unknown rule type: " + ruleType);
}
}
public static final String COMPOSITE_RULE_TYPE = "composite";
public static final String TOKEN_PATTERN_RULE_TYPE = "tokens";
public static final String TEXT_PATTERN_RULE_TYPE = "text";
public static final String FILTER_RULE_TYPE = "filter";
public static final TokenPatternExtractRuleCreator TOKEN_PATTERN_EXTRACT_RULE_CREATOR = new TokenPatternExtractRuleCreator();
public static final CompositeExtractRuleCreator COMPOSITE_EXTRACT_RULE_CREATOR = new CompositeExtractRuleCreator();
public static final TextPatternExtractRuleCreator TEXT_PATTERN_EXTRACT_RULE_CREATOR = new TextPatternExtractRuleCreator();
public static final MultiTokenPatternExtractRuleCreator MULTI_TOKEN_PATTERN_EXTRACT_RULE_CREATOR = new MultiTokenPatternExtractRuleCreator();
public static final AnnotationExtractRuleCreator DEFAULT_EXTRACT_RULE_CREATOR = TOKEN_PATTERN_EXTRACT_RULE_CREATOR;
private static final Map<String, AnnotationExtractRuleCreator> registeredRuleTypes = new HashMap<>();//Generics.newHashMap();
static {
registeredRuleTypes.put(TOKEN_PATTERN_RULE_TYPE, TOKEN_PATTERN_EXTRACT_RULE_CREATOR);
registeredRuleTypes.put(COMPOSITE_RULE_TYPE, COMPOSITE_EXTRACT_RULE_CREATOR);
registeredRuleTypes.put(TEXT_PATTERN_RULE_TYPE, TEXT_PATTERN_EXTRACT_RULE_CREATOR);
registeredRuleTypes.put(FILTER_RULE_TYPE, TOKEN_PATTERN_EXTRACT_RULE_CREATOR);
}
private static AnnotationExtractRuleCreator lookupExtractRuleCreator(Env env, String ruleType) {
if (env != null) {
Object obj = env.get(ruleType);
if (obj != null && obj instanceof AnnotationExtractRuleCreator) {
return (AnnotationExtractRuleCreator) obj;
}
}
if (ruleType == null) {
return DEFAULT_EXTRACT_RULE_CREATOR;
} else {
return registeredRuleTypes.get(ruleType);
}
}
public static AnnotationExtractRule createTokenPatternRule(Env env, SequencePattern.PatternExpr expr, Expression result) {
return TOKEN_PATTERN_EXTRACT_RULE_CREATOR.create(env, expr, result);
}
public static AnnotationExtractRule createTextPatternRule(Env env, String expr, Expression result) {
return TEXT_PATTERN_EXTRACT_RULE_CREATOR.create(env, expr, result);
}
public static AnnotationExtractRule createMultiTokenPatternRule(Env env, AnnotationExtractRule template, List<TokenSequencePattern> patterns) {
return MULTI_TOKEN_PATTERN_EXTRACT_RULE_CREATOR.create(env, template, patterns);
}
public static class AnnotationExtractRuleCreator {
public AnnotationExtractRule create(Env env) {
AnnotationExtractRule r = new AnnotationExtractRule();
r.resultAnnotationField = EnvLookup.getDefaultResultAnnotationKey(env);
r.resultNestedAnnotationField = EnvLookup.getDefaultNestedResultsAnnotationKey(env);
r.tokensAnnotationField = EnvLookup.getDefaultTokensAnnotationKey(env);
r.tokensResultAnnotationField = EnvLookup.getDefaultTokensResultAnnotationKey(env);
if (env != null) {
r.update(env, env.getDefaults());
}
return r;
}
public AnnotationExtractRule create(Env env, Map<String,Object> attributes) {
// Get default annotation extract rule from env
AnnotationExtractRule r = create(env);
if (attributes != null) {
r.update(env, attributes);
}
return r;
}
}
public static MatchedExpression.SingleAnnotationExtractor createAnnotationExtractor(Env env, AnnotationExtractRule r) {
MatchedExpression.SingleAnnotationExtractor extractor = new MatchedExpression.SingleAnnotationExtractor();
extractor.name = r.name;
extractor.tokensAnnotationField = r.tokensAnnotationField;
extractor.tokensResultAnnotationField = r.tokensResultAnnotationField;
extractor.resultAnnotationField = r.resultAnnotationField;
extractor.resultNestedAnnotationField = r.resultNestedAnnotationField;
extractor.priority = r.priority;
extractor.weight = r.weight;
extractor.includeNested = r.includeNested;
extractor.resultAnnotationExtractor = EnvLookup.getDefaultResultAnnotationExtractor(env);
extractor.tokensAggregator = EnvLookup.getDefaultTokensAggregator(env);
return extractor;
}
public static class CompositeExtractRuleCreator extends AnnotationExtractRuleCreator {
protected static void updateExtractRule(AnnotationExtractRule r,
Env env,
SequencePattern.PatternExpr expr,
Expression action,
Expression result) {
TokenSequencePattern pattern = TokenSequencePattern.compile(expr);
updateExtractRule(r, env, pattern, action, result);
}
protected static void updateExtractRule(AnnotationExtractRule r,
Env env,
TokenSequencePattern pattern,
Expression action,
Expression result) {
MatchedExpression.SingleAnnotationExtractor annotationExtractor = createAnnotationExtractor(env, r);
SequenceMatchResultExtractor<CoreMap> valueExtractor = new SequenceMatchResultExtractor<>(env, action, result);
SequencePatternExtractRule<CoreMap,Value> valueExtractRule = new SequencePatternExtractRule<>(pattern, valueExtractor, r.matchFindType, r.matchWithResults);
SequenceMatchedExpressionExtractor exprExtractor = new SequenceMatchedExpressionExtractor( annotationExtractor, r.matchedExpressionGroup );
SequencePatternExtractRule<CoreMap, MatchedExpression> exprExtractRule =
new SequencePatternExtractRule<>(pattern, exprExtractor, r.matchFindType, r.matchWithResults);
annotationExtractor.expressionToValue = matched -> {
if (matched != null && matched.context != null && matched.context instanceof SequenceMatchResult ) {
return valueExtractor.apply( (SequenceMatchResult<CoreMap>) matched.context);
} else return null;
};
annotationExtractor.valueExtractor = new CoreMapFunctionApplier<>(env, r.annotationField, valueExtractRule);
r.extractRule = exprExtractRule;
r.filterRule = new AnnotationMatchedFilter(annotationExtractor);
r.pattern = pattern;
r.result = result;
pattern.weight = r.weight;
pattern.priority = r.priority;
}
protected AnnotationExtractRule create(Env env, SequencePattern.PatternExpr expr, Expression result) {
AnnotationExtractRule r = super.create(env, null);
r.isComposite = true;
if (r.annotationField == null) { r.annotationField = r.resultNestedAnnotationField; }
if (r.annotationField == null) { throw new IllegalArgumentException("Error creating composite rule: no annotation field"); }
r.ruleType = TOKEN_PATTERN_RULE_TYPE;
updateExtractRule(r, env, expr, null, result);
return r;
}
@Override
public AnnotationExtractRule create(Env env, Map<String,Object> attributes) {
AnnotationExtractRule r = super.create(env, attributes);
r.isComposite = true;
if (r.annotationField == null) { r.annotationField = r.resultNestedAnnotationField; }
if (r.annotationField == null) { throw new IllegalArgumentException("Error creating composite rule: no annotation field"); }
if (r.ruleType == null) { r.ruleType = TOKEN_PATTERN_RULE_TYPE; }
//SequencePattern.PatternExpr expr = (SequencePattern.PatternExpr) attributes.get("pattern");
TokenSequencePattern expr = (TokenSequencePattern) Expressions.asObject(env, attributes.get("pattern"));
Expression action = Expressions.asExpression(env, attributes.get("action"));
Expression result = Expressions.asExpression(env, attributes.get("result"));
updateExtractRule(r, env, expr, action, result);
return r;
}
}
public static class TokenPatternExtractRuleCreator extends AnnotationExtractRuleCreator {
protected static void updateExtractRule(AnnotationExtractRule r,
Env env,
SequencePattern.PatternExpr expr,
Expression action,
Expression result)
{
TokenSequencePattern pattern = TokenSequencePattern.compile(expr);
updateExtractRule(r, env, pattern, action, result);
}
protected static void updateExtractRule(AnnotationExtractRule r,
Env env,
TokenSequencePattern pattern,
Expression action,
Expression result) {
MatchedExpression.SingleAnnotationExtractor annotationExtractor = createAnnotationExtractor(env, r);
SequenceMatchResultExtractor<CoreMap> valueExtractor = new SequenceMatchResultExtractor<>(env, action, result);
SequencePatternExtractRule<CoreMap,Value> valueExtractRule = new SequencePatternExtractRule<>(pattern, valueExtractor, r.matchFindType, r.matchWithResults);
SequenceMatchedExpressionExtractor exprExtractor = new SequenceMatchedExpressionExtractor( annotationExtractor, r.matchedExpressionGroup );
SequencePatternExtractRule<CoreMap, MatchedExpression> exprExtractRule =
new SequencePatternExtractRule<>(pattern, exprExtractor, r.matchFindType, r.matchWithResults);
annotationExtractor.expressionToValue = matched -> {
if (matched != null && matched.context != null && matched.context instanceof SequenceMatchResult ) {
return valueExtractor.apply( (SequenceMatchResult<CoreMap>) matched.context);
} else return null;
};
if (r.annotationField != null && r.annotationField != CoreMap.class) {
annotationExtractor.valueExtractor = new CoreMapFunctionApplier<>(env, r.annotationField, valueExtractRule);
r.extractRule = new CoreMapExtractRule<>(env, r.annotationField, exprExtractRule);
} else {
annotationExtractor.valueExtractor = new CoreMapToListFunctionApplier<>(env, valueExtractRule);
r.extractRule = new CoreMapToListExtractRule<>(exprExtractRule);
}
r.filterRule = new AnnotationMatchedFilter(annotationExtractor);
r.pattern = pattern;
r.result = result;
pattern.weight = r.weight;
pattern.priority = r.priority;
}
protected AnnotationExtractRule create(Env env, SequencePattern.PatternExpr expr, Expression result) {
AnnotationExtractRule r = super.create(env, null);
if (r.annotationField == null) { r.annotationField = r.tokensAnnotationField; }
r.ruleType = TOKEN_PATTERN_RULE_TYPE;
updateExtractRule(r, env, expr, null, result);
return r;
}
@Override
public AnnotationExtractRule create(Env env, Map<String,Object> attributes) {
AnnotationExtractRule r = super.create(env, attributes);
if (r.annotationField == null) { r.annotationField = r.tokensAnnotationField; }
if (r.ruleType == null) { r.ruleType = TOKEN_PATTERN_RULE_TYPE; }
//SequencePattern.PatternExpr expr = (SequencePattern.PatternExpr) attributes.get("pattern");
TokenSequencePattern expr = (TokenSequencePattern) Expressions.asObject(env, attributes.get("pattern"));
Expression action = Expressions.asExpression(env, attributes.get("action"));
Expression result = Expressions.asExpression(env, attributes.get("result"));
updateExtractRule(r, env, expr, action, result);
return r;
}
}
public static class MultiTokenPatternExtractRuleCreator extends AnnotationExtractRuleCreator {
protected static void updateExtractRule(AnnotationExtractRule r,
Env env,
MultiPatternMatcher<CoreMap> pattern,
Expression action,
Expression result) {
MatchedExpression.SingleAnnotationExtractor annotationExtractor = createAnnotationExtractor(env, r);
SequenceMatchResultExtractor<CoreMap> valueExtractor = new SequenceMatchResultExtractor<>(env, action, result);
MultiSequencePatternExtractRule<CoreMap,Value> valueExtractRule = new MultiSequencePatternExtractRule<>(pattern, valueExtractor);
SequenceMatchedExpressionExtractor exprExtractor = new SequenceMatchedExpressionExtractor( annotationExtractor, r.matchedExpressionGroup );
MultiSequencePatternExtractRule<CoreMap, MatchedExpression> exprExtractRule =
new MultiSequencePatternExtractRule<>(pattern, exprExtractor);
annotationExtractor.expressionToValue = matched -> {
if (matched != null && matched.context != null && matched.context instanceof SequenceMatchResult ) {
return valueExtractor.apply( (SequenceMatchResult<CoreMap>) matched.context);
} else return null;
};
if (r.annotationField != null && r.annotationField != CoreMap.class) {
annotationExtractor.valueExtractor = new CoreMapFunctionApplier<>(env, r.annotationField, valueExtractRule);
r.extractRule = new CoreMapExtractRule<>(env, r.annotationField, exprExtractRule);
} else {
annotationExtractor.valueExtractor = new CoreMapToListFunctionApplier<>(env, valueExtractRule);
r.extractRule = new CoreMapToListExtractRule<>(exprExtractRule);
}
r.filterRule = new AnnotationMatchedFilter(annotationExtractor);
r.pattern = pattern;
r.result = result;
}
protected static AnnotationExtractRule create(Env env, SequenceMatchRules.AnnotationExtractRule aerTemplate, List<TokenSequencePattern> patterns) {
AnnotationExtractRule r = new AnnotationExtractRule();
r.stage = aerTemplate.stage;
r.active = aerTemplate.active;
r.priority = Double.NaN; // Priority from patterns?
r.weight = Double.NaN; // weight from patterns?
r.annotationField = aerTemplate.annotationField;
r.tokensAnnotationField = aerTemplate.tokensAnnotationField;
r.tokensResultAnnotationField = aerTemplate.tokensResultAnnotationField;
r.resultAnnotationField = aerTemplate.resultAnnotationField;
r.resultNestedAnnotationField = aerTemplate.resultNestedAnnotationField;
r.matchFindType = aerTemplate.matchFindType;
r.matchedExpressionGroup = aerTemplate.matchedExpressionGroup;
r.matchWithResults = aerTemplate.matchWithResults;
r.ruleType = aerTemplate.ruleType;
r.isComposite = aerTemplate.isComposite;
r.includeNested = aerTemplate.includeNested;
r.active = aerTemplate.active;
r.result = aerTemplate.result;
if (r.annotationField == null) { r.annotationField = r.tokensAnnotationField; }
r.ruleType = TOKEN_PATTERN_RULE_TYPE;
MultiPatternMatcher<CoreMap> multiPatternMatcher = TokenSequencePattern.getMultiPatternMatcher(patterns);
multiPatternMatcher.setMatchWithResult(r.matchWithResults);
updateExtractRule(r, env, multiPatternMatcher, null, r.result);
return r;
}
@Override
public AnnotationExtractRule create(Env env, Map<String,Object> attributes) {
throw new UnsupportedOperationException();
}
}
public static class TextPatternExtractRuleCreator extends AnnotationExtractRuleCreator {
protected static void updateExtractRule(AnnotationExtractRule r,
Env env,
String expr,
Expression action,
Expression result) {
final MatchedExpression.SingleAnnotationExtractor annotationExtractor = createAnnotationExtractor(env, r);
Pattern pattern = env.getStringPattern(expr);
StringMatchResultExtractor valueExtractor = new StringMatchResultExtractor(env, action, result);
StringPatternExtractRule<Value> valueExtractRule = new StringPatternExtractRule<>(pattern, valueExtractor);
StringMatchedExpressionExtractor exprExtractor = new StringMatchedExpressionExtractor( annotationExtractor, r.matchedExpressionGroup );
StringPatternExtractRule<MatchedExpression> exprExtractRule = new StringPatternExtractRule<>(pattern, exprExtractor);
annotationExtractor.valueExtractor = new CoreMapFunctionApplier<>(env, r.annotationField, valueExtractRule);
r.extractRule = new CoreMapExtractRule<>(env, r.annotationField, exprExtractRule);
r.filterRule = new AnnotationMatchedFilter(annotationExtractor);
r.pattern = pattern;
r.result = result;
}
protected AnnotationExtractRule create(Env env, String expr, Expression result) {
AnnotationExtractRule r = super.create(env, null);
if (r.annotationField == null) { r.annotationField = EnvLookup.getDefaultTextAnnotationKey(env); }
r.ruleType = TEXT_PATTERN_RULE_TYPE;
updateExtractRule(r, env, expr, null, result);
return r;
}
@Override
public AnnotationExtractRule create(Env env, Map<String,Object> attributes) {
AnnotationExtractRule r = super.create(env, attributes);
if (r.annotationField == null) { r.annotationField = EnvLookup.getDefaultTextAnnotationKey(env); }
if (r.ruleType == null) { r.ruleType = TEXT_PATTERN_RULE_TYPE; }
String expr = (String) Expressions.asObject(env, attributes.get("pattern"));
Expression action = Expressions.asExpression(env, attributes.get("action"));
Expression result = Expressions.asExpression(env, attributes.get("result"));
updateExtractRule(r, env, expr, action, result);
return r;
}
}
public static class AnnotationMatchedFilter implements Predicate<MatchedExpression>, Serializable {
MatchedExpression.SingleAnnotationExtractor extractor;
public AnnotationMatchedFilter(MatchedExpression.SingleAnnotationExtractor extractor) {
this.extractor = extractor;
}
@Override
public boolean test(MatchedExpression me) {
CoreMap cm = me.getAnnotation();
Value v = extractor.apply(cm);
if (v != null) {
if (v.get() == null) {
return true;
} else {
extractor.annotate(me);
return false;
}
//return v.get() == null;
} else {
return false;
}
}
}
public static class StringMatchResultExtractor implements Function<MatchResult,Value> {
Env env;
Expression action;
Expression result;
public StringMatchResultExtractor(Env env, Expression action, Expression result) {
this.env = env;
this.action = action;
this.result = result;
}
public StringMatchResultExtractor(Env env, Expression result) {
this.env = env;
this.result = result;
}
@Override
public Value apply(MatchResult matchResult) {
Value v = null;
if (action != null) {
action.evaluate(env, matchResult);
}
if (result != null) {
v = result.evaluate(env, matchResult);
}
return v;
}
}
public static class SequenceMatchResultExtractor<T> implements Function<SequenceMatchResult<T>,Value> {
Env env;
Expression action;
Expression result;
public SequenceMatchResultExtractor(Env env, Expression action, Expression result) {
this.env = env;
this.action = action;
this.result = result;
}
public SequenceMatchResultExtractor(Env env, Expression result) {
this.env = env;
this.result = result;
}
@Override
public Value apply(SequenceMatchResult<T> matchResult) {
Value v = null;
if (action != null) {
action.evaluate(env, matchResult);
}
if (result != null) {
v = result.evaluate(env, matchResult);
}
return v;
}
}
/**
* Interface for a rule that extracts a list of matched items from an input.
*
* @param <I> input type
* @param <O> output type
*/
public interface ExtractRule<I,O> {
boolean extract(I in, List<O> out);
}
/**
* Extraction rule that filters the input before passing it on to the next extractor
* @param <I> input type
* @param <O> output type
*/
public static class FilterExtractRule<I,O> implements ExtractRule<I,O> {
Predicate<I> filter;
ExtractRule<I,O> rule;
public FilterExtractRule(Predicate<I> filter, ExtractRule<I,O> rule) {
this.filter = filter;
this.rule = rule;
}
@SafeVarargs
public FilterExtractRule(Predicate<I> filter, ExtractRule<I,O>... rules) {
this.filter = filter;
this.rule = new ListExtractRule<>(rules);
}
@Override
public boolean extract(I in, List<O> out) {
if (filter.test(in)) {
return rule.extract(in,out);
} else {
return false;
}
}
}
/**
* Extraction rule that applies a list of rules in sequence and aggregates
* all matches found.
*
* @param <I> input type
* @param <O> output type
*/
public static class ListExtractRule<I,O> implements ExtractRule<I,O> {
List<ExtractRule<I,O>> rules;
public ListExtractRule(Collection<ExtractRule<I,O>> rules) {
this.rules = new ArrayList<>(rules);
}
@SafeVarargs
public ListExtractRule(ExtractRule<I,O>... rules) {
this.rules = new ArrayList<>(rules.length);
Collections.addAll(this.rules, rules);
}
@Override
public boolean extract(I in, List<O> out) {
boolean extracted = false;
for (ExtractRule<I,O> rule:rules) {
if (rule.extract(in,out)) {
extracted = true;
}
}
return extracted;
}
@SafeVarargs
public final void addRules(ExtractRule<I, O>... rules) {
Collections.addAll(this.rules, rules);
}
public void addRules(Collection<ExtractRule<I,O>> rules) {
this.rules.addAll(rules);
}
public String ruleList() {
List<String> names = new ArrayList<>();
for (ExtractRule rule: rules) {
if (rule instanceof AnnotationExtractRule) {
AnnotationExtractRule aer = (AnnotationExtractRule) rule;
String ruleString = null;
if (aer.pattern != null) {
ruleString = aer.pattern.toString();
} else if (aer.extractRule != null) {
ruleString = aer.extractRule.toString();
} else if (aer.filterRule != null) {
ruleString = aer.filterRule.toString();
} else {
ruleString = aer.toString();
}
names.add(ruleString);
} else {
names.add(rule.getClass().getName());
}
}
return names.toString();
}
public String toString() {
return "ListExtractRule[" + ruleList() + "]";
}
}
/**
* Extraction rule to apply a extraction rule on a particular CoreMap field
* Input is of type CoreMap, output is templated type O.
* @param <T> type of the annotation field
* @param <O> output type
*/
public static class CoreMapExtractRule<T,O> implements ExtractRule<CoreMap, O>
{
Env env;
Class annotationField;
ExtractRule<T,O> extractRule;
public CoreMapExtractRule(Env env, Class annotationField, ExtractRule<T,O> extractRule) {
this.annotationField = annotationField;
this.extractRule = extractRule;
this.env = env;
}
@Override
public boolean extract(CoreMap cm, List<O> out) {
env.push(Expressions.VAR_SELF, cm);
T field = (T) cm.get(annotationField);
boolean res = extractRule.extract(field, out);
env.pop(Expressions.VAR_SELF);
return res;
}
}
/**
* Extraction rule that treats a single CoreMap as a list/sequence of CoreMaps
* (convenience class, for use with BasicSequenceExtractRule)
* Input is of type CoreMap, output is templated type O.
* @param <O> output type
*/
public static class CoreMapToListExtractRule<O> implements ExtractRule<CoreMap, O>
{
ExtractRule<List<? extends CoreMap>,O> extractRule;
public CoreMapToListExtractRule(ExtractRule<List<? extends CoreMap>,O> extractRule) {
this.extractRule = extractRule;
}
@Override
public boolean extract(CoreMap cm, List<O> out) {
return extractRule.extract(Arrays.asList(cm), out);
}
}
/**
* Extraction rule.
* Input is of type CoreMap, output is MatchedExpression.
*/
public static class BasicSequenceExtractRule implements ExtractRule< List<? extends CoreMap>, MatchedExpression>
{
MatchedExpression.SingleAnnotationExtractor extractor;
public BasicSequenceExtractRule(MatchedExpression.SingleAnnotationExtractor extractor) {
this.extractor = extractor;
}
@Override
public boolean extract(List<? extends CoreMap> seq, List<MatchedExpression> out) {
boolean extracted = false;
for (int i = 0; i < seq.size(); i++) {
CoreMap t = seq.get(i);
Value v = extractor.apply(t);
if (v != null) {
MatchedExpression te = extractor.createMatchedExpression(Interval.toInterval(i, i + 1, Interval.INTERVAL_OPEN_END), null);
out.add(te);
extracted = true;
}
}
return extracted;
}
}
public static class SequencePatternExtractRule<T,O> implements ExtractRule< List<? extends T>, O>, Function<List<? extends T>, O> {
SequencePattern<T> pattern;
Function<SequenceMatchResult<T>, O> extractor;
SequenceMatcher.FindType findType = null;
boolean matchWithResult = false;
public SequencePatternExtractRule(Env env, String regex, Function<SequenceMatchResult<T>, O> extractor) {
this.extractor = extractor;
this.pattern = SequencePattern.compile(env, regex);
}
public SequencePatternExtractRule(SequencePattern<T> p, Function<SequenceMatchResult<T>, O> extractor) {
this.extractor = extractor;
this.pattern = p;
}
public SequencePatternExtractRule(SequencePattern<T> p, Function<SequenceMatchResult<T>, O> extractor,
SequenceMatcher.FindType findType, boolean matchWithResult) {
this.extractor = extractor;
this.pattern = p;
this.findType = findType;
this.matchWithResult = matchWithResult;
}
@Override
public boolean extract(List<? extends T> seq, List<O> out) {
if (seq == null) return false;
boolean extracted = false;
SequenceMatcher<T> m = pattern.getMatcher(seq);
if (findType != null) {
m.setFindType(findType);
}
m.setMatchWithResult(matchWithResult);
while (m.find()) {
out.add(extractor.apply(m));
extracted = true;
}
// System.err.println("SequencePattern " + pattern + " of type " + pattern.getClass() + " matched on " + extracted);
return extracted;
}
@Override
public O apply(List<? extends T> seq) {
if (seq == null) return null;
SequenceMatcher<T> m = pattern.getMatcher(seq);
m.setMatchWithResult(matchWithResult);
if (m.matches()) {
return extractor.apply(m);
} else {
return null;
}
}
} // end static class
public static class MultiSequencePatternExtractRule<T,O> implements ExtractRule< List<? extends T>, O>, Function<List<? extends T>, O> {
MultiPatternMatcher<T> matcher;
Function<SequenceMatchResult<T>, O> extractor;
public MultiSequencePatternExtractRule(MultiPatternMatcher<T> matcher, Function<SequenceMatchResult<T>, O> extractor) {
this.extractor = extractor;
this.matcher = matcher;
}
@Override
public boolean extract(List<? extends T> seq, List<O> out) {
if (seq == null) return false;
boolean extracted = false;
List<SequenceMatchResult<T>> matched = matcher.findNonOverlappingMaxScore(seq);
for (SequenceMatchResult<T> m : matched) {
out.add(extractor.apply(m));
extracted = true;
}
return extracted;
}
@Override
public O apply(List<? extends T> seq) {
if (seq == null) return null;
List<SequenceMatchResult<T>> matched = matcher.findNonOverlappingMaxScore(seq);
if (matched.size() > 0) {
return extractor.apply(matched.get(0));
} else {
return null;
}
}
}
public static class StringPatternExtractRule<O> implements ExtractRule<String, O>, Function<String, O> {
private final Pattern pattern;
private final Function<MatchResult, O> extractor;
public StringPatternExtractRule(Pattern pattern, Function<MatchResult, O> extractor) {
this.pattern = pattern;
this.extractor = extractor;
}
public StringPatternExtractRule(Env env, String regex, Function<MatchResult, O> extractor) {
this(env, regex, extractor, false);
}
public StringPatternExtractRule(String regex, Function<MatchResult, O> extractor) {
this(null, regex, extractor, false);
}
public StringPatternExtractRule(Env env, String regex, Function<MatchResult, O> extractor,
boolean addWordBoundaries) {
this.extractor = extractor;
if (addWordBoundaries) { regex = "\\b" + regex + "\\b"; }
if (env != null) {
pattern = env.getStringPattern(regex);
} else {
pattern = Pattern.compile(regex);
}
}
@Override
public boolean extract(String str, List<O> out) {
if (str == null) return false;
boolean extracted = false;
Matcher m = pattern.matcher(str);
while (m.find()) {
out.add(extractor.apply( m ));
// System.err.println("StringPatternExtractRule: " + pattern + " extracted " + out.get(out.size() - 1)); // XXXX
extracted = true;
}
return extracted;
}
@Override
public O apply(String str) {
if (str == null) return null;
Matcher m = pattern.matcher(str);
if (m.matches()) {
return extractor.apply(m);
} else {
return null;
}
}
} // end static class StringPatternExtractRule
public static class StringMatchedExpressionExtractor implements Function<MatchResult, MatchedExpression>
{
MatchedExpression.SingleAnnotationExtractor extractor;
int group = 0;
public StringMatchedExpressionExtractor(MatchedExpression.SingleAnnotationExtractor extractor, int group) {
this.extractor = extractor;
this.group = group;
}
@Override
public MatchedExpression apply(MatchResult matched) {
MatchedExpression te = extractor.createMatchedExpression(Interval.toInterval(matched.start(group), matched.end(group), Interval.INTERVAL_OPEN_END), null);
return te;
}
}
public static class SequenceMatchedExpressionExtractor implements Function<SequenceMatchResult<CoreMap>, MatchedExpression>
{
MatchedExpression.SingleAnnotationExtractor extractor;
int group = 0;
public SequenceMatchedExpressionExtractor(MatchedExpression.SingleAnnotationExtractor extractor, int group) {
this.extractor = extractor;
this.group = group;
}
@Override
public MatchedExpression apply(SequenceMatchResult<CoreMap> matched) {
MatchedExpression te = extractor.createMatchedExpression(null, Interval.toInterval(matched.start(group), matched.end(group), Interval.INTERVAL_OPEN_END));
if (Double.isNaN(te.priority)) {
te.priority = matched.priority();
}
if (Double.isNaN(te.weight)) {
te.weight = matched.score();
}
if (this.group != 0) {
// Save context so value evaluation can happen
te.context = matched.toBasicSequenceMatchResult();
}
return te;
}
}
public static class CoreMapFunctionApplier<T,O> implements Function<CoreMap, O>
{
Env env;
Class annotationField;
Function<T,O> func;
public CoreMapFunctionApplier(Env env, Class annotationField, Function<T,O> func) {
this.annotationField = annotationField;
if (annotationField == null) {
throw new IllegalArgumentException("Annotation field cannot be null");
}
this.func = func;
this.env = env;
}
@Override
public O apply(CoreMap cm) {
if (env != null) { env.push(Expressions.VAR_SELF, cm); }
T field = (T) cm.get(annotationField);
O res = func.apply(field);
if (env != null) { env.pop(Expressions.VAR_SELF); }
return res;
}
}
public static class CoreMapToListFunctionApplier<O> implements Function<CoreMap, O>
{
Env env;
Function<List<? extends CoreMap>,O> func;
public CoreMapToListFunctionApplier(Env env, Function<List<? extends CoreMap>,O> func) {
this.func = func;
this.env = env;
}
@Override
public O apply(CoreMap cm) {
if (env != null) { env.push(Expressions.VAR_SELF, cm); }
O res = func.apply(Collections.singletonList(cm));
if (env != null) { env.pop(Expressions.VAR_SELF); }
return res;
}
}
}