package edu.stanford.nlp.ling.tokensregex;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.tokensregex.parser.ParseException;
import edu.stanford.nlp.ling.tokensregex.parser.TokenSequenceParseException;
import edu.stanford.nlp.ling.tokensregex.parser.TokenSequenceParser;
import edu.stanford.nlp.ling.tokensregex.types.Expression;
import edu.stanford.nlp.ling.tokensregex.types.Tags;
import edu.stanford.nlp.ling.tokensregex.types.Value;
import edu.stanford.nlp.util.*;
import edu.stanford.nlp.util.logging.Redwood;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.StringReader;
import java.util.*;
import java.util.function.Predicate;
/**
* <p>Represents a list of assignment and extraction rules over sequence patterns.
* See {@link SequenceMatchRules} for the syntax of rules.
* </p>
*
* <p>Assignment rules are used to assign a value to a variable for later use in
* extraction rules or for expansions in patterns.</p>
* <p>Extraction rules are used to extract text/tokens matching regular expressions.
* Extraction rules are grouped into stages, with each stage consisting of the following:
* <ol>
* <li>Matching of rules over <b>text</b> and <b>tokens</b>. These rules are applied directly on the <b>text</b> and <b>tokens</b> fields of the {@code CoreMap}.</li>
* <li>Matching of <b>composite</b> rules. Matched expression are merged, and composite rules
* are applied recursively until no more changes to the matched expressions are detected.</li>
* <li><b>Filtering</b> of an invalid expression. In the final phase, a final filtering stage filters out invalid expressions.</li>
* </ol>
* The different stages are numbered and are applied in numeric order.
* </p>
*
* @author Angel Chang
* @see SequenceMatchRules
*/
public class CoreMapExpressionExtractor<T extends MatchedExpression> {
/** A logger for this class */
private static final Redwood.RedwoodChannels log = Redwood.channels(CoreMapExpressionExtractor.class);
private static boolean verbose = false;
// TODO: Remove templating of MatchedExpressions<?> (keep for now until TimeExpression rules can be decoupled)
private final Env env;
/* Keeps temporary tags created by extractor */
private boolean keepTags = false;
/* Collapses extraction rules - use with care */
private boolean collapseExtractionRules = false;
private final Class<CoreAnnotation<List<? extends CoreMap>>> tokensAnnotationKey;
private final Map<Integer, Stage<T>> stages;
/**
* Describes one stage of extraction.
* @param <T>
*/
public static class Stage<T> {
/** Whether to clear matched expressions from previous stages or not */
boolean clearMatched = false;
/**
* Limit the number of iterations for which the composite rules are applied
* (prevents badly formed rules from iterating forever)
*/
int limitIters = 50;
/**
* Stage id (stages are applied in numeric order from low to high)
*/
int stageId;
/** Rules to extract matched expressions directly from tokens */
SequenceMatchRules.ExtractRule<CoreMap, T> basicExtractRule;
/** Rules to extract composite expressions (grouped in stages) */
SequenceMatchRules.ExtractRule<List<? extends CoreMap>, T> compositeExtractRule;
/** Filtering rule */
Predicate<T> filterRule;
private static <I,O> SequenceMatchRules.ExtractRule<I,O> addRule(SequenceMatchRules.ExtractRule<I, O> origRule,
SequenceMatchRules.ExtractRule<I, O> rule) {
SequenceMatchRules.ListExtractRule<I,O> r;
if (origRule instanceof SequenceMatchRules.ListExtractRule) {
r = (SequenceMatchRules.ListExtractRule<I,O>) origRule;
} else {
r = new SequenceMatchRules.ListExtractRule<>();
if (origRule != null)
r.addRules(origRule);
}
r.addRules(rule);
return r;
}
private void addCompositeRule(SequenceMatchRules.ExtractRule<List<? extends CoreMap>, T> rule) {
compositeExtractRule = addRule(compositeExtractRule, rule);
}
private void addBasicRule(SequenceMatchRules.ExtractRule<CoreMap, T> rule) {
basicExtractRule = addRule(basicExtractRule, rule);
}
private void addFilterRule(Predicate<T> rule) {
Filters.DisjFilter<T> r;
if (filterRule instanceof Filters.DisjFilter) {
r = (Filters.DisjFilter<T>) filterRule;
r.addFilter(rule);
} else {
if (filterRule == null) {
r = new Filters.DisjFilter<>(rule);
} else {
r = new Filters.DisjFilter<>(filterRule, rule);
}
filterRule = r;
}
}
}
/**
* Creates an empty instance with no rules.
*/
public CoreMapExpressionExtractor() {
this(null);
}
/**
* Creates a default instance with the specified environment.
* (use the default tokens annotation key as specified in the environment)
* @param env Environment to use for binding variables and applying rules
*/
public CoreMapExpressionExtractor(Env env) {
this.stages = new HashMap<>();//Generics.newHashMap();
this.env = env;
this.tokensAnnotationKey = EnvLookup.getDefaultTokensAnnotationKey(env);
this.collapseExtractionRules = false;
if (env != null) {
this.collapseExtractionRules = Objects.equals((Boolean) env.get("collapseExtractionRules"), true);
if (env.get("verbose") != null)
verbose = (env.get("verbose") != null) &&
Objects.equals((Boolean) env.get("verbose"), true);
}
}
/**
* Creates an instance with the specified environment and list of rules
* @param env Environment to use for binding variables and applying rules
* @param rules List of rules for this extractor
*/
public CoreMapExpressionExtractor(Env env, List<SequenceMatchRules.Rule> rules) {
this(env);
appendRules(rules);
}
/**
* Add specified rules to this extractor.
*
* @param rules
*/
public void appendRules(List<SequenceMatchRules.Rule> rules) {
if (verbose)
log.info("Read " + rules.size() + " rules");
// Put rules into stages
if (collapseExtractionRules) {
rules = collapse(rules);
if (verbose)
log.info("Collapsing into " + rules.size() + " rules");
}
for (SequenceMatchRules.Rule r:rules) {
if (r instanceof SequenceMatchRules.AssignmentRule) {
// Nothing to do
// Assignments are added to environment as they are parsed
((SequenceMatchRules.AssignmentRule) r).evaluate(env);
} else if (r instanceof SequenceMatchRules.AnnotationExtractRule) {
SequenceMatchRules.AnnotationExtractRule aer = (SequenceMatchRules.AnnotationExtractRule) r;
Stage<T> stage = stages.get(aer.stage);
if (stage == null) {
stages.put(aer.stage, stage = new Stage<>());
stage.stageId = aer.stage;
Boolean clearMatched = (Boolean) env.getDefaults().get("stage.clearMatched");
if (clearMatched != null) {
stage.clearMatched = clearMatched;
}
Integer limitIters = (Integer) env.getDefaults().get("stage.limitIters");
if (limitIters != null) {
stage.limitIters = limitIters;
}
}
if (aer.active) {
if (SequenceMatchRules.FILTER_RULE_TYPE.equals(aer.ruleType)) {
stage.addFilterRule(aer);
} else {
if (aer.isComposite) {
// if (SequenceMatchRules.COMPOSITE_RULE_TYPE.equals(aer.ruleType)) {
stage.addCompositeRule(aer);
} else {
stage.addBasicRule(aer);
}
}
} else {
log.debug("Ignoring inactive rule: " + aer.name); // used to be INFO but annoyed Chris/users
}
}
}
}
private SequenceMatchRules.AnnotationExtractRule createMergedRule(SequenceMatchRules.AnnotationExtractRule aerTemplate, List<TokenSequencePattern> patterns) {
return SequenceMatchRules.createMultiTokenPatternRule(env, aerTemplate, patterns);
}
private List<SequenceMatchRules.Rule> collapse(List<SequenceMatchRules.Rule> rules) {
List<SequenceMatchRules.Rule> collapsed = new ArrayList<>();
List<TokenSequencePattern> patterns = null;
SequenceMatchRules.AnnotationExtractRule aerTemplate = null;
for (SequenceMatchRules.Rule rule:rules) {
boolean ruleHandled = false;
if (rule instanceof SequenceMatchRules.AnnotationExtractRule) {
SequenceMatchRules.AnnotationExtractRule aer = (SequenceMatchRules.AnnotationExtractRule) rule;
if (aer.hasTokensRegexPattern()) {
if (aerTemplate == null || aerTemplate.isMostlyCompatible(aer)) {
if (aerTemplate == null) {
aerTemplate = aer;
}
if (patterns == null) {
patterns = new ArrayList<>();
}
patterns.add((TokenSequencePattern) aer.pattern);
ruleHandled = true;
}
}
}
// Did we handle this rule?
if (!ruleHandled) {
if (aerTemplate != null) {
SequenceMatchRules.AnnotationExtractRule merged = createMergedRule(aerTemplate, patterns);
collapsed.add(merged);
aerTemplate = null;
patterns = null;
}
collapsed.add(rule);
}
}
if (aerTemplate != null) {
SequenceMatchRules.AnnotationExtractRule merged = createMergedRule(aerTemplate, patterns);
collapsed.add(merged);
}
return collapsed;
}
public Env getEnv() {
return env;
}
public void setExtractRules(SequenceMatchRules.ExtractRule<CoreMap, T> basicExtractRule,
SequenceMatchRules.ExtractRule<List<? extends CoreMap>, T> compositeExtractRule,
Predicate<T> filterRule)
{
Stage<T> stage = new Stage<>();
stage.basicExtractRule = basicExtractRule;
stage.compositeExtractRule = compositeExtractRule;
stage.filterRule = filterRule;
this.stages.clear();
this.stages.put(1, stage);
}
/**
* Creates an extractor using the specified environment, and reading the rules from the given filenames.
* @param env
* @param filenames
* @throws RuntimeException
*/
public static <M extends MatchedExpression> CoreMapExpressionExtractor<M> createExtractorFromFiles(Env env, String... filenames) throws RuntimeException {
return createExtractorFromFiles(env, Arrays.asList(filenames));
}
/**
* Creates an extractor using the specified environment, and reading the rules from the given filenames.
* @param env
* @param filenames
* @throws RuntimeException
*/
public static <M extends MatchedExpression> CoreMapExpressionExtractor<M> createExtractorFromFiles(Env env, List<String> filenames) throws RuntimeException {
CoreMapExpressionExtractor<M> extractor = new CoreMapExpressionExtractor<>(env);
for (String filename:filenames) {
try {
if (verbose)
log.info("Reading TokensRegex rules from " + filename);
BufferedReader br = IOUtils.readerFromString(filename);
TokenSequenceParser parser = new TokenSequenceParser();
parser.updateExpressionExtractor(extractor, br);
IOUtils.closeIgnoringExceptions(br);
} catch (Exception ex) {
throw new RuntimeException("Error parsing file: " + filename, ex);
}
}
return extractor;
}
/**
* Creates an extractor using the specified environment, and reading the rules from the given filename.
* @param env
* @param filename
* @throws RuntimeException
*/
public static CoreMapExpressionExtractor createExtractorFromFile(Env env, String filename) throws RuntimeException {
return createExtractorFromFiles(env, Collections.singletonList(filename));
}
/**
* Creates an extractor using the specified environment, and reading the rules from the given string
* @param env
* @param str
* @throws IOException, ParseException
*/
public static CoreMapExpressionExtractor createExtractorFromString(Env env, String str) throws IOException, ParseException, TokenSequenceParseException {
TokenSequenceParser parser = new TokenSequenceParser();
CoreMapExpressionExtractor extractor = parser.getExpressionExtractor(env, new StringReader(str));
return extractor;
}
public Value getValue(String varname)
{
Expression expr = (Expression) env.get(varname);
if (expr != null) {
return expr.evaluate(env);
} else {
throw new RuntimeException("Unable get expression for variable " + varname);
}
}
private List<CoreMap> extractCoreMapsToList(List<CoreMap> res, CoreMap annotation) {
List<T> exprs = extractExpressions(annotation);
for (T expr : exprs) {
res.add(expr.getAnnotation());
}
return res;
}
/**
* Returns list of coremaps that matches the specified rules.
*
* @param annotation
*/
public List<CoreMap> extractCoreMaps(CoreMap annotation) {
List<CoreMap> res = new ArrayList<>();
return extractCoreMapsToList(res, annotation);
}
/**
* Returns list of merged tokens and original tokens.
*
* @param annotation
*/
public List<CoreMap> extractCoreMapsMergedWithTokens(CoreMap annotation) {
List<CoreMap> res = extractCoreMaps(annotation);
Integer startTokenOffset = annotation.get(CoreAnnotations.TokenBeginAnnotation.class);
if (startTokenOffset == null) {
startTokenOffset = 0;
}
final Integer startTokenOffsetFinal = startTokenOffset;
List<CoreMap> merged = CollectionUtils.mergeListWithSortedMatchedPreAggregated(
annotation.get(tokensAnnotationKey), res, (CoreMap in) -> Interval.toInterval(in.get(CoreAnnotations.TokenBeginAnnotation.class) - startTokenOffsetFinal,
in.get(CoreAnnotations.TokenEndAnnotation.class) - startTokenOffsetFinal)
);
return merged;
}
public List<CoreMap> flatten(List<CoreMap> cms) {
return flatten(cms, tokensAnnotationKey);
}
private static List<CoreMap> flatten(List<CoreMap> cms, Class key) {
List<CoreMap> res = new ArrayList<>();
for (CoreMap cm : cms) {
if (cm.get(key) != null) {
res.addAll( (List<CoreMap>) cm.get(key));
} else {
res.add(cm);
}
}
return res;
}
private void cleanupTags(Collection objs, Map<Object, Boolean> cleaned) {
for (Object obj:objs) {
if (!cleaned.containsKey(obj)) {
cleaned.put(obj, false);
if (obj instanceof CoreMap) {
cleanupTags((CoreMap) obj, cleaned);
} else if (obj instanceof Collection) {
cleanupTags((Collection) obj, cleaned);
}
cleaned.put(obj, true);
}
}
}
private void cleanupTags(CoreMap cm) {
cleanupTags(cm, new IdentityHashMap<>());
}
private void cleanupTags(CoreMap cm, Map<Object, Boolean> cleaned) {
cm.remove(Tags.TagsAnnotation.class);
for (Class key:cm.keySet()) {
Object obj = cm.get(key);
if (!cleaned.containsKey(obj)) {
cleaned.put(obj, false);
if (obj instanceof CoreMap) {
cleanupTags((CoreMap) obj, cleaned);
} else if (obj instanceof Collection) {
cleanupTags((Collection) obj, cleaned);
}
cleaned.put(obj, true);
}
}
}
private Pair<List<? extends CoreMap>, List<T>> applyCompositeRule(
SequenceMatchRules.ExtractRule<List<? extends CoreMap>, T> compositeExtractRule,
List<? extends CoreMap> merged,
List<T> matchedExpressions, int limit) {
// Apply higher order rules
boolean done = false;
// Limit of number of times rules are applied just in case
int maxIters = limit;
int iters = 0;
while (! done) {
List<T> newExprs = new ArrayList<>();
boolean extracted = compositeExtractRule.extract(merged, newExprs);
if (verbose && extracted) log.info("applyCompositeRule() extracting with " + compositeExtractRule + " from " + merged + " gives " + newExprs);
if (extracted) {
annotateExpressions(merged, newExprs);
newExprs = MatchedExpression.removeNullValues(newExprs);
if ( ! newExprs.isEmpty()) {
newExprs = MatchedExpression.removeNested(newExprs);
newExprs = MatchedExpression.removeOverlapping(newExprs);
merged = MatchedExpression.replaceMerged(merged, newExprs);
// Favor newly matched expressions over older ones
newExprs.addAll(matchedExpressions);
matchedExpressions = MatchedExpression.removeNested(newExprs);
matchedExpressions = MatchedExpression.removeOverlapping(matchedExpressions);
} else {
extracted = false;
}
}
done = ! extracted;
iters++;
if (maxIters > 0 && iters >= maxIters) {
if (verbose) {
log.warn("Aborting application of composite rules: Maximum iteration " + maxIters + " reached");
}
break;
}
}
return new Pair<>(merged, matchedExpressions);
}
private static class CompositeMatchState<T> {
List<? extends CoreMap> merged;
List<T> matched;
int iters;
private CompositeMatchState(List<? extends CoreMap> merged, List<T> matched, int iters) {
this.merged = merged;
this.matched = matched;
this.iters = iters;
}
}
public List<T> extractExpressions(CoreMap annotation) {
// Extract potential expressions
List<T> matchedExpressions = new ArrayList<>();
List<Integer> stageIds = new ArrayList<>(stages.keySet());
Collections.sort(stageIds);
for (int stageId : stageIds) {
Stage<T> stage = stages.get(stageId);
SequenceMatchRules.ExtractRule<CoreMap, T> basicExtractRule = stage.basicExtractRule;
if (stage.clearMatched) {
matchedExpressions.clear();
}
if (basicExtractRule != null) {
basicExtractRule.extract(annotation, matchedExpressions);
if (verbose && matchedExpressions != null) {
log.info("extractExpressions() extracting with " + basicExtractRule + " from " + annotation + " gives " + matchedExpressions);
}
annotateExpressions(annotation, matchedExpressions);
matchedExpressions = MatchedExpression.removeNullValues(matchedExpressions);
matchedExpressions = MatchedExpression.removeNested(matchedExpressions);
matchedExpressions = MatchedExpression.removeOverlapping(matchedExpressions);
}
List<? extends CoreMap> merged = MatchedExpression.replaceMergedUsingTokenOffsets(annotation.get(tokensAnnotationKey), matchedExpressions);
SequenceMatchRules.ExtractRule<List<? extends CoreMap>, T> compositeExtractRule = stage.compositeExtractRule;
if (compositeExtractRule != null) {
Pair<List<? extends CoreMap>, List<T>> p = applyCompositeRule(
compositeExtractRule, merged, matchedExpressions, stage.limitIters);
merged = p.first();
matchedExpressions = p.second();
}
matchedExpressions = filterInvalidExpressions(stage.filterRule, matchedExpressions);
}
Collections.sort(matchedExpressions, MatchedExpression.EXPR_TOKEN_OFFSETS_NESTED_FIRST_COMPARATOR);
if (!keepTags) {
cleanupTags(annotation);
}
return matchedExpressions;
}
private void annotateExpressions(CoreMap annotation, List<T> expressions) {
// TODO: Logging can be excessive
List<T> toDiscard = new ArrayList<>();
for (T te : expressions) {
// Add attributes and all
if (te.annotation == null) {
try {
boolean extractOkay = te.extractAnnotation(env, annotation);
if (verbose && extractOkay) {
log.info("annotateExpressions() matched " + te + " from " + annotation);
}
if (!extractOkay) {
// Things didn't turn out so well
toDiscard.add(te);
log.warn("Error extracting annotation from " + te /*+ ", " + te.getExtractErrorMessage() */);
}
} catch (Exception ex) {
if (verbose) {
log.warn("Error extracting annotation from " + te);
log.warn(ex);
}
}
}
}
expressions.removeAll(toDiscard);
}
private void annotateExpressions(List<? extends CoreMap> chunks, List<T> expressions) {
// TODO: Logging can be excessive
List<T> toDiscard = new ArrayList<>();
for (T te : expressions) {
// Add attributes and all
try {
boolean extractOkay = te.extractAnnotation(env, chunks);
if (verbose && extractOkay) {
log.info("annotateExpressions() matched " + te + " from " + chunks);
}
if (!extractOkay) {
// Things didn't turn out so well
toDiscard.add(te);
log.warn("Error extracting annotation from " + te /*+ ", " + te.getExtractErrorMessage() */);
}
} catch (Exception ex) {
if (verbose) {
log.warn("Error extracting annotation from " + te);
log.warn(ex);
}
}
}
expressions.removeAll(toDiscard);
}
private List<T> filterInvalidExpressions(Predicate<T> filterRule, List<T> expressions) {
if (filterRule == null) return expressions;
if (expressions.isEmpty()) return expressions;
int nfiltered = 0;
List<T> kept = new ArrayList<>(expressions.size()); // Approximate size
for (T expr : expressions) {
if (!filterRule.test(expr)) {
kept.add(expr);
} else {
nfiltered++;
// logger.warning("Filtering out " + expr.getText());
}
}
if (nfiltered > 0 && verbose) {
log.debug("Filtered " + nfiltered);
}
return kept;
}
/**
* Keeps the temporary tags on the sentence after extraction has finished.
* This can have potentially unexpected results if you run the same sentence through multiple extractors;
* but, it makes the extraction process 20+% faster.
*
* @return This object
*/
public CoreMapExpressionExtractor keepTemporaryTags() {
this.keepTags = true;
return this;
}
public static void setVerbose(boolean v) {
verbose = v;
}
}