TokenSequencePattern.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.ling.tokensregex;

import java.util.*;

import edu.stanford.nlp.ling.tokensregex.parser.TokenSequenceParser;
import edu.stanford.nlp.util.*;

/**
 * Token Sequence Pattern for regular expressions over sequences of tokens (each represented as a <code>CoreMap</code>).
 * Sequences over tokens can be matched like strings.
 * <p>
 * To use:
 * </p>
 * <pre><code>
 *   TokenSequencePattern p = TokenSequencePattern.compile("....");
 *   TokenSequenceMatcher m = p.getMatcher(tokens);
 *   while (m.find()) ....
 * </code></pre>
 *
 * <p>
 * Supports the following:
 * <ul>
 *  <li>Concatenation: <code>X Y</code></li>
 *  <li>Or: <code>X | Y</code></li>
 *  <li>And: {@code X & Y}</li>
 *  <li>Groups:
 *     <ul>
 *     <li>capturing: <code>(X)</code> (with numeric group id)</li>
 *     <li>capturing: <code>(?$var X)</code> (with group name "$var")</li>
 *     <li>noncapturing: <code>(?:X)</code></li>
 *     </ul>
 *  Capturing groups can be retrieved with group id or group variable, as matched string
 *     (<code>m.group()</code>) or list of tokens (<code>m.groupNodes()</code>).
 *  <ul>
 *     <li>To retrieve group using id: <code>m.group(id)</code> or <code>m.groupNodes(id)</code>
 *     <br> NOTE: Capturing groups are indexed from left to right, starting at one.  Group zero is the entire matched sequence.
 *     </li>
 *     <li>To retrieve group using bound variable name: <code>m.group("$var")</code> or <code>m.groupNodes("$var")</code>
 *     </li>
 *  </ul>
 *  See {@link SequenceMatchResult} for more accessor functions to retrieve matches.
 * </li>
 * <li>Greedy Quantifiers:  <code>X+, X?, X*, X{n,m}, X{n}, X{n,}</code></li>
 * <li>Reluctant Quantifiers: <code>X+?, X??, X*?, X{n,m}?, X{n}?, X{n,}?</code></li>
 * <li>Back references: <code>\captureid</code> </li>
 * <li>Value binding for groups: {@code [pattern] => [value]}.
 *   Value for matched expression can be accessed using {@code m.groupValue()}
 *   <br></br>Example: {@code ( one => 1 | two => 2 | three => 3 | ...)}
 * </li>
 * </ul>
 *
 * <p>
 * Individual tokens are marked by <code>"[" TOKEN_EXPR "]" </code>
 * <br>Possible <code>TOKEN_EXPR</code>:
 * </p>
 * <ul>
 * <li> All specified token attributes match:
 * <br> For Strings:
 *     <code> { lemma:/.../; tag:"NNP" } </code> = attributes that need to all match.
 *     If only one attribute, the {} can be dropped.
 * <br> See {@link edu.stanford.nlp.ling.AnnotationLookup AnnotationLookup} for a list of predefined token attribute names.
 * <br> Additional attributes can be bound using the environment (see below).
 * <br> NOTE: <code>/.../</code> used for regular expressions,
 *            <code>"..."</code> for exact string matches
 * <br> For Numbers:
 *      <code>{ word>=2 }</code>
 * <br> NOTE: Relation can be {@code ">=", "<=", ">", "<",} or {@code "=="}
 * <br> Others:
 *      <code>{ word::IS_NUM } , { word::IS_NIL } </code> or
 *      <code>{ word::NOT_EXISTS }, { word::NOT_NIL } </code> or <code> { word::EXISTS } </code>
 * </li>
 * <li>Short hand for just word/text match:
 *     <code> /.../ </code>  or  <code>"..." </code>
 * </li>
 * <li>
 *  Negation:
 *     <code> !{...} </code>
 * </li>
 * <li>
 *  Conjunction or Disjunction:
 *     <code> {...} & {...} </code>   or  <code> {...} | {...} </code>
 * </li>
 * </ul>
 *
 * <p>
 * Special tokens:
 *   Any token: <code>[]</code>
 * </p>
 *
 * <p>
 * String pattern match across multiple tokens:
 *   <code>(?m){min,max} /pattern/</code>
 * </p>
 *
 * <p>
 * Special expressions: indicated by double braces: <code>{{ expr }}</code>
 *   <br> See {@link edu.stanford.nlp.ling.tokensregex.types.Expressions} for syntax.
 * </p>
 *
 * <p>
 * Binding of variables for use in compiling patterns:
 * </p>
 * <ol>
 * <li> Use  {@code Env env = TokenSequencePattern.getNewEnv()} to create a new environment for binding </li>
 * <li> Bind string to attribute key (Class) lookup:
 *    {@code env.bind("numtype", CoreAnnotations.NumericTypeAnnotation.class);}
 * </li>
 * <li> Bind patterns / strings for compiling patterns
 *    <pre><code>
 *    // Bind string for later compilation using: compile("/it/ /was/ $RELDAY");
 *    env.bind("$RELDAY", "/today|yesterday|tomorrow|tonight|tonite/");
 *    // Bind pre-compiled patter for later compilation using: compile("/it/ /was/ $RELDAY");
 *    env.bind("$RELDAY", TokenSequencePattern.compile(env, "/today|yesterday|tomorrow|tonight|tonite/"));
 *    </code></pre>
 * </li>
 * <li> Bind custom node pattern functions (currently no arguments are supported)
 *    <pre><code>
 *    // Bind node pattern so we can do patterns like: compile("... temporal::IS_TIMEX_DATE ...");
 *    //   (TimexTypeMatchNodePattern is a NodePattern that implements some custom logic)
 *    env.bind("::IS_TIMEX_DATE", new TimexTypeMatchNodePattern(SUTime.TimexType.DATE));
 *   </code></pre>
 * </li>
 * </ol>
 *
 * <p>
 * Actions (partially implemented)
 * </p>
 * <ul>
 * <li> {@code pattern ==> action} </li>
 * <li> Supported action:
 *    <code> &annotate( { ner="DATE" } ) </code> </li>
 * <li> Not applied automatically, associated with a pattern.</li>
 * <li> To apply, call <code>pattern.getAction().apply(match, groupid)</code></li>
 * </ul>
 *
 * @author Angel Chang
 * @see TokenSequenceMatcher
 */
public class TokenSequencePattern extends SequencePattern<CoreMap> {

  private static final long serialVersionUID = -4760710834202406916L;

  public static final TokenSequencePattern ANY_NODE_PATTERN = TokenSequencePattern.compile(ANY_NODE_PATTERN_EXPR);

  private static final Env DEFAULT_ENV = getNewEnv();

  public TokenSequencePattern(String patternStr, SequencePattern.PatternExpr nodeSequencePattern) {
    super(patternStr, nodeSequencePattern);
  }

  public TokenSequencePattern(String patternStr, SequencePattern.PatternExpr nodeSequencePattern,
                                 SequenceMatchAction<CoreMap> action) {
    super(patternStr, nodeSequencePattern, action);
  }

  public static Env getNewEnv() {
    Env env =  new Env(new TokenSequenceParser());
    env.initDefaultBindings();
    return env;
  }

  /**
   * Compiles a regular expression over tokens into a TokenSequencePattern
   * using the default environment.
   *
   * @param string Regular expression to be compiled
   * @return Compiled TokenSequencePattern
   */
  public static TokenSequencePattern compile(String string)
  {
    return compile(DEFAULT_ENV, string);
  }

  /**
   * Compiles a regular expression over tokens into a TokenSequencePattern
   * using the specified environment.
   *
   * @param env Environment to use
   * @param string Regular expression to be compiled
   * @return Compiled TokenSequencePattern
   */
  public static TokenSequencePattern compile(Env env, String string)
  {
    try {
//      SequencePattern.PatternExpr nodeSequencePattern = TokenSequenceParser.parseSequence(env, string);
//      return new TokenSequencePattern(string, nodeSequencePattern);
      // TODO: Check token sequence parser?
      Pair<PatternExpr, SequenceMatchAction<CoreMap>> p = env.parser.parseSequenceWithAction(env, string);
      return new TokenSequencePattern(string, p.first(), p.second());

    } catch (Exception ex) {
      throw new RuntimeException("When parsing " + string + "\t\t" + ex);
    }
  }

  /**
   * Compiles a sequence of regular expressions into a TokenSequencePattern
   * using the default environment.
   *
   * @param strings List of regular expression to be compiled
   * @return Compiled TokenSequencePattern
   */
  public static TokenSequencePattern compile(String... strings)
  {
    return compile(DEFAULT_ENV, strings);
  }

  /**
   * Compiles a sequence of regular expressions into a TokenSequencePattern
   * using the specified environment.
   *
   * @param env Environment to use
   * @param strings List of regular expression to be compiled
   * @return Compiled TokenSequencePattern
   */
  public static TokenSequencePattern compile(Env env, String... strings)
  {
    try {
      List<SequencePattern.PatternExpr> patterns = new ArrayList<>();
      for (String string:strings) {
        // TODO: Check token sequence parser?
        SequencePattern.PatternExpr pattern = env.parser.parseSequence(env, string);
        patterns.add(pattern);
      }
      SequencePattern.PatternExpr nodeSequencePattern = new SequencePattern.SequencePatternExpr(patterns);
      return new TokenSequencePattern(StringUtils.join(strings), nodeSequencePattern);
    } catch (Exception ex) {
      throw new RuntimeException(ex);
    }
  }

  /**
   * Compiles a PatternExpr into a TokenSequencePattern.
   *
   * @param nodeSequencePattern A sequence pattern expression (before translation into a NFA)
   * @return Compiled TokenSequencePattern
   */
  public static TokenSequencePattern compile(SequencePattern.PatternExpr nodeSequencePattern)
  {
    return new TokenSequencePattern(null, nodeSequencePattern);
  }

  /**
   * Returns a TokenSequenceMatcher that can be used to match this pattern
   * against the specified list of tokens.
   *
   * @param tokens List of tokens to match against
   * @return TokenSequenceMatcher
   */
  @Override
  public TokenSequenceMatcher getMatcher(List<? extends CoreMap> tokens) {
    return new TokenSequenceMatcher(this, tokens);
  }

  /**
   * Returns a TokenSequenceMatcher that can be used to match this pattern
   * against the specified list of tokens.
   *
   * @param tokens List of tokens to match against
   * @return TokenSequenceMatcher
   */
  public TokenSequenceMatcher matcher(List<? extends CoreMap> tokens) {
    return getMatcher(tokens);
  }

  /** Returns a String representation of the TokenSequencePattern.
   *
   * @return A String representation of the TokenSequencePattern
   */
  @Override
  public String toString(){
    return this.pattern();
  }


  /**
   * Create a multi-pattern matcher for matching across multiple TokensRegex patterns.
   * @param patterns Collection of input patterns
   * @return a MultiPatternMatcher
   */
  public static MultiPatternMatcher<CoreMap> getMultiPatternMatcher(Collection<TokenSequencePattern> patterns) {
    return new MultiPatternMatcher<>(
            new MultiPatternMatcher.BasicSequencePatternTrigger<>(
                    new CoreMapNodePatternTrigger(patterns)
            ), patterns);
  }

  /**
   * Create a multi-pattern matcher for matching across multiple TokensRegex patterns.
   * @param patterns input patterns
   * @return a MultiPatternMatcher
   */
  public static MultiPatternMatcher<CoreMap> getMultiPatternMatcher(TokenSequencePattern... patterns) {
    return new MultiPatternMatcher<>(
            new MultiPatternMatcher.BasicSequencePatternTrigger<>(
                    new CoreMapNodePatternTrigger(patterns)
            ), patterns);
  }

}