TokensRegexNERAnnotatorITest.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.pipeline;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.util.StringUtils;
import junit.framework.TestCase;

import java.io.File;
import java.io.PrintWriter;
import java.util.List;
import java.util.Properties;

/**
 * Test cases for TokensRegexNERAnnotator (taken from RegexNERAnnotator)
 * @author Angel Chang
 */
public class TokensRegexNERAnnotatorITest extends TestCase {
  private static final String REGEX_ANNOTATOR_NAME = "tokensregexner";
  private static final String MAPPING = "/u/nlp/data/TAC-KBP2010/sentence_extraction/itest_map";

  private static StanfordCoreNLP pipeline;
  private static Annotator caseless;
  private static Annotator cased;
  private static Annotator annotator;

  @Override
  public void setUp() throws Exception {
    synchronized(TokensRegexNERAnnotatorITest.class) {
      if (pipeline == null) {  // Hack so we don't load the pipeline fresh for every test
        Properties props = new Properties();
        props.setProperty("annotators", "tokenize, ssplit, pos, lemma, ner");
        pipeline = new StanfordCoreNLP(props);
        // Basic caseless and cased tokens regex annotators
        caseless = new TokensRegexNERAnnotator(MAPPING, true);
        cased = new TokensRegexNERAnnotator(MAPPING);
        annotator = cased;
      }
    }
  }

  // Helper methods
  protected static TokensRegexNERAnnotator getTokensRegexNerAnnotator(Properties props)
  {
    return new TokensRegexNERAnnotator(REGEX_ANNOTATOR_NAME, props);
  }

  protected static TokensRegexNERAnnotator getTokensRegexNerAnnotator(String[][] patterns, boolean ignoreCase) throws Exception
  {
    return getTokensRegexNerAnnotator(new Properties(), patterns, ignoreCase);
  }

  protected static TokensRegexNERAnnotator getTokensRegexNerAnnotator(Properties props, String[][] patterns, boolean ignoreCase) throws Exception
  {
    File tempFile = File.createTempFile("tokensregexnertest.patterns", "txt");
    tempFile.deleteOnExit();
    PrintWriter pw = IOUtils.getPrintWriter(tempFile.getAbsolutePath());
    for (String[] p: patterns) {
      pw.println(StringUtils.join(p, "\t"));
    }
    pw.close();
    props.setProperty(REGEX_ANNOTATOR_NAME + ".mapping", tempFile.getAbsolutePath());
    props.setProperty(REGEX_ANNOTATOR_NAME + ".ignorecase", String.valueOf(ignoreCase));
    return new TokensRegexNERAnnotator(REGEX_ANNOTATOR_NAME, props);
  }

  protected static Annotation createDocument(String text) {
    Annotation annotation = new Annotation(text);
    pipeline.annotate(annotation);
    return annotation;
  }

  /**
   * Helper method, checks that each token is tagged with the expected NER type.
   */
  private static void checkNerTags(List<CoreLabel> tokens, String... tags) {
    assertEquals(tags.length, tokens.size());
    for (int i = 0; i < tags.length; ++i) {
      assertEquals("Mismatch for token tag NER " + i + " " + tokens.get(i),
                   tags[i], tokens.get(i).get(CoreAnnotations.NamedEntityTagAnnotation.class));
    }
  }

  private static void checkTags(List<CoreLabel> tokens, Class key, String... tags) {
    assertEquals(tags.length, tokens.size());
    for (int i = 0; i < tags.length; ++i) {
      assertEquals("Mismatch for token tag " + key + " " + i + " " + tokens.get(i),
        tags[i], tokens.get(i).get(key));
    }
  }

  /**
   * Helper method, re-annotate each token with specified tag
   */
  private static void reannotate(List<CoreLabel> tokens, Class key, String ... tags) {
    assertEquals(tags.length, tokens.size());
    for (int i = 0; i < tags.length; ++i) {
      tokens.get(i).set(key, tags[i]);
    }
  }

  // Tests for TokensRegex syntax
  public void testTokensRegexSyntax() throws Exception {
    String[][] regexes =
      new String[][]{
        new String[]{"( /University/ /of/ [ {ner:LOCATION} ] )", "SCHOOL"}
        // TODO: TokensRegex literal string patterns ignores ignoreCase settings
        //new String[]{"( University of [ {ner:LOCATION} ] )", "SCHOOL"}
    };
    Annotator annotatorCased = getTokensRegexNerAnnotator(regexes, false);

    String str = "University of Alaska is located in Alaska.";
    Annotation document = createDocument(str);
    annotatorCased.annotate(document);
    List<CoreLabel> tokens = document.get(CoreAnnotations.TokensAnnotation.class);

    checkNerTags(tokens,
      "ORGANIZATION", "ORGANIZATION", "ORGANIZATION", "O", "O", "O", "LOCATION", "O");

    reannotate(tokens, CoreAnnotations.NamedEntityTagAnnotation.class,
            "O", "O", "LOCATION", "O", "O", "O", "LOCATION", "O");
    annotatorCased.annotate(document);

    checkNerTags(tokens,
      "SCHOOL", "SCHOOL", "SCHOOL", "O", "O", "O", "LOCATION", "O");

    // Try lowercase
    Annotator annotatorCaseless = getTokensRegexNerAnnotator(regexes, true);

    str = "university of alaska is located in alaska.";
    document = createDocument(str);
    tokens = document.get(CoreAnnotations.TokensAnnotation.class);
    checkNerTags(tokens,
      "O", "O", "LOCATION", "O", "O", "O", "LOCATION", "O");
    annotatorCased.annotate(document);
    checkNerTags(tokens,
      "O", "O", "LOCATION", "O", "O", "O", "LOCATION", "O");
    annotatorCaseless.annotate(document);
    checkNerTags(tokens,
      "SCHOOL", "SCHOOL", "SCHOOL", "O", "O", "O", "LOCATION", "O");
  }

  // Tests for TokensRegex syntax with match group
  public void testTokensRegexMatchGroup() throws Exception {
    String[][] regexes =
      new String[][]{
        new String[]{"( /the/? /movie/ (/[A-Z].*/+) )", "MOVIE", "", "0", "1"}
      };
    Annotator annotatorCased = getTokensRegexNerAnnotator(regexes, false);

    String str = "the movie Mud was very muddy";
    Annotation document = createDocument(str);
    annotatorCased.annotate(document);
    List<CoreLabel> tokens = document.get(CoreAnnotations.TokensAnnotation.class);

    checkNerTags(tokens,
      "O", "O", "MOVIE", "O", "O", "O");

  }

  // Tests for TokensRegexNer annotator annotating other fields
  public void testTokensRegexNormalizedAnnotate() throws Exception {
    Properties props = new Properties();
    props.setProperty(REGEX_ANNOTATOR_NAME + ".mapping.header", "pattern,ner,normalized,overwrite,priority,group");

    String[][] regexes =
      new String[][]{
        new String[]{"blue",  "COLOR", "B", "", "0"},
        new String[]{"red",   "COLOR", "R", "", "0"},
        new String[]{"green", "COLOR", "G", "", "0"}
      };
    Annotator annotatorCased = getTokensRegexNerAnnotator(props, regexes, false);

    String str = "These are all colors: blue, red, and green.";
    Annotation document = createDocument(str);
    annotatorCased.annotate(document);
    List<CoreLabel> tokens = document.get(CoreAnnotations.TokensAnnotation.class);

    checkTags(tokens, CoreAnnotations.TextAnnotation.class, "These", "are", "all", "colors", ":", "blue", ",", "red", ",", "and", "green", ".");
    checkTags(tokens, CoreAnnotations.NamedEntityTagAnnotation.class,  "O", "O", "O", "O", "O", "COLOR", "O", "COLOR", "O", "O", "COLOR", "O");
    checkTags(tokens, CoreAnnotations.NormalizedNamedEntityTagAnnotation.class,  null, null, null, null, null, "B", null, "R", null, null, "G", null);
  }

  public static class TestAnnotation implements CoreAnnotation<String> {
    public Class<String> getType() {
      return String.class;
    }
  }

  // Tests for TokensRegexNer annotator annotating other fields with custom key mapping
  public void testTokensRegexCustomAnnotate() throws Exception {

    Properties props = new Properties();
    props.setProperty(REGEX_ANNOTATOR_NAME + ".mapping.header", "pattern,test,overwrite,priority,group");
    props.setProperty(REGEX_ANNOTATOR_NAME + ".mapping.field.test", "edu.stanford.nlp.pipeline.TokensRegexNERAnnotatorITest$TestAnnotation");
    String[][] regexes =
      new String[][]{
        new String[]{"test", "TEST", "", "0"}
      };
    Annotator annotatorCased = getTokensRegexNerAnnotator(props, regexes, true);

    String str = "Marking all test as test";
    Annotation document = createDocument(str);
    annotatorCased.annotate(document);
    List<CoreLabel> tokens = document.get(CoreAnnotations.TokensAnnotation.class);

    checkTags(tokens, CoreAnnotations.TextAnnotation.class, "Marking", "all", "test", "as", "test");
    checkTags(tokens, TestAnnotation.class, null, null, "TEST", null, "TEST");
  }

  // Basic tests from RegexNERAnnotatorITest
  public void testBasicMatching() throws Exception {
    String str = "President Barack Obama lives in Chicago , Illinois , " +
    "and is a practicing Christian .";
    Annotation document = createDocument(str);
    annotator.annotate(document);
    List<CoreLabel> tokens = document.get(CoreAnnotations.TokensAnnotation.class);

    checkNerTags(tokens,
      "TITLE", "PERSON", "PERSON", "O", "O", "LOCATION", "O", "STATE_OR_PROVINCE",
      "O", "O", "O", "O", "O", "IDEOLOGY", "O");

  }

  /**
   * The ORGANIZATION on Ontario Bank should not ve overrwritten since Ontario (STATE_OR_PROVINCE)
   * does not span Ontario Bank. Nevertheless, by the special Chinese KBP 2016 hack, the LOCATION on Ontario Lake
   * should be overwritten.  Native American Church will overwrite ORGANIZATION with
   * RELIGION.
   */
  public void testOverwrite() throws Exception {
    String str = "I like Ontario Bank and Ontario Lake , and I like the Native American Church , too .";
    Annotation document = createDocument(str);
    annotator.annotate(document);
    List<CoreLabel> tokens = document.get(CoreAnnotations.TokensAnnotation.class);

    checkNerTags(tokens, "O", "O", "ORGANIZATION", "ORGANIZATION", "O", "STATE_OR_PROVINCE", "LOCATION", "O", "O", "O", "O", "O", "RELIGION",
      "RELIGION", "RELIGION", "O", "O", "O");

  }

  /**
   * In the mapping file, Christianity is assigned a higher priority than Early Christianity,
   * and so Early should not be marked as RELIGION.
   */
  public void testPriority() throws Exception {
    String str = "Christianity is of higher regex priority than Early Christianity . ";
    Annotation document = createDocument(str);
    annotator.annotate(document);
    List<CoreLabel> tokens = document.get(CoreAnnotations.TokensAnnotation.class);
    checkNerTags(tokens, "RELIGION", "O", "O", "O", "O", "O", "O", "O", "RELIGION", "O");
  }


  /**
   * Test that if there are no annotations at all, the annotator
   * throws an exception.  We are happy if we can catch an exception
   * and continue, and if we don't get any exceptions, we throw an
   * exception of our own.
   */
  public void testEmptyAnnotation() throws Exception {
    try {
      annotator.annotate(new Annotation(""));
    } catch(RuntimeException e) {
      return;
    }
    fail("Never expected to get this far... the annotator should have thrown an exception by now");
  }

}