NERCombinerAnnotatorITest.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.pipeline;

import edu.stanford.nlp.ie.NERClassifierCombiner;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.sequences.ColumnTabDocumentReaderWriter;
import edu.stanford.nlp.util.CoreMap;
import junit.framework.TestCase;

import java.io.StringReader;
import java.util.Iterator;
import java.util.List;
import java.util.Properties;


/**
 * @author Angel Chang
 * @author John Bauer
 */
public class NERCombinerAnnotatorITest extends TestCase {

  public static final String NER_3CLASS = DefaultPaths.DEFAULT_NER_THREECLASS_MODEL;
  public static final String NER_7CLASS = DefaultPaths.DEFAULT_NER_MUC_MODEL;
  public static final String NER_MISCCLASS = DefaultPaths.DEFAULT_NER_CONLL_MODEL;

  private static NERCombinerAnnotator nerAnnotator = null;
  private static AnnotationPipeline unthreadedPipeline = null;
  private static AnnotationPipeline threaded4Pipeline = null;

  /**
   * Creates the tagger annotator if it isn't already created
   */
  @Override
  public void setUp()
    throws Exception
  {
    synchronized(NERCombinerAnnotatorITest.class) {
      if (nerAnnotator == null) {
        nerAnnotator = new NERCombinerAnnotator(false, NER_3CLASS, NER_7CLASS, NER_MISCCLASS);

        Properties props = new Properties();
        props.setProperty("ner.applyNumericClassifiers", "false");
        props.setProperty("ner.useSUTime", "false");
        props.setProperty("ner.model", NER_3CLASS);
        NERClassifierCombiner ner = NERClassifierCombiner.createNERClassifierCombiner("ner", props);
        NERCombinerAnnotator threaded4Annotator = new NERCombinerAnnotator(ner, false, 4, -1);

        threaded4Pipeline = new AnnotationPipeline();
        threaded4Pipeline.addAnnotator(new TokenizerAnnotator(false, "en"));
        threaded4Pipeline.addAnnotator(new WordsToSentencesAnnotator(false));
        threaded4Pipeline.addAnnotator(threaded4Annotator);

        NERCombinerAnnotator unthreadedAnnotator = new NERCombinerAnnotator(ner, false, 1, -1);
        unthreadedPipeline = new AnnotationPipeline();
        unthreadedPipeline.addAnnotator(new TokenizerAnnotator(false, "en"));
        unthreadedPipeline.addAnnotator(new WordsToSentencesAnnotator(false));
        unthreadedPipeline.addAnnotator(unthreadedAnnotator);
      }
    }
  }

  public void testPipelineAnnotator() {
    Annotation document = new Annotation(TEXT);
    unthreadedPipeline.annotate(document);
    verifyAnswers(ANSWERS, document);
  }

  public void testThreadedAnnotator() {
    Annotation document = new Annotation(TEXT);
    threaded4Pipeline.annotate(document);
    verifyAnswers(ANSWERS, document);

    document = new Annotation(TEXT + TEXT + TEXT);
    threaded4Pipeline.annotate(document);
    verifyAnswers(ANSWERS, document);
  }

  public static void verifyAnswers(String[][] expected, Annotation document) {
    int sentenceIndex = 0;
    for (CoreMap sentence : document.get(CoreAnnotations.SentencesAnnotation.class)) {
      List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
      assertEquals(expected[sentenceIndex % expected.length].length, tokens.size());

      int token = 0;
      for (CoreLabel word : sentence.get(CoreAnnotations.TokensAnnotation.class)) {
        assertEquals(expected[sentenceIndex % expected.length][token], word.ner());
        ++token;
      }
      ++sentenceIndex;
    }
  }

  public static final String TEXT = "John Bauer used to work at Stanford.  He worked there for 4 years.  John left in August 2014.  ";

  public static final String[][] ANSWERS = {
    { "PERSON", "PERSON", "O", "O", "O", "O", "ORGANIZATION", "O" },
    { "O", "O", "O", "O", "O", "O", "O" },
    { "PERSON", "O", "O", "O", "O", "O" }
  };

  private static Iterator<Annotation> getTestData(String inputString, boolean includeAnswer)
  {
    ColumnTabDocumentReaderWriter<CoreMap> colReader = new ColumnTabDocumentReaderWriter<>();
    if (includeAnswer) {
      colReader.init("word=0,tag=1,answer=2");
    } else {
      colReader.init("word=0,tag=1");
    }
    StringReader strReader = new StringReader(inputString);
    return colReader.getDocIterator(strReader);
  }

  private static void checkAnnotation(String goldInputString) throws Exception
  {
    // Use separate sets for gold and test since the NER annotator may write stuff to the AnswerAnnotation
    Iterator<Annotation> goldDocs = getTestData(goldInputString, true);
    Iterator<Annotation> testDocs = getTestData(goldInputString, false);
    int k = 0;
    while (testDocs.hasNext()) {
      Annotation goldDoc = goldDocs.next();
      Annotation testDoc = testDocs.next();
      nerAnnotator.annotate(testDoc);
      List<CoreLabel> goldTokens = goldDoc.get(CoreAnnotations.TokensAnnotation.class);
      List<CoreLabel> testTokens = testDoc.get(CoreAnnotations.TokensAnnotation.class);
      assertEquals("token number", goldTokens.size(), testTokens.size());
      for (int i = 0; i < goldTokens.size(); i++) {
        CoreLabel goldToken = goldTokens.get(i);
        CoreLabel testToken = testTokens.get(i);
        //System.err.println("POS: " + testToken.get(CoreAnnotations.PartOfSpeechAnnotation.class));
        String goldNer = goldToken.get(CoreAnnotations.AnswerAnnotation.class);
        String testNer = testToken.get(CoreAnnotations.NamedEntityTagAnnotation.class);
        //System.err.println("Ner tag for token " + i + " doc " + k +", GOLD: " + goldNer + ", TEST:" + testNer);
        assertEquals("Ner tag for token " + i + " (\"" + testToken.word() + "\") doc " + k, goldNer, testNer);
      }
      k++;
    }

  }

  public void testCombinedAnnotation() throws Exception
  {
    StringBuilder sb = new StringBuilder();
    sb.append("EU\tNNP\tORGANIZATION\n");
    sb.append("rejects\tVBZ\tO\n");
    sb.append("German\tNNP\tMISC\n");
    sb.append("call\tNN\tO\n");
    sb.append("to\tTO\tO\n");
    sb.append("boycott\tVB\tO\n");
    sb.append("British\tNNP\tMISC\n");
    sb.append("lamb\tNN\tO\n");
    sb.append(".\t.\tO\n");
    sb.append("Peter\tNNP\tPERSON\n");
    sb.append("Blackburn\tNNP\tPERSON\n");
    sb.append("BRUSSELS\tNNP\tLOCATION\n");
    sb.append("1996-08-22\tCD\tDATE\n");
    sb.append("It\tPRP\tO\n");
    sb.append("is\tVBZ\tO\n");
    sb.append("bright\tJJ\tO\n");
    sb.append("during\tIN\tO\n");
    sb.append("the\tDT\tDATE\n");
    sb.append("day\tNN\tDATE\n");
    sb.append(".\t.\tO\n");
    sb.append("It\tPRP\tO\n");
    sb.append("was\tVBZ\tO\n");
    sb.append("2\tJJ\tDURATION\n");
    sb.append("days\tIN\tDURATION\n");
    sb.append("before\tDT\tO\n");
    sb.append("the\tNN\tO\n");
    sb.append("meeting\tNN\tO\n");
    sb.append(".\t.\tO\n");

    checkAnnotation(sb.toString());
  }

}