SourceForgeModelEval.java example

Explorer
opennlp-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package opennlp.tools.eval;

import java.io.File;
import java.io.IOException;
import java.math.BigInteger;
import java.nio.charset.StandardCharsets;
import java.security.MessageDigest;

import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Test;

import opennlp.tools.chunker.Chunker;
import opennlp.tools.chunker.ChunkerME;
import opennlp.tools.chunker.ChunkerModel;
import opennlp.tools.cmdline.parser.ParserTool;
import opennlp.tools.doccat.DocumentSample;
import opennlp.tools.formats.LeipzigDoccatSampleStream;
import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.namefind.TokenNameFinder;
import opennlp.tools.namefind.TokenNameFinderModel;
import opennlp.tools.parser.Parse;
import opennlp.tools.parser.Parser;
import opennlp.tools.parser.ParserFactory;
import opennlp.tools.parser.ParserModel;
import opennlp.tools.postag.POSModel;
import opennlp.tools.postag.POSSample;
import opennlp.tools.postag.POSTagger;
import opennlp.tools.postag.POSTaggerME;
import opennlp.tools.sentdetect.SentenceDetector;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.tokenize.WhitespaceTokenizer;
import opennlp.tools.util.MarkableFileInputStreamFactory;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.Span;

/**
 * This tests ensures that the existing SourceForge models perform
 * like they are expected to.
 * <p>
 * To run this tests external the leipzig sentences files is needed:
 * leipzig/eng_news_2010_300K-sentences.txt, this file can be
 * obtained from the leipzig corpus project. <br>
 * <p>
 * And all the SourceForge models:<br>
 * - models-sf/en-sent.bin<br>
 * - models-sf/en-token.bin<br>
 * - models-sf/en-ner-date.bin<br>
 * - models-sf/en-ner-location.binn<br>
 * - models-sf/en-ner-money.bin<br>
 * - models-sf/en-ner-organization.bin<br>
 * - models-sf/en-ner-percentage.bi<br>
 * - models-sf/en-ner-person.bin<br>
 * - models-sf/en-ner-time.bin<br>
 * - models-sf/en-chunker.bin<br>
 * - models-sf/en-pos-maxent.bin<br>
 * - models-sf/en-pos-perceptron.bin<br>
 * - models-sf/en-parser-chunking.bin.bin<br>
 */
public class SourceForgeModelEval {

  @BeforeClass
  public static void ensureTestDataIsCorrect() throws IOException {
    MessageDigest digest = EvalUtil.createDigest();

    try (ObjectStream<String> lines = new PlainTextByLineStream(
        new MarkableFileInputStreamFactory(new File(EvalUtil.getOpennlpDataDir(),
            "leipzig/eng_news_2010_300K-sentences.txt")), StandardCharsets.UTF_8)) {

      String line;
      while ((line = lines.read()) != null) {
        digest.update(line.getBytes(StandardCharsets.UTF_8));
      }

      Assert.assertEquals(new BigInteger("248567841356936801447294643695012852392"),
          new BigInteger(1, digest.digest()));
    }
  }

  @Test
  public void evalSentenceModel() throws IOException {

    SentenceModel model = new SentenceModel(
        new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-sent.bin"));

    MessageDigest digest = EvalUtil.createDigest();

    SentenceDetector sentenceDetector = new SentenceDetectorME(model);

    StringBuilder text = new StringBuilder();

    try (ObjectStream<DocumentSample> lineBatches = new LeipzigDoccatSampleStream("en", 25,
        new MarkableFileInputStreamFactory(new File(EvalUtil.getOpennlpDataDir(),
            "leipzig/eng_news_2010_300K-sentences.txt")))) {

      DocumentSample lineBatch;
      while ((lineBatch = lineBatches.read()) != null) {
        text.append(String.join(" ", lineBatch.getText())).append(" ");
      }
    }

    String[] sentences = sentenceDetector.sentDetect(text.toString());

    for (String sentence : sentences) {
      digest.update(sentence.getBytes(StandardCharsets.UTF_8));
    }

    Assert.assertEquals(new BigInteger("228544068397077998410949364710969159291"),
        new BigInteger(1, digest.digest()));
  }

  @Test
  public void evalTokenModel() throws IOException {

    // the input stream is currently tokenized, we should detokenize it again,
    //    (or extend to pass in tokenizer, then whitespace tokenizer can be passed)
    // and then tokenize it here

    TokenizerModel model = new TokenizerModel(
        new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-token.bin"));

    MessageDigest digest = EvalUtil.createDigest();

    Tokenizer tokenizer = new TokenizerME(model);

    try (ObjectStream<DocumentSample> lines = new LeipzigDoccatSampleStream("en", 1,
        WhitespaceTokenizer.INSTANCE,
        new MarkableFileInputStreamFactory(new File(EvalUtil.getOpennlpDataDir(),
            "leipzig/eng_news_2010_300K-sentences.txt")))) {

      DocumentSample line;
      while ((line = lines.read()) != null) {
        String[] tokens = tokenizer.tokenize(String.join(" ", line.getText()));
        for (String token : tokens) {
          digest.update(token.getBytes(StandardCharsets.UTF_8));
        }
      }
    }

    Assert.assertEquals(new BigInteger("180602607571756839321060482558626151930"),
        new BigInteger(1, digest.digest()));
  }

  private ObjectStream<DocumentSample> createLineWiseStream() throws IOException {
    return new LeipzigDoccatSampleStream("en", 1,
        new MarkableFileInputStreamFactory(new File(EvalUtil.getOpennlpDataDir(),
            "leipzig/eng_news_2010_300K-sentences.txt")));
  }


  private void evalNameFinder(TokenNameFinderModel model, BigInteger expectedHash)
      throws IOException {

    MessageDigest digest = EvalUtil.createDigest();

    TokenNameFinder nameFinder = new NameFinderME(model);

    try (ObjectStream<DocumentSample> lines = createLineWiseStream()) {

      DocumentSample line;
      while ((line = lines.read()) != null) {
        Span[] names = nameFinder.find(line.getText());
        for (Span name : names) {
          digest.update((name.getType() + name.getStart()
              + name.getEnd()).getBytes(StandardCharsets.UTF_8));
        }
      }
    }

    Assert.assertEquals(expectedHash, new BigInteger(1, digest.digest()));
  }

  @Test
  public void evalNerDateModel() throws IOException {
    TokenNameFinderModel personModel = new TokenNameFinderModel(
        new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-ner-date.bin"));

    evalNameFinder(personModel, new BigInteger("116570003910213570906062355532299200317"));
  }

  @Test
  public void evalNerLocationModel() throws IOException {
    TokenNameFinderModel personModel = new TokenNameFinderModel(
        new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-ner-location.bin"));

    evalNameFinder(personModel, new BigInteger("44810593886021404716125849669208680993"));
  }

  @Test
  public void evalNerMoneyModel() throws IOException {
    TokenNameFinderModel personModel = new TokenNameFinderModel(
        new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-ner-money.bin"));

    evalNameFinder(personModel, new BigInteger("65248897509365807977219790824670047287"));
  }

  @Test
  public void evalNerOrganizationModel() throws IOException {
    TokenNameFinderModel personModel = new TokenNameFinderModel(
        new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-ner-organization.bin"));

    evalNameFinder(personModel, new BigInteger("50454559690338630659278005157657197233"));
  }

  @Test
  public void evalNerPercentageModel() throws IOException {
    TokenNameFinderModel personModel = new TokenNameFinderModel(
        new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-ner-percentage.bin"));

    evalNameFinder(personModel, new BigInteger("320996882594215344113023719117249515343"));
  }

  @Test
  public void evalNerPersonModel() throws IOException {
    TokenNameFinderModel personModel = new TokenNameFinderModel(
        new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-ner-person.bin"));

    evalNameFinder(personModel, new BigInteger("143619582249937129618340838626447763744"));
  }

  @Test
  public void evalNerTimeModel() throws IOException {
    TokenNameFinderModel personModel = new TokenNameFinderModel(
        new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-ner-time.bin"));

    evalNameFinder(personModel, new BigInteger("282941772380683328816791801782579055940"));
  }

  @Test
  public void evalChunkerModel() throws IOException {

    MessageDigest digest = EvalUtil.createDigest();

    POSTagger tagger = new POSTaggerME(new POSModel(
        new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-pos-perceptron.bin")));

    Chunker chunker = new ChunkerME(new ChunkerModel(
        new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-chunker.bin")));

    try (ObjectStream<DocumentSample> lines = createLineWiseStream()) {

      DocumentSample line;
      while ((line = lines.read()) != null) {
        POSSample sentence = new POSSample(line.getText(), tagger.tag(line.getText()));

        String[] chunks = chunker.chunk(sentence.getSentence(), sentence.getTags());
        for (String chunk : chunks) {
          digest.update(chunk.getBytes(StandardCharsets.UTF_8));
        }
      }
    }

    Assert.assertEquals(new BigInteger("226003515785585284478071030961407561943"),
        new BigInteger(1, digest.digest()));
  }

  private void evalPosModel(POSModel model, BigInteger expectedHash) throws IOException {

    // break the input stream into sentences
    // The input stream is tokenized and can be processed here directly

    MessageDigest digest = EvalUtil.createDigest();

    POSTagger tagger = new POSTaggerME(model);

    try (ObjectStream<DocumentSample> lines = createLineWiseStream()) {

      DocumentSample line;
      while ((line = lines.read()) != null) {
        String[] tags = tagger.tag(line.getText());
        for (String tag : tags) {
          digest.update(tag.getBytes(StandardCharsets.UTF_8));
        }
      }
    }

    Assert.assertEquals(expectedHash, new BigInteger(1, digest.digest()));
  }

  @Test
  public void evalMaxentModel() throws IOException {
    POSModel maxentModel = new POSModel(
        new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-pos-maxent.bin"));

    evalPosModel(maxentModel, new BigInteger("231995214522232523777090597594904492687"));
  }

  @Test
  public void evalPerceptronModel() throws IOException {
    POSModel perceptronModel = new POSModel(
        new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-pos-perceptron.bin"));

    evalPosModel(perceptronModel, new BigInteger("209440430718727101220960491543652921728"));
  }

  @Test
  public void evalParserModel() throws IOException {

    ParserModel model = new ParserModel(
        new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-parser-chunking.bin"));

    MessageDigest digest = EvalUtil.createDigest();

    Parser parser = ParserFactory.create(model);

    try (ObjectStream<DocumentSample> lines = createLineWiseStream()) {

      DocumentSample line;
      while ((line = lines.read()) != null) {
        Parse[] parse = ParserTool.parseLine(String.join(" ", line.getText()), parser, 1);
        if (parse.length > 0) {
          StringBuffer sb = new StringBuffer();
          parse[0].show(sb);
          digest.update(sb.toString().getBytes(StandardCharsets.UTF_8));
        } else {
          digest.update("empty".getBytes(StandardCharsets.UTF_8));
        }
      }
    }

    Assert.assertEquals(new BigInteger("312218841713337505306598301082074515847"),
        new BigInteger(1, digest.digest()));
  }
}