/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.tools.eval;
import java.io.File;
import java.io.IOException;
import java.math.BigInteger;
import java.nio.charset.StandardCharsets;
import java.security.MessageDigest;
import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Test;
import opennlp.tools.chunker.Chunker;
import opennlp.tools.chunker.ChunkerME;
import opennlp.tools.chunker.ChunkerModel;
import opennlp.tools.cmdline.parser.ParserTool;
import opennlp.tools.doccat.DocumentSample;
import opennlp.tools.formats.LeipzigDoccatSampleStream;
import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.namefind.TokenNameFinder;
import opennlp.tools.namefind.TokenNameFinderModel;
import opennlp.tools.parser.Parse;
import opennlp.tools.parser.Parser;
import opennlp.tools.parser.ParserFactory;
import opennlp.tools.parser.ParserModel;
import opennlp.tools.postag.POSModel;
import opennlp.tools.postag.POSSample;
import opennlp.tools.postag.POSTagger;
import opennlp.tools.postag.POSTaggerME;
import opennlp.tools.sentdetect.SentenceDetector;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.tokenize.WhitespaceTokenizer;
import opennlp.tools.util.MarkableFileInputStreamFactory;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.Span;
/**
* This tests ensures that the existing SourceForge models perform
* like they are expected to.
* <p>
* To run this tests external the leipzig sentences files is needed:
* leipzig/eng_news_2010_300K-sentences.txt, this file can be
* obtained from the leipzig corpus project. <br>
* <p>
* And all the SourceForge models:<br>
* - models-sf/en-sent.bin<br>
* - models-sf/en-token.bin<br>
* - models-sf/en-ner-date.bin<br>
* - models-sf/en-ner-location.binn<br>
* - models-sf/en-ner-money.bin<br>
* - models-sf/en-ner-organization.bin<br>
* - models-sf/en-ner-percentage.bi<br>
* - models-sf/en-ner-person.bin<br>
* - models-sf/en-ner-time.bin<br>
* - models-sf/en-chunker.bin<br>
* - models-sf/en-pos-maxent.bin<br>
* - models-sf/en-pos-perceptron.bin<br>
* - models-sf/en-parser-chunking.bin.bin<br>
*/
public class SourceForgeModelEval {
@BeforeClass
public static void ensureTestDataIsCorrect() throws IOException {
MessageDigest digest = EvalUtil.createDigest();
try (ObjectStream<String> lines = new PlainTextByLineStream(
new MarkableFileInputStreamFactory(new File(EvalUtil.getOpennlpDataDir(),
"leipzig/eng_news_2010_300K-sentences.txt")), StandardCharsets.UTF_8)) {
String line;
while ((line = lines.read()) != null) {
digest.update(line.getBytes(StandardCharsets.UTF_8));
}
Assert.assertEquals(new BigInteger("248567841356936801447294643695012852392"),
new BigInteger(1, digest.digest()));
}
}
@Test
public void evalSentenceModel() throws IOException {
SentenceModel model = new SentenceModel(
new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-sent.bin"));
MessageDigest digest = EvalUtil.createDigest();
SentenceDetector sentenceDetector = new SentenceDetectorME(model);
StringBuilder text = new StringBuilder();
try (ObjectStream<DocumentSample> lineBatches = new LeipzigDoccatSampleStream("en", 25,
new MarkableFileInputStreamFactory(new File(EvalUtil.getOpennlpDataDir(),
"leipzig/eng_news_2010_300K-sentences.txt")))) {
DocumentSample lineBatch;
while ((lineBatch = lineBatches.read()) != null) {
text.append(String.join(" ", lineBatch.getText())).append(" ");
}
}
String[] sentences = sentenceDetector.sentDetect(text.toString());
for (String sentence : sentences) {
digest.update(sentence.getBytes(StandardCharsets.UTF_8));
}
Assert.assertEquals(new BigInteger("228544068397077998410949364710969159291"),
new BigInteger(1, digest.digest()));
}
@Test
public void evalTokenModel() throws IOException {
// the input stream is currently tokenized, we should detokenize it again,
// (or extend to pass in tokenizer, then whitespace tokenizer can be passed)
// and then tokenize it here
TokenizerModel model = new TokenizerModel(
new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-token.bin"));
MessageDigest digest = EvalUtil.createDigest();
Tokenizer tokenizer = new TokenizerME(model);
try (ObjectStream<DocumentSample> lines = new LeipzigDoccatSampleStream("en", 1,
WhitespaceTokenizer.INSTANCE,
new MarkableFileInputStreamFactory(new File(EvalUtil.getOpennlpDataDir(),
"leipzig/eng_news_2010_300K-sentences.txt")))) {
DocumentSample line;
while ((line = lines.read()) != null) {
String[] tokens = tokenizer.tokenize(String.join(" ", line.getText()));
for (String token : tokens) {
digest.update(token.getBytes(StandardCharsets.UTF_8));
}
}
}
Assert.assertEquals(new BigInteger("180602607571756839321060482558626151930"),
new BigInteger(1, digest.digest()));
}
private ObjectStream<DocumentSample> createLineWiseStream() throws IOException {
return new LeipzigDoccatSampleStream("en", 1,
new MarkableFileInputStreamFactory(new File(EvalUtil.getOpennlpDataDir(),
"leipzig/eng_news_2010_300K-sentences.txt")));
}
private void evalNameFinder(TokenNameFinderModel model, BigInteger expectedHash)
throws IOException {
MessageDigest digest = EvalUtil.createDigest();
TokenNameFinder nameFinder = new NameFinderME(model);
try (ObjectStream<DocumentSample> lines = createLineWiseStream()) {
DocumentSample line;
while ((line = lines.read()) != null) {
Span[] names = nameFinder.find(line.getText());
for (Span name : names) {
digest.update((name.getType() + name.getStart()
+ name.getEnd()).getBytes(StandardCharsets.UTF_8));
}
}
}
Assert.assertEquals(expectedHash, new BigInteger(1, digest.digest()));
}
@Test
public void evalNerDateModel() throws IOException {
TokenNameFinderModel personModel = new TokenNameFinderModel(
new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-ner-date.bin"));
evalNameFinder(personModel, new BigInteger("116570003910213570906062355532299200317"));
}
@Test
public void evalNerLocationModel() throws IOException {
TokenNameFinderModel personModel = new TokenNameFinderModel(
new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-ner-location.bin"));
evalNameFinder(personModel, new BigInteger("44810593886021404716125849669208680993"));
}
@Test
public void evalNerMoneyModel() throws IOException {
TokenNameFinderModel personModel = new TokenNameFinderModel(
new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-ner-money.bin"));
evalNameFinder(personModel, new BigInteger("65248897509365807977219790824670047287"));
}
@Test
public void evalNerOrganizationModel() throws IOException {
TokenNameFinderModel personModel = new TokenNameFinderModel(
new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-ner-organization.bin"));
evalNameFinder(personModel, new BigInteger("50454559690338630659278005157657197233"));
}
@Test
public void evalNerPercentageModel() throws IOException {
TokenNameFinderModel personModel = new TokenNameFinderModel(
new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-ner-percentage.bin"));
evalNameFinder(personModel, new BigInteger("320996882594215344113023719117249515343"));
}
@Test
public void evalNerPersonModel() throws IOException {
TokenNameFinderModel personModel = new TokenNameFinderModel(
new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-ner-person.bin"));
evalNameFinder(personModel, new BigInteger("143619582249937129618340838626447763744"));
}
@Test
public void evalNerTimeModel() throws IOException {
TokenNameFinderModel personModel = new TokenNameFinderModel(
new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-ner-time.bin"));
evalNameFinder(personModel, new BigInteger("282941772380683328816791801782579055940"));
}
@Test
public void evalChunkerModel() throws IOException {
MessageDigest digest = EvalUtil.createDigest();
POSTagger tagger = new POSTaggerME(new POSModel(
new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-pos-perceptron.bin")));
Chunker chunker = new ChunkerME(new ChunkerModel(
new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-chunker.bin")));
try (ObjectStream<DocumentSample> lines = createLineWiseStream()) {
DocumentSample line;
while ((line = lines.read()) != null) {
POSSample sentence = new POSSample(line.getText(), tagger.tag(line.getText()));
String[] chunks = chunker.chunk(sentence.getSentence(), sentence.getTags());
for (String chunk : chunks) {
digest.update(chunk.getBytes(StandardCharsets.UTF_8));
}
}
}
Assert.assertEquals(new BigInteger("226003515785585284478071030961407561943"),
new BigInteger(1, digest.digest()));
}
private void evalPosModel(POSModel model, BigInteger expectedHash) throws IOException {
// break the input stream into sentences
// The input stream is tokenized and can be processed here directly
MessageDigest digest = EvalUtil.createDigest();
POSTagger tagger = new POSTaggerME(model);
try (ObjectStream<DocumentSample> lines = createLineWiseStream()) {
DocumentSample line;
while ((line = lines.read()) != null) {
String[] tags = tagger.tag(line.getText());
for (String tag : tags) {
digest.update(tag.getBytes(StandardCharsets.UTF_8));
}
}
}
Assert.assertEquals(expectedHash, new BigInteger(1, digest.digest()));
}
@Test
public void evalMaxentModel() throws IOException {
POSModel maxentModel = new POSModel(
new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-pos-maxent.bin"));
evalPosModel(maxentModel, new BigInteger("231995214522232523777090597594904492687"));
}
@Test
public void evalPerceptronModel() throws IOException {
POSModel perceptronModel = new POSModel(
new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-pos-perceptron.bin"));
evalPosModel(perceptronModel, new BigInteger("209440430718727101220960491543652921728"));
}
@Test
public void evalParserModel() throws IOException {
ParserModel model = new ParserModel(
new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-parser-chunking.bin"));
MessageDigest digest = EvalUtil.createDigest();
Parser parser = ParserFactory.create(model);
try (ObjectStream<DocumentSample> lines = createLineWiseStream()) {
DocumentSample line;
while ((line = lines.read()) != null) {
Parse[] parse = ParserTool.parseLine(String.join(" ", line.getText()), parser, 1);
if (parse.length > 0) {
StringBuffer sb = new StringBuffer();
parse[0].show(sb);
digest.update(sb.toString().getBytes(StandardCharsets.UTF_8));
} else {
digest.update("empty".getBytes(StandardCharsets.UTF_8));
}
}
}
Assert.assertEquals(new BigInteger("312218841713337505306598301082074515847"),
new BigInteger(1, digest.digest()));
}
}