/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.tools.sentdetect;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import org.junit.Assert;
import org.junit.Test;
import opennlp.tools.formats.ResourceAsStreamFactory;
import opennlp.tools.util.InputStreamFactory;
import opennlp.tools.util.InsufficientTrainingDataException;
import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.Span;
import opennlp.tools.util.TrainingParameters;
/**
* Tests for the {@link SentenceDetectorME} class.
*/
public class SentenceDetectorMETest {
@Test
public void testSentenceDetector() throws IOException {
InputStreamFactory in = new ResourceAsStreamFactory(getClass(),
"/opennlp/tools/sentdetect/Sentences.txt");
TrainingParameters mlParams = new TrainingParameters();
mlParams.put(TrainingParameters.ITERATIONS_PARAM, 100);
mlParams.put(TrainingParameters.CUTOFF_PARAM, 0);
SentenceDetectorFactory factory = new SentenceDetectorFactory("en", true, null, null);
SentenceModel sentdetectModel = SentenceDetectorME.train(
"en", new SentenceSampleStream(new PlainTextByLineStream(in,
StandardCharsets.UTF_8)), factory, mlParams);
Assert.assertEquals("en", sentdetectModel.getLanguage());
SentenceDetectorME sentDetect = new SentenceDetectorME(sentdetectModel);
// Tests sentence detector with sentDetect method
String sampleSentences1 = "This is a test. There are many tests, this is the second.";
String[] sents = sentDetect.sentDetect(sampleSentences1);
Assert.assertEquals(sents.length,2);
Assert.assertEquals(sents[0],"This is a test.");
Assert.assertEquals(sents[1],"There are many tests, this is the second.");
double[] probs = sentDetect.getSentenceProbabilities();
Assert.assertEquals(probs.length,2);
String sampleSentences2 = "This is a test. There are many tests, this is the second";
sents = sentDetect.sentDetect(sampleSentences2);
Assert.assertEquals(sents.length,2);
probs = sentDetect.getSentenceProbabilities();
Assert.assertEquals(probs.length,2);
Assert.assertEquals(sents[0],"This is a test.");
Assert.assertEquals(sents[1],"There are many tests, this is the second");
String sampleSentences3 = "This is a \"test\". He said \"There are many tests, this is the second.\"";
sents = sentDetect.sentDetect(sampleSentences3);
Assert.assertEquals(sents.length,2);
probs = sentDetect.getSentenceProbabilities();
Assert.assertEquals(probs.length,2);
Assert.assertEquals(sents[0],"This is a \"test\".");
Assert.assertEquals(sents[1],"He said \"There are many tests, this is the second.\"");
String sampleSentences4 = "This is a \"test\". I said \"This is a test.\" Any questions?";
sents = sentDetect.sentDetect(sampleSentences4);
Assert.assertEquals(sents.length,3);
probs = sentDetect.getSentenceProbabilities();
Assert.assertEquals(probs.length,3);
Assert.assertEquals(sents[0],"This is a \"test\".");
Assert.assertEquals(sents[1],"I said \"This is a test.\"");
Assert.assertEquals(sents[2],"Any questions?");
String sampleSentences5 = "This is a one sentence test space at the end. ";
sents = sentDetect.sentDetect(sampleSentences5);
Assert.assertEquals(1, sentDetect.getSentenceProbabilities().length);
Assert.assertEquals(sents[0],"This is a one sentence test space at the end.");
String sampleSentences6 = "This is a one sentences test with tab at the end. ";
sents = sentDetect.sentDetect(sampleSentences6);
Assert.assertEquals(sents[0],"This is a one sentences test with tab at the end.");
String sampleSentences7 = "This is a test. With spaces between the two sentences.";
sents = sentDetect.sentDetect(sampleSentences7);
Assert.assertEquals(sents[0],"This is a test.");
Assert.assertEquals(sents[1],"With spaces between the two sentences.");
String sampleSentences9 = "";
sents = sentDetect.sentDetect(sampleSentences9);
Assert.assertEquals(0, sents.length);
String sampleSentences10 = " "; // whitespaces and tabs
sents = sentDetect.sentDetect(sampleSentences10);
Assert.assertEquals(0, sents.length);
String sampleSentences11 = "This is test sentence without a dot at the end and spaces ";
sents = sentDetect.sentDetect(sampleSentences11);
Assert.assertEquals(sents[0],"This is test sentence without a dot at the end and spaces");
probs = sentDetect.getSentenceProbabilities();
Assert.assertEquals(1, probs.length);
String sampleSentence12 = " This is a test.";
sents = sentDetect.sentDetect(sampleSentence12);
Assert.assertEquals(sents[0],"This is a test.");
String sampleSentence13 = " This is a test";
sents = sentDetect.sentDetect(sampleSentence13);
Assert.assertEquals(sents[0],"This is a test");
// Test that sentPosDetect also works
Span[] pos = sentDetect.sentPosDetect(sampleSentences2);
Assert.assertEquals(pos.length,2);
probs = sentDetect.getSentenceProbabilities();
Assert.assertEquals(probs.length,2);
Assert.assertEquals(new Span(0, 15), pos[0]);
Assert.assertEquals(new Span(16, 56), pos[1]);
}
@Test(expected = InsufficientTrainingDataException.class)
public void testInsufficientData() throws IOException {
InputStreamFactory in = new ResourceAsStreamFactory(getClass(),
"/opennlp/tools/sentdetect/SentencesInsufficient.txt");
TrainingParameters mlParams = new TrainingParameters();
mlParams.put(TrainingParameters.ITERATIONS_PARAM, 100);
mlParams.put(TrainingParameters.CUTOFF_PARAM, 0);
SentenceDetectorFactory factory = new SentenceDetectorFactory("en", true, null, null);
SentenceDetectorME.train("en",
new SentenceSampleStream(
new PlainTextByLineStream(in, StandardCharsets.UTF_8)), factory, mlParams);
}
}