/*
* Copyright 2012
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.dkpro.core.opennlp;
import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine;
import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription;
import static org.apache.uima.fit.util.JCasUtil.select;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.jcas.JCas;
import org.junit.Ignore;
import org.junit.Rule;
import org.junit.Test;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations;
import de.tudarmstadt.ukp.dkpro.core.testing.AssumeResource;
import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext;
import de.tudarmstadt.ukp.dkpro.core.testing.harness.SegmenterHarness;
public class OpenNlpSegmenterTest
{
@Test
public void testItalian() throws Exception
{
final String language = "it";
final String variant = "maxent";
final String text = "Questo è un test. E un altro ancora.";
final String[] sentences = { "Questo è un test.", "E un altro ancora." };
final String[] tokens = { "Questo", "è", "un", "test", ".", "E", "un", "altro", "ancora",
"." };
runTest(language, variant, text, sentences, tokens);
runTestWithModelsLocation(language, variant, text, sentences, tokens);
}
@Ignore("We don't have these models integrated yet")
@Test
public void testPortugueseCogroo() throws Exception
{
final String text = "Este é um teste. E mais uma.";
final String[] sentences = { "Este é um teste.", "E mais uma." };
final String[] tokens = { "Este", "é", "um", "teste", ".", "E", "mais", "uma", "." };
runTest("pt", "cogroo", text, sentences, tokens);
}
@Test
public void runHarness()
throws Throwable
{
AnalysisEngineDescription aed = createEngineDescription(OpenNlpSegmenter.class);
SegmenterHarness.run(aed, (language, variant) -> {
AssumeResource.assumeResource(OpenNlpSegmenter.class, "sentence", language,
"maxent");
},
"de.1", "en.7", "en.9", "ar.1", "zh.1", "zh.2");
}
private JCas runTest(String aLanguage, String aVariant, String aDocument, String[] sentences,
String[] tokens)
throws Exception
{
AssumeResource.assumeResource(OpenNlpSegmenter.class, "sentence", aLanguage, aVariant);
AnalysisEngine engine = createEngine(OpenNlpSegmenter.class,
OpenNlpSegmenter.PARAM_VARIANT, aVariant);
// Cannot use TestRunner because that uses TokenBuilder to create a segmentation.
JCas jcas = engine.newJCas();
jcas.setDocumentLanguage(aLanguage);
jcas.setDocumentText(aDocument);
engine.process(jcas);
AssertAnnotations.assertSentence(sentences, select(jcas, Sentence.class));
AssertAnnotations.assertToken(tokens, select(jcas, Token.class));
return jcas;
}
private JCas runTestWithModelsLocation(final String aLanguage, final String variant,
final String aDocument, final String[] sentences, final String[] tokens)
throws Exception
{
AssumeResource.assumeResource(OpenNlpSegmenter.class, "sentence", aLanguage, variant);
final AnalysisEngine engine = createEngine(OpenNlpSegmenter.class,
OpenNlpSegmenter.PARAM_VARIANT, variant,
OpenNlpSegmenter.PARAM_SEGMENTATION_MODEL_LOCATION,
"classpath:/de/tudarmstadt/ukp/dkpro/core/opennlp/lib/sentence-" + aLanguage + "-"
+ variant + ".bin", OpenNlpSegmenter.PARAM_TOKENIZATION_MODEL_LOCATION,
"classpath:/de/tudarmstadt/ukp/dkpro/core/opennlp/lib/token-" + aLanguage + "-"
+ variant + ".bin");
// Cannot use TestRunner because that uses TokenBuilder to create a segmentation.
JCas jcas = engine.newJCas();
jcas.setDocumentLanguage(aLanguage);
jcas.setDocumentText(aDocument);
engine.process(jcas);
AssertAnnotations.assertSentence(sentences, select(jcas, Sentence.class));
AssertAnnotations.assertToken(tokens, select(jcas, Token.class));
return jcas;
}
@Test
public void testZoning() throws Exception
{
SegmenterHarness.testZoning(OpenNlpSegmenter.class);
}
@Rule
public DkproTestContext testContext = new DkproTestContext();
}