/*
* Copyright 2012
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.dkpro.core.opennlp;
import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine;
import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription;
import static org.apache.uima.fit.util.JCasUtil.select;
import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.*;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.jcas.JCas;
import org.junit.Before;
import org.junit.Ignore;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.TestName;
import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk;
import de.tudarmstadt.ukp.dkpro.core.testing.AssumeResource;
public class OpenNlpChunkerTest
{
@Test
public void testEnglish()
throws Exception
{
JCas jcas = runTest("en", null, "We need a very complicated example sentence, which " +
"contains as many constituents and dependencies as possible.");
String[] chunks = {
"[ 0, 2]NC(NP) (We)",
"[ 3, 7]VC(VP) (need)",
"[ 8, 43]NC(NP) (a very complicated example sentence)",
"[ 45, 50]NC(NP) (which)",
"[ 51, 59]VC(VP) (contains)",
"[ 60, 62]PC(PP) (as)",
"[ 63, 97]NC(NP) (many constituents and dependencies)",
"[ 98,100]PC(PP) (as)",
"[101,109]ADJC(ADJP) (possible)" };
String[] chunkTags = { "ADJP", "ADVP", "CONJP", "INTJ", "LST", "NP", "PP", "PRT", "SBAR",
"UCP", "VP" };
String[] unmappedChunk = {};
assertChunks(chunks, select(jcas, Chunk.class));
assertTagset(Chunk.class, "conll2000", chunkTags, jcas);
assertTagsetMapping(Chunk.class, "conll2000", unmappedChunk, jcas);
}
@Test
public void testEnglishIxa()
throws Exception
{
JCas jcas = runTest("en", "perceptron-ixa", "We need a very complicated example sentence, "
+ "which contains as many constituents and dependencies as possible.");
String[] chunks = {
"[ 0, 2]NC(NP) (We)",
"[ 3, 7]VC(VP) (need)",
"[ 8, 43]NC(NP) (a very complicated example sentence)",
"[ 45, 50]NC(NP) (which)",
"[ 51, 59]VC(VP) (contains)",
"[ 60, 62]O(SBAR) (as)",
"[ 63, 97]NC(NP) (many constituents and dependencies)",
"[ 98,100]PC(PP) (as)",
"[101,109]ADJC(ADJP) (possible)" };
String[] chunkTags = { "ADJP", "ADVP", "CONJP", "INTJ", "LST", "NP", "PP", "PRT", "SBAR",
"UCP", "VP" };
String[] unmappedChunk = {};
assertChunks(chunks, select(jcas, Chunk.class));
assertTagset(Chunk.class, "conll2000", chunkTags, jcas);
assertTagsetMapping(Chunk.class, "conll2000", unmappedChunk, jcas);
}
@Ignore("We don't have these models integrated yet")
@Test
public void testPortuguese()
throws Exception
{
JCas jcas = runTest("pt", "cogroo", "Precisamos de uma frase exemplo muito complicado, que "
+ "contém o maior número de eleitores e dependências possível.");
String[] chunks = new String[] {
"[ 0, 43]Chunk(NP) (We need a very complicated example sentence)",
"[ 43, 44]Chunk(O) (,)",
"[ 45,109]Chunk(NP) (which contains as many constituents and dependencies as possible)",
"[109,110]Chunk(O) (.)" };
String[] chunkTags = new String[] { "ADJP", "ADVP", "CONJP", "INTJ", "LST", "NP", "O",
"PP", "PRT", "SBAR", "UCP", "VP" };
// String[] unmappedChunk = new String[] { "#", "$", "''", "-LRB-", "-RRB-", "``" };
assertChunks(chunks, select(jcas, Chunk.class));
assertTagset(Chunk.class, "conll2000", chunkTags, jcas);
// FIXME assertTagsetMapping(Chunk.class, "conll2000", unmappedChunk, jcas);
}
private JCas runTest(String aLanguage, String aVariant, String aText)
throws Exception
{
String variant = aVariant != null ? aVariant : "default";
AssumeResource.assumeResource(OpenNlpChunker.class, "chunker", aLanguage, variant);
AnalysisEngineDescription segmenter = createEngineDescription(OpenNlpSegmenter.class);
AnalysisEngineDescription tagger = createEngineDescription(OpenNlpPosTagger.class);
AnalysisEngineDescription chunker = createEngineDescription(OpenNlpChunker.class,
OpenNlpChunker.PARAM_VARIANT, aVariant,
OpenNlpChunker.PARAM_PRINT_TAGSET, true);
AnalysisEngineDescription aggregate = createEngineDescription(segmenter, tagger, chunker);
AnalysisEngine engine = createEngine(aggregate);
JCas jcas = engine.newJCas();
jcas.setDocumentLanguage(aLanguage);
jcas.setDocumentText(aText);
engine.process(jcas);
return jcas;
}
@Rule
public TestName name = new TestName();
@Before
public void printSeparator()
{
System.out.println("\n=== " + name.getMethodName() + " =====================");
}
}