/* * Copyright 2014 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.core.treetagger; import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.assertChunks; import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.assertTagset; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import static org.apache.uima.fit.util.JCasUtil.select; import static org.junit.Assert.assertEquals; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.fit.testing.factory.TokenBuilder; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; import org.junit.Assume; import org.junit.Rule; import org.junit.Test; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk; import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; public class TreeTaggerChunkerTest { @Test public void testEnglish() throws Exception { JCas jcas = runTest("en", null, "We need a very complicated example sentence, which " + "contains as many constituents and dependencies as possible ."); String[] chunks = new String[] { "[ 0, 2]NC(NC) (We)", "[ 3, 7]VC(VC) (need)", "[ 8, 44]NC(NC) (a very complicated example sentence,)", "[ 45, 50]NC(NC) (which)", "[ 51, 59]VC(VC) (contains)", "[ 60, 62]PC(PC) (as)", "[ 63, 97]NC(NC) (many constituents and dependencies)", "[ 98,100]PC(PC) (as)", "[101,109]ADJC(ADJC) (possible)", "[110,111]O(O) (.)" }; String[] chunkTags = new String[] { "ADJC", "ADVC", "CONJC", "INTJ", "LST", "NC", "O", "PC", "PRT", "SBAR", "VC", "that" }; // String[] unmappedChunk = new String[] { "#", "$", "''", "-LRB-", "-RRB-", "``" }; assertChunks(chunks, select(jcas, Chunk.class)); assertTagset(Chunk.class, "tt", chunkTags, jcas); // FIXME assertTagsetMapping(Chunk.class, "conll2000", unmappedChunk, jcas); } @Test public void testGerman() throws Exception { JCas jcas = runTest("de", null, "Wir brauchen ein sehr kompliziertes Beispiel , welches " + "möglichst viele Konstituenten und Dependenzen beinhaltet ."); String[] chunks = new String[] { "[ 0, 3]NC(NC) (Wir)", "[ 4, 12]VC(VC) (brauchen)", "[ 13, 44]NC(NC) (ein sehr kompliziertes Beispiel)", "[ 45, 46]O(0) (,)", "[ 47, 54]NC(NC) (welches)", "[ 55, 64]O(0) (möglichst)", "[ 65, 84]NC(NC) (viele Konstituenten)", "[ 85, 88]O(0) (und)", "[ 89,100]NC(NC) (Dependenzen)", "[101,111]VC(VC) (beinhaltet)", "[112,113]O(0) (.)" }; String[] chunkTags = new String[] { "0", "NC", "PC", "VC" }; // String[] unmappedChunk = new String[] { "#", "$", "''", "-LRB-", "-RRB-", "``" }; assertChunks(chunks, select(jcas, Chunk.class)); assertTagset(Chunk.class, "tt", chunkTags, jcas); // FIXME assertTagsetMapping(Chunk.class, "conll2000", unmappedChunk, jcas); } @Test public void testFrench() throws Exception { JCas jcas = runTest("fr", null, "Nous avons besoin d' une phrase par exemple très " + "compliqué , qui contient des constituants que de nombreuses dépendances et que " + "possible ."); String[] chunks = new String[] { "[ 0, 17]VC(VN) (Nous avons besoin)", "[ 18, 20]PC(PP) (d')", "[ 21, 31]NC(NP) (une phrase)", "[ 32, 35]PC(PP) (par)", "[ 36, 43]NC(NP) (exemple)", "[ 44, 60]ADJC(AP) (très compliqué ,)", "[ 61, 64]NC(NP) (qui)", "[ 65, 73]VC(VN) (contient)", "[ 74, 90]NC(NP) (des constituants)", "[ 91, 94]O(Ssub) (que)", "[ 95,120]NC(NP) (de nombreuses dépendances)", "[121,127]O(COORD) (et que)", "[128,136]ADJC(AP) (possible)", "[137,138]O(0) (.)" }; String[] chunkTags = new String[] { "0", "AP", "AdP", "COORD", "NP", "PONCT:S", "PP", "Sint", "Srel", "Ssub", "VN", "VPinf", "VPpart" }; // String[] unmappedChunk = new String[] { "#", "$", "''", "-LRB-", "-RRB-", "``" }; assertChunks(chunks, select(jcas, Chunk.class)); assertTagset(Chunk.class, "ftb", chunkTags, jcas); // FIXME assertTagsetMapping(Chunk.class, "conll2000", unmappedChunk, jcas); } private JCas runTest(String aLanguage, String aVariant, String aText) throws Exception { checkModelsAndBinary(aLanguage); AnalysisEngineDescription tagger = createEngineDescription(TreeTaggerPosTagger.class); AnalysisEngineDescription chunker = createEngineDescription(TreeTaggerChunker.class, TreeTaggerChunker.PARAM_VARIANT, aVariant, TreeTaggerChunker.PARAM_PRINT_TAGSET, true); AnalysisEngineDescription aggregate = createEngineDescription(tagger, chunker); return TestRunner.runTest(aggregate, aLanguage, aText); } private JCas runTest(String aLanguage, String aText, String[] aLemmas, String[] aTags, String[] aTagClasses) throws Exception { AnalysisEngine tagger = createEngine(TreeTaggerPosTagger.class); AnalysisEngine chunker = createEngine(TreeTaggerChunker.class, TreeTaggerPosTagger.PARAM_PRINT_TAGSET, true); JCas aJCas = JCasFactory.createJCas(); aJCas.setDocumentLanguage(aLanguage); TokenBuilder<Token, Annotation> tb = TokenBuilder.create(Token.class, Annotation.class); tb.buildTokens(aJCas, aText); tagger.process(aJCas); chunker.process(aJCas); // test Chunk annotations if (aTagClasses != null && aTags != null) { int i = 0; for (Chunk posAnnotation : select(aJCas, Chunk.class)) { System.out.println(posAnnotation.getChunkValue()+": ["+posAnnotation.getCoveredText()+"]"); assertEquals("In position "+i, aTagClasses[i], posAnnotation.getType().getShortName()); assertEquals("In position "+i, aTags[i], posAnnotation.getChunkValue()); i++; } assertEquals(aTags.length, i); } return aJCas; } private void checkModelsAndBinary(String lang) { Assume.assumeTrue(getClass().getResource( "/de/tudarmstadt/ukp/dkpro/core/treetagger/lib/chunker-" + lang + "-le.bin") != null); Assume.assumeTrue(getClass().getResource( "/de/tudarmstadt/ukp/dkpro/core/treetagger/bin/LICENSE.txt") != null); } @Rule public DkproTestContext testContext = new DkproTestContext(); }