/* * Copyright 2014 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.core.hunpos; import static org.apache.commons.lang.StringUtils.repeat; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import static org.apache.uima.fit.util.JCasUtil.select; import static org.junit.Assert.assertEquals; import java.util.ArrayList; import java.util.List; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.fit.testing.util.HideOutput; import org.apache.uima.jcas.JCas; import org.junit.Assume; import org.junit.Before; import org.junit.Ignore; import org.junit.Rule; import org.junit.Test; import org.junit.rules.TestName; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; public class HunPosTaggerTest { // @Test // public void testCatalan() // throws Exception // { // runTest("ca", null, "Aquesta és una prova .", // new String[] { "Pd-nsn--n-a", "Vcr3s", "N-msan", "Z" }, // new String[] { "POS", "POS", "POS", "POS" }); // } @Test public void testCroatian() throws Exception { runTest("hr", null, "Ovo je test .", new String[] { "Pd-nsn--n-a", "Vcr3s", "N-msan", "Z" }, new String[] { "POS", "POS", "POS", "POS" }); } @Test public void testDanish() throws Exception { runTest("da", null, "Dette er en test .", new String[] { "PD", "VA", "PI", "NC", "XP" }, new String[] { "PRON", "VERB", "PRON", "NOUN", "PUNCT" }); } @Test public void testEnglish() throws Exception { runTest("en", null, "This is a test .", new String[] { "DT", "VBZ", "DT", "NN", "." }, new String[] { "DET", "VERB", "DET", "NOUN", "PUNCT" }); runTest("en", null, "A neural net .", new String[] { "DT", "JJ", "NN", "." }, new String[] { "DET", "ADJ", "NOUN", "PUNCT" }); runTest("en", null, "John is purchasing oranges .", new String[] { "NNP", "VBZ", "VBG", "NNS", "." }, new String[] { "PROPN", "VERB", "VERB", "NOUN", "PUNCT" }); } @Test public void testFarsi() throws Exception { runTest("fa", null, "این یک تست است . \n", new String[] { "DET", "PRO", "N_SING", "V_COP", "DELM" }, new String[] { "DET", "PRON", "NOUN", "VERB", "PUNCT" }); } @Test public void testGerman() throws Exception { runTest("de", null, "Das ist ein Test .", new String[] { "PDS", "VAFIN", "ART", "NN", "$." }, new String[] { "PRON", "VERB", "DET", "NOUN", "PUNCT" }); } @Test public void testHungarian() throws Exception { runTest("hu", null, "Ez egy teszt .", new String[] { "NOUN", "ART", "NOUN", "PUNCT" }, new String[] { "POS", "POS", "POS", "POS" }); } @Test public void testPortuguese() throws Exception { runTest("pt", null, "Este é um teste .", new String[] {"pron-det", "v-fin", "art", "n", "punc" }, new String[] { "PRON", "VERB", "DET", "NOUN", "PUNCT" }); runTest("pt", "tbchp", "Este é um teste .", new String[] { "D", "SR-P", "D-UM", "N", "." }, new String[] { "POS", "POS", "POS", "POS", "POS" }); runTest("pt", "mm", "Este é um teste .", new String[] { "PROSUB", "V", "ART", "N", "." }, new String[] { "POS", "POS", "POS", "POS", "POS" }); runTest("pt", "bosque", "Este é um teste .", new String[] { "pron-det", "v-fin", "art", "n", "punc" }, new String[] { "PRON", "VERB", "DET", "NOUN", "PUNCT" }); } @Test public void testRussian() throws Exception { runTest("ru", null, "Это тест .", new String[] { "A", "S", "PUNC" }, new String[] { "POS", "POS", "POS" }); } @Test public void testSlovenian() throws Exception { runTest("sl", null, "To je test .", new String[] { "zaimek-kazalni", "glagol-pomožni", "samostalnik-občno_ime", "PUNC" }, new String[] { "POS", "POS", "POS", "POS" }); } @Test public void testSwedish() throws Exception { runTest("sv", null, "Detta är ett test .", new String[] { "PN_NEU_SIN_DEF_SUB/OBJ", "VB_PRS_AKT", "DT_NEU_SIN_IND", "NN_NEU_SIN_IND_NOM", "DL_MAD" }, new String[] { "X", "X", "X", "X", "X" }); runTest("sv", "paroletags", "Detta är ett test .", new String[] { "PF@NS0@S", "V@IPAS", "DI@NS@S", "NCNSN@IS", "FE" }, new String[] { "POS", "POS", "POS", "POS", "POS" }); runTest("sv", "suctags", "Detta är ett test .", new String[] { "PN_NEU_SIN_DEF_SUB/OBJ", "VB_PRS_AKT", "DT_NEU_SIN_IND", "NN_NEU_SIN_IND_NOM", "DL_MAD" }, new String[] { "X", "X", "X", "X", "X" }); // runTest("sv", "suc2x", "Detta är ett test .", // new String[] { "PN_NEU_SIN_DEF_SUB@OBJ", "VB_PRS_AKT", "DT_NEU_SIN_IND", "NN_NEU_SIN_IND_NOM", "MAD" }, // new String[] { "O", "O", "O", "O", "O" }); } @Test // @Ignore("Platform specific") public void testOddCharacters() throws Exception { runTest("en", null, "² § ¶ § °", new String[] { "NNP", "NNP", "NNP", "NNP", "NNP" }, new String[] { "PROPN", "PROPN", "PROPN", "PROPN", "PROPN"}); } /** * Generate a very large document and test it. * @throws Exception if an error occurs. */ @Test @Ignore("Takes too long") public void hugeDocumentTest() throws Exception { // Start Java with -Xmx512m boolean run = Runtime.getRuntime().maxMemory() > (500000000); if (!run) { System.out.println("Test requires more heap than available, skipping"); } Assume.assumeTrue(run); String text = "This is a test .\n"; int reps = 4000000 / text.length(); String testString = repeat(text, " ", reps); AnalysisEngineDescription engine = createEngineDescription(HunPosTagger.class); JCas jcas = TestRunner.runTest(engine, "en", testString); List<POS> actualTags = new ArrayList<POS>(select(jcas, POS.class)); assertEquals(reps * 5, actualTags.size()); // test POS annotations String[] expectedTags = new String[] { "DT", "VBZ", "DT", "NN", "." }; String[] expectedTagClasses = new String[] { "ART", "V", "ART", "NN", "PUNC" }; for (int i = 0; i < actualTags.size(); i++) { POS posAnnotation = actualTags.get(i); assertEquals("In position "+i, expectedTagClasses[i%5], posAnnotation.getType().getShortName()); assertEquals("In position "+i, expectedTags[i%5], posAnnotation.getPosValue()); } System.out.println("Successfully tagged document with " + testString.length() + " characters"); } /** * Test using the same AnalysisEngine multiple times. * @throws Exception if an error occurs. */ @Test @Ignore("Takes too long") public void multiDocumentTest() throws Exception { String testDocument = "This is a test ."; String[] tags = new String[] { "DT", "VBZ", "DT", "NN", "." }; String[] tagClasses = new String[] { "ART", "V", "ART", "NN", "PUNC" }; AnalysisEngine engine = createEngine(HunPosTagger.class); HideOutput hideOut = new HideOutput(); try { for (int n = 0; n < 100; n++) { JCas aJCas = TestRunner.runTest(engine, "en", testDocument); AssertAnnotations.assertPOS(tagClasses, tags, select(aJCas, POS.class)); } } finally { engine.destroy(); hideOut.restoreOutput(); } } private JCas runTest(String language, String variant, String testDocument, String[] tags, String[] tagClasses) throws Exception { AnalysisEngine engine = createEngine(HunPosTagger.class, HunPosTagger.PARAM_VARIANT, variant, HunPosTagger.PARAM_PRINT_TAGSET, true); JCas jcas = TestRunner.runTest(engine, language, testDocument); AssertAnnotations.assertPOS(tagClasses, tags, select(jcas, POS.class)); return jcas; } @Rule public TestName name = new TestName(); @Before public void printSeparator() { System.out.println("\n=== " + name.getMethodName() + " ====================="); } }