/**
* Copyright 2007-2014
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see http://www.gnu.org/licenses/.
*/
package de.tudarmstadt.ukp.dkpro.core.lingpipe;
import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine;
import static org.apache.uima.fit.util.JCasUtil.select;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.jcas.JCas;
import org.junit.Rule;
import org.junit.Test;
import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS;
import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations;
import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext;
import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner;
public class LingPipePosTaggerTest
{
@Test
public void testEnglish()
throws Exception
{
runTest("en", null, "This is a test .",
new String[] { "DT", "BEZ", "AT", "NN", "." },
new String[] { "DET", "VERB", "DET", "NOUN", "PUNCT" });
runTest("en", null, "A neural net .",
new String[] { "AT", "JJ", "NN", "." },
new String[] { "DET", "ADJ", "NOUN", "PUNCT" });
runTest("en", null, "John is purchasing oranges .",
new String[] { "NP", "BEZ", "VBG", "NNS", "." },
new String[] { "PROPN", "VERB", "VERB", "NOUN", "PUNCT" });
// This is WRONG tagging. "jumps" is tagged as "NNS"
JCas jcas = runTest("en", "general-brown", "The quick brown fox jumps over the lazy dog . \n",
new String[] { "AT", "JJ", "JJ", "NN", "NNS", "IN", "AT", "JJ", "NN", "." },
new String[] { "DET", "ADJ", "ADJ", "NOUN", "NOUN", "ADP", "DET", "ADJ", "NOUN",
"PUNCT" });
String[] brownTags = { "'", "''", "(", ")", "*", ",", "--", ".", ":", "ABL", "ABN", "ABX",
"AP", "AP$", "AT", "BE", "BED", "BEDZ", "BEG", "BEM", "BEN", "BER", "BEZ", "CC",
"CD", "CD$", "CS", "DO", "DOD", "DOZ", "DT", "DT$", "DTI", "DTS", "DTX", "EX",
"HV", "HVD", "HVG", "HVN", "HVZ", "IN", "JJ", "JJ$", "JJR", "JJS", "JJT", "MD",
"NIL", "NN", "NN$", "NNS", "NNS$", "NP", "NP$", "NPS", "NPS$", "NR", "NR$", "NRS",
"OD", "PN", "PN$", "PP$", "PP$$", "PPL", "PPLS", "PPO", "PPS", "PPSS", "QL", "QLP",
"RB", "RB$", "RBR", "RBT", "RN", "RP", "TL", "TO", "UH", "VB", "VBD", "VBG", "VBN",
"VBZ", "WDT", "WP$", "WPO", "WPS", "WQL", "WRB", "``" };
String[] unmappedBrown = { "'", "''", "*", "--", "AP$", "DT$", "JJ$", "NIL", "``" };
AssertAnnotations.assertTagset(POS.class, "brown", brownTags, jcas);
AssertAnnotations.assertTagsetMapping(POS.class, "brown", unmappedBrown, jcas);
jcas = runTest("en", "bio-genia", "The quick brown fox jumps over the lazy dog . \n",
new String[] { "DT", "RB", "VBN", "NN", "NNS", "IN", "DT", "NN", "NN", "." },
new String[] { "DET", "ADV", "VERB", "NOUN", "NOUN", "ADP", "DET", "NOUN", "NOUN",
"PUNCT" });
String[] ptbTags = { "", "''", "(", ")", ",", "-", ".", ":", "CC", "CD", "CT", "DT", "EX",
"FW", "IN", "JJ", "JJR", "JJS", "LS", "MD", "N", "NN", "NNP", "NNPS", "NNS", "PDT",
"POS", "PP", "PRP", "PRP$", "RB", "RBR", "RBS", "RP", "SYM", "TO", "VB", "VBD",
"VBG", "VBN", "VBP", "VBZ", "WDT", "WP", "WP$", "WRB", "XT", "``" };
String[] unmappedPtb = { "", "CT", "N", "XT" };
AssertAnnotations.assertTagset(POS.class, "ptb", ptbTags, jcas);
AssertAnnotations.assertTagsetMapping(POS.class, "ptb", unmappedPtb, jcas);
jcas = runTest("en", "bio-medpost", "The quick brown fox jumps over the lazy dog . \n",
new String[] { "DD", "NN", "JJ", "NN", "NNS", "II", "DD", "NN", "NN", "." },
new String[] { "DET", "NOUN", "ADJ", "NOUN", "NOUN", "ADP", "DET", "NOUN", "NOUN",
"PUNCT" });
String[] medpostTags = { "''", "(", ")", ",", ".", ":", "CC", "CC+", "CS", "CS+", "CSN",
"CST", "DB", "DD", "EX", "GE", "II", "II+", "JJ", "JJ+", "JJR", "JJT", "MC", "NN",
"NN+", "NNP", "NNS", "PN", "PND", "PNG", "PNR", "RR", "RR+", "RRR", "RRT", "SYM",
"TO", "VBB", "VBD", "VBG", "VBI", "VBN", "VBZ", "VDB", "VDD", "VDN", "VDZ", "VHB",
"VHD", "VHG", "VHI", "VHZ", "VM", "VVB", "VVD", "VVG", "VVGJ", "VVGN", "VVI",
"VVN", "VVNJ", "VVZ", "``" };
String[] unmappedMedpost = { "CC+", "CS+", "II+", "JJ+", "NN+", "RR+" };
AssertAnnotations.assertTagset(POS.class, "medpost", medpostTags, jcas);
AssertAnnotations.assertTagsetMapping(POS.class, "medpost", unmappedMedpost, jcas);
}
private JCas runTest(String language, String variant, String testDocument, String[] tags,
String[] tagClasses)
throws Exception
{
AnalysisEngine engine = createEngine(LingPipePosTagger.class,
LingPipePosTagger.PARAM_VARIANT, variant,
LingPipePosTagger.PARAM_PRINT_TAGSET, true);
JCas jcas = TestRunner.runTest(engine, language, testDocument);
AssertAnnotations.assertPOS(tagClasses, tags, select(jcas, POS.class));
return jcas;
}
@Rule
public DkproTestContext testContext = new DkproTestContext();
}