/**
* Copyright 2007-2014
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see http://www.gnu.org/licenses/.
*/
package de.tudarmstadt.ukp.dkpro.core.matetools;
import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription;
import static org.apache.uima.fit.util.JCasUtil.select;
import java.util.ArrayList;
import java.util.List;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.junit.Assume;
import org.junit.Rule;
import org.junit.Test;
import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS;
import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency;
import de.tudarmstadt.ukp.dkpro.core.hunpos.HunPosTagger;
import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations;
import de.tudarmstadt.ukp.dkpro.core.testing.AssumeResource;
import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext;
import de.tudarmstadt.ukp.dkpro.core.testing.TagsetDescriptionStripper;
import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner;
public class MateParserTest
{
@Test
public void testGerman()
throws Exception
{
JCas jcas = runTest("de", "Wir brauchen ein sehr kompliziertes Beispiel , welches "
+ "möglichst viele Konstituenten und Dependenzen beinhaltet .");
String[] dependencies = {
"[ 0, 3]Dependency(SB,basic) D[0,3](Wir) G[4,12](brauchen)",
"[ 4, 12]ROOT(--,basic) D[4,12](brauchen) G[4,12](brauchen)",
"[ 13, 16]Dependency(NK,basic) D[13,16](ein) G[36,44](Beispiel)",
"[ 17, 21]Dependency(MO,basic) D[17,21](sehr) G[22,35](kompliziertes)",
"[ 22, 35]Dependency(NK,basic) D[22,35](kompliziertes) G[36,44](Beispiel)",
"[ 36, 44]DOBJ(OA,basic) D[36,44](Beispiel) G[4,12](brauchen)",
"[ 45, 46]Dependency(--,basic) D[45,46](,) G[36,44](Beispiel)",
"[ 47, 54]Dependency(SB,basic) D[47,54](welches) G[101,111](beinhaltet)",
"[ 55, 64]Dependency(MO,basic) D[55,64](möglichst) G[65,70](viele)",
"[ 65, 70]Dependency(NK,basic) D[65,70](viele) G[71,84](Konstituenten)",
"[ 71, 84]DOBJ(OA,basic) D[71,84](Konstituenten) G[101,111](beinhaltet)",
"[ 85, 88]Dependency(CD,basic) D[85,88](und) G[71,84](Konstituenten)",
"[ 89,100]CONJ(CJ,basic) D[89,100](Dependenzen) G[85,88](und)",
"[101,111]Dependency(RC,basic) D[101,111](beinhaltet) G[36,44](Beispiel)",
"[112,113]Dependency(--,basic) D[112,113](.) G[101,111](beinhaltet)" };
String[] posTags = { "$(", "$,", "$.", "ADJA", "ADJD", "ADV", "APPO", "APPR", "APPRART",
"APZR", "ART", "CARD", "END", "FM", "ITJ", "KOKOM", "KON", "KOUI", "KOUS", "MID",
"NE", "NN", "NNE", "PDAT", "PDS", "PIAT", "PIS", "PPER", "PPOSAT", "PPOSS",
"PRELAT", "PRELS", "PRF", "PROAV", "PTKA", "PTKANT", "PTKNEG", "PTKVZ", "PTKZU",
"PWAT", "PWAV", "PWS", "STPOS", "STR", "TRUNC", "VAFIN", "VAIMP", "VAINF", "VAPP",
"VMFIN", "VMINF", "VMPP", "VVFIN", "VVIMP", "VVINF", "VVIZU", "VVPP", "XY" };
String[] dependencyTags = { "--", "AC", "ADC", "AG", "AMS", "APP", "AVC", "CC", "CD", "CJ",
"CM", "CP", "CVC", "DA", "DM", "END", "EP", "JU", "MNR", "MO", "NG", "NK", "NMC",
"OA", "OA2", "OC", "OG", "OP", "PAR", "PD", "PG", "PH", "PM", "PNC", "RC", "RE",
"RS", "SB", "SBP", "SP", "SVP", "UC", "VO" };
AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class));
AssertAnnotations.assertTagset(POS.class, "stts", posTags, jcas);
AssertAnnotations.assertTagset(Dependency.class, "negra", dependencyTags, jcas);
}
@Test
public void testEnglish()
throws Exception
{
JCas jcas = runTest("en", "We need a very complicated example sentence , which " +
"contains as many constituents and dependencies as possible .");
String[] dependencies = {
"[ 0, 2]Dependency(SBJ,basic) D[0,2](We) G[3,7](need)",
"[ 3, 7]ROOT(ROOT,basic) D[3,7](need) G[3,7](need)",
"[ 8, 9]Dependency(NMOD,basic) D[8,9](a) G[35,43](sentence)",
"[ 10, 14]Dependency(AMOD,basic) D[10,14](very) G[15,26](complicated)",
"[ 15, 26]Dependency(NMOD,basic) D[15,26](complicated) G[35,43](sentence)",
"[ 27, 34]Dependency(NMOD,basic) D[27,34](example) G[35,43](sentence)",
"[ 35, 43]Dependency(OBJ,basic) D[35,43](sentence) G[3,7](need)",
"[ 44, 45]Dependency(P,basic) D[44,45](,) G[35,43](sentence)",
"[ 46, 51]Dependency(SBJ,basic) D[46,51](which) G[52,60](contains)",
"[ 52, 60]Dependency(NMOD,basic) D[52,60](contains) G[35,43](sentence)",
"[ 61, 63]Dependency(AMOD,basic) D[61,63](as) G[64,68](many)",
"[ 64, 68]Dependency(NMOD,basic) D[64,68](many) G[69,81](constituents)",
"[ 69, 81]Dependency(OBJ,basic) D[69,81](constituents) G[52,60](contains)",
"[ 82, 85]Dependency(COORD,basic) D[82,85](and) G[69,81](constituents)",
"[ 86, 98]Dependency(CONJ,basic) D[86,98](dependencies) G[82,85](and)",
"[ 99,101]Dependency(NMOD,basic) D[99,101](as) G[69,81](constituents)",
"[102,110]Dependency(PMOD,basic) D[102,110](possible) G[99,101](as)",
"[111,112]Dependency(P,basic) D[111,112](.) G[3,7](need)" };
String[] posTags = { "#", "$", "''", "(", ")", ",", ".", ":", "CC", "CD", "DT", "END",
"EX", "FW", "HYPH", "IN", "JJ", "JJR", "JJS", "LS", "MD", "MID", "NIL", "NN",
"NNP", "NNPS", "NNS", "PDT", "POS", "PRF", "PRP", "PRP$", "RB", "RBR", "RBS", "RP",
"STPOS", "STR", "SYM", "TO", "UH", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "WDT",
"WP", "WP$", "WRB", "``" };
String[] dependencyTags = { "ADV", "ADV-GAP", "AMOD", "AMOD-GAP", "APPO", "BNF", "CONJ",
"COORD", "DEP", "DEP-GAP", "DIR", "DIR-GAP", "DIR-OPRD", "DIR-PRD", "DTV",
"DTV-GAP", "END", "EXT", "EXT-GAP", "EXTR", "EXTR-GAP", "GAP-LGS", "GAP-LOC",
"GAP-LOC-PRD", "GAP-MNR", "GAP-NMOD", "GAP-OBJ", "GAP-OPRD", "GAP-PMOD", "GAP-PRD",
"GAP-PRP", "GAP-PUT", "GAP-SBJ", "GAP-SUB", "GAP-TMP", "GAP-VC", "HMOD", "HYPH",
"IM", "LGS", "LOC", "LOC-MNR", "LOC-OPRD", "LOC-PRD", "LOC-TMP", "MNR", "MNR-PRD",
"MNR-TMP", "NAME", "NMOD", "OBJ", "OPRD", "P", "PMOD", "POSTHON", "PRD", "PRD-PRP",
"PRD-TMP", "PRN", "PRP", "PRT", "PUT", "ROOT", "SBJ", "SUB", "SUFFIX", "TITLE",
"TMP", "VC", "VOC" };
AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class));
AssertAnnotations.assertTagset(POS.class, "ptb", posTags, jcas);
AssertAnnotations.assertTagset(Dependency.class, "conll2008", dependencyTags, jcas);
}
@Test
public void testFrench()
throws Exception
{
JCas jcas = runTest("fr", "Nous avons besoin d'une phrase par exemple très "
+ "compliqué, qui contient des constituants que de nombreuses dépendances et que "
+ "possible .");
String[] dependencies = {
"[ 0, 4]Dependency(suj,basic) D[0,4](Nous) G[5,10](avons)",
"[ 5, 10]ROOT(root,basic) D[5,10](avons) G[5,10](avons)",
"[ 11, 17]Dependency(obj,basic) D[11,17](besoin) G[5,10](avons)",
"[ 18, 23]Dependency(mod,basic) D[18,23](d'une) G[5,10](avons)",
"[ 24, 30]Dependency(obj,basic) D[24,30](phrase) G[18,23](d'une)",
"[ 31, 34]Dependency(dep,basic) D[31,34](par) G[24,30](phrase)",
"[ 35, 42]Dependency(obj,basic) D[35,42](exemple) G[31,34](par)",
"[ 43, 47]Dependency(mod,basic) D[43,47](très) G[48,58](compliqué,)",
"[ 48, 58]Dependency(mod,basic) D[48,58](compliqué,) G[35,42](exemple)",
"[ 59, 62]Dependency(suj,basic) D[59,62](qui) G[63,71](contient)",
"[ 63, 71]Dependency(mod_rel,basic) D[63,71](contient) G[24,30](phrase)",
"[ 72, 75]Dependency(det,basic) D[72,75](des) G[76,88](constituants)",
"[ 76, 88]Dependency(obj,basic) D[76,88](constituants) G[63,71](contient)",
"[ 89, 92]Dependency(dep,basic) D[89,92](que) G[76,88](constituants)",
"[ 93, 95]Dependency(det,basic) D[93,95](de) G[107,118](dépendances)",
"[ 96,106]Dependency(mod,basic) D[96,106](nombreuses) G[107,118](dépendances)",
"[107,118]Dependency(obj,basic) D[107,118](dépendances) G[89,92](que)",
"[119,121]Dependency(coord,basic) D[119,121](et) G[89,92](que)",
"[122,125]Dependency(dep_coord,basic) D[122,125](que) G[119,121](et)",
"[126,134]Dependency(obj,basic) D[126,134](possible) G[122,125](que)",
"[135,136]Dependency(ponct,basic) D[135,136](.) G[5,10](avons)" };
String[] posMapped = { "PRON", "VERB", "NOUN", "ADP", "NOUN", "ADP", "NOUN", "ADV", "ADJ",
"PRON", "VERB", "DET", "NOUN", "CONJ", "DET", "ADJ", "NOUN", "CONJ", "CONJ", "ADJ",
"PUNCT" };
String[] posOriginal = { "CLS", "V", "NC", "P", "NC", "P", "NC", "ADV", "ADJ",
"PROREL", "V", "DET", "NC", "CS", "DET", "ADJ", "NC", "CC", "CS", "ADJ", "PONCT" };
String[] posTags = { "ADJ", "ADJWH", "ADV", "ADVWH",
"CC", "CLO", "CLR", "CLS", "CS", "DET", "DETWH", "END", "ET", "I", "MID", "NC",
"NPP", "P", "P+D", "P+PRO", "PONCT", "PREF", "PRO", "PROREL", "PROWH", "STPOS",
"STR", "V", "VIMP", "VINF", "VPP", "VPR", "VS" };
String[] depTags = { "END", "a_obj", "aff", "arg", "ato", "ats", "aux_caus", "aux_pass",
"aux_tps", "comp", "coord", "de_obj", "dep", "dep_coord", "det", "missinghead",
"mod", "mod_rel", "obj", "obj1", "p_obj", "ponct", "root", "suj" };
String[] unmappedPos = { "END", "MID", "STPOS", "STR" };
String[] unmappedDep = { "END", "comp", "missinghead", "obj1", "root" };
AssertAnnotations.assertPOS(posMapped, posOriginal, select(jcas, POS.class));
AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class));
AssertAnnotations.assertTagset(POS.class, "melt", posTags, jcas);
AssertAnnotations.assertTagsetMapping(POS.class, "melt", unmappedPos, jcas);
AssertAnnotations.assertTagset(Dependency.class, "ftb", depTags, jcas);
AssertAnnotations.assertTagsetMapping(Dependency.class, "ftb", unmappedDep, jcas);
}
@Test
public void testFarsi()
throws Exception
{
JCas jcas = runTest(
"fa",
"parsper",
"ما به عنوان مثال جمله بسیار پیچیده، که شامل به عنوان بسیاری از مولفه ها و وابستگی ها که ممکن است نیاز دارید .");
String[] dependencies = {
"[ 0, 2]Dependency(nsubj,basic) D[0,2](ما) G[39,43](شامل)",
"[ 3, 5]Dependency(prep,basic) D[3,5](به) G[39,43](شامل)",
"[ 6, 11]Dependency(pobj,basic) D[6,11](عنوان) G[3,5](به)",
"[ 12, 16]Dependency(pobj,basic) D[12,16](مثال) G[3,5](به)",
"[ 17, 21]Dependency(pobj,basic) D[17,21](جمله) G[3,5](به)",
"[ 22, 27]Dependency(advmod,basic) D[22,27](بسیار) G[28,35](پیچیده،)",
"[ 28, 35]Dependency(acomp,basic) D[28,35](پیچیده،) G[39,43](شامل)",
"[ 36, 38]Dependency(rel,basic) D[36,38](که) G[39,43](شامل)",
"[ 39, 43]ROOT(root,basic) D[39,43](شامل) G[39,43](شامل)",
"[ 44, 46]Dependency(prep,basic) D[44,46](به) G[39,43](شامل)",
"[ 47, 52]Dependency(pobj,basic) D[47,52](عنوان) G[44,46](به)",
"[ 53, 59]Dependency(pobj,basic) D[53,59](بسیاری) G[47,52](عنوان)",
"[ 60, 62]Dependency(prep,basic) D[60,62](از) G[53,59](بسیاری)",
"[ 63, 68]Dependency(pobj,basic) D[63,68](مولفه) G[60,62](از)",
"[ 69, 71]Dependency(pobj,basic) D[69,71](ها) G[60,62](از)",
"[ 72, 73]Dependency(cc,basic) D[72,73](و) G[53,59](بسیاری)",
"[ 74, 81]Dependency(pobj,basic) D[74,81](وابستگی) G[88,92](ممکن)",
"[ 82, 84]Dependency(pobj,basic) D[82,84](ها) G[74,81](وابستگی)",
"[ 85, 87]Dependency(complm,basic) D[85,87](که) G[88,92](ممکن)",
"[ 88, 92]Dependency(conj,basic) D[88,92](ممکن) G[53,59](بسیاری)",
"[ 93, 96]Dependency(pobj,basic) D[93,96](است) G[44,46](به)",
"[ 97,101]Dependency(pobj,basic) D[97,101](نیاز) G[44,46](به)",
"[102,107]Dependency(pobj,basic) D[102,107](دارید) G[44,46](به)",
"[108,109]Dependency(punct,basic) D[108,109](.) G[39,43](شامل)" };
String[] posTags = { "ADJ", "ADV", "CLITIC", "CON", "DELM", "DET", "END", "FW", "INT",
"MID", "N", "NUM", "P", "PREV", "PRO", "STPOS", "STR", "V" };
String[] depTags = { "END", "acc", "acomp", "acomp/pc", "advcl", "advcl/cop", "advcl/pc",
"advmod", "advmod/pc", "amod", "amod/cop", "amod/pc", "appos", "appos/pc", "aux",
"auxpass", "cc", "ccomp", "ccomp/cop", "ccomp/pc", "ccomp/pc/cop", "ccomp\\cpobj",
"ccomp\\nsubj", "ccomp\\pobj", "ccomp\\poss", "complm", "conj", "conj/cop",
"conj/pc", "conj\\pobj", "conj\\poss", "cop", "cpobj", "cpobj/pc", "cprep", "dep",
"dep-top", "dep-voc", "dep/pc", "det", "dobj", "dobj/acc", "dobj/pc", "fw", "lvc",
"lvc/pc", "mark", "mwe", "mwe/pc", "neg", "nn", "nn/cop", "npadvmod", "nsubj",
"nsubj/pc", "nsubjpass", "nsubjpass/pc", "num", "number", "parataxis",
"parataxis/cop", "parataxis/pc", "pobj", "pobj/cop", "pobj/pc", "poss", "poss/acc",
"poss/cop", "poss/pc", "preconj", "predet", "prep", "prep/det", "prep/pc",
"prep/pobj", "prt", "punct", "quantmod", "rcmod", "rcmod/cop", "rcmod/pc",
"rcmod\\amod", "rcmod\\pobj", "rcmod\\poss", "rel", "root", "root/cop", "root/pc",
"root\\conj", "root\\pobj", "root\\poss", "tmod", "xcomp" };
String[] unmappedPos = { };
String[] origPos = { "PRO", "P", "N_SING", "N_SING", "N_SING", "ADV", "ADJ", "CON", "ADJ",
"P", "N_SING", "ADJ", "P", "N_SING", "N_SING", "CON", "N_SING", "N_PL", "CON",
"ADJ", "V_COP", "N_SING", "V_PRS", "DELM" };
String[] mappedPos = { "PRON", "ADP", "NOUN", "NOUN", "NOUN", "ADV", "ADJ", "CONJ", "ADJ",
"ADP", "NOUN", "ADJ", "ADP", "NOUN", "NOUN", "CONJ", "NOUN", "NOUN", "CONJ", "ADJ",
"VERB", "NOUN", "VERB", "PUNCT" };
AssertAnnotations.assertPOS(mappedPos, origPos, JCasUtil.select(jcas, POS.class));
AssertAnnotations.assertDependencies(dependencies, JCasUtil.select(jcas, Dependency.class));
AssertAnnotations.assertTagset(POS.class, "upc-reduced", posTags, jcas);
AssertAnnotations.assertTagsetMapping(POS.class, "upc-reduced", unmappedPos, jcas);
AssertAnnotations.assertTagset(Dependency.class, "updt", depTags, jcas);
// FIXME AssertAnnotations.assertTagsetMapping(Dependency.class, "ftb", new String[] {},
// jcas);
}
private JCas runTest(String aLanguage, String aVariant, String aText)
throws Exception
{
AssumeResource.assumeResource(MateSemanticRoleLabeler.class, "parser", aLanguage, aVariant);
AnalysisEngineDescription engine = getEngines(aLanguage, aVariant);
if (aLanguage.startsWith("dummy-")) {
aLanguage = aLanguage.substring("dummy-".length());
}
return TestRunner.runTest(engine, aLanguage, aText);
}
public static AnalysisEngineDescription getEngines(String aLanguage, String aVariant)
throws ResourceInitializationException
{
List<AnalysisEngineDescription> engines = new ArrayList<AnalysisEngineDescription>();
if ("fa".equals(aLanguage) || "sv".equals(aLanguage)) {
engines.add(createEngineDescription(HunPosTagger.class));
}
else {
engines.add(createEngineDescription(MatePosTagger.class));
}
engines.add(createEngineDescription(TagsetDescriptionStripper.class));
engines.add(createEngineDescription(MateParser.class,
MateParser.PARAM_VARIANT, aVariant,
MateParser.PARAM_PRINT_TAGSET, true));
return createEngineDescription(engines
.toArray(new AnalysisEngineDescription[engines.size()]));
}
private JCas runTest(String aLanguage, String aText)
throws Exception
{
Assume.assumeTrue(Runtime.getRuntime().maxMemory() >= 2000000000);
AssumeResource.assumeResource(MateSemanticRoleLabeler.class, "parser", aLanguage, null);
AnalysisEngineDescription aggregate = createEngineDescription(
createEngineDescription(MatePosTagger.class),
createEngineDescription(MateParser.class));
return TestRunner.runTest(aggregate, aLanguage, aText);
}
@Rule
public DkproTestContext testContext = new DkproTestContext();
}