/** * Copyright 2007-2014 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see http://www.gnu.org/licenses/. */ package de.tudarmstadt.ukp.dkpro.core.corenlp; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import static org.apache.uima.fit.util.JCasUtil.select; import org.apache.commons.lang.ArrayUtils; import org.apache.uima.fit.factory.AggregateBuilder; import org.apache.uima.jcas.JCas; import org.junit.Rule; import org.junit.Test; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; import de.tudarmstadt.ukp.dkpro.core.testing.AssumeResource; import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; public class CoreNlpDependencyParserTest { private static final String[] GERMAN_POS_TAGS = { "$,", "$.", "$[", "ADJA", "ADJD", "ADV", "APPO", "APPR", "APPRART", "APZR", "ART", "CARD", "FM", "ITJ", "KOKOM", "KON", "KOUI", "KOUS", "NE", "NN", "PDAT", "PDS", "PIAT", "PIDAT", "PIS", "PPER", "PPOSAT", "PPOSS", "PRELAT", "PRELS", "PRF", "PROAV", "PTKA", "PTKANT", "PTKNEG", "PTKVZ", "PTKZU", "PWAT", "PWAV", "PWS", "TRUNC", "VAFIN", "VAIMP", "VAINF", "VAPP", "VMFIN", "VMINF", "VMPP", "VVFIN", "VVIMP", "VVINF", "VVIZU", "VVPP", "XY" }; private static final String[] STANFORD_DEPENDENCY_TAGS = { "acomp", "advcl", "advmod", "amod", "appos", "aux", "auxpass", "cc", "ccomp", "conj", "cop", "csubj", "csubjpass", "dep", "det", "discourse", "dobj", "expl", "iobj", "mark", "mwe", "neg", "nn", "npadvmod", "nsubj", "nsubjpass", "num", "number", "parataxis", "pcomp", "pobj", "poss", "possessive", "preconj", "predet", "prep", "prt", "punct", "quantmod", "rcmod", "root", "tmod", "vmod", "xcomp" }; private static final String[] UNIVERSAL_DEPENDENCY_TAGS = { "acl", "acl:relcl", "advcl", "advmod", "amod", "appos", "aux", "auxpass", "case", "cc", "cc:preconj", "ccomp", "compound", "compound:prt", "conj", "cop", "csubj", "csubjpass", "dep", "det", "det:predet", "discourse", "dobj", "expl", "iobj", "list", "mark", "mwe", "neg", "nmod", "nmod:npmod", "nmod:poss", "nmod:tmod", "nsubj", "nsubjpass", "nummod", "parataxis", "punct", "root", "xcomp" }; private static final String[] PTB_POS_TAGS = { "#", "$", "''", ",", "-LRB-", "-RRB-", ".", ":", "CC", "CD", "DT", "EX", "FW", "IN", "JJ", "JJR", "JJS", "LS", "MD", "NN", "NNP", "NNPS", "NNS", "PDT", "POS", "PRP", "PRP$", "RB", "RBR", "RBS", "RP", "SYM", "TO", "UH", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "WDT", "WP", "WP$", "WRB", "``" }; private static final String[] UNIVERSAL_POS_TAGS = { "ADJ", "ADP", "ADV", "AUX", "CONJ", "DET", "INTJ", "NOUN", "NUM", "PART", "PRON", "PROPN", "PUNCT", "SCONJ", "SYM", "VERB", "X" }; private static final String[] CORENLP34_POS_TAGS = { "A", "ADJ", "ADJWH", "ADV", "ADVWH", "C", "CC", "CL", "CLO", "CLR", "CLS", "CS", "DET", "DETWH", "ET", "I", "N", "NC", "NPP", "P", "PREF", "PRO", "PROREL", "PROWH", "PUNC", "V", "VIMP", "VINF", "VPP", "VPR", "VS" }; @Test public void testEnglishStanfordDependencies() throws Exception { JCas jcas = runTest("en", "sd", "We need a very complicated example sentence , which " + "contains as many constituents and dependencies as possible ."); String[] dependencies = { "[ 0, 2]NSUBJ(nsubj,basic) D[0,2](We) G[3,7](need)", "[ 3, 7]ROOT(root,basic) D[3,7](need) G[3,7](need)", "[ 8, 9]DET(det,basic) D[8,9](a) G[35,43](sentence)", "[ 10, 14]ADVMOD(advmod,basic) D[10,14](very) G[15,26](complicated)", "[ 15, 26]AMOD(amod,basic) D[15,26](complicated) G[35,43](sentence)", "[ 27, 34]NN(nn,basic) D[27,34](example) G[35,43](sentence)", "[ 35, 43]DOBJ(dobj,basic) D[35,43](sentence) G[3,7](need)", "[ 44, 45]PUNCT(punct,basic) D[44,45](,) G[35,43](sentence)", "[ 46, 51]NSUBJ(nsubj,basic) D[46,51](which) G[52,60](contains)", "[ 52, 60]RCMOD(rcmod,basic) D[52,60](contains) G[35,43](sentence)", "[ 64, 68]AMOD(amod,basic) D[64,68](many) G[69,81](constituents)", "[ 69, 81]PREP(prep_as,basic) D[69,81](constituents) G[52,60](contains)", "[ 86, 98]CONJ(conj_and,basic) D[86,98](dependencies) G[69,81](constituents)", "[102,110]PREP(prep_as,basic) D[102,110](possible) G[69,81](constituents)", "[111,112]PUNCT(punct,basic) D[111,112](.) G[3,7](need)" }; String[] unmappedDep = {}; AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class)); AssertAnnotations.assertTagset(CoreNlpPosTagger.class, POS.class, "ptb", PTB_POS_TAGS, jcas); AssertAnnotations.assertTagset(CoreNlpDependencyParser.class, POS.class, "ptb", PTB_POS_TAGS, jcas); AssertAnnotations.assertTagset(CoreNlpDependencyParser.class, Dependency.class, "stanford341", STANFORD_DEPENDENCY_TAGS, jcas); AssertAnnotations.assertTagsetMapping(Dependency.class, "stanford341", unmappedDep, jcas); } @Test public void testEnglishUniversalDependencies() throws Exception { JCas jcas = runTest("en", "ud", "We need a very complicated example sentence , which " + "contains as many constituents and dependencies as possible ."); String[] dependencies = { "[ 0, 2]NSUBJ(nsubj,basic) D[0,2](We) G[3,7](need)", "[ 3, 7]ROOT(root,basic) D[3,7](need) G[3,7](need)", "[ 8, 9]DET(det,basic) D[8,9](a) G[35,43](sentence)", "[ 10, 14]ADVMOD(advmod,basic) D[10,14](very) G[15,26](complicated)", "[ 15, 26]AMOD(amod,basic) D[15,26](complicated) G[35,43](sentence)", "[ 27, 34]NN(compound,basic) D[27,34](example) G[35,43](sentence)", "[ 35, 43]DOBJ(dobj,basic) D[35,43](sentence) G[3,7](need)", "[ 44, 45]PUNCT(punct,basic) D[44,45](,) G[35,43](sentence)", "[ 46, 51]NSUBJ(nsubj,basic) D[46,51](which) G[52,60](contains)", "[ 52, 60]Dependency(acl:relcl,basic) D[52,60](contains) G[35,43](sentence)", "[ 61, 63]PREP(case,basic) D[61,63](as) G[69,81](constituents)", "[ 64, 68]AMOD(amod,basic) D[64,68](many) G[69,81](constituents)", "[ 69, 81]Dependency(nmod:as,basic) D[69,81](constituents) G[52,60](contains)", "[ 82, 85]CC(cc,basic) D[82,85](and) G[69,81](constituents)", "[ 86, 98]CONJ(conj:and,basic) D[86,98](dependencies) G[69,81](constituents)", "[ 99,101]PREP(case,basic) D[99,101](as) G[102,110](possible)", "[102,110]Dependency(acl,basic) D[102,110](possible) G[69,81](constituents)", "[111,112]PUNCT(punct,basic) D[111,112](.) G[3,7](need)" }; String[] unmappedDep = { "acl:relcl", "cc:preconj", "compound:prt", "det:predet", "nmod:npmod", "nmod:poss", "nmod:tmod" }; AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class)); AssertAnnotations.assertTagset(CoreNlpPosTagger.class, POS.class, "ptb", PTB_POS_TAGS, jcas); AssertAnnotations.assertTagset(CoreNlpDependencyParser.class, POS.class, "ptb", PTB_POS_TAGS, jcas); AssertAnnotations.assertTagset(CoreNlpDependencyParser.class, Dependency.class, "universal", UNIVERSAL_DEPENDENCY_TAGS, jcas); AssertAnnotations.assertTagsetMapping(Dependency.class, "universal", unmappedDep, jcas); } @Test public void testEnglishWsjSd() throws Exception { JCas jcas = runTest("en", "wsj-sd", "We need a very complicated example sentence , which " + "contains as many constituents and dependencies as possible ."); String[] dependencies = { "[ 0, 2]NSUBJ(nsubj,basic) D[0,2](We) G[3,7](need)", "[ 3, 7]ROOT(root,basic) D[3,7](need) G[3,7](need)", "[ 8, 9]DET(det,basic) D[8,9](a) G[35,43](sentence)", "[ 10, 14]ADVMOD(advmod,basic) D[10,14](very) G[15,26](complicated)", "[ 15, 26]AMOD(amod,basic) D[15,26](complicated) G[35,43](sentence)", "[ 27, 34]NN(nn,basic) D[27,34](example) G[35,43](sentence)", "[ 35, 43]DOBJ(dobj,basic) D[35,43](sentence) G[3,7](need)", "[ 44, 45]PUNCT(punct,basic) D[44,45](,) G[35,43](sentence)", "[ 46, 51]NSUBJ(nsubj,basic) D[46,51](which) G[52,60](contains)", "[ 52, 60]RCMOD(rcmod,basic) D[52,60](contains) G[35,43](sentence)", "[ 61, 63]PREP(prep,basic) D[61,63](as) G[52,60](contains)", "[ 64, 68]AMOD(amod,basic) D[64,68](many) G[69,81](constituents)", "[ 69, 81]POBJ(pobj,basic) D[69,81](constituents) G[61,63](as)", "[ 82, 85]CC(cc,basic) D[82,85](and) G[69,81](constituents)", "[ 86, 98]CONJ(conj:and,basic) D[86,98](dependencies) G[69,81](constituents)", "[ 99,101]PREP(prep,basic) D[99,101](as) G[69,81](constituents)", "[102,110]POBJ(pobj,basic) D[102,110](possible) G[99,101](as)", "[111,112]PUNCT(punct,basic) D[111,112](.) G[3,7](need)" }; String[] unmappedDep = {}; AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class)); AssertAnnotations.assertTagset(CoreNlpPosTagger.class, POS.class, "ptb", PTB_POS_TAGS, jcas); AssertAnnotations.assertTagset(CoreNlpDependencyParser.class, POS.class, "ptb", PTB_POS_TAGS, jcas); AssertAnnotations.assertTagset(CoreNlpDependencyParser.class, Dependency.class, "stanford341", STANFORD_DEPENDENCY_TAGS, jcas); AssertAnnotations.assertTagsetMapping(Dependency.class, "stanford341", unmappedDep, jcas); } @Test public void testEnglishWsjUd() throws Exception { JCas jcas = runTest("en", "wsj-ud", "We need a very complicated example sentence , which " + "contains as many constituents and dependencies as possible ."); String[] dependencies = { "[ 0, 2]NSUBJ(nsubj,basic) D[0,2](We) G[3,7](need)", "[ 3, 7]ROOT(root,basic) D[3,7](need) G[3,7](need)", "[ 8, 9]DET(det,basic) D[8,9](a) G[35,43](sentence)", "[ 10, 14]ADVMOD(advmod,basic) D[10,14](very) G[15,26](complicated)", "[ 15, 26]AMOD(amod,basic) D[15,26](complicated) G[35,43](sentence)", "[ 27, 34]NN(compound,basic) D[27,34](example) G[35,43](sentence)", "[ 35, 43]DOBJ(dobj,basic) D[35,43](sentence) G[3,7](need)", "[ 44, 45]PUNCT(punct,basic) D[44,45](,) G[35,43](sentence)", "[ 46, 51]NSUBJ(nsubj,basic) D[46,51](which) G[52,60](contains)", "[ 52, 60]Dependency(acl:relcl,basic) D[52,60](contains) G[35,43](sentence)", "[ 61, 63]PREP(case,basic) D[61,63](as) G[69,81](constituents)", "[ 64, 68]AMOD(amod,basic) D[64,68](many) G[69,81](constituents)", "[ 69, 81]Dependency(nmod:as,basic) D[69,81](constituents) G[52,60](contains)", "[ 82, 85]CC(cc,basic) D[82,85](and) G[69,81](constituents)", "[ 86, 98]CONJ(conj:and,basic) D[86,98](dependencies) G[69,81](constituents)", "[ 99,101]PREP(case,basic) D[99,101](as) G[102,110](possible)", "[102,110]Dependency(acl,basic) D[102,110](possible) G[69,81](constituents)", "[111,112]PUNCT(punct,basic) D[111,112](.) G[3,7](need)" }; String[] depTags = { "acl", "acl:relcl", "advcl", "advmod", "amod", "appos", "aux", "auxpass", "case", "cc", "cc:preconj", "ccomp", "compound", "compound:prt", "conj", "cop", "csubj", "csubjpass", "dep", "det", "det:predet", "discourse", "dobj", "expl", "iobj", "mark", "mwe", "neg", "nmod", "nmod:npmod", "nmod:poss", "nmod:tmod", "nsubj", "nsubjpass", "nummod", "parataxis", "punct", "root", "xcomp" }; String[] unmappedDep = { "acl:relcl", "cc:preconj", "compound:prt", "det:predet", "nmod:npmod", "nmod:poss", "nmod:tmod" }; AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class)); AssertAnnotations.assertTagset(CoreNlpPosTagger.class, POS.class, "ptb", PTB_POS_TAGS, jcas); AssertAnnotations.assertTagset(CoreNlpDependencyParser.class, POS.class, "ptb", PTB_POS_TAGS, jcas); AssertAnnotations.assertTagset(CoreNlpDependencyParser.class, Dependency.class, "universal", depTags, jcas); AssertAnnotations.assertTagsetMapping(Dependency.class, "universal", unmappedDep, jcas); } @Test public void testFrenchUniversalDependencies() throws Exception { JCas jcas = runTest("fr", "ud", "Nous avons besoin d' une phrase par exemple très " + "compliqué , qui contient des constituants que de nombreuses dépendances et que " + "possible ."); String[] dependencies = { "[ 0, 4]ROOT(root,basic) D[0,4](Nous) G[0,4](Nous)", "[ 5, 10]NN(compound,basic) D[5,10](avons) G[0,4](Nous)", "[ 11, 17]DEP(dep,basic) D[11,17](besoin) G[5,10](avons)", "[ 18, 20]MWE(mwe,basic) D[18,20](d') G[11,17](besoin)", "[ 21, 24]DET(det,basic) D[21,24](une) G[25,31](phrase)", "[ 25, 31]PREP(case,basic) D[25,31](phrase) G[18,20](d')", "[ 32, 35]PREP(case,basic) D[32,35](par) G[25,31](phrase)", "[ 36, 43]PREP(case,basic) D[36,43](exemple) G[32,35](par)", "[ 44, 48]ADVMOD(advmod,basic) D[44,48](très) G[49,58](compliqué)", "[ 49, 58]AMOD(amod,basic) D[49,58](compliqué) G[36,43](exemple)", "[ 59, 60]PUNCT(punct,basic) D[59,60](,) G[49,58](compliqué)", "[ 61, 64]APPOS(appos,basic) D[61,64](qui) G[59,60](,)", "[ 65, 73]APPOS(appos,basic) D[65,73](contient) G[61,64](qui)", "[ 74, 77]DET(det,basic) D[74,77](des) G[78,90](constituants)", "[ 78, 90]Dependency(nmod,basic) D[78,90](constituants) G[65,73](contient)", "[ 91, 94]DEP(dep,basic) D[91,94](que) G[78,90](constituants)", "[ 95, 97]DET(det,basic) D[95,97](de) G[109,120](dépendances)", "[ 98,108]AMOD(amod,basic) D[98,108](nombreuses) G[109,120](dépendances)", "[109,120]Dependency(nmod,basic) D[109,120](dépendances) G[91,94](que)", "[121,123]CC(cc,basic) D[121,123](et) G[109,120](dépendances)", "[124,127]MWE(mwe,basic) D[124,127](que) G[121,123](et)", "[128,136]AMOD(amod,basic) D[128,136](possible) G[124,127](que)", "[137,138]CONJ(conj:et,basic) D[137,138](.) G[109,120](dépendances)" }; String[] depTags = { "acl", "acl:relcl", "advcl", "advmod", "amod", "appos", "aux", "auxpass", "case", "cc", "ccomp", "compound", "conj", "cop", "csubj", "dep", "det", "discourse", "dobj", "expl", "iobj", "mark", "mwe", "name", "neg", "nmod", "nmod:poss", "nsubj", "nsubjpass", "nummod", "parataxis", "punct", "root", "xcomp" }; String[] unmappedDep = { "acl:relcl", "nmod:poss" }; AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class)); AssertAnnotations.assertTagset(CoreNlpPosTagger.class, POS.class, "corenlp34", CORENLP34_POS_TAGS, jcas); AssertAnnotations.assertTagset(CoreNlpDependencyParser.class, POS.class, "universal", UNIVERSAL_POS_TAGS, jcas); AssertAnnotations.assertTagset(CoreNlpDependencyParser.class, Dependency.class, "universal", depTags, jcas); AssertAnnotations.assertTagsetMapping(Dependency.class, "universal", unmappedDep, jcas); } @Test public void testGermanUniversalDependencies() throws Exception { JCas jcas = runTest("de", "ud", "Wir brauchen ein sehr kompliziertes Beispiel , welches " + "möglichst viele Konstituenten und Dependenzen beinhaltet ."); String[] dependencies = { "[ 0, 3]NSUBJ(nsubj,basic) D[0,3](Wir) G[4,12](brauchen)", "[ 4, 12]ROOT(root,basic) D[4,12](brauchen) G[4,12](brauchen)", "[ 13, 16]DET(det,basic) D[13,16](ein) G[36,44](Beispiel)", "[ 17, 21]ADVMOD(advmod,basic) D[17,21](sehr) G[22,35](kompliziertes)", "[ 22, 35]AMOD(amod,basic) D[22,35](kompliziertes) G[36,44](Beispiel)", "[ 36, 44]DOBJ(dobj,basic) D[36,44](Beispiel) G[4,12](brauchen)", "[ 45, 46]PUNCT(punct,basic) D[45,46](,) G[4,12](brauchen)", "[ 47, 54]NSUBJ(nsubj,basic) D[47,54](welches) G[101,111](beinhaltet)", "[ 55, 64]ADVMOD(advmod,basic) D[55,64](möglichst) G[65,70](viele)", "[ 65, 70]AMOD(amod,basic) D[65,70](viele) G[71,84](Konstituenten)", "[ 71, 84]DOBJ(dobj,basic) D[71,84](Konstituenten) G[101,111](beinhaltet)", "[ 85, 88]CC(cc,basic) D[85,88](und) G[71,84](Konstituenten)", "[ 89,100]CONJ(conj:und,basic) D[89,100](Dependenzen) G[71,84](Konstituenten)", "[101,111]Dependency(acl,basic) D[101,111](beinhaltet) G[4,12](brauchen)", "[112,113]PUNCT(punct,basic) D[112,113](.) G[4,12](brauchen)" }; String[] depTags = { "acl", "advcl", "advmod", "amod", "appos", "aux", "auxpass", "case", "cc", "ccomp", "compound", "conj", "cop", "csubj", "csubjpass", "dep", "det", "dobj", "expl", "iobj", "mark", "mwe", "name", "neg", "nmod", "nmod:poss", "nsubj", "nsubjpass", "nummod", "parataxis", "punct", "root", "xcomp" }; String[] unmappedDep = { "nmod:poss" }; String[] depParserPosTags = { "$,", "$.", "$[", "ADJA", "ADJD", "ADV", "APPO", "APPR", "APPRART", "APZR", "ART", "CARD", "FM", "ITJ", "KOKOM", "KON", "KOUI", "KOUS", "NE", "NN", "PDAT", "PDS", "PIAT", "PIDAT", "PIS", "PPER", "PPOSAT", "PRELAT", "PRELS", "PRF", "PROAV", "PTKA", "PTKANT", "PTKNEG", "PTKVZ", "PTKZU", "PWAT", "PWAV", "PWS", "TRUNC", "VAFIN", "VAIMP", "VAINF", "VAPP", "VMFIN", "VMINF", "VVFIN", "VVIMP", "VVINF", "VVIZU", "VVPP", "XY" }; AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class)); AssertAnnotations.assertTagset(CoreNlpPosTagger.class, POS.class, "stts", GERMAN_POS_TAGS, jcas); AssertAnnotations.assertTagset(CoreNlpDependencyParser.class, POS.class, "stts", depParserPosTags, jcas); AssertAnnotations.assertTagset(CoreNlpDependencyParser.class, Dependency.class, "universal", depTags, jcas); AssertAnnotations.assertTagsetMapping(Dependency.class, "universal", unmappedDep, jcas); } @Test public void testChineseCtbConllDependencies() throws Exception { JCas jcas = runTest("zh", "ctb-conll", "我们 需要 一个 非常 复杂 的 句子 例如 其中 包含 许多 成分 和 尽可能 的 依赖 。"); String[] dependencies = { "[ 0, 2]Dependency(SUB,basic) D[0,2](我们) G[3,5](需要)", "[ 3, 5]ROOT(root,basic) D[3,5](需要) G[3,5](需要)", "[ 6, 8]Dependency(AMOD,basic) D[6,8](一个) G[12,14](复杂)", "[ 9, 11]Dependency(AMOD,basic) D[9,11](非常) G[12,14](复杂)", "[ 12, 14]Dependency(DEP,basic) D[12,14](复杂) G[15,16](的)", "[ 15, 16]Dependency(NMOD,basic) D[15,16](的) G[17,19](句子)", "[ 17, 19]Dependency(OBJ,basic) D[17,19](句子) G[3,5](需要)", "[ 20, 22]Dependency(VMOD,basic) D[20,22](例如) G[26,28](包含)", "[ 23, 25]Dependency(SUB,basic) D[23,25](其中) G[26,28](包含)", "[ 26, 28]Dependency(VMOD,basic) D[26,28](包含) G[3,5](需要)", "[ 29, 31]Dependency(NMOD,basic) D[29,31](许多) G[32,34](成分)", "[ 32, 34]Dependency(SUB,basic) D[32,34](成分) G[43,45](依赖)", "[ 35, 36]Dependency(VMOD,basic) D[35,36](和) G[43,45](依赖)", "[ 37, 40]Dependency(DEP,basic) D[37,40](尽可能) G[41,42](的)", "[ 41, 42]Dependency(VMOD,basic) D[41,42](的) G[43,45](依赖)", "[ 43, 45]Dependency(VMOD,basic) D[43,45](依赖) G[26,28](包含)", "[ 46, 47]Dependency(P,basic) D[46,47](。) G[3,5](需要)" }; String[] depTags = { "AMOD", "DEP", "NMOD", "OBJ", "P", "PMOD", "PRD", "ROOT", "SBAR", "SUB", "VC", "VMOD" }; String[] posTags = { "AD", "AS", "BA", "CC", "CD", "CS", "DEC", "DEG", "DER", "DEV", "DT", "ETC", "FW", "IJ", "JJ", "LB", "LC", "M", "MSP", "NN", "NR", "NT", "OD", "ON", "P", "PN", "PU", "SB", "SP", "URL", "VA", "VC", "VE", "VV", "X" }; String[] unmappedDep = {}; AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class)); AssertAnnotations.assertTagset(CoreNlpPosTagger.class, POS.class, "ctb", posTags, jcas); // There are some minor differences between the tags produced by the POS tagger and the // tags expected by the parser model. We need a better test here that makes these differences // more visible and at the same time doesn't fail. //AssertAnnotations.assertTagset(CoreNlpDependencyParser.class, POS.class, "ctb", posTags, // jcas); AssertAnnotations.assertTagset(CoreNlpDependencyParser.class, Dependency.class, "conll", depTags, jcas); AssertAnnotations.assertTagsetMapping(CoreNlpDependencyParser.class, Dependency.class, "conll", unmappedDep, jcas); } @Test public void testChinesePtbConllDependencies() throws Exception { JCas jcas = runTest("zh", "ptb-conll", "我们 需要 一个 非常 复杂 的 句子 例如 其中 包含 许多 成分 和 尽可能 的 依赖 。"); // This output is bogus because the tagger we use here produced ctb tags and the model // expects ptb tags. However, I didn't find any pos tagger model for chinese that produces // the ptb tags... String[] dependencies = { "[ 0, 2]ROOT(root,basic) D[0,2](我们) G[0,2](我们)", "[ 3, 5]Dependency(COORD,basic) D[3,5](需要) G[0,2](我们)", "[ 6, 8]Dependency(COORD,basic) D[6,8](一个) G[3,5](需要)", "[ 9, 11]Dependency(COORD,basic) D[9,11](非常) G[6,8](一个)", "[ 12, 14]Dependency(COORD,basic) D[12,14](复杂) G[9,11](非常)", "[ 15, 16]Dependency(COORD,basic) D[15,16](的) G[12,14](复杂)", "[ 17, 19]Dependency(COORD,basic) D[17,19](句子) G[15,16](的)", "[ 20, 22]Dependency(COORD,basic) D[20,22](例如) G[17,19](句子)", "[ 23, 25]Dependency(COORD,basic) D[23,25](其中) G[20,22](例如)", "[ 26, 28]Dependency(COORD,basic) D[26,28](包含) G[23,25](其中)", "[ 29, 31]Dependency(NMOD,basic) D[29,31](许多) G[32,34](成分)", "[ 32, 34]Dependency(VMOD,basic) D[32,34](成分) G[41,42](的)", "[ 35, 36]Dependency(COORD,basic) D[35,36](和) G[32,34](成分)", "[ 37, 40]Dependency(CONJ,basic) D[37,40](尽可能) G[35,36](和)", "[ 41, 42]Dependency(COORD,basic) D[41,42](的) G[26,28](包含)", "[ 43, 45]Dependency(NMOD,basic) D[43,45](依赖) G[46,47](。)", "[ 46, 47]Dependency(VMOD,basic) D[46,47](。) G[41,42](的)" }; String[] depTags = { "AMOD", "APPO", "CONJ", "COORD", "DEP", "IM", "NAME", "NMOD", "P", "PMOD", "PRN", "PRT", "ROOT", "SUB", "SUFFIX", "VC", "VMOD" }; String[] posTags = { "AD", "AS", "BA", "CC", "CD", "CS", "DEC", "DEG", "DER", "DEV", "DT", "ETC", "FW", "IJ", "JJ", "LB", "LC", "M", "MSP", "NN", "NR", "NT", "OD", "ON", "P", "PN", "PU", "SB", "SP", "URL", "VA", "VC", "VE", "VV", "X" }; String[] unmappedDep = {}; AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class)); AssertAnnotations.assertTagset(CoreNlpPosTagger.class, POS.class, "ctb", posTags, jcas); // There are some minor differences between the tags produced by the POS tagger and the // tags expected by the parser model. We need a better test here that makes these differences // more visible and at the same time doesn't fail. //AssertAnnotations.assertTagset(CoreNlpDependencyParser.class, POS.class, "ctb", posTags, // jcas); AssertAnnotations.assertTagset(CoreNlpDependencyParser.class, Dependency.class, "conll2008", depTags, jcas); AssertAnnotations.assertTagsetMapping(CoreNlpDependencyParser.class, Dependency.class, "conll2008", unmappedDep, jcas); } @Test public void testChineseUniversalDependencies() throws Exception { JCas jcas = runTest("zh", "ud", "我们 需要 一个 非常 复杂 的 句子 例如 其中 包含 许多 成分 和 尽可能 的 依赖 。"); String[] dependencies = { "[ 0, 2]NSUBJ(nsubj,basic) D[0,2](我们) G[3,5](需要)", "[ 3, 5]ROOT(root,basic) D[3,5](需要) G[3,5](需要)", "[ 6, 8]DEP(dep,basic) D[6,8](一个) G[17,19](句子)", "[ 9, 11]ADVMOD(advmod,basic) D[9,11](非常) G[12,14](复杂)", "[ 12, 14]AMOD(amod,basic) D[12,14](复杂) G[17,19](句子)", "[ 15, 16]MARK(mark,basic) D[15,16](的) G[12,14](复杂)", "[ 17, 19]NSUBJ(nsubj,basic) D[17,19](句子) G[26,28](包含)", "[ 20, 22]ADVMOD(advmod,basic) D[20,22](例如) G[26,28](包含)", "[ 23, 25]NSUBJ(nsubj,basic) D[23,25](其中) G[26,28](包含)", "[ 26, 28]CCOMP(ccomp,basic) D[26,28](包含) G[3,5](需要)", "[ 29, 31]DEP(dep,basic) D[29,31](许多) G[32,34](成分)", "[ 32, 34]CONJ(conj:和,basic) D[32,34](成分) G[43,45](依赖)", "[ 35, 36]CC(cc,basic) D[35,36](和) G[43,45](依赖)", "[ 37, 40]ADVMOD(advmod:dvp,basic) D[37,40](尽可能) G[43,45](依赖)", "[ 41, 42]MARK(mark,basic) D[41,42](的) G[37,40](尽可能)", "[ 43, 45]DOBJ(dobj,basic) D[43,45](依赖) G[26,28](包含)", "[ 46, 47]PUNCT(punct,basic) D[46,47](。) G[3,5](需要)" }; String[] depTags = { "acl", "advcl:loc", "advmod", "advmod:dvp", "advmod:loc", "advmod:rcomp", "amod", "amod:ordmod", "appos", "aux:asp", "aux:ba", "aux:modal", "aux:prtmod", "auxpass", "case", "cc", "ccomp", "compound:nn", "compound:vc", "conj", "cop", "dep", "det", "discourse", "dobj", "erased", "etc", "mark", "mark:clf", "name", "neg", "nmod", "nmod:assmod", "nmod:poss", "nmod:prep", "nmod:range", "nmod:tmod", "nmod:topic", "nsubj", "nsubj:xsubj", "nsubjpass", "nummod", "parataxis:prnmod", "punct", "root", "xcomp" }; String[] posTags = { "AD", "AS", "BA", "CC", "CD", "CS", "DEC", "DEG", "DER", "DEV", "DT", "ETC", "FW", "IJ", "JJ", "LB", "LC", "M", "MSP", "NN", "NR", "NT", "OD", "ON", "P", "PN", "PU", "SB", "SP", "URL", "VA", "VC", "VE", "VV", "X" }; String[] unmappedDep = { "advcl:loc", "advmod:dvp", "advmod:loc", "advmod:rcomp", "amod:ordmod", "aux:asp", "aux:ba", "aux:modal", "aux:prtmod", "compound:nn", "compound:vc", "erased", "etc", "mark:clf", "nmod:assmod", "nmod:poss", "nmod:prep", "nmod:range", "nmod:tmod", "nmod:topic", "nsubj:xsubj", "parataxis:prnmod" }; AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class)); AssertAnnotations.assertTagset(CoreNlpPosTagger.class, POS.class, "ctb", posTags, jcas); // There are some minor differences between the tags produced by the POS tagger and the // tags expected by the parser model. We need a better test here that makes these differences // more visible and at the same time doesn't fail. //AssertAnnotations.assertTagset(CoreNlpDependencyParser.class, POS.class, "ctb", posTags, // jcas); AssertAnnotations.assertTagset(CoreNlpDependencyParser.class, Dependency.class, "universal", depTags, jcas); AssertAnnotations.assertTagsetMapping(CoreNlpDependencyParser.class, Dependency.class, "universal", unmappedDep, jcas); } @Test public void testEnglishPtbConllDependencies() throws Exception { JCas jcas = runTest("en", "ptb-conll", "We need a very complicated example sentence , which " + "contains as many constituents and dependencies as possible ."); String[] dependencies = { "[ 0, 2]Dependency(VMOD,basic) D[0,2](We) G[3,7](need)", "[ 3, 7]ROOT(root,basic) D[3,7](need) G[3,7](need)", "[ 8, 9]Dependency(NMOD,basic) D[8,9](a) G[35,43](sentence)", "[ 10, 14]Dependency(AMOD,basic) D[10,14](very) G[15,26](complicated)", "[ 15, 26]Dependency(NMOD,basic) D[15,26](complicated) G[35,43](sentence)", "[ 27, 34]Dependency(NMOD,basic) D[27,34](example) G[35,43](sentence)", "[ 35, 43]Dependency(VMOD,basic) D[35,43](sentence) G[3,7](need)", "[ 44, 45]Dependency(P,basic) D[44,45](,) G[35,43](sentence)", "[ 46, 51]Dependency(VMOD,basic) D[46,51](which) G[52,60](contains)", "[ 52, 60]Dependency(NMOD,basic) D[52,60](contains) G[35,43](sentence)", "[ 61, 63]Dependency(VMOD,basic) D[61,63](as) G[52,60](contains)", "[ 64, 68]Dependency(NMOD,basic) D[64,68](many) G[69,81](constituents)", "[ 69, 81]Dependency(PMOD,basic) D[69,81](constituents) G[61,63](as)", "[ 82, 85]Dependency(COORD,basic) D[82,85](and) G[69,81](constituents)", "[ 86, 98]Dependency(CONJ,basic) D[86,98](dependencies) G[82,85](and)", "[ 99,101]Dependency(NMOD,basic) D[99,101](as) G[69,81](constituents)", "[102,110]Dependency(PMOD,basic) D[102,110](possible) G[99,101](as)", "[111,112]Dependency(P,basic) D[111,112](.) G[3,7](need)" }; String[] depTags = { "AMOD", "APPO", "CONJ", "COORD", "DEP", "IM", "NAME", "NMOD", "P", "PMOD", "PRN", "PRT", "ROOT", "SUB", "SUFFIX", "VC", "VMOD" }; String[] unmappedDep = {}; String[] posTags = { "#", "$", "''", "(", ")", ",", ".", ":", "CC", "CD", "DT", "EX", "FW", "IN", "JJ", "JJR", "JJS", "LS", "MD", "NN", "NNP", "NNPS", "NNS", "PDT", "POS", "PRP", "PRP$", "RB", "RBR", "RBS", "RP", "SYM", "TO", "UH", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "WDT", "WP", "WP$", "WRB", "``" }; AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class)); // There are some minor differences between the tags produced by the POS tagger and the // tags expected by the parser model. We need a better test here that makes these differences // more visible and at the same time doesn't fail. //AssertAnnotations.assertTagset(CoreNlpPosTagger.class, POS.class, "ptb", PTB_POS_TAGS, jcas); AssertAnnotations.assertTagset(CoreNlpDependencyParser.class, POS.class, "ptb", posTags, jcas); AssertAnnotations.assertTagset(Dependency.class, "conll", depTags, jcas); AssertAnnotations.assertTagsetMapping(Dependency.class, "conll", unmappedDep, jcas); } private JCas runTest(String aLanguage, String aVariant, String aText, Object... aExtraParams) throws Exception { AssumeResource.assumeResource(CoreNlpDependencyParser.class, "depparser", aLanguage, aVariant); AggregateBuilder aggregate = new AggregateBuilder(); aggregate.add(createEngineDescription(CoreNlpPosTagger.class)); Object[] params = new Object[] { CoreNlpDependencyParser.PARAM_VARIANT, aVariant, CoreNlpDependencyParser.PARAM_PRINT_TAGSET, true}; params = ArrayUtils.addAll(params, aExtraParams); aggregate.add(createEngineDescription(CoreNlpDependencyParser.class, params)); return TestRunner.runTest(aggregate.createAggregateDescription(), aLanguage, aText); } @Rule public DkproTestContext testContext = new DkproTestContext(); }