/* * Copyright 2014 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.core.io.penntree; import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.assertConstituents; import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.assertPOS; import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.assertSentence; import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.assertToken; import static org.apache.uima.fit.factory.CollectionReaderFactory.createReader; import static org.apache.uima.fit.util.JCasUtil.select; import static org.apache.uima.fit.util.JCasUtil.selectCovered; import static org.junit.Assert.assertEquals; import org.apache.uima.collection.CollectionReader; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; import org.junit.Before; import org.junit.Rule; import org.junit.Test; import org.junit.rules.TestName; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent; public class PennTreebankCombinedReaderTest { @Test public void test() throws Exception { CollectionReader reader = createReader(PennTreebankCombinedReader.class, PennTreebankCombinedReader.PARAM_SOURCE_LOCATION, "src/test/resources/stanfordPennTrees/stanford-english-trees-first2.mrg"); JCas jcas = JCasFactory.createJCas(); reader.getNext(jcas.getCas()); String text = "Al Qaida Endorses George W. Bush for President\n" + "Al-Qaeda tries to incite more violence in Iraq\n"; String[] sentences = { "Al Qaida Endorses George W. Bush for President", "Al-Qaeda tries to incite more violence in Iraq" }; String[] tokens1 = { "Al", "Qaida", "Endorses", "George", "W.", "Bush", "for", "President" }; String[] constituentMapped1 = { "Constituent 0,46", "Constituent 0,8", "Constituent 18,32", "Constituent 33,46", "Constituent 37,46", "Constituent 9,46", "ROOT 0,46" }; String[] constituentOriginal1 = { "NP 0,8", "NP 18,32", "NP 37,46", "PP 33,46", "ROOT 0,46", "S 0,46", "VP 9,46" }; String[] tokens2 = { "Al-Qaeda", "tries", "to", "incite", "more", "violence", "in", "Iraq" }; String[] constituentMapped2 = { "Constituent 47,55", "Constituent 47,93", "Constituent 56,93", "Constituent 62,93", "Constituent 65,93", "Constituent 72,85", "Constituent 86,93", "Constituent 89,93", "ROOT 47,93" }; String[] constituentOriginal2 = { "NP 47,55", "NP 72,85", "NP 89,93", "PP 86,93", "ROOT 47,93", "S 47,93", "S 62,93", "VP 56,93", "VP 62,93", "VP 65,93" }; assertEquals(text, jcas.getDocumentText()); assertSentence(sentences, select(jcas, Sentence.class)); Sentence[] actualSentences = select(jcas, Sentence.class).toArray(new Sentence[0]); assertToken(tokens1, selectCovered(Token.class, actualSentences[0])); assertConstituents(constituentMapped1, constituentOriginal1, selectCovered(Constituent.class, actualSentences[0])); assertToken(tokens2, selectCovered(Token.class, actualSentences[1])); assertConstituents(constituentMapped2, constituentOriginal2, selectCovered(Constituent.class, actualSentences[1])); } @Test public void testWithDirectSpeech() throws Exception { CollectionReader reader = createReader(PennTreebankCombinedReader.class, PennTreebankCombinedReader.PARAM_LANGUAGE, "en", PennTreebankCombinedReader.PARAM_SOURCE_LOCATION, "src/test/resources/stanfordPennTrees/tree_with_direct_speech.mrg"); JCas jcas = JCasFactory.createJCas(); reader.getNext(jcas.getCas()); String[] sentences = { "`` And what do you know ? ''" }; String[] tokens = { "``", "And", "what", "do", "you", "know", "?", "''" }; String[] posMapped = { "PUNCT", "CONJ", "PRON", "VERB", "PRON", "VERB", "PUNCT", "PUNCT" }; String[] posOriginal = { "``", "CC", "WP", "VBP", "PRP", "VB", ".", "''" }; String[] constituentMapped = { "NP 15,18", "ROOT 0,28", "SBARQ 0,28", "SQ 12,23", "VP 19,23", "WHNP 7,11" }; String[] constituentOriginal = { "NP 15,18", "ROOT 0,28", "SBARQ 0,28", "SQ 12,23", "VP 19,23", "WHNP 7,11" }; assertSentence(sentences, select(jcas, Sentence.class)); assertToken(tokens, select(jcas, Token.class)); assertPOS(posMapped, posOriginal, select(jcas, POS.class)); assertConstituents(constituentMapped, constituentOriginal, select(jcas, Constituent.class)); } @Test public void testWithParentheses() throws Exception { CollectionReader reader = createReader(PennTreebankCombinedReader.class, PennTreebankCombinedReader.PARAM_LANGUAGE, "en", PennTreebankCombinedReader.PARAM_SOURCE_LOCATION, "src/test/resources/stanfordPennTrees/tree_with_parentheses.mrg"); JCas jcas = JCasFactory.createJCas(); reader.getNext(jcas.getCas()); String[] sentences = { "( CNN ) ." }; String[] tokens = { "(", "CNN", ")", "." }; String[] posMapped = { "PUNCT", "PROPN", "PUNCT", "PUNCT" }; String[] posOriginal = { "-LRB-", "NNP", "-RRB-", "." }; String[] constituentMapped = { "FRAG 0,9", "NP 2,5", "ROOT 0,9" }; String[] constituentOriginal = { "FRAG 0,9", "NP 2,5", "ROOT 0,9" }; assertSentence(sentences, select(jcas, Sentence.class)); assertToken(tokens, select(jcas, Token.class)); assertPOS(posMapped, posOriginal, select(jcas, POS.class)); assertConstituents(constituentMapped, constituentOriginal, select(jcas, Constituent.class)); } @Rule public TestName name = new TestName(); @Before public void printSeparator() { System.out.println("\n=== " + name.getMethodName() + " ====================="); } @Before public void setupLogging() { System.setProperty("org.apache.uima.logger.class", "org.apache.uima.util.impl.Log4jLogger_impl"); } }