/*
* Copyright 2014
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.dkpro.core.io.penntree;
import static org.junit.Assert.assertEquals;
import static org.apache.uima.fit.util.JCasUtil.*;
import org.apache.uima.collection.CollectionReader;
import org.apache.uima.fit.factory.CollectionReaderFactory;
import org.apache.uima.fit.factory.JCasFactory;
import org.apache.uima.jcas.JCas;
import org.junit.Test;
import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk;
import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations;
public class PennTreebankChunkedReaderTest
{
@Test
public void testCountsOfAnnotations()
throws Exception
{
JCas jCas = readTestFile("generalTest.pos");
assertEquals(1, select(jCas, Sentence.class).size());
assertEquals(32, select(jCas, Token.class).size());
assertEquals(32, select(jCas, POS.class).size());
assertEquals(8, select(jCas, Chunk.class).size());
}
@Test
public void testPartOfSpeechTagAssignment()
throws Exception
{
JCas jCas = readTestFile("generalTest.pos");
String[] posOriginal = { "DT", "NN", "IN", "JJ", "NNS", "VBG", "IN", "NNP", "NNP", "NNP",
"VBD", "PRP", "VBZ", "VBN", "DT", "$", "CD", "CD", "NN", "NN", "IN", "JJS", "IN",
"NNP", "NNP", "NNP", "POS", "NN", "CC", "NN", "NNS", "." };
String[] posMapped = { "DET", "NOUN", "ADP", "ADJ", "NOUN", "VERB", "ADP", "PROPN", "PROPN",
"PROPN", "VERB", "PRON", "VERB", "VERB", "DET", "PUNCT", "NUM", "NUM", "NOUN",
"NOUN", "ADP", "ADJ", "ADP", "PROPN", "PROPN", "PROPN", "X", "NOUN", "CONJ", "NOUN",
"NOUN", "PUNCT" };
AssertAnnotations.assertPOS(posMapped, posOriginal, select(jCas, POS.class));
}
@Test
public void testTokenBoundaries()
throws Exception
{
JCas jCas = readTestFile("generalTest.pos");
String[] tokens = { "A", "consortium", "of", "private", "investors", "operating", "as",
"LJH", "Funding", "Co.", "said", "it", "has", "made", "a", "$", "409", "million",
"cash", "bid", "for", "most", "of", "L.J.", "Hooker", "Corp.", "'s", "real-estate",
"and", "shopping-center", "holdings", "." };
AssertAnnotations.assertToken(tokens, select(jCas, Token.class));
}
@Test
public void testErroneouslyJoinedTokensWithCorrectedTag()
throws Exception
{
JCas jcas = readTestFile("erroneouslyJoinedTokensAndTheirTags.pos");
String[] posOriginal = { "DT", "NNS", "NNS" };
String[] posMapped = { "DET", "NOUN", "NOUN" };
AssertAnnotations.assertPOS(posMapped, posOriginal, select(jcas, POS.class));
}
@Test
public void testDashedWordsTokenization()
throws Exception
{
JCas jcas = readTestFile("generalTest.pos");
String[] chunks = {
"[ 0, 12]Chunk(null) (A consortium)",
"[ 16, 33]Chunk(null) (private investors)",
"[ 47, 62]Chunk(null) (LJH Funding Co.)", "[ 68, 70]Chunk(null) (it)",
"[ 80,104]Chunk(null) (a $ 409 million cash bid)", "[109,113]Chunk(null) (most)",
"[117,149]Chunk(null) (L.J. Hooker Corp. 's real-estate)",
"[154,178]Chunk(null) (shopping-center holdings)" };
AssertAnnotations.assertChunks(chunks, select(jcas, Chunk.class));
}
/**
* We annotate only one pos if several exist, the first one mentioned
*/
@Test
public void testTokensWithSeveralPossiblePOSTags()
throws Exception
{
JCas jcas = readTestFile("severalPOSToken.pos");
String[] posOriginal = { "VBG", "NN" };
String[] posMapped = { "VERB", "NOUN" };
AssertAnnotations.assertPOS(posMapped, posOriginal, select(jcas, POS.class));
}
@Test
public void testSuppressedTokenAnnotation()
throws Exception
{
// POS/Chunk is set to true, yet it should not be annotated
JCas jCas = readTestFile("generalTest.pos", false, true, true, true);
assertEquals(1, select(jCas, Sentence.class).size());
assertEquals(0, select(jCas, Token.class).size());
assertEquals(0, select(jCas, POS.class).size());
assertEquals(0, select(jCas, Chunk.class).size());
}
@Test
public void testSuppressedPosAnnotation()
throws Exception
{
JCas jCas = readTestFile("generalTest.pos", true, false, true, true);
assertEquals(1, select(jCas, Sentence.class).size());
assertEquals(32, select(jCas, Token.class).size());
assertEquals(0, select(jCas, POS.class).size());
assertEquals(8, select(jCas, Chunk.class).size());
}
@Test
public void testSuppressedSentenceAnnotations()
throws Exception
{
JCas jCas = readTestFile("generalTest.pos", true, true, false, true);
assertEquals(0, select(jCas, Sentence.class).size());
assertEquals(32, select(jCas, Token.class).size());
assertEquals(32, select(jCas, POS.class).size());
assertEquals(8, select(jCas, Chunk.class).size());
}
@Test
public void testSuppressedChunkAnnotations()
throws Exception
{
JCas jCas = readTestFile("generalTest.pos", true, true, true, false);
assertEquals(1, select(jCas, Sentence.class).size());
assertEquals(32, select(jCas, Token.class).size());
assertEquals(32, select(jCas, POS.class).size());
assertEquals(0, select(jCas, Chunk.class).size());
}
private static JCas readTestFile(String aFile)
throws Exception
{
return readTestFile(aFile, true, true, true, true);
}
private static JCas readTestFile(String aFile, boolean readToken, boolean readPos,
boolean readSent, boolean readChunk)
throws Exception
{
CollectionReader reader = CollectionReaderFactory.createReader(
PennTreebankChunkedReader.class, PennTreebankChunkedReader.PARAM_LANGUAGE, "en",
PennTreebankChunkedReader.PARAM_SOURCE_LOCATION,
"src/test/resources/pennTreebankChunkedReaderTestFiles/",
PennTreebankChunkedReader.PARAM_POS_TAG_SET, "ptb",
PennTreebankChunkedReader.PARAM_READ_TOKEN, readToken,
PennTreebankChunkedReader.PARAM_READ_POS, readPos,
PennTreebankChunkedReader.PARAM_READ_SENTENCE, readSent,
PennTreebankChunkedReader.PARAM_READ_CHUNK, readChunk,
PennTreebankChunkedReader.PARAM_PATTERNS, aFile);
JCas jcas = JCasFactory.createJCas();
reader.getNext(jcas.getCas());
return jcas;
}
}