/*
* Copyright 2013
* Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.dkpro.core.io.conll;
import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.assertChunks;
import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.assertPOS;
import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.assertSentence;
import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription;
import static org.apache.uima.fit.util.JCasUtil.select;
import org.apache.uima.collection.CollectionReaderDescription;
import org.apache.uima.fit.pipeline.JCasIterable;
import org.apache.uima.jcas.JCas;
import org.junit.Test;
import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk;
public class Conll2000ReaderTest
{
@Test
public void conll2000test()
throws Exception
{
CollectionReaderDescription reader = createReaderDescription(
Conll2000Reader.class,
Conll2000Reader.PARAM_LANGUAGE, "en",
Conll2000Reader.PARAM_SOURCE_LOCATION, "src/test/resources/conll/2000/",
Conll2000Reader.PARAM_PATTERNS, "chunk2000_test.conll",
Conll2000Reader.PARAM_CHUNK_TAG_SET, "conll2000"
);
JCas jcas = new JCasIterable(reader).iterator().next();
String[] sentences = new String[] {
"Confidence in the pound is widely expected to take another sharp dive if trade "
+ "figures for September , due for release tomorrow , fail to show a substantial "
+ "improvement from July and August 's near-record deficits .",
"Chancellor of the Exchequer Nigel Lawson 's restated commitment to a firm "
+ "monetary policy has helped to prevent a freefall in sterling over the past "
+ "week .",
"But analysts reckon underlying support for sterling has been eroded by the "
+ "chancellor 's failure to announce any new policy measures in his Mansion "
+ "House speech last Thursday ." };
String[] chunks = new String[] {
"[ 0, 10]NC(NP) (Confidence)",
"[ 11, 13]PC(PP) (in)",
"[ 14, 23]NC(NP) (the pound)",
"[ 24, 50]VC(VP) (is widely expected to take)",
"[ 51, 69]NC(NP) (another sharp dive)",
"[ 70, 72]O(SBAR) (if)",
"[ 73, 86]NC(NP) (trade figures)",
"[ 87, 90]PC(PP) (for)",
"[ 91,100]NC(NP) (September)",
"[103,106]ADJC(ADJP) (due)",
"[107,110]PC(PP) (for)",
"[111,118]NC(NP) (release)",
"[119,127]NC(NP) (tomorrow)",
"[130,142]VC(VP) (fail to show)",
"[143,168]NC(NP) (a substantial improvement)",
"[169,173]PC(PP) (from)",
"[174,189]NC(NP) (July and August)",
"[190,213]NC(NP) ('s near-record deficits)",
"[228,230]PC(PP) (of)",
"[231,244]NC(NP) (the Exchequer)",
"[245,257]NC(NP) (Nigel Lawson)",
"[258,280]NC(NP) ('s restated commitment)",
"[281,283]PC(PP) (to)",
"[284,306]NC(NP) (a firm monetary policy)",
"[307,328]VC(VP) (has helped to prevent)",
"[329,339]NC(NP) (a freefall)",
"[340,342]PC(PP) (in)",
"[343,351]NC(NP) (sterling)",
"[352,356]PC(PP) (over)",
"[357,370]NC(NP) (the past week)",
"[378,386]NC(NP) (analysts)",
"[387,393]VC(VP) (reckon)",
"[394,412]NC(NP) (underlying support)",
"[413,416]PC(PP) (for)",
"[417,425]NC(NP) (sterling)",
"[426,441]VC(VP) (has been eroded)",
"[442,444]PC(PP) (by)",
"[445,459]NC(NP) (the chancellor)",
"[460,470]NC(NP) ('s failure)",
"[471,482]VC(VP) (to announce)",
"[483,506]NC(NP) (any new policy measures)",
"[507,509]PC(PP) (in)",
"[510,534]NC(NP) (his Mansion House speech)",
"[535,548]NC(NP) (last Thursday)" };
String[] posMapped = { "NOUN", "ADP", "DET", "NOUN", "VERB", "ADV", "VERB", "ADP", "VERB",
"DET", "ADJ", "NOUN", "ADP", "NOUN", "NOUN", "ADP", "PROPN", "PUNCT", "ADJ", "ADP",
"NOUN", "NOUN", "PUNCT", "VERB", "ADP", "VERB", "DET", "ADJ", "NOUN", "ADP",
"PROPN", "CONJ", "PROPN", "X", "ADJ", "NOUN", "PUNCT", "PROPN", "ADP", "DET",
"PROPN", "PROPN", "PROPN", "X", "VERB", "NOUN", "ADP", "DET", "NOUN", "ADJ", "NOUN",
"VERB", "VERB", "ADP", "VERB", "DET", "NOUN", "ADP", "NOUN", "ADP", "DET", "ADJ",
"NOUN", "PUNCT", "CONJ", "NOUN", "VERB", "VERB", "NOUN", "ADP", "NOUN", "VERB",
"VERB", "VERB", "ADP", "DET", "NOUN", "X", "NOUN", "ADP", "VERB", "DET", "ADJ",
"NOUN", "NOUN", "ADP", "PRON", "PROPN", "PROPN", "NOUN", "ADJ", "PROPN", "PUNCT" };
String[] posOriginal = { "NN", "IN", "DT", "NN", "VBZ", "RB", "VBN", "TO", "VB", "DT", "JJ",
"NN", "IN", "NN", "NNS", "IN", "NNP", ",", "JJ", "IN", "NN", "NN", ",", "VB", "TO",
"VB", "DT", "JJ", "NN", "IN", "NNP", "CC", "NNP", "POS", "JJ", "NNS", ".", "NNP",
"IN", "DT", "NNP", "NNP", "NNP", "POS", "VBN", "NN", "TO", "DT", "NN", "JJ", "NN",
"VBZ", "VBN", "TO", "VB", "DT", "NN", "IN", "NN", "IN", "DT", "JJ", "NN", ".", "CC",
"NNS", "VBP", "VBG", "NN", "IN", "NN", "VBZ", "VBN", "VBN", "IN", "DT", "NN", "POS",
"NN", "TO", "VB", "DT", "JJ", "NN", "NNS", "IN", "PRP$", "NNP", "NNP", "NN", "JJ",
"NNP", "." };
assertSentence(sentences, select(jcas, Sentence.class));
assertChunks(chunks, select(jcas, Chunk.class));
assertPOS(posMapped, posOriginal, select(jcas, POS.class));
}
}