/* * Copyright 2012 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.core.testing.harness; import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.assertSentence; import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.assertToken; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; import static org.apache.uima.fit.util.JCasUtil.select; import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.apache.commons.lang.ArrayUtils; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; import org.junit.Assert; import org.junit.internal.AssumptionViolatedException; import de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceObjectProviderBase; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.SegmenterBase; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; public final class SegmenterHarness { public static final TestData[] DATA = new TestData[] { new TestData("de.1", "de", "Herr Frank M. Meier hat einen Hund.", new String[] { "Herr", "Frank", "M.", "Meier", "hat", "einen", "Hund", "."}, new String[] { "Herr Frank M. Meier hat einen Hund." }), new TestData("de.2", "de", "Ich bin ein blöder Hund.", new String[] { "Ich", "bin", "ein", "blöder", "Hund", "." }, new String[] { "Ich bin ein blöder Hund." }), new TestData("de.3", "de", "Mein Name ist Hans.", new String[] { "Mein", "Name", "ist", "Hans", "." }, new String[] { "Mein Name ist Hans." }), // DKPRO-CORE-ASL-98: BreakIteratorSegmenter turns hypens to separate tokens new TestData("de.4", "de", "ihre Negativbei- spiele immer", new String[] { "ihre", "Negativbei-", "spiele", "immer" }, new String[] { "ihre Negativbei- spiele immer" }), new TestData("en.1", "en", "Sadler, A.L. Cha-No-Yu: The Japanese Tea Ceremony.", new String[] { "Sadler", ",", "A.L.", "Cha-No-Yu", ":", "The", "Japanese", "Tea", "Ceremony", "."}, new String[] { "Sadler, A.L. Cha-No-Yu: The Japanese Tea Ceremony." } ), new TestData("en.2", "en", "I love the UIMA toolkit. 1989 is the year in which the Berlin wall fell.", new String[] { "I", "love", "the", "UIMA", "toolkit", ".", "1989", "is", "the", "year", "in", "which", "the", "Berlin", "wall", "fell", "." }, new String[] { "I love the UIMA toolkit.", "1989 is the year in which the Berlin wall fell." }), new TestData("en.3", "en", "I'm not a girl.", new String[] { "I", "'m", "not", "a", "girl", "." }, new String[] { "I'm not a girl." }), new TestData("en.4", "en", "I am a stupid dog.", new String[] { "I", "am", "a", "stupid", "dog", "." }, new String[] { "I am a stupid dog." }), new TestData("en.5", "en", "Georg \"Bullseye\" Logal is a though guy.", new String[] { "Georg", "\"", "Bullseye", "\"", "Logal", "is", "a", "though", "guy", "." }, new String[] { "Georg \"Bullseye\" Logal is a though guy." }), new TestData("en.6", "en", "This doesn't compute.", new String[] { "This", "does", "n't", "compute", "." }, new String[] { "This doesn't compute." }), new TestData("en.7", "en", "based on\n 'Carnival of Souls', written by [...] and directed by [...].", new String[] { "based", "on", "'", "Carnival", "of", "Souls", "'", ",", "written", "by", "[", "...", "]", "and", "directed", "by", "[", "...", "]", "." }, new String[] { "based on\n 'Carnival of Souls', written by [...] and directed by [...]." }), new TestData("en.8", "en", ", , ,", new String[] { ",", ",", "," }, new String[] { ", , ," }), new TestData("en.9", "en", "How to tokenize smileys? This is a good example. >^,,^< :0 3:[", new String[] { "How", "to", "tokenize", "smileys", "?", "This", "is", "a", "good", "example.", ">^,,^<", ":0", "3:[" }, new String[] { "How to tokenize smileys?", "This is a good example.", ">^,,^< :0 3:[" }), // Sombody who can read arabic, please check this // Covering the following sub-Saharan countries with vast areas very new TestData("ar.1", "ar", "تغطي الصحراء الكبرى الدول التالية بمساحات شاسعة جدا", new String[] { "تغطي", "الصحراء", "الكبرى", "الدول", "التالية", "مساحات", "شاسعة", "جدا" }, new String[] { "تغطي الصحراء الكبرى الدول التالية بمساحات شاسعة جدا" }), // While the stanford parser should come with a proper tokenizer // for Chinese (because it can parse chinese text), this does not // seem to be the right one or I am using it wrong. The associated // test cases do not work. Maybe debugging the command below // would help to find out how to use it. // They use command to parse it: java -mx1g -cp "stanford-parser.jar" // edu.stanford.nlp.parser.lexparser.LexicalizedParser -tLPP // edu.stanford.nlp.parser.lexparser.ChineseTreebankParserParams -sentences // newline -escaper // edu.stanford.nlp.trees.international.pennchinese.ChineseEscaper // -outputFormat "penn,typedDependencies" -outputFormatOptions // "removeTopBracket" xinhuaFactoredSegmenting.ser.gz sampleInput.txt. new TestData("zh.1", "zh", "服务业成为广东经济转型升级的重要引擎。", new String[] {"服务业", "成为", "广东", "经济", "转型", "升级", "的", "重要", "引擎", "。"}, new String[] {"服务业成为广东经济转型升级的重要引擎。"}), new TestData("zh.2", "zh", "中国离世界技术品牌有多远?", new String[] {"中国", "离", "世界", "技术", "品牌", "有", "多远", "?" }, new String[] { "中国离世界技术品牌有多远?" }) }; private SegmenterHarness() { // No instances } @FunctionalInterface public static interface AssumeResourcePredicate { void assume(String aLanguage, String aVariant) throws AssumptionViolatedException, IOException; } public static void run(AnalysisEngineDescription aAed, String... aIgnoreIds) throws Throwable { run(aAed, null, aIgnoreIds); } public static void run(AnalysisEngineDescription aAed, AssumeResourcePredicate aCheck, String... aIgnoreIds) throws Throwable { // No automatic downloading from repository during testing. This makes sure we fail if // models are not properly added as test dependencies. if (offline) { System.setProperty(ResourceObjectProviderBase.PROP_REPO_OFFLINE, "true"); } offline = true; AnalysisEngine ae = createEngine(aAed); JCas jCas = ae.newJCas(); List<String> results = new ArrayList<String>(); try { for (TestData td : DATA) { System.out.printf("== %s ==%n", td.id); jCas.reset(); if (aCheck != null) { try { aCheck.assume(td.language, null); } catch (AssumptionViolatedException e) { results.add(String.format("%s skipped", td.id)); continue; } } jCas.setDocumentLanguage(td.language); jCas.setDocumentText(td.text); boolean failed = false; try { ae.process(jCas); AssertAnnotations.assertSentence(td.sentences, select(jCas, Sentence.class)); AssertAnnotations.assertToken(td.tokens, select(jCas, Token.class)); results.add(String.format("%s OK", td.id)); } catch (Throwable e) { failed = true; if (!ArrayUtils.contains(aIgnoreIds, td.id)) { results.add(String.format("%s FAIL", td.id)); throw e; } else { results.add(String.format("%s FAIL - Known, ignored", td.id)); } } if (!failed && ArrayUtils.contains(aIgnoreIds, td.id)) { results.add(String.format("%s FAIL", td.id)); Assert.fail(td.id + " passed but was expected to fail"); } } } finally { System.out.println("=== RESULTS ==="); for (String r : results) { System.out.println(r); } } } public static void testZoning(Class<? extends SegmenterBase> aSegmenter) throws Exception { testZoning(aSegmenter, "en"); } public static void testZoning(Class<? extends SegmenterBase> aSegmenter, String aLanguage) throws Exception { testLaxZoning(aSegmenter, aLanguage); testStrictZoning(aSegmenter, aLanguage); testOufOfBoundsZones(aSegmenter, aLanguage); } public static void testLaxZoning(Class<? extends SegmenterBase> aSegmenter, String aLanguage) throws Exception { // No automatic downloading from repository during testing. This makes sure we fail if // models are not properly added as test dependencies. if (offline) { System.setProperty(ResourceObjectProviderBase.PROP_REPO_OFFLINE, "true"); } offline = true; String[] sentences = { "A a a a .", "A a a a -", "B b b b .", "B b b b -", "C c c c .", "C c c c -" }; String[] tokens = { "A", "a", "a", "a", ".", "A", "a", "a", "a", "-", "B", "b", "b", "b", ".", "B", "b", "b", "b", "-", "C", "c", "c", "c", ".", "C", "c", "c", "c", "-" }; JCas jcas = JCasFactory.createJCas(); jcas.setDocumentLanguage(aLanguage); // 1 1 2 2 3 3 4 4 5 5 6 // 0 5 0 5 0 5 0 5 0 5 0 5 0 // ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- jcas.setDocumentText("A a a a . A a a a - B b b b . B b b b - C c c c . C c c c -"); // |------------------| |------------------| new Paragraph(jcas, 0, 19).addToIndexes(); new Paragraph(jcas, 40, 59).addToIndexes(); AnalysisEngine ae = createEngine(aSegmenter, SegmenterBase.PARAM_STRICT_ZONING, false, SegmenterBase.PARAM_ZONE_TYPES, Paragraph.class); ae.process(jcas); assertToken(tokens, select(jcas, Token.class)); assertSentence(sentences, select(jcas, Sentence.class)); } public static void testOufOfBoundsZones(Class<? extends SegmenterBase> aSegmenter, String aLanguage) throws Exception { // No automatic downloading from repository during testing. This makes sure we fail if // models are not properly added as test dependencies. if (offline) { System.setProperty(ResourceObjectProviderBase.PROP_REPO_OFFLINE, "true"); } offline = true; // 1 1 2 2 3 3 4 4 5 5 6 // 0 5 0 5 0 5 0 5 0 5 0 5 0 // ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- String text = "A a a a . A a a a - B b b b . B b b b - C c c c . C c c c -"; // |------------------| |------------------| // non-strict zoning { String[] sentences = { "A a a a .", "A a a a -", "B b b b .", "B b b b -", "C c c c .", "C c c c -" }; String[] tokens = { "A", "a", "a", "a", ".", "A", "a", "a", "a", "-", "B", "b", "b", "b", ".", "B", "b", "b", "b", "-", "C", "c", "c", "c", ".", "C", "c", "c", "c", "-" }; JCas jcas = JCasFactory.createJCas(); jcas.setDocumentLanguage(aLanguage); jcas.setDocumentText(text); new Paragraph(jcas, 0, 19).addToIndexes(); new Paragraph(jcas, 40, 65).addToIndexes(); AnalysisEngine ae = createEngine(aSegmenter, SegmenterBase.PARAM_STRICT_ZONING, false, SegmenterBase.PARAM_ZONE_TYPES, Paragraph.class); ae.process(jcas); assertToken(tokens, select(jcas, Token.class)); assertSentence(sentences, select(jcas, Sentence.class)); } // strict zoning { String[] sentences = { "A a a a .", "A a a a -", "C c c c .", "C c c c -" }; String[] tokens = { "A", "a", "a", "a", ".", "A", "a", "a", "a", "-", "C", "c", "c", "c", ".", "C", "c", "c", "c", "-" }; JCas jcas = JCasFactory.createJCas(); jcas.setDocumentLanguage(aLanguage); jcas.setDocumentText(text); new Paragraph(jcas, 0, 19).addToIndexes(); new Paragraph(jcas, 40, 65).addToIndexes(); AnalysisEngine ae = createEngine(aSegmenter, SegmenterBase.PARAM_STRICT_ZONING, true, SegmenterBase.PARAM_ZONE_TYPES, Paragraph.class); ae.process(jcas); assertToken(tokens, select(jcas, Token.class)); assertSentence(sentences, select(jcas, Sentence.class)); } } public static void testStrictZoning(Class<? extends SegmenterBase> aSegmenter, String aLanguage) throws Exception { // No automatic downloading from repository during testing. This makes sure we fail if // models are not properly added as test dependencies. if (offline) { System.setProperty(ResourceObjectProviderBase.PROP_REPO_OFFLINE, "true"); } offline = true; String[] sentences = { "A a a a .", "A a a a -", "C c c c .", "C c c c -" }; String[] tokens = { "A", "a", "a", "a", ".", "A", "a", "a", "a", "-", "C", "c", "c", "c", ".", "C", "c", "c", "c", "-" }; JCas jcas = JCasFactory.createJCas(); jcas.setDocumentLanguage(aLanguage); // 1 1 2 2 3 3 4 4 5 5 6 // 0 5 0 5 0 5 0 5 0 5 0 5 0 // ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- jcas.setDocumentText("A a a a . A a a a - B b b b . B b b b - C c c c . C c c c -"); // |------------------| |------------------| new Paragraph(jcas, 0, 19).addToIndexes(); new Paragraph(jcas, 40, 59).addToIndexes(); AnalysisEngine ae = createEngine(aSegmenter, SegmenterBase.PARAM_STRICT_ZONING, true, SegmenterBase.PARAM_ZONE_TYPES, Paragraph.class); ae.process(jcas); assertToken(tokens, select(jcas, Token.class)); assertSentence(sentences, select(jcas, Sentence.class)); } static class TestData { final String id; final String language; final String text; final String[] sentences; final String[] tokens; public TestData(String aId, String aLanguage, String aText, String[] aTokens, String[] aSentences) { id = aId; language = aLanguage; text = aText; sentences = aSentences; tokens = aTokens; } } private static boolean offline = true; public static void autoloadModelsOnNextTestRun() { offline = false; } }