package wikokit.base.wikt.multi.en; import wikokit.base.wikt.multi.en.WEtymologyEn; import wikokit.base.wikt.multi.en.WPOSEn; import wikokit.base.wikipedia.language.LanguageType; import wikokit.base.wikt.constant.POS; import org.junit.After; import org.junit.AfterClass; import org.junit.Before; import org.junit.BeforeClass; import org.junit.Test; import static org.junit.Assert.*; import wikokit.base.wikt.util.LangText; import wikokit.base.wikt.util.POSText; public class WPOSEnTest { public WPOSEnTest() { } @BeforeClass public static void setUpClass() throws Exception { } @AfterClass public static void tearDownClass() throws Exception { } @Before public void setUp() { } @After public void tearDown() { } @Test public void testSplitToPOSSections() { System.out.println("splitToPOSSections"); String s1, s2, s3, s4, s1_result, s2_result; String source_text, result1, result2; LangText source_lt; LangText[] etymology_sections; POSText[] result; String page_title; // case -1: empty => unknown // // ===Verb=== source_text = "===It's not a Part Of Speech Name===\n" + "text which do not describe POS"; source_lt = new LangText(LanguageType.en); source_lt.text = new StringBuffer(source_text); page_title = "pos_en_word-1"; etymology_sections = WEtymologyEn.splitToEtymologySections(page_title, source_lt); result = WPOSEn.splitToPOSSections(page_title, etymology_sections); assertEquals(1, result.length); assertEquals(POS.unknown, result[0].getPOSType()); assertTrue(result[0].getText().toString().equalsIgnoreCase(source_text)); // case 0: verb // // ===Verb=== source_text = "===Verb===\n" + "===It's not a Part Of Speech Name==="; source_lt = new LangText(LanguageType.en); source_lt.text = new StringBuffer(source_text); page_title = "pos_en_word0"; etymology_sections = WEtymologyEn.splitToEtymologySections(page_title, source_lt); result = WPOSEn.splitToPOSSections(page_title, etymology_sections); assertEquals(1, result.length); assertEquals(POS.verb, result[0].getPOSType()); assertTrue(result[0].getText().toString().equalsIgnoreCase("===It's not a Part Of Speech Name===")); // case 1: noun and verb // // ===Noun=== // {{en-noun}} // ===Verb=== s1 = "===Noun===\n" + "{{en-noun}}\n"; s2 = "===Verb==="; source_text = s1 + s2; s1_result = "{{en-noun}}\n"; s2_result = ""; source_lt = new LangText(LanguageType.en); source_lt.text = new StringBuffer(source_text); page_title = "pos_en_word1"; etymology_sections = WEtymologyEn.splitToEtymologySections(page_title, source_lt); result = WPOSEn.splitToPOSSections(page_title, etymology_sections); assertEquals(2, result.length); assertEquals(POS.noun, result[0].getPOSType()); assertEquals(POS.verb, result[1].getPOSType()); assertTrue(result[0].getText().toString().equalsIgnoreCase(s1_result)); assertTrue(result[1].getText().toString().equalsIgnoreCase(s2_result)); // todo case with two etymologies // case 2: noun and verb // // ===Etymology 1=== // ====Pronunciation==== // ====Noun==== // {{en-noun}} // ====Usage notes==== // ====Synonyms==== // // ===Verb=== // ===Etymology 2=== // ====Pronunciation==== // ====Noun==== // ====Verb==== // Conjugation s1 = "===Etymology 1===\n" + "====Pronunciation====\n" + "====Noun====\n"; s2 = "{{en-noun}}\n" + "====Usage notes====\n" + "====Synonyms====\n" + "\n"; s3 = "===Verb===\n" + "===Etymology 2===\n" + "====Pronunciation====\n" + "====Noun====\n" + "====Verb====\n"; s4 = "Conjugation\n" + "\n"; source_text = s1 + s2 + s3 + s4; source_lt = new LangText(LanguageType.en); source_lt.text = new StringBuffer(source_text); page_title = "pos_en_word3"; etymology_sections = WEtymologyEn.splitToEtymologySections(page_title, source_lt); result = WPOSEn.splitToPOSSections(page_title, etymology_sections); assertEquals(4, result.length); assertEquals(POS.noun, result[0].getPOSType()); assertEquals(POS.verb, result[1].getPOSType()); assertEquals(POS.noun, result[2].getPOSType()); assertEquals(POS.verb, result[3].getPOSType()); assertTrue(result[0].getText().toString().equalsIgnoreCase(s2)); assertTrue(result[1].getText().toString().equalsIgnoreCase("")); assertTrue(result[2].getText().toString().equalsIgnoreCase("")); assertTrue(result[3].getText().toString().equalsIgnoreCase(s4)); } @Test public void testSplitToPOSSections_phrase() { System.out.println("splitToPOSSections_phrase"); String s1, s2, s3, s4, s1_result, s2_result; String source_text, result1, result2; LangText source_lt; LangText[] etymology_sections; POSText[] result; String page_title; //==Swedish== //===Phrase=== s1 = "===Phrase===\n"; s2 = "'''[[var]] [[är]] [[toaletten]]?'''\n"; s3 = "# [[where is the toilet]]?"; source_text = s1 + s2 + s3; s1_result = s2 + s3; source_lt = new LangText(LanguageType.sw); source_lt.text = new StringBuffer(source_text); page_title = "pos_en_word1"; etymology_sections = WEtymologyEn.splitToEtymologySections(page_title, source_lt); result = WPOSEn.splitToPOSSections(page_title, etymology_sections); assertEquals(1, result.length); assertEquals(POS.phrase, result[0].getPOSType()); assertTrue(result[0].getText().toString().equalsIgnoreCase(s1_result)); } // Adjective // // Other headers in use (http://en.wiktionary.org/wiki/Wiktionary:Entry_layout_explained/POS_headers#Other_headers_in_use) // Adjectival noun な-Adjectives Japanese "quasi-adjective", probably should be Adjective. // Quasi-adjective な-Adjectives Japanese, probably should be Adjective. @Test public void testSplitToPOSSections_adjective() { System.out.println("splitToPOSSections_adjective"); String s1, s2, s3, s4, s5, s6, s1_result, s2_result, s3_result; String source_text, result1, result2; LangText source_lt; LangText[] etymology_sections; POSText[] result; String page_title; s1 = "===Adjective===\n"; s2 = "{{infl|lv|adjective}}\n"; s3 = "===Adjectival noun===\n"; s4 = "some text\n"; s5 = "===Quasi-adjective===\n"; s6 = "some text2"; source_text = s1 + s2 + s3 + s4 + s5 + s6; s1_result = s2; s2_result = s4; s3_result = s6; source_lt = new LangText(LanguageType.sw); source_lt.text = new StringBuffer(source_text); page_title = "pos_en_word1"; etymology_sections = WEtymologyEn.splitToEtymologySections(page_title, source_lt); result = WPOSEn.splitToPOSSections(page_title, etymology_sections); assertEquals(3, result.length); assertEquals(POS.adjective, result[0].getPOSType()); assertEquals(POS.adjective, result[1].getPOSType()); assertEquals(POS.adjective, result[2].getPOSType()); assertTrue(result[0].getText().toString().equalsIgnoreCase(s1_result)); assertTrue(result[1].getText().toString().equalsIgnoreCase(s2_result)); assertTrue(result[2].getText().toString().equalsIgnoreCase(s3_result)); } // POS of foreign words in English Wiktionary. @Test public void testSplitToPOSSections_foreign() { System.out.println("splitToPOSSections_foreign"); String s1, s2, s1_result, s2_result; String source_text; LangText source_lt; LangText[] etymology_sections; POSText[] result; String page_title; // case 1: "Verb form" and "Participle" in Russian word article // s1 = "===Verb form===\n" + "'''испо́льзуем''' (ispól'zujem)\n"; s2 = "===Participle===\n" + "# [[is used]]"; source_text = s1 + s2; s1_result = "'''испо́льзуем''' (ispól'zujem)\n"; s2_result = "# [[is used]]"; source_lt = new LangText(LanguageType.ru); source_lt.text = new StringBuffer(source_text); page_title = "pos_ru_word-1"; etymology_sections = WEtymologyEn.splitToEtymologySections(page_title, source_lt); result = WPOSEn.splitToPOSSections(page_title, etymology_sections); assertEquals(2, result.length); assertEquals(POS.verb, result[0].getPOSType()); assertTrue(result[0].getText().toString().equalsIgnoreCase(s1_result)); assertEquals(POS.verb, result[0].getPOSType()); assertTrue(result[1].getText().toString().equalsIgnoreCase(s2_result)); } // POS of foreign words in English Wiktionary. @Test public void testSplitToPOSSections_onePOS() { System.out.println("splitToPOSSections_onePOS"); String s1, s1_result; LangText source_lt; LangText[] etymology_sections; POSText[] result; String page_title; // case 1: "Verb form" and "Participle" in Russian word article // s1 = "===Verb form===\n" + "'''испо́льзуем''' (ispól'zujem)\n"; s1_result = "'''испо́льзуем''' (ispól'zujem)\n"; source_lt = new LangText(LanguageType.ru); source_lt.text = new StringBuffer(s1); page_title = "pos_ru_word-1"; etymology_sections = WEtymologyEn.splitToEtymologySections(page_title, source_lt); result = WPOSEn.splitToPOSSections(page_title, etymology_sections); assertEquals(1, result.length); assertEquals(POS.verb, result[0].getPOSType()); assertTrue(result[0].getText().toString().equalsIgnoreCase(s1_result)); } // One POS. Tests sub-function cutHeaderFromAlonePOSSection. @Test public void testSplitToPOSSections_onePOS_cutHeaderFromAlonePOSSection() { System.out.println("splitToPOSSections_onePOS_cutHeaderFromAlonePOSSection"); String s_header, s_noun, source_text; LangText source_lt; LangText[] etymology_sections; POSText[] result; String page_title; s_header = "[[Image:Ryanair.arp.750pix.jpg|thumb|right|250 px|Boeing 737 airplane.]]\n" + "===Etymology===\n" + "From {{term|aeroplane||lang=en}}\n" + "\n" + "===Pronunciation===\n" + "* {{audio|en-us-airplane.ogg|Audio (US)}}\n" + "\n" + "===Noun===\n"; s_noun = "{{en-noun}}\n" + "# {{US}} A powered heavier-than air [[aircraft]] with fixed [[wing]]s.\n" + "\n" + "====Synonyms====\n" + "* [[aeroplane]].\n" + "\n" + "\n"; source_text = s_header + s_noun; source_lt = new LangText(LanguageType.en); source_lt.text = new StringBuffer(source_text); page_title = "pos_word-1"; etymology_sections = WEtymologyEn.splitToEtymologySections(page_title, source_lt); result = WPOSEn.splitToPOSSections(page_title, etymology_sections); assertEquals(1, result.length); assertEquals(POS.noun, result[0].getPOSType()); System.out.println("pos_section="+result[0].getText().toString()); assertTrue(result[0].getText().toString().equalsIgnoreCase(s_noun)); } }