package wikokit.base.wikt.multi.ru; import wikokit.base.wikt.multi.ru.WTranslationRu; import org.junit.After; import org.junit.AfterClass; import org.junit.Before; import org.junit.BeforeClass; import org.junit.Test; import static org.junit.Assert.*; import wikokit.base.wikipedia.language.LanguageType; import wikokit.base.wikt.util.POSText; import wikokit.base.wikt.word.WTranslation; import wikokit.base.wikt.word.WTranslationEntry; import wikokit.base.wikt.util.WikiWord; import wikokit.base.wikt.util.WikiText; import wikokit.base.wikt.constant.POS; public class WTranslationRuTest { public static String samolyot_text, samolyot_with_header, kolokolchik_text, kolokolchik_text_1_translation_box, kosa_text_1_translation_box_without_header, unfinished_template, translation_without_pipe, page_end, empty_translation, empty_translation_with_category, absent_translation_block; public WTranslationRuTest() { } @BeforeClass public static void setUpClass() throws Exception { } @AfterClass public static void tearDownClass() throws Exception { } @Before public void setUp() { // 1 translation box with 21 translations samolyot_text = "{{перев-блок||\n" + "|en=[[airplane]], [[plane]], [[aircraft]]\n" + // 0 "|bg=[[самолет]], [[аероплан]]\n" + "|hu=[[repülőgép]]\n" + "|da=[[flyvemaskine]], [[fly]]\n" + "|is=[[flugvél]](f)(-ar,-ar)\n" + "|es=[[avión]] {{m}}\n" + "|it=[[aereo]]\n" + "|ko=[[비행기]]\n" + // 7 "|de=[[Flugzeug]] {{n}}\n" + "|nl=[[vliegtuig]]\n" + "|no=[[fly]]\n" + "|os=[[хæдтæхæг]]\n" + "|pl=[[samolot]]\n" + "|tr=[[uçak]], [[tayyare]]\n" + "|uk=[[літак]], [[аероплан]]\n" + "|fi=[[lentokone]]\n" + "|fr=[[avion]] {{m}}\n" + "|cs=[[letadlo]]\n" + "|eo=[[aeroplano]], [[avio]], [[aviadilo]]\n" + "|et=[[lennuk]]\n" + //"|ja=[[飛行機]] (ひこうき, хйко:ки)\n" + // 20 "}}"; samolyot_with_header = "=== Перевод ===\n" + samolyot_text; kolokolchik_text = "text before \n" + "===Перевод===\n" + "{{перев-блок|звонок|\n" + "|en=[[little]] [[bell]], [[handbell]], [[doorbell]]\n" + "|de=[[Glöckchen]], [[Schelle]], [[Klingel]]\n" + "|os=[[мыр-мыраг]], [[хъуытаз]] {{m}}\n" + "|fr=[[sonnette]], [[clochette]], [[clarine]]; (у скота) [[sonnaille]]\n" + "}}\n" + "{{перев-блок|оркестровый инструмент|\n" + "|en=[[glockenspiel]]\n" + "}}\n" + "\n" + "{{перев-блок|цветок\n" + "|en=[[bluebell]], [[bellflower]], [[campanula]]\n" + "|os=[[дзæнгæрæг]], [[къæрцгæнæг]]\n" + "|fr=[[campanule]], [[clochette]]\n" + "}}\n" + "\n" + "===Библиография===\n" + "*\n" + "\n{{categ|category1|category2|lang=}}" + "\n" + "[[Категория:Музыкальные инструменты]]\n"; kolokolchik_text_1_translation_box = "{{перев-блок|цветок\n" + "|en=[[bluebell]], [[bellflower]], [[campanula]]\n" + "|os=[[дзæнгæрæг]], [[къæрцгæнæг]]\n" + "|fr=[[campanule]], [[clochette]]\n" + "}}\n"; kosa_text_1_translation_box_without_header = "{{перев-блок\n" + "|en=[[braid]], [[plait]], [[pigtail]], [[queue]]\n" + "|de=[[Zopf]] {{m}} -es, Zöpfe\n" + "|fr=[[natte]] {{f}}; [[couette]] {{f}}, [[tresse]] <i>f</i>\n" + "}}\n"; unfinished_template = "\n" + "{{unfinished\n" + "|m=\n" + "|p=1\n" + "|s=\n" + "|e=\n" + "}}"; page_end = "\n" + "*{{theza|самолёт}}\n" + "\n" + "{{длина слова|7}}\n" + "\n" + "[[Категория:Авиация]]\n" + "\n" + "[[ar:самолёт]]"; empty_translation_with_category = "=== Перевод ===\n" + "{{перев-блок||\n" + "|en=\n" + "}}\n" + "\n" + "[[Категория:Россиянки]]\n" + unfinished_template; empty_translation = "=== Перевод ===\n" + "{{перев-блок||\n" + "|en=[[]]\n" + "|de=[[]]\n" + "}}\n"; translation_without_pipe = "=== Перевод ===\n" + "{{перев-блок\n" + "|en=[[test_translation]]\n" + "}}\n"; absent_translation_block = "=== Перевод ===\n" + "*\n" + "\n" + "[[Категория:Россиянки]]\n" + unfinished_template; } @After public void tearDown() { } @Test public void testParse_3_meanings() { System.out.println("parse__3_meanings"); LanguageType wikt_lang = LanguageType.ru; // Russian Wiktionary String page_title = "колокольчик"; LanguageType lang_section = LanguageType.ru; // Russian word POSText pt = new POSText(POS.noun, kolokolchik_text); WTranslation[] result = WTranslationRu.parse(wikt_lang, lang_section, page_title, pt); assertEquals(3, result.length ); assertTrue(result[0].getHeader().equalsIgnoreCase( "звонок" )); assertTrue(result[1].getHeader().equalsIgnoreCase( "оркестровый инструмент" )); assertTrue(result[2].getHeader().equalsIgnoreCase( "цветок" )); // "{{перев-блок|оркестровый инструмент|\n" + // "|en=[[glockenspiel]]\n" + WikiText[] wt_orchestra = result[1].getTranslationIntoLanguage(LanguageType.en); assertEquals(1, wt_orchestra.length ); assertTrue(wt_orchestra[0].getVisibleText(). equalsIgnoreCase( "glockenspiel" ) ); WikiWord[] ww_orchestra = wt_orchestra[0].getWikiWords(); assertEquals(1, ww_orchestra.length ); assertTrue(ww_orchestra[0].getWordLink(). equalsIgnoreCase( "glockenspiel" ) ); assertTrue(ww_orchestra[0].getWordVisible().equalsIgnoreCase( "glockenspiel" ) ); } @Test public void testParse_1_meaning() { System.out.println("parse_1_meaning"); LanguageType wikt_lang = LanguageType.ru; // Russian Wiktionary String page_title = "самолёт"; LanguageType lang_section = LanguageType.ru; // Russian word POSText pt = new POSText(POS.noun, samolyot_with_header); WTranslation[] result = WTranslationRu.parse(wikt_lang, lang_section, page_title, pt); assertEquals(1, result.length ); assertEquals(0, result[0].getHeader().length()); // only one meaning, the translation box has no header // "|tr=[[uçak]], [[tayyare]]\n" WikiText[] wt_samolyot = result[0].getTranslationIntoLanguage(LanguageType.tr); assertEquals(2, wt_samolyot.length ); assertTrue(wt_samolyot[0].getVisibleText().equalsIgnoreCase( "uçak" ) ); assertTrue(wt_samolyot[1].getVisibleText().equalsIgnoreCase( "tayyare" ) ); } // kolokolchik_text_1_translation_box /*цветок |en=[[bluebell]], [[bellflower]], [[campanula]]" + |os=[[дзæнгæрæг]], [[къæрцгæнæг]]" + |fr=[[campanule]], [[clochette]]*/ @Test public void testParseOneTranslationBox_test2() { System.out.println("parseOneTranslationBox_test2"); LanguageType wikt_lang = LanguageType.ru; // Russian Wiktionary String page_title = "колокольчик"; WTranslation result = WTranslationRu.parseOneTranslationBox(wikt_lang, page_title, kolokolchik_text_1_translation_box); assertTrue(null != result); assertTrue(result.getHeader().equalsIgnoreCase( "цветок" ) ); WTranslationEntry[] trans_all = result.getTranslations(); assertEquals(3, trans_all.length); { // 2. Osetian // |os=[[дзæнгæрæг]], [[къæрцгæнæг]] WTranslationEntry trans_os = trans_all[1]; assertEquals(LanguageType.os, trans_os.getLanguage()); WikiText[] wt_os = trans_os.getWikiPhrases(); assertEquals(2, wt_os.length); assertTrue(wt_os[0].getVisibleText().equalsIgnoreCase( "дзæнгæрæг" ) ); assertTrue(wt_os[1].getVisibleText().equalsIgnoreCase( "къæрцгæнæг" ) ); } } /* translation_without_pipe * "{{перев-блок" instead of "{{перев-блок||" */ @Test public void testParse_without_pipe() { System.out.println("without_pipe"); LanguageType wikt_lang = LanguageType.ru; // Russian Wiktionary String page_title = "test_translation"; LanguageType lang_section = LanguageType.ru; // Russian word POSText pt = new POSText(POS.noun, translation_without_pipe); WTranslation[] result = WTranslationRu.parse(wikt_lang, lang_section, page_title, pt); assertEquals(1, result.length ); assertEquals(0, result[0].getHeader().length()); // only one meaning, the translation box has no header // "|en=[[test_translation]]\n" WikiText[] wt_samolyot = result[0].getTranslationIntoLanguage(LanguageType.en); assertEquals(1, wt_samolyot.length ); assertTrue(wt_samolyot[0].getVisibleText().equalsIgnoreCase( "test_translation" ) ); } // tests skipping of unfinished_template @Test public void testParseTranslation_unfinished_template() { System.out.println("parse_1_meaning"); LanguageType wikt_lang = LanguageType.ru; // Russian Wiktionary String page_title = "самолёт"; LanguageType lang_section = LanguageType.ru; // Russian word String s = samolyot_with_header + unfinished_template; POSText pt = new POSText(POS.noun, s); WTranslation[] result = WTranslationRu.parse(wikt_lang, lang_section, page_title, pt); assertEquals(1, result.length ); assertEquals(0, result[0].getHeader().length()); // only one meaning, the translation box has no header // "|tr=[[uçak]], [[tayyare]]\n" WikiText[] wt_samolyot = result[0].getTranslationIntoLanguage(LanguageType.tr); assertEquals(2, wt_samolyot.length ); assertTrue(wt_samolyot[0].getVisibleText().equalsIgnoreCase( "uçak" ) ); assertTrue(wt_samolyot[1].getVisibleText().equalsIgnoreCase( "tayyare" ) ); } // tests skipping of unfinished_template @Test public void testParseTranslation_without_unfinished_template() { System.out.println("parse_1_meaning"); LanguageType wikt_lang = LanguageType.ru; // Russian Wiktionary String page_title = "самолёт"; LanguageType lang_section = LanguageType.ru; // Russian word String s = samolyot_with_header + page_end; POSText pt = new POSText(POS.noun, s); WTranslation[] result = WTranslationRu.parse(wikt_lang, lang_section, page_title, pt); assertEquals(1, result.length ); assertEquals(0, result[0].getHeader().length()); // only one meaning, the translation box has no header // the last translation entry shoud be without "[[en:..]]", etc. // "|et=[[lennuk]]\n" WikiText[] wt_samolyot_et = result[0].getTranslationIntoLanguage(LanguageType.et); assertEquals(1, wt_samolyot_et.length); assertTrue(wt_samolyot_et[0].getVisibleText().equalsIgnoreCase( "lennuk" ) ); } /* tests categories before unfinished_template, e.g. {{перев-блок|| |en= }} [[Категория:Россиянки]] {{unfinished|p=1}}{{длина слова|11}}*/ @Test public void testParseTranslation_categories_before_unfinished_template() { System.out.println("parse_categories_before_unfinished_template"); LanguageType wikt_lang = LanguageType.ru; // Russian Wiktionary String page_title = "самолёт"; LanguageType lang_section = LanguageType.ru; // Russian word String s = empty_translation_with_category; POSText pt = new POSText(POS.noun, s); WTranslation[] result = WTranslationRu.parse(wikt_lang, lang_section, page_title, pt); assertEquals(0, result.length ); } /* tests categories before unfinished_template, e.g. {{перев-блок|| |en=[[]] }} */ @Test public void testParseTranslation_empty_wiki_links() { System.out.println("parse_empty_wiki_links"); LanguageType wikt_lang = LanguageType.ru; // Russian Wiktionary String page_title = "самолёт"; LanguageType lang_section = LanguageType.ru; // Russian word String s = empty_translation; POSText pt = new POSText(POS.noun, s); WTranslation[] result = WTranslationRu.parse(wikt_lang, lang_section, page_title, pt); assertEquals(0, result.length ); } /** At least one translation block should exists (for Russian words in Russian Wiktionary). * ===Перевод=== * [[Категория:Травы]] {{unfinished|p=1|s=1|e=1}} */ @Test public void testParseTranslation_absentTranslationBlock () { System.out.println("parse_absentTranslationBlock"); LanguageType wikt_lang = LanguageType.ru; // Russian Wiktionary String page_title = "самолёт"; LanguageType lang_section = LanguageType.ru; // Russian word String s = absent_translation_block; POSText pt = new POSText(POS.noun, s); WTranslation[] result = WTranslationRu.parse(wikt_lang, lang_section, page_title, pt); assertEquals(0, result.length ); } @Test public void testParseOneTranslationBox_utf8_korean() { System.out.println("parseOneTranslationBox_utf8_korean"); LanguageType wikt_lang = LanguageType.ru; // Russian Wiktionary String page_title = "самолёт"; WTranslation result = WTranslationRu.parseOneTranslationBox(wikt_lang, page_title, samolyot_text); assertTrue(null != result); assertEquals(0, result.getHeader().length()); // 1 meaning, the translation has no header (summary meaning) WTranslationEntry[] trans_all = result.getTranslations(); assertEquals(20, trans_all.length); { // 1. English // "|en=[[airplane]], [[plane]], [[aircraft]]\n" + WTranslationEntry trans_en = trans_all[0]; assertEquals(LanguageType.en, trans_en.getLanguage()); WikiText[] ww_en = trans_en.getWikiPhrases(); assertEquals(3, ww_en.length); assertTrue(ww_en[0].getVisibleText().equalsIgnoreCase( "airplane" ) ); assertTrue(ww_en[1].getVisibleText().equalsIgnoreCase( "plane" ) ); assertTrue(ww_en[2].getVisibleText().equalsIgnoreCase( "aircraft" ) ); } { // 2. Korean //"|ko=[[비행기]]\n" WTranslationEntry trans_ko = trans_all[7]; assertEquals(LanguageType.ko, trans_ko.getLanguage()); WikiText[] ww_ko = trans_ko.getWikiPhrases(); assertEquals(1, ww_ko.length); assertTrue(ww_ko[0].getVisibleText().equalsIgnoreCase( "비행기" ) ); } } /** To parse additional comments, e.g. {{f}}, {{n}}, * also visible != word link, e.g. * "|ja=[[飛行機]] (ひこうき, хйко:ки)\n" */ @Test public void testParseOneTranslationBox_comma_in_comments() { System.out.println("parseOneTranslationBox_comma_in_comments"); LanguageType wikt_lang = LanguageType.ru; // Russian Wiktionary String page_title = "самолёт"; WTranslation result = WTranslationRu.parseOneTranslationBox(wikt_lang, page_title, samolyot_text); assertTrue(null != result); WTranslationEntry[] trans_all = result.getTranslations(); assertEquals(21, trans_all.length); { // 1. Japanese // "|ja=[[飛行機]] (ひこうき, хйко:ки)\n" + // work link = 飛行機 // word visible = 飛行機 (ひこうき, хйко:ки) WTranslationEntry trans_ja = trans_all[20]; assertEquals(LanguageType.ja, trans_ja.getLanguage()); WikiText[] wt_ja = trans_ja.getWikiPhrases(); assertEquals(1, wt_ja.length); assertTrue(wt_ja[0].getVisibleText().equalsIgnoreCase( "飛行機" ) ); } // todo parse additional comments, e.g. {{f}}, {{n}}, // ... } // test one translation box without the header (article: "коса") @Test public void testParseOneTranslationBox_without_header() { System.out.println("parseOneTranslationBox_comma_in_comments"); LanguageType wikt_lang = LanguageType.ru; // Russian Wiktionary String page_title = "коса"; WTranslation result = WTranslationRu.parseOneTranslationBox(wikt_lang, page_title, kosa_text_1_translation_box_without_header); assertTrue(null != result); WTranslationEntry[] trans_all = result.getTranslations(); assertEquals(3, trans_all.length); { // {{перев-блок // |en=[[braid]], [[plait]], [[pigtail]], [[queue]] // |de=[[Zopf]] {{m}} -es, Zöpfe // work link = Zopf // word visible = Zopf m -es, Zöpfe // |fr=[[natte]] {{f}}; [[couette]] {{f}}, [[tresse]] <i>f</i>\n WTranslationEntry trans_de = trans_all[1]; assertEquals(LanguageType.de, trans_de.getLanguage()); WikiText[] wt_de = trans_de.getWikiPhrases(); assertEquals(2, wt_de.length); WikiWord[] ww_de = wt_de[0].getWikiWords(); assertTrue(null != ww_de); assertEquals(1, ww_de.length); assertTrue(ww_de[0].getWordLink().equalsIgnoreCase( "Zopf" ) ); assertTrue(wt_de[0].getVisibleText().equalsIgnoreCase( "Zopf {{m}} -es" ) ); // "Zopf m. -es" will be better, but later } } }