package wikokit.base.wikt.multi.ru;
import wikokit.base.wikt.multi.ru.WLanguageRu;
import wikokit.base.wikipedia.language.LanguageType;
import wikokit.base.wikt.util.LangText;
import org.junit.After;
import org.junit.AfterClass;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;
import static org.junit.Assert.*;
public class WLanguageRuTest {
public WLanguageRuTest() {
}
@BeforeClass
public static void setUpClass() throws Exception {
}
@AfterClass
public static void tearDownClass() throws Exception {
}
@Before
public void setUp() {
}
@After
public void tearDown() {
}
/**
* Test of splitToLanguageSections method, of class WLanguageRu.
* test words: самолёт, stitch, султан, бор, тафта, кит, акушер
*/
@Test
public void testSplitToLanguageSections() {
System.out.println("splitToLanguageSections");
//StringBuffer[] expResult = null;
//assertEquals(expResult, result);
String source_text;
LangText[] result;
StringBuffer s;
// simple two-letter code: ru
source_text = "Before {{-ru-}} Russian";
result = WLanguageRu.splitToLanguageSections("test_word1", new StringBuffer(source_text));
assertEquals(1, result.length);
//assertTrue(result[0].text.toString().equalsIgnoreCase("Before Russian"));
assertEquals("Before Russian", result[0].text.toString());
// simple two-letter code: ru bg tg
source_text = "Before {{-ru-}} Russian {{-bg-}} Bulgarian {{-tg-}} Tajik After";
result = WLanguageRu.splitToLanguageSections("test_word1", new StringBuffer(source_text));
assertEquals(3, result.length);
// more than two-letter code: ain slovio zh
source_text = "{{-ru-}} Russian {{-ain-}} Ainu {{-slovio-la-}} Slovio {{-zh-}} Chinese";
result = WLanguageRu.splitToLanguageSections("test_word2", new StringBuffer(source_text));
assertEquals(4, result.length); // fifth language is unknown
// unknown letter code (only one): should be omitted (with text)
source_text = "Before {{-unknown-}} Unknown lang";
result = WLanguageRu.splitToLanguageSections("test_word1", new StringBuffer(source_text));
assertEquals(0, result.length);
// without letter code: let's think that this is a Russian word
source_text = "Before without lang tag";
result = WLanguageRu.splitToLanguageSections("test_word1", new StringBuffer(source_text));
assertEquals(1, result.length);
// unknown letter code (last): should be omitted (with text)
source_text = "{{-ru-}} Russian {{-slovio-}} Slovio {{-unknown-}} Unknown lang";
result = WLanguageRu.splitToLanguageSections("test_word2", new StringBuffer(source_text));
assertEquals(2, result.length);
// unknown letter code (in the middle): should be omitted (with text)
source_text = "{{-ru-}} Russian {{-unknown-}} Unknown lang {{-slovio-}} Slovio";
result = WLanguageRu.splitToLanguageSections("test_word2", new StringBuffer(source_text));
assertEquals(2, result.length);
/* // sorry, we are skipping SQL JDBC in this project
// only one language (Russian)
s = new StringBuffer(
PageTableBase.getArticleText(connect_ruwikt, "самолёт"));
result = WLanguageRu.splitToLanguageSections("самолёт", s);
assertEquals(1, result.length);
// only one language (English)
s = new StringBuffer(PageTableBase.getArticleText(connect_ruwikt, "roast"));
result = WLanguageRu.splitToLanguageSections("roast", s);
assertEquals(1, result.length);
// two languages: ru I & II + uk I & II
s = new StringBuffer(PageTableBase.getArticleText(connect_ruwikt, "султан"));
result = WLanguageRu.splitToLanguageSections("султан", s);
assertEquals(2, result.length);
// complex: 2 Russian homonyms and three other languages
s = new StringBuffer(PageTableBase.getArticleText(connect_ruwikt, "бор"));
result = WLanguageRu.splitToLanguageSections("бор", s);
assertTrue(result.length >= 3); // now 4, in db 3
s = new StringBuffer(PageTableBase.getArticleText(connect_ruwikt, "тафта"));
result = WLanguageRu.splitToLanguageSections("тафта", s);
assertEquals(3, result.length);
s = new StringBuffer(PageTableBase.getArticleText(connect_ruwikt, "кит"));
result = WLanguageRu.splitToLanguageSections("кит", s);
assertEquals(4, result.length);
// three
s = new StringBuffer(PageTableBase.getArticleText(connect_ruwikt, "акушер"));
result = WLanguageRu.splitToLanguageSections("акушер", s);
assertEquals(3, result.length);
// translingual, INTernational: sin (trigonometric)
s = new StringBuffer(PageTableBase.getArticleText(connect_ruwikt, "sin"));
result = WLanguageRu.splitToLanguageSections("sin", s);
assertEquals(11, result.length);
s = new StringBuffer(PageTableBase.getArticleText(connect_ruwikt, "0"));
result = WLanguageRu.splitToLanguageSections("0", s);
assertEquals(1, result.length);
*/
}
// {{заголовок|ka|add=}} - yes, this is language delimiter
// {{заголовок|be|add=I}}- no, this is not a laguage, but a POS delimiter
@Test
public void testSplitToLanguageSections_with_special_header() {
System.out.println("splitToLanguageSections_with_special_header");
//StringBuffer[] expResult = null;
//assertEquals(expResult, result);
String source_text;
LangText[] result;
// simple two-letter code: Georgia "|ka|"
source_text = "Before {{заголовок|ka|add=}} Georgia";
result = WLanguageRu.splitToLanguageSections("test_word1", new StringBuffer(source_text));
assertEquals(1, result.length);
//assertTrue(result[0].text.toString().equalsIgnoreCase("Before Russian"));
assertEquals("Before Georgia", result[0].text.toString());
// simple two-letter code: Georgia "|ka}}"
source_text = "Before {{заголовок|ka}} Georgia";
result = WLanguageRu.splitToLanguageSections("test_word1", new StringBuffer(source_text));
assertEquals(1, result.length);
//assertTrue(result[0].text.toString().equalsIgnoreCase("Before Russian"));
assertEquals("Before Georgia", result[0].text.toString());
// {{заголовок|be|add=I}}
// no, this is not a laguage, but a POS delimiter (next level)
source_text = "Before {{заголовок|be|add=I}} shah";
result = WLanguageRu.splitToLanguageSections("shah", new StringBuffer(source_text));
assertEquals(0, result.length);
// only one Belarusian language with two POS
// {{-be-}}
// {{заголовок|be|add=I}}
// {{заголовок|be|add=II}}
source_text = "{{-be-}} skip me {{заголовок|be|add=I}} Belarusian 1 {{заголовок|be|add=II}} second";
result = WLanguageRu.splitToLanguageSections("shah", new StringBuffer(source_text));
assertEquals(1, result.length);
// two = Russian and Belarusian (with two POS)
// {{-ru-}}
// {{-be-}}
// {{заголовок|be|add=I}}
// {{заголовок|be|add=II}}
source_text = "{{-ru-}} Ru {{-be-}} skip me {{заголовок|be|add=I}} Belarusian 1 {{заголовок|be|add=II}} second";
result = WLanguageRu.splitToLanguageSections("shah", new StringBuffer(source_text));
assertEquals(2, result.length);
}
// = {{-ru-}} =
// {{заголовок|add=I}} - no, this is not a laguage, but a POS delimiter
// {{заголовок|add=II}} - also language delimiter
@Test
public void testSplitToLanguageSections_with_special_header_without_language_code() {
System.out.println("splitToLanguageSections_with_special_header_without_language_code");
String source_text;
LangText[] result;
// only one Russian language with two POS
// = {{-ru-}} =
// {{заголовок|add=I}}
// {{заголовок|add=II}}
source_text = "= {{-ru-}} = skip me {{заголовок|add=I}} Russian 1 {{заголовок|add=II}} second";
result = WLanguageRu.splitToLanguageSections("vsduti", new StringBuffer(source_text));
assertEquals(1, result.length);
}
// {{заголовок|ka|add=}} - yes, this is language delimiter
// {{заголовок|be|add=I}}- no, this is not a laguage, but a POS delimiter
// {{заголовок|de|add=|aare}} - also language delimiter
@Test
public void testSplitToLanguageSections_with_special_header_also() {
System.out.println("splitToLanguageSections_with_special_header_also");
String source_text;
LangText[] result;
// {{заголовок|de|add=|aare}} - also language delimiter
// simple two-letter code: German "|de|"
source_text = "Before {{заголовок|de|add=|aare}} German";
result = WLanguageRu.splitToLanguageSections("test_word_Aare", new StringBuffer(source_text));
assertEquals(1, result.length);
assertEquals(LanguageType.de, result[0].getLanguage());
}
// {{-de-|schwalbe}}
@Test
public void testSplitToLanguageSections_header_with_parameter() {
System.out.println("splitToLanguageSections_header_with_parameter");
//StringBuffer[] expResult = null;
//assertEquals(expResult, result);
String source_text;
LangText[] result;
StringBuffer s;
// simple two-letter code: Georgia "|ka|"
source_text = "Before {{-de-|schwalbe}} Eine Rauchschwalbe";
result = WLanguageRu.splitToLanguageSections("test_word1", new StringBuffer(source_text));
assertEquals(1, result.length);
//assertTrue(result[0].text.toString().equalsIgnoreCase("Before Russian"));
assertEquals("Before Eine Rauchschwalbe", result[0].text.toString());
}
}