/*
* CategoryBlackListTest.java
* JUnit based test
*/
package wikipedia.kleinberg;
import wikipedia.language.Encodings;
import junit.framework.*;
import wikipedia.sql.*;
import wikipedia.util.*;
import wikipedia.data.ArticleIdAndTitle;
import java.sql.*;
import java.util.*;
public class CategoryBlackListTest extends TestCase {
public Connect connect, connect_ru;
public SessionHolder session;
int categories_max_steps;
static Map<String,Set<String>> m_out = new HashMap<String,Set<String>>();
static Map<String,Set<String>> m_in = new HashMap<String,Set<String>>();
public CategoryBlackListTest(String testName) {
super(testName);
}
protected void setUp() throws Exception {
connect = new Connect();
connect.Open(Connect.WP_HOST, Connect.WP_DB, Connect.WP_USER, Connect.WP_PASS);
connect_ru = new Connect();
connect_ru.Open(Connect.WP_RU_HOST,Connect.WP_RU_DB,Connect.WP_RU_USER,Connect.WP_RU_PASS);
session = new SessionHolder();
session.initObjects();
categories_max_steps = 99;
}
protected void tearDown() throws Exception {
connect.Close();
connect_ru.Close();
}
public static Test suite() {
TestSuite suite = new TestSuite(CategoryBlackListTest.class);
return suite;
}
public void testGetCategoryUpIteratively_ru() {
System.out.println("getCategoryUpIteratively_ru");
int n_limit, page_id;
String page_title, categories[];
session.connect = connect_ru;
Encodings e = session.connect.enc;
page_title = e.EncodeFromJava("XX_век"); // "XX_век" has category "XX_век" (XX century)
page_id = PageTable.getIDByTitle(connect_ru, page_title);
n_limit = 1;
session.category_black_list.setMaxSteps(n_limit);
categories = session.category_black_list.getCategoryUpIteratively(page_id, null);
assertEquals(1, categories.length);
//assertEquals(0, categories[0].compareTo("XX_век"));
//assertEquals(0, categories[0].compareTo(session.enc.FromDBToUser("XX_век")));
assertEquals(0, categories[0].compareTo(e.EncodeFromJava("XX_век")));
page_title = "1917_год";
page_id = PageTable.getIDByTitle(connect_ru, page_title);
n_limit = 20;
session.category_black_list.setMaxSteps(n_limit);
categories = session.category_black_list.getCategoryUpIteratively(page_id, null);
if(null == categories || 11 >= categories.length) {
System.out.println("\nDid you run the script maintenance/refreshLinks.php?\n");
}
assertTrue(11 < categories.length);
// check:
// result: 1917 XX_век Века Календарь History Время Страны_и_народы Всё
// Календарь: History | Время
// History: Всё | Страны_и_народы
// Время: Всё
// Страны и народы: Всё
session.Init(connect_ru, session.category_black_list.ru, categories_max_steps);
page_title = e.EncodeFromJava("Жданов,_Василий_Александрович"); // Russian artist
page_id = PageTable.getIDByTitle(connect_ru, page_title);
n_limit = 20;
session.category_black_list.setMaxSteps(n_limit);
categories = session.category_black_list.getCategoryUpIteratively(page_id, null);
assertTrue(19 <= // 10
session.category_black_list.getTotalCategoriesPassed());
}
// 1917 -> 1917_год
public void testGetCategoryUpIteratively_Redirect_ru() {
System.out.println("testGetCategoryUpIteratively_Redirect_ru");
int n_limit, page_id;
String page_title, categories[];
session.connect = connect_ru;
session.skipTitlesWithSpaces(true);
session.randomPages(false);
Encodings e = session.connect.enc;
page_title = "1917";
page_id = PageTable.getIDByTitle(connect_ru, page_title);
n_limit = 20;
session.category_black_list.setMaxSteps(n_limit);
categories = session.category_black_list.getCategoryUpIteratively(page_id, null);
assertTrue(null != categories && 11 < categories.length);
}
/** Test GetCategoryUpIteratively for the case when parents categories are repeated.
* Test session.category_nodes after the function calling.
* E.g. Вырожденный газ -> Астрофизика -> Астрономия -> Космос | Естественные науки -> Науки -> ... ->
* Астрофизика -> Физика -> Естественные науки -> Науки | Природа -> ... ->
* Вырожденный газ -> Астрофизика | Квантовые явления | Незавершённые статьи по физике
*
* E.g. Астрофизика -> Физика | Астрономия
* Физика -> Естественные науки -> Науки | Природа
* Астрономия -> Естественные науки | Космос
* Космос -> Природа -> Всё
* Науки -> Наука -> Всё
* Example by JavE:
Аккреция(article)
|
Астрофизика(category)
.-'.
.' `.
.-' `.
Физика (8101) Астрономия (8124)
`. .' ``-..__
+--------------------+ ``--.._
| Естественные науки(13791) Космос (id=16935)
+--------------------+ .---'
.-'`--._ _.-'
_.-' `-..-'
Науки (7935) Природа (12522)
`---.._ .'
Наука (10553) .-'
``--.._ .'
Всё (16350)
*/
public void testGetCategoryUpIterativelyComplex_ru() {
System.out.println("GetCategoryUpIterativelyComplex_ru");
int n_limit, page_id;
String page_title, categories[];
Encodings e = session.connect.enc;
// check:
// result: Вырожденный газ
// Календарь: History | Время
// History: Всё | Страны_и_народы
// Время: Всё
// Страны и народы: Всё
session.Init(connect_ru, session.category_black_list.ru, categories_max_steps);
session.skipTitlesWithSpaces(false);
session.randomPages(false);
// Вырожденный_газ Астрофизика Аккреция
page_title = e.EncodeFromJava("Аккреция");
page_id = PageTable.getIDByTitle(connect_ru, page_title);
n_limit = 99;
session.category_black_list.setMaxSteps(n_limit);
categories = session.category_black_list.getCategoryUpIteratively(page_id, null);
// assert: 2 <= session.category_nodes ("Науки").links_in.size()
String category_name = e.EncodeFromJava("Науки");
int category_id = PageTable.getCategoryIDByTitle(connect_ru, category_name);
assertTrue(session.category_nodes.containsKey(category_id));
Category c = session.category_nodes.get(category_id);
assertTrue(2 == c.links_in.length);
// There are two categories: "Физика" (8101) and "Астрономия" (8124) which are
// refer (link_in) to the category "Естественные науки" (page_id=13791)
String cn = e.EncodeFromJava("Естественные_науки");
int cn_id = PageTable.getCategoryIDByTitle(connect_ru, cn);
assertTrue(session.category_nodes.containsKey(cn_id));
Category c2 = session.category_nodes.get(cn_id);
assertTrue(2 == c2.links_in.length);
}
public void testInBlackList_FirstLevelCategories_en () {
System.out.println("testInBlackList_FirstLevelCategories_en");
int n_limit, page_id;
String page_title, categories[], result, latin1, category, category2;
ArrayList<Integer> categories_sucrose, categories_rice;
// English
// "Sucrose" categories: Disaccharides|Sweeteners
session.Init(connect, session.category_black_list.en, categories_max_steps);
session.randomPages(false);
session.skipTitlesWithSpaces(false);
page_title = "Sucrose";
latin1 = Encodings.FromTo(page_title, "UTF8", "ISO8859_1");
page_id = PageTable.getIDByTitle(connect, latin1);
List<String> titles_level_1_cats = new ArrayList<String>();
result = session.category_black_list.inBlackList(page_id, titles_level_1_cats, session.source_article_id);
int[] id_level_1_cats = Category.getIDByTitle(session.connect, titles_level_1_cats);
assertEquals(2, id_level_1_cats.length);
category = PageTable.getTitleByID(session.connect, id_level_1_cats[0]);
category2 = PageTable.getTitleByID(session.connect, id_level_1_cats[1]);
assertEquals(category, "Disaccharides");
assertEquals(category2, "Sweeteners");
// "Rice" has 10 categories: Domesticated animals | Endangered species | Critically endangered species | Staple foods | Cereals | Grains | Grasses | Model organisms | Rice | Tropical agriculture
// "Rice" has 7 live links categories: Domesticated animals | Endangered species | Critically endangered species | | Cereals | Grains | Grasses | Model organisms | |
page_title = "Rice";
latin1 = Encodings.FromTo(page_title, "UTF8", "ISO8859_1");
page_id = PageTable.getIDByTitle(connect, latin1);
titles_level_1_cats.clear();
result = session.category_black_list.inBlackList(page_id, titles_level_1_cats, session.source_article_id);
id_level_1_cats = Category.getIDByTitle(session.connect, titles_level_1_cats);
assertTrue(10 <= id_level_1_cats.length); // 5 7 10
}
public void testInBlackList_FirstLevelCategories_ru () {
System.out.println("testInBlackList_FirstLevelCategories_ru");
int n_limit, page_id;
String page_title, categories[], result, latin1, category, category2;
ArrayList<Integer> first_level_categories;
Encodings e = session.connect.enc;
// Russian
// "Домра" has categories:
// "Щипковые музыкальные инструменты"
// "Музыкальные инструменты народов России"
List<String> domra_categories = new ArrayList<String>();
domra_categories.add( e.EncodeFromJava("Музыкальные_инструменты_народов_России"));
domra_categories.add( e.EncodeFromJava("Щипковые_музыкальные_инструменты") );
session.Init(connect_ru, session.category_black_list.ru, categories_max_steps);
session.skipTitlesWithSpaces(false);
session.randomPages(false);
page_title = e.EncodeFromJava("Домра");
page_id = PageTable.getIDByTitle(connect_ru, page_title);
List<String> titles_level_1_cats = new ArrayList<String>();
result = session.category_black_list.inBlackList(page_id, titles_level_1_cats, session.source_article_id);
int[] id_level_1_cats = Category.getIDByTitle(session.connect, titles_level_1_cats);
assertTrue(2 == id_level_1_cats.length);
category = PageTable.getTitleByID(session.connect, id_level_1_cats[0]);
assertEquals(category, "Музыкальные_инструменты_народов_России");
category2 = PageTable.getTitleByID(session.connect, id_level_1_cats[1]);
assertEquals(category2, "Щипковые_музыкальные_инструменты");
}
public void testInBlackList_en () {
System.out.println("testInBlackList_en");
int n_limit, page_id;
String page_title, categories[], result, latin1;
String category, article;
Encodings e = session.connect.enc;
// Let's "Centuries" will be in black-list categories,
// then it will be returned by inBlackList()
//
category = "Centuries"; // Years Centuries
page_title = "20th_century";
List<String> my_list = new ArrayList<String>();
my_list.add(category);
categories_max_steps = 99;
session.Init(connect, my_list, categories_max_steps);
session.randomPages(false);
latin1 = Encodings.FromTo(page_title, "UTF8", "ISO8859_1");
page_id = PageTable.getIDByTitle(connect, latin1);
result = session.category_black_list.inBlackList(page_id, null, session.source_article_id);
assertEquals(result, category);
// 284 is the result of "depth-first search" and 5 - "breadth-first search"
//assertTrue(284 == session.category_black_list.passed_steps || 5 == session.category_black_list.passed_steps);
}
public void testInBlackList_ru () {
System.out.println("testInBlackList_ru");
int n_limit, page_id;
String page_title, categories[], result, latin1;
Encodings e = session.connect.enc;
session.Init(connect_ru, session.category_black_list.ru, categories_max_steps);
session.skipTitlesWithSpaces(false);
session.randomPages(false);
page_title = e.EncodeFromJava("XX_век");
page_id = PageTable.getIDByTitle(connect_ru, page_title);
result = session.category_black_list.inBlackList(page_id, null, session.source_article_id);
//assertEquals(result, e.FromDBToUser("Века"));
assertEquals(result, e.EncodeFromJava("Века"));
assertEquals(1, session.category_black_list.getPassedSteps());
assertEquals(1, session.category_black_list.getTotalCategoriesPassed());
result = session.category_black_list.inBlackList(-999, null, session.source_article_id);
assertEquals(result, null);
session.Init(connect_ru, null, categories_max_steps);
result = session.category_black_list.inBlackList(page_id, null, session.source_article_id);
assertEquals(result, null);
session.Init(connect_ru, session.category_black_list.ru, categories_max_steps);
page_title = e.EncodeFromJava("Домра");
page_id = PageTable.getIDByTitle(connect_ru, page_title);
result = session.category_black_list.inBlackList(page_id, null, session.source_article_id);
assertEquals(result, e.EncodeFromJava("Страны"));
}
/** Checks that redirect pages are filtered also.
Redirect page itself does not have any categories.
*/
public void testInBlackList_RedirectPage_ru () {
System.out.println("testInBlackList_RedirectPage_ru");
int n_limit, page_id;
String page_title, categories[], result, latin1;
Encodings e = connect_ru.enc;
session.Init(connect_ru, session.category_black_list.ru, categories_max_steps);
session.skipTitlesWithSpaces(false);
session.randomPages(false);
// test "2000" has category "2000 год" -> "XX век" -> "Века"
page_title = e.EncodeFromJava("2000");
page_id = PageTable.getIDByTitle(connect_ru, page_title);
result = session.category_black_list.inBlackList(page_id, null, session.source_article_id);
assertEquals(result, e.EncodeFromJava("Века"));
}
public void testDeleteUsingBlackList_String_ru () {
System.out.println("test String[] DeleteUsingBlackList_ru (String[])");
Encodings e = connect_ru.enc;
List<String> local_black_list_ru = new ArrayList<String>();
local_black_list_ru.add( e.EncodeFromJava("Века") );
local_black_list_ru.add( e.EncodeFromJava("Страны") );
// session.category_black_list.ru
session.Init(connect_ru, local_black_list_ru, categories_max_steps);
session.skipTitlesWithSpaces(false);
session.randomPages(false);
String[] titles_with_blacklist = {"XX_век", "Домра", "Гитара", "Контрабас"};
ArticleIdAndTitle[] categories_with_blacklist = ArticleIdAndTitle.createByTitle (session.connect, m_out, m_in, titles_with_blacklist);
ArticleIdAndTitle[] cleaned_categories = session.category_black_list.DeleteUsingBlackList(categories_with_blacklist);
String[] cleaned_categories_str = new String[cleaned_categories.length];
int i = 0;
for(ArticleIdAndTitle c:cleaned_categories) {
cleaned_categories_str[i++] = c.title;
}
// "Гитара", "Контрабас" remained, "XX_век" and "Домра" were removed
//assertTrue(cleaned_categories[0] == id[1] || cleaned_categories[0] == id[2]);
assertTrue(2 == cleaned_categories.length);
assertTrue(StringUtil.containsIgnoreCase(cleaned_categories_str, "Гитара"));
assertTrue(StringUtil.containsIgnoreCase(cleaned_categories_str, "Контрабас"));
}
public void testDeleteUsingBlackList_String_Redirect_ru () {
System.out.println("test String[] DeleteUsingBlackList_ru (String[]) with redirect page");
Encodings e = connect_ru.enc;
List<String> local_black_list_ru = new ArrayList<String>();
local_black_list_ru.add( e.EncodeFromJava("Века") );
local_black_list_ru.add( e.EncodeFromJava("Страны") );
// session.category_black_list.ru
session.Init(connect_ru, local_black_list_ru, categories_max_steps);
session.skipTitlesWithSpaces(false);
session.randomPages(false);
String[] titles_with_blacklist = {"1917", "Броузер"};
ArticleIdAndTitle[] categories_with_blacklist = ArticleIdAndTitle.createByTitle (session.connect, m_out, m_in, titles_with_blacklist);
ArticleIdAndTitle[] cleaned_categories = session.category_black_list.DeleteUsingBlackList(categories_with_blacklist);
//String[] cleaned_categories_str= (String[])cleaned_categories.title.toArray(new String[0]);
List<String> l = ArticleIdAndTitle.getTitles (cleaned_categories);
String[] cleaned_categories_str= (String[])l.toArray(new String[0]);
// "Броузер" remained, "" were removed
assertEquals(cleaned_categories_str.length, 1);
assertTrue(StringUtil.containsIgnoreCase(cleaned_categories_str, "Броузер"));
}
public void testDeleteUsingBlackList_ru () {
System.out.println("testDeleteUsingBlackList_ru");
Encodings e = connect_ru.enc;
List<String> local_black_list_ru = new ArrayList<String>();
//local_black_list_ru.add( Encodings.FromTo("Века", "UTF8", "ISO8859_1"));
//local_black_list_ru.add( Encodings.FromTo("Страны", "UTF8", "ISO8859_1"));
local_black_list_ru.add( e.EncodeFromJava("Века") );
local_black_list_ru.add( e.EncodeFromJava("Страны") );
// session.category_black_list.ru
session.Init(connect_ru, local_black_list_ru, categories_max_steps);
session.skipTitlesWithSpaces(false);
session.randomPages(false);
String[] titles = {"XX_век", "Домра", "Гитара"};
Article[] articles = new Article[3];
for(int i=0; i<titles.length; i++) {
Article a = new Article();
a.page_title = e.EncodeFromJava( titles[i] );
a.page_id = PageTable.getIDByTitle(connect_ru, a.page_title);
articles[i] = a;
}
ArticleIdAndTitle[] aid_with_blacklist = ArticleIdAndTitle.create( articles );
boolean b_rand = false;
ArticleIdAndTitle[] cleaned_categories = session.category_black_list.DeleteUsingBlackList(b_rand, aid_with_blacklist, -1);
assertEquals(cleaned_categories.length, 1);
// "Гитара" remained, "XX_век" and "Домра" were removed
//assertTrue(cleaned_categories[0] == id[1] || cleaned_categories[0] == id[2]);
assertTrue(cleaned_categories[0].id == articles[2].page_id);
cleaned_categories = session.category_black_list.DeleteUsingBlackList(b_rand, aid_with_blacklist, 1);
assertEquals(cleaned_categories.length, 1);
}
}