/*
* AuthoritiesTest.java
* JUnit based test
*/
package wikipedia.kleinberg;
import wikipedia.language.Encodings;
import wikipedia.sql.*;
import wikipedia.util.*;
import wikipedia.clustering.*;
import junit.framework.*;
import java.util.*;
import java.text.*;
public class AuthoritiesTest extends TestCase {
Article node;
Authorities auth;
Map<Integer, Article> nodes;
Connect connect, connect_ru;
DumpToGraphViz dump;
SessionHolder session;
public AuthoritiesTest(String testName) {
super(testName);
}
protected void setUp() throws java.lang.Exception {
connect = new Connect();
connect.Open(Connect.WP_HOST, Connect.WP_DB, Connect.WP_USER, Connect.WP_PASS);
connect_ru = new Connect();
connect_ru.Open(Connect.WP_RU_HOST,Connect.WP_RU_DB,Connect.WP_RU_USER,Connect.WP_RU_PASS);
//String f = System.getProperty("synarcher") + "/.synarcher/test_kleinberg/graphviz/";
auth = new Authorities();
dump = new DumpToGraphViz();
//dump.file_dot.SetDir("data/graphviz/");
//dump.file_dot.SetDir(f);
dump.file_dot.setFileInHomeDir("graphviz", "test.dot", "Cp866",true);
dump.file_bat.setFileInHomeDir("graphviz", "bat_ruwiki.bat", "Cp866",true);
dump.file_sh.setFileInHomeDir("graphviz", "bat_ruwiki.sh", "Cp1251", true);
node = new Article();
nodes = new HashMap<Integer, Article>();
Article[] source_nodes = new Article[4];
source_nodes[0] = new Article();
source_nodes[1] = new Article();
source_nodes[2] = new Article();
source_nodes[3] = new Article();
source_nodes[0].page_id = 18991;
source_nodes[0].x =
12.f; source_nodes[1].page_id = 10484;
source_nodes[1].x =
2.f; source_nodes[2].page_id = 3321;
source_nodes[2].x =
7.f; source_nodes[3].page_id = 1121;
source_nodes[3].x =
7.f;
// put source_nodes to nodes
for(int i=0; i<source_nodes.length; i++) {
nodes.put(source_nodes[i].page_id, source_nodes[i]);
}
session = new SessionHolder();
session.initObjects();
session.dump = dump;
}
protected void tearDown() throws java.lang.Exception {
connect.Close();
connect_ru.Close();
}
public static junit.framework.Test suite() {
junit.framework.TestSuite suite = new junit.framework.TestSuite(AuthoritiesTest.class);
return suite;
}
/**
* Test of getLargestXNodes method, of class wikipedia.Authorities.
*/
/* public void testGetLargestXNodes() {
int n_largest_authority = 3;
Article[] nodes_sorted = auth.getLargestXNodes(null, n_largest_authority);
assertEquals(null, nodes_sorted);
nodes_sorted = auth.getLargestXNodes(nodes, n_largest_authority);
assertEquals(nodes_sorted.length, n_largest_authority);
assertEquals(nodes_sorted[0].x, 12.f);
assertEquals(nodes_sorted[1].x, 7.f);
assertEquals(nodes_sorted[2].x, 7.f);
}
*/
public void testGetTitles() {
System.out.println("GetTitles");
List<Article> nodes = new ArrayList<Article>();
Article a1 = new Article();
a1.page_title = "Title1";
String delimiter = ",";
String sres = auth.getTitles(nodes, delimiter);
assertEquals(null, sres);
nodes.add(a1);
sres = auth.getTitles(nodes, delimiter);
assertEquals(a1.page_title, sres);
}
public void testSynonymsToString() {
System.out.println("SynonymsToString");
List<Article> nodes = new ArrayList<Article>();
String s, delimiter = ",";
// test null
s = auth.SynonymsToString("", null, delimiter);
s = auth.SynonymsToString("source_synonym", nodes, delimiter);
assertEquals(s, "source_synonym");
// test "Title1"
Article a1 = new Article();
a1.page_title = "Title1";
nodes.add(a1);
s = auth.SynonymsToString("source_synonym", nodes, delimiter);
assertEquals(s, "source_synonym,Title1");
// test "Title2"
Article a2 = new Article();
a2.page_title = "Title2";
nodes.add(a2);
s = auth.SynonymsToString("source_synonym", nodes, delimiter);
assertEquals(s, "source_synonym,Title1,Title2");
}
public void testgetAllHubs() {
int root_set_size, increment, n_synonyms;
float eps_error; // error to stop the iteration
String article, article_fn;
Encodings e = connect_ru.enc;
//root_set_size = 300; increment = 50;
root_set_size = 10; increment = 2;
//root_set_size = -1; increment = 2;
article = e.EncodeFromJava("Контрабас"); // Интернационализация Контрабас Преферанс
article_fn = StringUtilRegular.encodeRussianToLatinitsa(article, Encodings.enc_java_default, Encodings.enc_int_default);
int categories_max_steps = 5;
// skip Persons
List<String> black_ru = new ArrayList<String>();
//black_ru.addAll(session.category_black_list.ru);
//black_ru.add("Русские_музыканты");
//page_title = Encodings.FromTo("Жданов,_Василий_Александрович", "UTF8", "ISO8859_1");
//page_title = "Жданов,_Василий_Александрович";
//black_ru.add("Музыканты_России");
session.Init(connect_ru, session.category_black_list.ru, categories_max_steps);
session.randomPages(false);
session.skipTitlesWithSpaces(false);
//session.dump = null;
if (null != session.dump) {
String fs = System.getProperty("file.separator");
String sub_dir = "test_kleinberg" + fs + "ru" + fs;
dump.file.setFileInHomeDir(sub_dir, article_fn + ".txt", "Cp1251",true);
}
List<String> rated_synonyms = new ArrayList<String>();
Map<Integer, Article> base_nodes = LinksBaseSet.CreateBaseSet(article, rated_synonyms, session, root_set_size, increment);
eps_error = 0.001f;
int iter = auth.Iterate(base_nodes, eps_error, session);
List<Article> hubs = auth.getAllHubsSortedByY(
base_nodes, session.source_article_id);
if (null != session.dump) {
dump.file.PrintNL( "\nhubs (sorted by Y) pointed to the source article:\n" +
auth.getTitles(hubs, " | ") );
dump.file.Flush();
}
// assertTrue(3 <= hubs.size());
//assertTrue(hubs.get(0).page_id == 1400);
//assertTrue(hubs.get(1).page_id == 14894);
{
n_synonyms = 5; // 4
List<Article> l_synonyms = auth.getAuthoritiesSortedByX(base_nodes, hubs, n_synonyms);
assertTrue(l_synonyms != null && 0 < l_synonyms.size());
String synonyms_titles = auth.getTitles(l_synonyms, " | ");
//synonyms_titles = Encodings.FromTo(synonyms_titles, "ISO8859_1", "UTF8");
//String compare_text = e.EncodeFromJava("Музыка | Фортепиано | Ксилофон | Мелодия");
// "Музыка | Фортепиано | Ксилофон | Ноты"
// "Ксилофон | Маримба | Вибрафон | Колокольчики"
// "Гитара | Скрипка | Электроника_(наука) | Валторна";
// Джаз | Виолончель | Сайкобилли | Жданов | Бас-гитара
// Музыка | Фортепиано | Гармония | Скрипка | Пианист
// String s1 = Encodings.FromTo(compare_text, "UTF8", "ISO8859_1");
List<String> lsyn = new ArrayList<String>();
/*lsyn.add( e.EncodeFromJava("Джаз") );
lsyn.add( e.EncodeFromJava("Виолончель"));
lsyn.add( e.EncodeFromJava("Сайкобилли"));
lsyn.add( e.EncodeFromJava("Жданов") );
lsyn.add( e.EncodeFromJava("Бас-гитара"));*/
//
// all: Гармония Пианист Скрипка Музыка Викисклад Фортепиано Мелодия Импровизация
// now: Импровизация | Викисклад | Система_Хорнбостеля_—_Закса | Музыка | Скрипка
lsyn.add( e.EncodeFromJava("Импровизация"));//+
lsyn.add( e.EncodeFromJava("Викисклад") );//+
lsyn.add( e.EncodeFromJava("Система_Хорнбостеля_—_Закса") );//+
lsyn.add( e.EncodeFromJava("Музыка") );//+
lsyn.add( e.EncodeFromJava("Ноты"));// Ноты Скрипка
assertEquals(5, l_synonyms.size());
/*for(int i=0;i<l_synonyms.size();i++) {
assertTrue(lsyn.contains( l_synonyms.get(i).page_title ));
}*/
}
n_synonyms = 20;
List<Article> synonyms = auth.getAuthoritiesSortedByX(base_nodes, hubs, n_synonyms);
// first order 4 nodes with x = 0.0814: Huitar (id=14671), "1917" (id=706), "XX_век" (id=692), Fiddle (id=11939)
// second order 7 nodes: x = 0.074893 Гобой (14350), Электроника (9500), Синтезатор_(музыкальный_инструмент) (10536),
// Домра (749), Шум (11733), Альт (14945), Валторна (29706)
/* Music tools:
* First order: Гитара | Скрипка | 1917 | XX_век |
* Second order: Валторна | Альт | Гобой | Шум | Синтезатор_(музыкальный_инструмент) | Электроника | Домра |
* Sicobilli:
* First order: Бас-гитара | Великобритания | Европа |
Second order: Панк-рок | Япония
*/
String synonyms_titles = auth.getTitles(synonyms, " | ");
if (null != session.dump) {
dump.file.PrintNL( "hubs (sorted by Y) pointed to the source article:\n" + synonyms_titles);
dump.file.Flush();
}
//String compare_text = "Гитара | Скрипка | 1917 | XX_век | Валторна | Альт | Гобой | Шум | Синтезатор_(музыкальный_инструмент) | Электроника | Домра | Бас-гитара | Великобритания | Европа | Панк-рок | Япония";
String compare_text = "Гитара | Скрипка | Валторна | Альт | Гобой | Шум | Синтезатор_(музыкальный_инструмент) | Электроника | Домра | Бас-гитара | Панк-рок";
// x= 0.104 0.104 0.0939 .0939 .0939 .0939 .0939 .0939 .0939 0.014 0.012
//assertEquals(0, synonyms_titles.compareTo(compare_text));
//assertEquals(11, synonyms.size());
}
/**
* Test of Calculate method, of class wikipedia.Authorities.
*/
public void testCalculate() {
int root_set_size, increment;
int n_synonyms; //, iter;
float eps_error; // error to stop the iteration
String article, directory;
//root_set_size = 100; increment = 30; // science: time 5 min
// root_set_size = 150; increment = 50; // 6 min time
//root_set_size = 200; increment = 50; // Kleinberg default values; Science:11 min (? 25 min + failed)
root_set_size = 1; increment = 2; // Simple test values
int categories_max_steps = 99;
article = "Контрабас"; // Контрабас, Плазма
session.Init(connect_ru, session.category_black_list.ru, categories_max_steps);
//directory = "data/synonyms_ru/";
directory = System.getProperty("user.home") + "/.synarcher/test_kleinberg/ru/";
/*
article = "Science"; // 39 minutes or 2 366,359 sec vertices:12407 edges:410156 if root_set_size = 200; increment = 50;
session.Init(connect, session.category_black_list.en);
directory = "data/synonyms_en/";
*/
/* //session.dump = null;
dump.file.SetDir(directory);
dump.file.SetFilename(article + ".txt");
dump.file.Open(true, "Cp1251");
HashMap<Integer, Article> base_nodes = LinksBaseSet.CreateBaseSet(article, session, root_set_size, increment);
n_synonyms = 300;
eps_error = 0.001f;
List<Article> synonyms = auth.Calculate(base_nodes, eps_error, n_synonyms, session);
*/
//public ArrayList<Integer> categories;
}
public void testAppendSynonyms() {
int root_set_size, increment, n_synonyms;
long t_start, t_end;
float t_work, t_max; // time of one cycle's work
String[] all_words = null;
boolean b_russian;
float eps_error; // error to stop the iteration
String directory, article_fn;
Encodings e = connect_ru.enc;
DateFormat formatter = DateFormat.getDateTimeInstance(DateFormat.LONG,
DateFormat.LONG,
new Locale("en","US"));
String today = formatter.format(new Date());
int categories_max_steps = 10; //99;
// Russian
String[] ru_words = {"Сюжет"}; // Контрабас Робот Фуникулёр Контрабас
/*String[] ru_words = {"Робот", "Рычаг", "Фуникулёр",
"Глаз", "Окуляр", "Орбитальная станция", "Плазма", "Фут",
"Свёртка", "Ноты", "Предложение", "Преферанс", "Взрыв",
"Самосознание", "Свобода", "Оптимизм", "Контрабас"};
*/
// English
//String[] en_words = {"Science"}; // Sugar Parallelogram Sycamore
String[] en_words = {"Parallelogram", "Sycamore", "Innuendo", // rare words
"Disappear", "Sugar", "Science", //};// words_Blondel + "parallelogram"
"Astronaut", "Mechanism", "Universal", "Universe" };
b_russian = true;
if (b_russian) {
all_words = ru_words;
//directory = "data/synonyms_ru/"; //String filename = "ruwiki_synonyms.txt";
//directory = System.getProperty("user.home") + "/.synarcher/test_kleinberg/ru/";
directory = "ru";
session.Init(connect_ru, session.category_black_list.ru, categories_max_steps);
} else {
all_words = en_words;
//directory = "data/synonyms_en/"; //String filename = "enwiki_synonyms.txt";
//directory = System.getProperty("user.home") + "/.synarcher/test_kleinberg/en/";
directory = "en";
session.Init(connect, session.category_black_list.en, categories_max_steps);
}
session.skipTitlesWithSpaces(false);
session.randomPages(false);
//session.dump = null;
n_synonyms = 100;
eps_error = 0.001f;
// Kleinberg default values
/*root_set_size = 300; increment = 50;*/
// Simple test values
/*root_set_size = 1; increment = 2; */
int root_start, root_end, root_add;
int inc_start, inc_end, inc_add;
root_start = 10;
root_end = 310;
root_add = 50;
inc_start = 10; inc_end=60; inc_add=20;
t_max = 10f; // sec // en:43 minutes 48 seconds ru:3 min
//t_max = 80f; // sec
//t_max = 201f; // english 11 min
//t_max = 300f; // en depth-first search: 1 hours 40 min
// en breadth-first search:1 hours 14 min
//t_max = 3000f; // english 4 hours 10 min
//dump.enable_file_dot = false;
//directory = System.getProperty("user.home") + "/.synarcher/test_kleinberg/en/";
String fs = System.getProperty("file.separator");
String sub_dir = "test_kleinberg" + fs + directory + fs;
dump.file.setFileInHomeDir(sub_dir, "empty.txt", "Cp1251",true);
WORDS_CYCLE:
for(int i=0; i<all_words.length; i++) {
//System.out.println ("The word Latin1ToUTF8 '"+Encodings.Latin1ToUTF8(all_words[i])+"' is processing...");
System.out.println ("The word '" + all_words[i] + "' is processing...");
t_start = System.currentTimeMillis();
String article = all_words[i];
//dump.file_dot.SetFilename(article + ".dot");
//dump.file_dot.Open(false, "UTF8");
article_fn = StringUtilRegular.encodeRussianToLatinitsa(article, Encodings.enc_java_default, Encodings.enc_int_default);
dump.file.SetFilename(article_fn + ".txt");
dump.file.Open(true, "Cp1251");
dump.file.Print("\n\ntime max:" + t_max +
" start:" + today + " add categories_max_steps to Categorylinks.InBlackList()\n");
dump.file.Flush();
ROOT_CYCLE:
for(root_set_size = root_start;
root_set_size <= root_end;
root_set_size += root_add) {
for(increment = inc_start;
increment <= inc_end;
increment += inc_add) {
//root_set_size= 10; increment = 50; // FIXME temp
dump.file.SetFilename(article_fn + ".txt");
dump.file.Open(true, "Cp1251");
dump.file.Print("\n\n***\n*\n* New_Iteration *** *** ***\n*\n***\n\n");
dump.file.Flush();
dump.enable_file_dot = false;
session.clear();
List<String> rated_synonyms = new ArrayList<String>();
Map<Integer, Article> base_nodes = LinksBaseSet.CreateBaseSet(article, rated_synonyms, session, root_set_size, increment);
dump.file.PrintNL("Total_steps_while_categories_removing:"+
session.category_black_list.getTotalCategoriesPassed());
dump.file.Flush();
List<Article> synonyms = auth.Calculate(base_nodes, eps_error, n_synonyms, session);
t_end = System.currentTimeMillis();
t_work = (t_end - t_start)/1000f; // in sec
if (null != synonyms) {
dump.file.Print("\n\ntime sec:" + t_work +
" iter:" + auth.iter +
" vertices:" + base_nodes.values().size() +
" edges:" + DCEL.CountLinksIn(base_nodes) +
"\nroot_set_size:"+root_set_size+" increment:"+increment +
"\nn_synonyms:"+n_synonyms +
" total_steps_while_categories_removing:"+
session.category_black_list.getTotalCategoriesPassed() + "\n");
dump.file.Flush();
auth.AppendSynonyms(article, synonyms, "|", dump);
dump.file.Print("\n");
dump.file.Flush();
// print best hubs as triangles
dump.Dump(base_nodes, "");
// append dot command to bat file
dump.file_bat.Print( dump.GetStatisticsHashMap(base_nodes) + dump.GetDotCommand("jpeg", true) );
dump.file_sh. Print( dump.GetStatisticsHashMap(base_nodes) + dump.GetDotCommand("jpeg", false) );
dump.file_bat.Flush();
dump.file_sh. Flush();
// cluster synonyms
Map<Integer, Article> map_synonyms = Article.createMapIdToArticleWithoutRedirects((Article[])synonyms.toArray(new Article[0]));
dump.enable_file_dot = true;
CategorySet.prepareCategories(session, map_synonyms);
int max_cluster_weight = 20;
List<ClusterCategory> clusters = CategorySet.getCategoryClusters (session.category_nodes, map_synonyms, max_cluster_weight);
CategorySet.dumpClusterCategoryArticle(session, map_synonyms, clusters, "02_clusters_max_weight_"+max_cluster_weight+"_root_set_size_"+root_set_size+"_increment_"+increment);
CategorySet.dumpClusterCategorywithListArticles(session, map_synonyms, clusters, "03_list_articles_max_weight_"+max_cluster_weight+"_root_set_size_"+root_set_size+"_increment_"+increment);
}
if(t_work > t_max) {
break ROOT_CYCLE;
}
break WORDS_CYCLE;
}
}
}
}
}