/*
* CategorySetTest.java
* JUnit based test
*
* Created on 24 May 2005
*/
package wikipedia.kleinberg;
import wikipedia.language.Encodings;
import wikipedia.sql.*;
import wikipedia.util.*;
import wikipedia.clustering.*;
import junit.framework.*;
import java.util.*;
public class CategorySetTest extends TestCase {
Article node;
Authorities auth;
//HashMap<Integer, Article> nodes;
Connect connect, connect_ru;
DumpToGraphViz dump;
SessionHolder session;
Map<Integer, Article> articles;
Map<Integer, Category> categories;
//List<ClusterCategory> clusters;
//List<Edge> edges;
ClusterCategory c_all, c_religious, c_science, c_art;
public CategorySetTest(String testName) {
super(testName);
}
protected void setUp() throws Exception {
connect = new Connect();
connect.Open(Connect.WP_HOST, Connect.WP_DB, Connect.WP_USER, Connect.WP_PASS);
connect_ru = new Connect();
connect_ru.Open(Connect.WP_RU_HOST,Connect.WP_RU_DB,Connect.WP_RU_USER,Connect.WP_RU_PASS);
auth = new Authorities();
dump = new DumpToGraphViz();
dump.file_dot.setFileInHomeDir("graphviz", "bat_ruwiki.dot", "Cp866",true);
dump.file_bat.setFileInHomeDir("graphviz", "bat_ruwiki.bat", "Cp866",true);
dump.file_sh.setFileInHomeDir("graphviz", "bat_ruwiki.sh", "Cp1251", true);
node = new Article();
/*
nodes = new HashMap<Integer, Article>();
Article[] source_nodes = new Article[4];
source_nodes[0] = new Article();
source_nodes[1] = new Article();
source_nodes[2] = new Article();
source_nodes[3] = new Article();
String[] titles = {"Настольная игра", "Го (игра)", "Шашки", "Шахматы"};
float[] xx = {12.f, 2.f, 7.f, 7.f};
//
// Article -> Categories
// "Настольная игра"-> Категория: Настольные игры
// "Го (игра)" -> Категории: Википедия:Избранные статьи | Го | Слова японского происхождения
// "Шашки" -> Категории: Незавершённые статьи | Шашки
// "Шахматы" -> Категория: Шахматы
for(int i=0; i<titles.length; i++) {
source_nodes[i].page_title = titles[i];
source_nodes[i].page_id = PageTable.getIDByTitle(connect_ru, titles[i]); // 18991 10484 3321 1121
source_nodes[i].x = xx[i];
}
// put source_nodes to nodes
for(int i=0; i<source_nodes.length; i++) {
nodes.put(source_nodes[i].page_id, source_nodes[i]);
}*/
session = new SessionHolder();
session.initObjects();
session.dump = dump;
CreateCategoryArticleGraph c = new CreateCategoryArticleGraph ();
articles = c.articles;
categories = c.categories;
}
protected void tearDown() throws Exception {
connect.Close();
connect_ru.Close();
}
public static Test suite() {
TestSuite suite = new TestSuite(CategorySetTest.class);
return suite;
}
/** Test filling of category.id_articles[]. See pix at CreateCategoryArticleGraph.java */
public void testFillLinksFromCategoryToArticles() {
System.out.println("testFillLinksFromCategoryToArticles");
CategorySet.fillLinksFromCategoryToArticles(articles, categories);
// category 'all' (see CreateCategoryArticleGraph)
assertTrue(1 == categories.get(1).id_articles.length);
assertTrue(11 == categories.get(1).id_articles[0]);
// category 'science'
assertTrue(3 == categories.get(3).id_articles.length);
Set<Integer> s = new HashSet<Integer>();
s.add(11); s.add(12); s.add(13); // these are science_articles ID
assertTrue(s.contains(categories.get(3).id_articles[0]));
assertTrue(s.contains(categories.get(3).id_articles[1]));
assertTrue(s.contains(categories.get(3).id_articles[2]));
}
// Simple check that clustering algorithm works properly.
// See first graph in CreateCategoryArticleGraph as source graph
public void testGetCategoryClusters () {
System.out.println("testGetCategoryClusters ");
int max_cluster_weight;
session.source_page_title = "4categories";
max_cluster_weight = 7;
List<ClusterCategory> clusters1 = CategorySet.getCategoryClusters (categories, articles, max_cluster_weight);
//CategorySet.dumpClusterCategoryArticle(session, articles, clusters1, "clusters_3_max_cluster_weight_7");
assertTrue(3 == clusters1.size());
max_cluster_weight = 11;
List<ClusterCategory> clusters2 = CategorySet.getCategoryClusters (categories, articles, max_cluster_weight);
//CategorySet.dumpClusterCategoryArticle(session, articles, clusters2, "clusters_2_max_cluster_weight_11");
assertTrue(2 == clusters2.size());
max_cluster_weight = 12;
List<ClusterCategory> clusters3 = CategorySet.getCategoryClusters (categories, articles, max_cluster_weight);
//CategorySet.dumpClusterCategoryArticle(session, articles, clusters3, "clusters_1_max_cluster_weight_12");
assertTrue(1 == clusters3.size());
}
// see second graph in CreateCategoryArticleGraph as source graph (by init2())
public void testGetCategoryClusters2 () {
System.out.println("testGetCategoryClusters2");
int max_cluster_weight;
session.source_page_title = "6categories";
CreateCategoryArticleGraph c = new CreateCategoryArticleGraph ();
c.init2();
articles = c.articles;
categories = c.categories;
max_cluster_weight = 2;
List<ClusterCategory> clusters0 = CategorySet.getCategoryClusters (categories, articles, max_cluster_weight);
//CategorySet.dumpClusterCategoryArticle(session, articles, clusters0, "clusters_5_max_cluster_weight_2");
assertTrue(5 == clusters0.size());
max_cluster_weight = 4;
List<ClusterCategory> clusters1 = CategorySet.getCategoryClusters (categories, articles, max_cluster_weight);
//CategorySet.dumpClusterCategoryArticle(session, articles, clusters1, "clusters_3_max_cluster_weight_4");
assertTrue(3 == clusters1.size() || 4 == clusters1.size());
max_cluster_weight = 6;
List<ClusterCategory> clusters2 = CategorySet.getCategoryClusters (categories, articles, max_cluster_weight);
//CategorySet.dumpClusterCategoryArticle(session, articles, clusters2, "clusters_2_max_cluster_weight_6");
assertTrue(2 == clusters2.size() || 3 == clusters2.size());
max_cluster_weight = 10;
List<ClusterCategory> clusters4 = CategorySet.getCategoryClusters (categories, articles, max_cluster_weight);
//CategorySet.dumpClusterCategoryArticle(session, articles, clusters4, "clusters_1_max_cluster_weight_10");
assertTrue(1 == clusters4.size());
}
/**
* Test of Create method, of class wikipedia.kleinberg.CategorySet.
*/
public void testCreate_ru() {
System.out.println("testCreate_ru");
int root_set_size, increment;
int n_synonyms; //, iter;
float eps_error; // error to stop the iteration
String article, directory;
// root_set_size = 100; increment = 30; // science: time 5 min
// root_set_size = 150; increment = 50; // 6 min time
// root_set_size = 200; increment = 50; // Kleinberg default values; Science:11 min (? 25 min + failed)
root_set_size = 1; increment = 1; // Simple test values
//root_set_size= 10; increment = 50;
int categories_max_steps = 99;
Encodings e = connect_ru.enc;
article = e.EncodeFromJava("Шахматы"); // Контрабас, Плазма, Робот Плазма
session.Init(connect_ru, session.category_black_list.ru, categories_max_steps);
//directory = "data/synonyms_ru/";
/*
article = "Science"; // 4 min, if root_set_size = 10; increment = 50;
// 39 minutes or 2 366,359 sec vertices:12407 edges:410156 if root_set_size = 200; increment = 50;
session.Init(connect, session.category_black_list.en, categories_max_steps);
directory = "data/synonyms_en/";
*/
//session.dump = null;
dump.file.setFileInHomeDir("graphviz", article + ".txt", "Cp1251",true);
/*dump.file.SetFilename(Encodings.UTF8ToCp1251(article) + ".txt1");
dump.file.Open(true, "Cp1251");
dump.file.SetFilename(Encodings.FromTo(article,"UTF8","ISO8859_1") + ".txt2");
dump.file.Open(true, "Cp1251");
dump.file.SetFilename(Encodings.FromTo(article,"ISO8859_1","UTF8") + ".txt3"); // error
dump.file.Open(true, "Cp1251");
dump.file.SetFilename(Encodings.FromTo(article,"Cp1251","UTF8") + ".txt4"); // error
dump.file.Open(true, "Cp1251");
dump.file.SetFilename(Encodings.FromTo(article,"UTF8","Cp1251") + ".txt5"); // error
dump.file.Open(true, "Cp1251");
dump.file.SetFilename(Encodings.FromTo(article,"Cp1251","ISO8859_1") + ".txt6"); // error
dump.file.Open(true, "Cp1251");
dump.file.SetFilename(Encodings.FromTo(article,"ISO8859_1","Cp1251") + ".txt7"); // error
dump.file.Open(true, "Cp1251");*/
List<String> rated_synonyms = new ArrayList<String>();
Map<Integer, Article> base_nodes = LinksBaseSet.CreateBaseSet(article, rated_synonyms,
session, root_set_size, increment);
Article s = base_nodes.get(session.source_article_id);
assertTrue(null != s);
CategorySet.prepareCategories(session, base_nodes);
int max_cluster_weight = 20;
List<ClusterCategory> clusters = CategorySet.getCategoryClusters (session.category_nodes, base_nodes, max_cluster_weight);
CategorySet.dumpClusterCategoryArticle(session, base_nodes, clusters, "02_clusters_max_weight_"+max_cluster_weight);
CategorySet.dumpClusterCategorywithListArticles(session, base_nodes, clusters, "03_list_articles_max_weight_"+max_cluster_weight);
/*
n_synonyms = 300;
eps_error = 0.001f;
List<Article> synonyms = auth.Calculate(base_nodes, eps_error, n_synonyms, session);
*/
}
}