/*
* MetricSpearmanTest.java
* JUnit based test
*/
package wikipedia.experiment;
//import wikipedia.experiment.MetricSpearman;
import wikipedia.language.Encodings;
import wikipedia.util.StringUtil;
import wikipedia.util.StringUtilRegular;
import wikipedia.kleinberg.Article;
import wikipedia.kleinberg.SessionHolder;
import wikipedia.kleinberg.DumpToGraphViz;
import wikipedia.kleinberg.Authorities;
import wikipedia.kleinberg.LinksBaseSet;
import wikipedia.sql.Connect;
import wikipedia.kleinberg.DCEL;
import junit.framework.*;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
//import java.util.HashMap;
import java.text.DateFormat;
import java.util.Locale;
import java.util.Date;
public class MetricSpearmanTest extends TestCase {
//Article node;
//Map<Integer, Article> nodes;
Authorities auth;
Connect connect, connect_ru;
DumpToGraphViz dump;
SessionHolder session;
public MetricSpearmanTest(String testName) {
super(testName);
}
protected void setUp() throws Exception {
connect = new Connect();
//connect.Open("enwiki?useUnicode=true&characterEncoding=UTF-8", "javawiki", "");
//connect.Open("localhost", "enwiki", "javawiki", "");
connect.Open(Connect.WP_HOST, Connect.WP_DB, Connect.WP_USER, Connect.WP_PASS);
connect_ru = new Connect();
//connect_ru.Open("localhost", "ruwiki?useUnicode=false&characterEncoding=ISO8859_1", "javawiki", ""); //Java:MySQL ISO8859_1:latin1
//connect_ru.Open("localhost", "ruwiki?autoReconnect=true&useUnbufferedInput=false&useUnicode=false&characterEncoding=ISO8859_1", "javawiki", "");
connect_ru.Open(Connect.WP_RU_HOST, Connect.WP_RU_DB, Connect.WP_RU_USER, Connect.WP_RU_PASS);
String f = System.getProperty("user.home") + "/.synarcher/test_kleinberg/graphviz/";
auth = new Authorities();
dump = new DumpToGraphViz();
dump.file_dot.SetDir(f);
/*
dump.file_bat.SetDir(f);
dump.file_bat.SetFilename("bat_ruwiki.bat");
dump.file_bat.Open(true, "Cp866");
dump.file_sh.SetDir(f);
dump.file_sh.SetFilename("bat_ruwiki.sh");
dump.file_sh.Open(true, "Cp1251");
*/
session = new SessionHolder();
session.initObjects();
session.dump = dump;
}
protected void tearDown() throws Exception {
connect.Close();
connect_ru.Close();
}
public static Test suite() {
TestSuite suite = new TestSuite(MetricSpearmanTest.class);
return suite;
}
/**
Example of comparing of list of different length:
All words: S1 S2 Общие слова S1* S2* |S2*-S1*|
1. Автожир Автожир Турболёт Автожир 1 3 2
2. Планер Турболёт Фюзеляж Турболёт 2 1 1
3. Турболёт Планер Автожир Планер 3 4 1
4. Фюзеляж Планер 4 (Значение метрики)
5. Космонавт Космонавт
*/
public void testCompare_StringArrays_test1() {
System.out.println("compare_StringArrays_test1");
String[] synonyms1 = {"Автожир", "Турболёт", "Планер"};
String[] synonyms2 = {"Турболёт", "Фюзеляж", "Автожир", "Планер", "Космонавт"};
MetricSpearman metr = new MetricSpearman();
int result = metr.compare(synonyms1, null);
assertEquals(-1, result);
// equal lists
result = metr.compare(synonyms1, synonyms1);
assertEquals(0, result);
result = metr.compare(synonyms1, synonyms2);
assertEquals(4, result);
}
/**
Example of comparing of list of different length:
All words: S1 S2 Общие слова S1* S2* |S2*-S1*|
1. Автожир Турболёт Автожир Автожир 1 3 2
2. Планер Планер Фюзеляж Турболёт 2 1 1
3. Турболёт Турболёт Планер 3 4 1
4. Фюзеляж Космонавт 4 (Значение метрики)
5. Космонавт Планер
*/
public void testCompare_StringArrays_test2() {
System.out.println("compare_StringArrays_test2");
String[] synonyms1 = {"Турболёт", "Планер"};
String[] synonyms2 = {"Автожир", "Фюзеляж", "Турболёт", "Космонавт", "Планер"};
String[] synonyms3 = {"Фюзеляж", "Космонавт"};
MetricSpearman metr = new MetricSpearman();
int result = metr.compare(synonyms1, synonyms2);
assertEquals(5, result);
result = metr.compare(synonyms1, synonyms3);
assertEquals(4, result);
}
public void testCompare_StringArrays_test3() {
System.out.println("compare_StringArrays_test2");
String[] synonyms1 = {"Турболёт", "Планер"};
String[] synonyms2 = {"Планер"};
String[] synonyms3 = {"WordAbsentInList1", "AnotherWordAbsentInList1"};
// assert: distance(synonyms1,synonyms2) < distance(synonyms1,synonyms3)
MetricSpearman metr = new MetricSpearman();
int dist12 = metr.compare(synonyms1, synonyms2);
int dist13 = metr.compare(synonyms1, synonyms3);
assertTrue(dist12 < dist13);
}
public void testCompare_ArticleList() {
System.out.println("compare_ArticleList");
String[] synonyms1 = {"Автожир", "Турболёт", "Планер"};
String[] synonyms2 = {"Турболёт", "Фюзеляж", "Автожир", "Планер", "Космонавт"};
List<Article> list1 = new ArrayList<Article>();
List<Article> list2 = new ArrayList<Article>();
for(String s:synonyms1) {
Article a = new Article();
a.page_title = s;
list1.add(a);
}
for(String s:synonyms2) {
Article a = new Article();
a.page_title = s;
list2.add(a);
}
MetricSpearman metr = new MetricSpearman();
int result = metr.compare(list1, null);
assertEquals(-1, result);
// equal lists
result = metr.compare(list1, list1);
assertEquals(0, result);
result = metr.compare(list1, list2);
assertEquals(4, result);
}
public void testCalcSpearmanFootrule() {
System.out.println("testCalcSpearmanFootrule");
String[] synonyms1 = {"Автожир", "Турболёт", "Планер"};
String[] synonyms2 = {"Космонавт", "Планер", "Автожир", "Турболёт"};
String[] synonyms3 = {"Автожир", "Космонавт", "Турболёт", "Планер"};
String[] synonyms4 = {"Автожир", "Космонавт", "Планер", "Турболёт"};
String[] synonyms_word1 = {"word1"};
String[] synonyms_word2 = {"word2"};
MetricSpearman m = new MetricSpearman();
double dist11 = m.calcSpearmanFootrule(synonyms1, synonyms1);
double dist12 = m.calcSpearmanFootrule(synonyms1, synonyms2);
double dist21 = m.calcSpearmanFootrule(synonyms2, synonyms1);
double dist13 = m.calcSpearmanFootrule(synonyms1, synonyms3);
double dist14 = m.calcSpearmanFootrule(synonyms1, synonyms4);
double eps = 0.01;
assertTrue(m.equals(1, dist11, eps));
assertTrue(m.equals(0, dist12, eps));
assertTrue(m.equals(0, dist21, eps));
assertTrue(m.equals(1, dist13, eps));
assertTrue(m.equals(0.5, dist14, eps));
double dist_word11 = m.calcSpearmanFootrule(synonyms_word1, synonyms_word1);
double dist_word12 = m.calcSpearmanFootrule(synonyms_word1, synonyms_word2);
assertTrue(m.equals(1, dist_word11, eps));
assertTrue(m.equals(0, dist_word12, eps));
}
//String findStringWithPosition (String[] big, String[] small) {
public void testFindStringWithPosition() {
System.out.println("testFindStringWithPosition");
String[] short1 = {"Автожир", "Турболёт"};
String[] long1 = {"Космонавт", "Планер", "Автожир", "Турболёт"};
String should_be = "Автожир2,Турболёт3";
String token = ",";
String res = MetricSpearman.findStringWithPosition(long1, short1, token);
assertEquals(should_be, res);
}
public void testCompareSynonyms() {
int root_set_size, increment, n_synonyms;
String filename;
long t_start, t_end;
float t_work, t_max; // time of one cycle's work
String[] all_words = null;
boolean b_russian;
float eps_error; // error to stop the iteration
String directory;
Encodings e = connect_ru.enc;
//if(true) {return;}
DateFormat formatter = DateFormat.getDateTimeInstance(DateFormat.LONG,
DateFormat.LONG,
new Locale("en","US"));
String today = formatter.format(new Date());
int categories_max_steps = 10; //99;
// Russian
String[] ru_words = {"Самолёт", "Сюжет", "Истина", "Жаргон"}; //
String[][] ru_words_standards = {
//"Самолёт"
{"Планер", "Турболёт", "Автожир", "Экранолёт", "Экраноплан", "Конвертоплан"},
//"Сюжет"
{"Интрига", "Переживание", "Конфликт", "Трагедия", "Коллизия", "Противоречие"},
//"Истина"
{"Факт", "Действительность", "Реальность", "Правда", "Бог", "Знание", "Вера", "Авторитет", "Догмат"},
// "Жаргон"
{"Сленг", "Просторечие", "Матерщина", "Диалект", "Арго", "Эвфемизм"}
};
// English
//String[] en_words = {"Science"}; // Sugar Parallelogram Sycamore
String[] en_words = {"Parallelogram", "Sycamore", "Innuendo", // rare words
"Disappear", "Sugar", "Science", //};// words_Blondel + "parallelogram"
"Astronaut", "Mechanism", "Universal", "Universe" };
b_russian = true;
if (b_russian) {
all_words = ru_words;
//directory = "data/synonyms_ru/"; //String filename = "ruwiki_synonyms.txt";
directory = System.getProperty("user.home") + "/.synarcher/test_kleinberg/ru/";
session.Init(connect_ru, session.category_black_list.ru, categories_max_steps);
} else {
all_words = en_words;
//directory = "data/synonyms_en/"; //String filename = "enwiki_synonyms.txt";
directory = System.getProperty("user.home") + "/.synarcher/test_kleinberg/en/";
session.Init(connect, session.category_black_list.en, categories_max_steps);
}
session.skipTitlesWithSpaces(false); // 1
//session.skipTitlesWithSpaces(true); // 2
session.dump = null;
n_synonyms = 100;
eps_error = 0.001f;
// Kleinberg default values
/*root_set_size = 300; increment = 50;*/
// Simple test values
/*root_set_size = 1; increment = 2; */
int root_start, root_end, root_add;
int inc_start, inc_end, inc_add;
boolean b_params_experiment = false;
if(b_params_experiment) {
// best parameters experiment
// fast
//root_start = 10; root_end = 20; root_add = 10; inc_start = 10; inc_end=20; inc_add=5;
// long
root_start = 10; root_end = 510; root_add = 50; inc_start = 10; inc_end=60; inc_add=10;
} else {
// skipSpaces experiment
root_start = 200; root_end = 200; root_add = 50; inc_start = 17; inc_end=17; inc_add=20;
}
//t_max = 10f; // sec // en:43 minutes 48 seconds ru:3 min
//t_max = 80f; // sec
//t_max = 201f; // english 11 min
//t_max = 300f; // en depth-first search: 1 hours 40 min
// en breadth-first search:1 hours 14 min
t_max = 3000f; // english 4 hours 10 min
//dump.enable_file_dot = false;
dump.file.SetDir(directory);
WORDS_CYCLE:
//for(int i=0; i<all_words.length; i++) {
for(int i=0; i<0; i++) {
//System.out.println ("The word Latin1ToUTF8 '"+Encodings.Latin1ToUTF8(all_words[i])+"' is processing...");
System.out.println ("The word '" + all_words[i] + "' is processing...");
t_start = System.currentTimeMillis();
String article = all_words[i];
//dump.file_dot.SetFilename(article + ".dot");
//dump.file_dot.Open(false, "UTF8");
String article_fn = StringUtilRegular.encodeRussianToLatinitsa(all_words[i], Encodings.enc_java_default, Encodings.enc_int_default);
dump.file.SetFilename( article_fn + "_metric_spf.txt");
dump.file.Open(true, "Cp1251");
dump.file.Print("\n\ntime max:" + t_max +
" start:" + today + " \n");
if(b_params_experiment) {
dump.file.Print("\n" +
"root_set_size\t" + "increment\t" +
// intersect.length + "\t" + dist_f + "\t" + dist_i + "\t" +
"intersect_words\t" + "Spearman_footrule\t" + "G\t" +
"time(sec)\t" +
"iter\t" +
"vertices\t" +
"edges\t" +
"n_synonyms\t" + "synonyms.size()\t" + "synonyms\t" +
"skipTitlesWithSpaces\t" +
"total_steps_while_categories_removing\n");
}
dump.file.Flush();
ROOT_CYCLE:
for(root_set_size = root_start;
root_set_size <= root_end;
root_set_size += root_add) {
for(increment = inc_start;
increment <= inc_end;
increment += inc_add) {
//root_set_size= 10; increment = 50; // FIXME temp
filename = article + ".txt";
if(!b_params_experiment) {
dump.file.Print("\n\n***\n*\n* New_Iteration *** *** ***\n*\n***\n");
dump.file.Flush();
dump.enable_file_dot = false;
}
session.clear();
List<String> rated_synonyms = new ArrayList<String>();
Map<Integer, Article> base_nodes = LinksBaseSet.CreateBaseSet(article, rated_synonyms,
session, root_set_size, increment);
if(!b_params_experiment) {
dump.file.PrintNL("Total_steps_while_categories_removing:"+
session.category_black_list.getTotalCategoriesPassed());
dump.file.Flush();
}
List<Article> synonyms = auth.Calculate(base_nodes, eps_error, n_synonyms, session);
t_end = System.currentTimeMillis();
t_work = (t_end - t_start)/1000f; // in sec
if (null != synonyms) {
// compare with standard list
String[] synonyms_title = Article.getTitles((Article[])synonyms.toArray(new Article[0]));
int dist_i = MetricSpearman.compare(synonyms_title, ru_words_standards[i]);
double dist_f = MetricSpearman.calcSpearmanFootrule(synonyms_title, ru_words_standards[i]);
String[] intersect = StringUtil.intersect(synonyms_title, ru_words_standards[i]);
if(b_params_experiment) {
dump.file.Print(
root_set_size + "\t" + increment + "\t" +
intersect.length + "\t" + dist_f + "\t" + dist_i + "\t" +
t_work + "\t" +
auth.iter + "\t" +
base_nodes.values().size() + "\t" +
DCEL.CountLinksIn(base_nodes) + "\t" +
n_synonyms + "\t" + synonyms.size() + "\t" +
//StringUtil.join(",",synonyms_title) + "\t" +
MetricSpearman.findStringWithPosition(synonyms_title, ru_words_standards[i], ",") + "\t" +
session.skipTitlesWithSpaces() + "\t" +
session.category_black_list.getTotalCategoriesPassed() + "\n");
} else {
dump.file.Print("\n\ntime sec:" + t_work +
" iter:" + auth.iter +
" vertices:" + base_nodes.values().size() +
" edges:" + DCEL.CountLinksIn(base_nodes) +
"\nroot_set_size:"+root_set_size+" increment:"+increment +
"\n Metric Spearman footrule:" + dist_f +
"\n Metric G:" + dist_i +
"\nn_synonyms:"+n_synonyms +
"\nsynonyms.size():"+synonyms.size() +
"\nStringWithPosition" + MetricSpearman.findStringWithPosition(synonyms_title, ru_words_standards[i], ",") +
"\nskipTitlesWithSpaces:"+session.skipTitlesWithSpaces()+
"\ntotal_steps_while_categories_removing:"+
session.category_black_list.getTotalCategoriesPassed() + "\n");
dump.file.Flush();
auth.AppendSynonyms(article, synonyms, "|", dump);
dump.file.Print("\n");
// add
// intersect.length + "\t" + dist_f + "\t" + dist_i + "\t" +
}
dump.file.Flush();
}
if(t_work > t_max) {
// break ROOT_CYCLE;
}
// break WORDS_CYCLE;
}
}
}
}
}