package wikipedia.experiment;
import junit.framework.*;
//import wikipedia.experiment.MetricSpearman;
//import wikipedia.kleinberg.Article;
//import wikipedia.kleinberg.LinksBaseSet;
import wikipedia.kleinberg.SessionHolder;
import wikipedia.kleinberg.DumpToGraphViz;
import wikipedia.kleinberg.Authorities;
//import wikipedia.kleinberg.DCEL;
import wikipedia.sql.Connect;
import wikipedia.language.Encodings;
import wikipedia.util.StringUtil;
import java.text.DateFormat;
import java.util.Locale;
import java.util.Date;
//import java.util.List;
//import java.util.ArrayList;
//import java.util.Map;
//import java.util.HashMap;
public class ValuerTest extends TestCase {
private static final boolean DEBUG = true;
Authorities auth;
Connect connect, connect_simple, idfsimplewiki_conn, idfenwiki7_conn, idf_conn; //, connect_ru;
DumpToGraphViz dump;
SessionHolder session;
public ValuerTest(String testName) {
super(testName);
}
protected void setUp() throws Exception {
connect = new Connect();
//connect.Open("enwiki?useUnicode=true&characterEncoding=UTF-8", "javawiki", "");
//connect.Open("localhost", "enwiki", "javawiki", "");
//connect.Open(Connect.WP_HOST, Connect.WP_DB, Connect.WP_USER, Connect.WP_PASS);
connect.Open(Connect.WP_HOST, Connect.WP_DB, Connect.WP_USER, Connect.WP_PASS);
connect_simple = new Connect();
connect_simple.Open(Connect.WP_HOST,Connect.WP_SIMPLE_DB, Connect.WP_USER, Connect.WP_PASS);
/*
connect_ru = new Connect();
//connect_ru.Open("localhost", "ruwiki?useUnicode=false&characterEncoding=ISO8859_1", "javawiki", ""); //Java:MySQL ISO8859_1:latin1
//connect_ru.Open("localhost", "ruwiki?autoReconnect=true&useUnbufferedInput=false&useUnicode=false&characterEncoding=ISO8859_1", "javawiki", "");
connect_ru.Open(Connect.WP_RU_HOST, Connect.WP_RU_DB, Connect.WP_RU_USER, Connect.WP_RU_PASS);
*/
idfsimplewiki_conn = new Connect();
idfsimplewiki_conn.Open(Connect.IDF_SIMPLE_HOST, Connect.IDF_SIMPLE_DB, Connect.IDF_SIMPLE_USER, Connect.IDF_SIMPLE_PASS);
idfenwiki7_conn = new Connect();
idfenwiki7_conn.Open(Connect.IDF_EN_HOST, Connect.IDF_EN_DB, Connect.IDF_EN_USER, Connect.IDF_EN_PASS);
String f = System.getProperty("user.home") + "/.synarcher/test_kleinberg/graphviz/";
auth = new Authorities();
dump = new DumpToGraphViz();
dump.file_dot.SetDir(f);
/*
dump.file_bat.SetDir(f);
dump.file_bat.SetFilename("bat_ruwiki.bat");
dump.file_bat.Open(true, "Cp866");
dump.file_sh.SetDir(f);
dump.file_sh.SetFilename("bat_ruwiki.sh");
dump.file_sh.Open(true, "Cp1251");
*/
session = new SessionHolder();
session.initObjects();
session.dump = dump;
}
protected void tearDown() throws Exception {
connect.Close();
connect_simple.Close();
idfsimplewiki_conn.Close();
idfenwiki7_conn.Close();
}
/**
* Test of compareSynonyms method, of class wikipedia.experiment.Valuer.
*/
public void testCompareSynonyms() {
System.out.println("compareSynonyms");
int root_set_size, increment, n_synonyms, categories_max_steps;
String filename;
float eps_error; // error to stop the iteration
String directory;
Encodings e = connect.enc;
DateFormat formatter = DateFormat.getDateTimeInstance(DateFormat.LONG,
DateFormat.LONG,
new Locale("en","US"));
String today = formatter.format(new Date());
boolean b_normal_load = false; // easy or normal (hard) load
boolean b_simple = false; // simple wiki or english wiki
if (b_normal_load) {
// normal test
categories_max_steps = 4; //10; //99;
n_synonyms = 1000;
eps_error = 0.001f;
root_set_size = 200;
increment = 17;
} else {
// easy test
categories_max_steps = 4; //99;
n_synonyms = 10;
eps_error = 0.01f;
root_set_size = 3;
increment = 1;
}
if (b_simple) {
idf_conn = idfsimplewiki_conn;
//directory = "data/synonyms_ru/"; //String filename = "ruwiki_synonyms.txt";
directory = System.getProperty("user.home") + "/.synarcher/test_kleinberg/simple/";
session.Init(connect_simple, session.category_black_list.en, categories_max_steps);
} else {
idf_conn = idfenwiki7_conn;
//all_words = en_words;
//directory = "data/synonyms_en/"; //String filename = "enwiki_synonyms.txt";
directory = System.getProperty("user.home") + "/.synarcher/test_kleinberg/en/";
session.Init(connect, session.category_black_list.en, categories_max_steps);
}
session.skipTitlesWithSpaces(false); // 1
//session.skipTitlesWithSpaces(true); // 2
session.dump = null;
dump.enable_file_dot = false;
dump.file.SetDir(directory);
dump.file.SetFilename( "wordsim353_AHITS_metric_spf.txt");
dump.file.Open(true, "Cp1251");
dump.file.Print("\n\ntime start:" + today + " \n");
if(DEBUG) {
dump.file.Print("\n" +
"isct_wo - Number of intersection words, they are synonyms of word1 and word2" + "\n" +
"Spr_ftr - Spearman footrule" + "\n" +
"G - compares two list of words. List (w/o duplicates) can have different length. -1 if String[] is null." + "\n" +
"t sec - time sec" + "\n" +
"syn.len - synonyms.size()" + "\n" +
"skipSpa - skipTitlesWithSpaces" + "\n" +
"stepCat - total_steps_while_categories_removing" + "\n" +
"root - root_set_size" + "\n" +
"incrmnt - increment" + "\n" +
"human - human_wordsim" + "\n" +
"cat_max - categories_max_steps" + "\n");
dump.file.Print("\n" +
"word1\t" + "word2\t" + "human\t" +
// intersect.length + "\t" + dist_f + "\t" + dist_i + "\t" +
"Spr_ftr\t" + "isct_wo\t" + "G\t" +
"t sec\t" +
"cat_max" + "\t" +
"iter\t" +
//"vert-s\t" +
//"edges\t" +
"n_synon\t" + "syn.len\t" +
"skipSpa\t" +
"stepCat\t"+
"root\t" + "incrmnt\t" +
"synonyms\n");
}
dump.file.Flush();
long t_start, t_end;
float t_work;
t_start = System.currentTimeMillis();
int i = 0;
System.out.println ("\nThe words are processing:\n");
WordSim353 wordsim353= new WordSim353();
Valuer.absent_counter = 0;
for(WordSim w:wordsim353.data) {
i++;
String word1 = StringUtil.UpperFirstLetter(w.word1);
String word2 = StringUtil.UpperFirstLetter(w.word2);
//word1 = "Computer";
//word2 = "Keyboard";
//System.out.println ("The word Latin1ToUTF8 '"+Encodings.Latin1ToUTF8(all_words[i])+"' is processing...");
System.out.println (i + ": " + word1 + ", " + word2 + ", categories_max_steps: ");
//for(categories_max_steps = 0; categories_max_steps < 20; categories_max_steps+=2) {
// session.category_black_list.setMaxSteps(categories_max_steps);
// System.out.println ("categories_max_steps " + categories_max_steps);
//float result =
Valuer.compareSynonyms(word1, word2, w.sim, auth, connect, dump, session,
root_set_size, increment, n_synonyms, categories_max_steps, eps_error,
idf_conn);
//}
//if( i > 7)
break;
}
t_end = System.currentTimeMillis();
t_work = (t_end - t_start)/1000f; // in sec
dump.file.Print("\n" + "Total time: " + t_work + "sec.\n");
dump.file.Print("\n" + "Number of absent data items: " + Valuer.absent_counter + ".\n");
dump.file.Flush();
}
}