/* * Valuer.java - evaluates and calculates words similarity. * * Copyright (c) 2005-2007 Andrew Krizhanovsky /aka at mail.iias.spb.su/ * Distributed under GNU Public License. */ package wikipedia.experiment; //import wikipedia.experiment.MetricSpearman; import wikipedia.kleinberg.*; import wikipedia.sql.Connect; import wikipedia.sql_idf.RelatedPage; import wikipedia.util.*; import java.text.*; import java.util.*; /** * Words similarity calculator based on "The WordSimilarity-353 Test * Collection" and metric Spearman footrule formula. * It compares AHITS result with human similarity estimation. * * In order to calculate similarity: * 1. Create database idfenwiki or idfsimplewiki: * mysql> CREATE DATABASE idfenwiki; * mysql> USE idfenwiki; * mysql> source synarcher/db/idf/idfenwiki7_20070527_normalload_417rp.sql * * This database caches results of previous search by AHITS. If related pages * are found, then they will be stored to the table related_page. You can use * 417 related pages stored in idfenwiki7_20070527_normalload_417rp.sql, or * you can clear table * mysql> DELETE FROM related_page * and change AHITS parameters in ValuerTest.testCompareSynonyms() * * 2. Check that you have folder /home/~/.synarcher/test_kleinberg/simple/ or /en * 3. Open JUnit test file ValuerTest.testCompareSynonyms * 4. Set boolean vars: b_normal_load, b_simple * 5. Run ValuerTest.testCompareSynonyms * 6. See output file in /home/~/.synarcher/test_kleinberg/en/ or /simple/ * 7. Yes, comment 'break;' in the cycle. */ public class Valuer { private static final boolean DEBUG = true; /** Number of absent data items */ public static int absent_counter; private final static Article[] NULL_ARTICLE_ARRAY = new Article[0]; private static String[] getSynonyms(String word, Authorities auth, SessionHolder session, int root_set_size, int increment, int n_synonyms, int categories_max_steps, float eps_error, Connect idf_conn) { String[] synonyms_title = null; RelatedPage rp = new RelatedPage (); if(rp.isInTable_RelatedPage(idf_conn.conn, word)) { // take result from cache (idf database) synonyms_title = rp.getRelatedTitlesAsArray(); } else { System.out.println("Word " + word + " is absent in cache (idf database), starts AHITS calculation..."); session.clear(); List<String> rated_synonyms = new ArrayList<String>(); Map<Integer, Article> base_nodes = LinksBaseSet.CreateBaseSet(word, rated_synonyms, session, root_set_size, increment); /*if(DEBUG) { dump.file.PrintNL("Total_steps_while_categories_removing:"+ session.category_black_list.getTotalCategoriesPassed()); dump.file.Flush(); }*/ List<Article> synonyms = auth.Calculate(base_nodes, eps_error, n_synonyms, session); if (null != synonyms && 0 < synonyms.size()) { synonyms_title = Article.getTitles((Article[])synonyms. toArray(NULL_ARTICLE_ARRAY)); } rp.add(idf_conn.conn, word, synonyms_title); } return synonyms_title; } /** Calculates relatedness of two words. * Creates two vectors of synonyms using AHITS algorithm, then calculates * intersection by Spearman footrule formula. */ public static float compareSynonyms (String word1,String word2,float human_wordsim, Authorities auth, Connect connect, DumpToGraphViz dump, SessionHolder session, int root_set_size, int increment, int n_synonyms, int categories_max_steps, float eps_error, Connect idf_conn ) { long t_start, t_end; float t_work; t_start = System.currentTimeMillis(); String[] synonyms_title = getSynonyms( word1, auth, session, root_set_size, increment, n_synonyms, categories_max_steps, eps_error, idf_conn); // word 2 String[] synonyms_title2 = getSynonyms( word2, auth, session, root_set_size, increment, n_synonyms, categories_max_steps, eps_error, idf_conn); t_end = System.currentTimeMillis(); t_work = (t_end - t_start)/1000f; // in sec double dist_f = -1; int dist_i = -1; if (null != synonyms_title && 0 < synonyms_title.length && null != synonyms_title2 && 0 < synonyms_title2.length) { // compare with standard list dist_i = MetricSpearman.compare(synonyms_title, synonyms_title2); dist_f = MetricSpearman.calcSpearmanFootrule(synonyms_title, synonyms_title2); String[] intersect = StringUtil.intersect(synonyms_title, synonyms_title2); if(DEBUG) { dump.file.Print( word1 + "\t" + word2 + "\t" + human_wordsim + "\t" + new PrintfFormat("%.3lg").sprintf(dist_f) + "\t" + intersect.length + "\t" + dist_i + "\t" + t_work + "\t" + categories_max_steps + "\t" + auth.iter + "\t" + //base_nodes.values().size() + "\t" + //DCEL.CountLinksIn(base_nodes) + "\t" + n_synonyms + "\t" + synonyms_title.length + "\t" + session.skipTitlesWithSpaces() + "\t" + session.category_black_list.getTotalCategoriesPassed() + "\t" + root_set_size + "\t" + increment + "\t" + //StringUtil.join(",",synonyms_title) + "\t" + MetricSpearman.findStringWithPosition(synonyms_title, synonyms_title2, ",") + "\n"); } /*else { dump.file.Print("\n\ntime sec:" + t_work + " iter:" + auth.iter + " vertices:" + base_nodes.values().size() + " edges:" + DCEL.CountLinksIn(base_nodes) + "\nroot_set_size:"+root_set_size+" increment:"+increment + "\n Metric Spearman footrule:" + dist_f + "\n Metric G:" + dist_i + "\nn_synonyms:"+n_synonyms + "\nsynonyms.size():"+synonyms.size() + "\nStringWithPosition" + MetricSpearman.findStringWithPosition(synonyms_title, synonyms_title2, ",") + "\nskipTitlesWithSpaces:"+session.skipTitlesWithSpaces()+ "\ntotal_steps_while_categories_removing:"+ session.category_black_list.getTotalCategoriesPassed() + "\n"); dump.file.Flush(); auth.AppendSynonyms(word1, synonyms, "|", dump); dump.file.Flush(); auth.AppendSynonyms(word2, synonyms2, "|", dump); dump.file.Print("\n"); // add // intersect.length + "\t" + dist_f + "\t" + dist_i + "\t" + }*/ dump.file.Flush(); } else { // AHITS didn't find any synonyms absent_counter ++; if(DEBUG) { dump.file.Print( word1 + "\t" + word2 + "\t" + human_wordsim + "\tabsent\n"); } } return (float)dist_f; } }