/* * SynonymSearcher.java - Interface to kleinberg.jar synonym searcher in * Wikipedia database. * * Copyright (c) 2005-2007 Andrew Krizhanovsky /aka at mail.iias.spb.su/ * Distributed under GNU Public License. */ package com.touchgraph.wikibrowser; import com.touchgraph.wikibrowser.panel.*; import com.touchgraph.wikibrowser.panel.db.*; import com.touchgraph.wikibrowser.parameter.*; import com.touchgraph.graphlayout.*; import wikipedia.kleinberg.*; import wikipedia.sql.*; import wikipedia.util.*; import wikipedia.clustering.*; import wikipedia.*; import java.awt.Color; import java.io.*; import java.util.*; import java.text.*; /** Interface to kleinberg.jar (algorithms for synonym searcher and Wikipedia * API, Wikipedia in MySQL). */ public class SynonymSearcher { public boolean DEBUG = true; private TGWikiBrowser wb; Authorities auth; public Map<Integer, Article> base_nodes; /** <page_title to page_id of article (nodes above)> */ public Map<String, Integer> m_articles; /** <pate_title of category, page_id of category>, category object is in session.category_nodes */ public Map<String, Integer> m_categories; //Connect connect; public SessionHolder session; DumpToGraphViz dump; /** Synonyms table (results of search) */ public ResultTableModel syn_table; /** Categories table */ public CategoryTableModel cat_table; public static int edge_dist_aritlce_category = 50; public static int edge_dist_category_category = 20; /** Blue */ public static final Color color_node_category = Color.decode("#1D00E2"); /** Black */ public static final Color color_edge_article_category = Color.decode("#000000"); /** Light-brown */ public static final Color color_edge_category_category = Color.decode("#D57413"); /** Dark Magenta #8B008B */ public static final Color color_dark_magenta = Color.decode("#8B008B"); private SynArt syn_art; /*************************************************/ /* Parameters, should be defined by user in GUI */ /* */ /** Article parameters, * see comments in {@link com.touchgraph.wikibrowser.parameter.ArticleParameters} */ protected int root_set_size; protected int increment; protected int n_synonyms; protected float eps_error; protected int categories_max_steps; protected String[] category_blacklist; /** Browser parameters, * see comments in {@link com.touchgraph.wikibrowser.parameter.BrowserParameters} */ protected String lang; protected String db_host, db_name, user, pass; protected long t_start, t_end; protected float t_work, t_max; // time of one cycle's work protected String enc_java = "UTF8"; protected String enc_ui = "Cp1251"; /* */ /* eo Parameters */ /*************************************************/ // todo add browser parameters protected static final boolean show_redirects = true; private final static List<String> NULL_STRING_LIST = new ArrayList<String>(0); private final static String[] NULL_STRING_ARRAY = new String[0]; /** Creates a new instance of SynonymSearcher, GUI is null yet */ public SynonymSearcher(TGWikiBrowser wb) { this.wb = wb; auth = new Authorities(); session = new SessionHolder(); session.initObjects(); dump = new DumpToGraphViz(); //session.connect = connect; session.dump = dump; syn_art = new SynArt(session, wb, this); } /** Set pointers to GUI objects */ void init(TGWikiBrowser t) { syn_table = ((ResultTablePanel)((SynonymPanel)wb.synonymTextPanel).result_table_panel). table; cat_table = ((CategoryPanel) ((SynonymPanel)wb.synonymTextPanel).category_table_panel).table; } /************************************** /** Set and get parameters functions */ public void connectDatabase() { session.connect.Close(); session.connect.Open(db_host, db_name, user, pass); } /** Loads previous browser parameters from /homedir/.wikibrowser.server.props * Prints to GUI fields */ public void getBrowserParameters() { wb.parameters.setSessionHolder(session); wb.parameters.updateEncodingsToSession(); //session.connect.enc.SetEncodingJavaSourceCode(wb.parameters.getEncJava()); lang = wb.parameters.getLangCode(); DBPanel d = (DBPanel)wb.dbTextPanel; enc_java = wb.parameters.getEncJava(); d.enc_java.setText(enc_java); enc_ui = wb.parameters.getEncUI(); d.enc_ui.setText(enc_ui); db_host = wb.parameters.getDBHost(); d.db_host.setText(db_host); db_name = wb.parameters.getDBName(); d.db_name.setText(db_name); user = wb.parameters.getUser(); d.user_tf.setText(user); pass = wb.parameters.getPass(); d.pass_tf.setText(pass); wb.WIKI_URL = wb.parameters.getWikiURL(); d.wiki_url_tf.setText(wb.WIKI_URL); wb.INITIAL_NODE = wb.parameters.getNode(); ((SynonymPanel)wb.synonymTextPanel).syn_word.setText(wb.INITIAL_NODE); } /** Saves browser parameters to /homedir/.wikibrowser.server.props */ public void setBrowserParameters() { wb.parameters.setLanguage(lang); wb.parameters.setEncJava(enc_java ); wb.parameters.setEncUI(enc_ui); wb.parameters.setDBHost(db_host); wb.parameters.setDBName(db_name); wb.parameters.setUser(user); wb.parameters.setPass(pass); wb.parameters.setWikiURL(wb.WIKI_URL); wb.parameters.setNode(wb.INITIAL_NODE); } /** Load previous search parameters and results from ./log_dir/article.ru.params, * Print to GUI fields */ public void getArticleParameters(String article) { if (!article.trim().equals("")) { ArticleParameters ap = new ArticleParameters(wb.parameters.getLogDir(), getLatinitsaFilename(article, lang), session); lang = wb.parameters.getLangCode(); ap.setLang(lang); ParametersPanel p = (ParametersPanel)(((SynonymPanel)wb.synonymTextPanel).params_panel); root_set_size = ap.getRootSetSize(); p.root_size_tf.setValue(root_set_size); increment = ap.getIncrement(); p.inc_tf.setValue(increment); n_synonyms = ap.getNSynonyms(); p.nsyn_tf.setValue(n_synonyms); eps_error = ap.getEpsError(); p.eps_tf.setValue(eps_error); categories_max_steps = ap.getCategoriesMaxSteps(); p.max_steps_tf.setValue(categories_max_steps); category_blacklist = ap.getCategoryBlackList(); p.categories_field.setText( StringUtil.join("|", category_blacklist)); syn_table.createRatedSynonymList(ap.getRatedSynonyms()); syn_table.updateTable(); } } /** Saves successfull search parameters and results to ./log_dir/article.params */ public void setArticleParameters() { String article_fn = wb.INITIAL_NODE; // filename if (!article_fn.trim().equals("") && wb.parameters.isLogEnabled()) { session.connect.enc.SetEncodingJavaSourceCode(wb.parameters.getEncJava()); ArticleParameters ap; ap = new ArticleParameters(wb.parameters.getLogDir(), getLatinitsaFilename(article_fn, lang), session); ap.setLang(lang); ap.setCategoryBlackList(category_blacklist); ap.setRatedSynonyms(syn_table.getRatedSynonymList()); ap.setRootSetSize(root_set_size); ap.setIncrement(increment); ap.setNSynonym(n_synonyms); ap.setEpsError(eps_error); ap.setCategoriesMaxSteps(categories_max_steps); ap.saveParameters(); } } private String getLatinitsaFilename(String article, String language) { String fn = StringUtilRegular.encodeRussianToLatinitsa(article, session.connect.enc.GetJavaEnc(), session.connect.enc.GetInternalEnc()); return fn.concat(".").concat(language); } public void setRootSetSize(int i) { root_set_size = i; } public void setIncrement (int i) { increment = i; } public void setNSynonyms (int i) { n_synonyms = i; } public void setEpsError (float f){ eps_error = f; } public void setCategoriesMaxSteps (int f) { categories_max_steps = f; } public void setBlackListCategory(String[] s){ category_blacklist = s; } /*********** eo parameters functions */ /*************************************/ /** Gets neighbours Wiki articles from the database, draws it. */ public void drawNeighboursFromDB(String article) { try { syn_art.drawNeighboursFromDB(article, increment); } catch (TGException tge) {tge.printStackTrace();} } /** Gets Wiki one node, checks: * it exists in the database, then adds 1) visual node and 2) article by url. * The font "Courier" was changed to "Times" in Node.java to draw Russian letters: * * public static final Font SMALL_TAG_FONT = new Font("Times",Font.PLAIN,9); */ public void getWikiOneNode(String article) { try { syn_art.getWikiOneNode(article); } catch (TGException tge) {tge.printStackTrace();} } /** Hides all nodes and edges */ public void hideAll() { syn_art.hideAll(); } /* public void showNeighboursCategories(String title) { if(DEBUG) System.out.println("SS.showNeighboursCategories, title is " + title); WikiNode n = (WikiNode) wb.tgPanel.findNodeLabelContaining(title); if(n == null || null == m_articles || null == base_nodes) return; // get categories for the article Integer id = m_articles.get(title); if(null != id) { System.out.println("The title is found in m_articles"); Article a = base_nodes.get(id); // retrieve or take retrieved categories of the article a if(null == a.id_categories || 0 == a.id_categories.length) { List<String> titles_level_1_cats = new ArrayList<String>(); String black_cat = session.category_black_list.inBlackList(id, titles_level_1_cats); if(null == titles_level_1_cats || 0 == titles_level_1_cats.size()) return; a.id_categories = Category.getIDByTitle(session.connect, titles_level_1_cats); } for(int id_cat:a.id_categories) { Category c = session.category_nodes.get(id_cat); if(null == c.page_title || 0 == c.page_title.length()) continue; // add category node String title_cat = "C:" + c.page_title; WikiNode r = (WikiNode) wb.completeEltSet.findNode(title_cat); if(r == null) r = wb.addWikiNode(title_cat); r.setBackColor(Color.decode(color_node_category)); // add edge com.touchgraph.graphlayout.Edge e; e = wb.completeEltSet.findEdge(r,n); if(e==null) { e = new WikiEdge(r,n,edge_dist_aritlce_category); e.setColor(Color.decode(color_edge_article_category)); //e.setColor(edgeColors[linenum % edgeColors.length]); wb.completeEltSet.addEdge(e); } r.setVisible(true); e.setVisible(true); } } // else Article.createArticleWithCategories() // todo } */ public void SetNodeAndTextPane(String s) { if (DEBUG) { System.out.println("SetNodeAndTextPane() called."); } WikiNode n = (WikiNode) wb.tgPanel.findNodeLabelContaining(s); if (n!=null) { wb.setLocale(n); wb.tgPanel.setSelect(n); wb.setWikiTextPane(n); } } /** Initializes object "session" with current parameters (depends on language): * connect, cb (category_black_list), categories_max_steps. **/ public void initSession() { List<String> cb = null; if(null != category_blacklist && 0 < category_blacklist.length) { cb = Arrays.asList(category_blacklist); } session.Init(session.connect, cb, categories_max_steps); String database_encoding = "ISO8859_1"; //String enc_java = "UTF8"; //String enc_ui = "Cp1251"; session.connect.enc.SetEncodings(database_encoding, enc_java, enc_ui); if(null != m_articles) m_articles.clear(); if(null != m_categories) m_categories.clear(); session.connect.ReOpenIfInvalid(); wb.completeEltSet.clearAll(); } /** Adds category to the blacklist, saves to word settings file, * prints blacklist categories in parameters panel. */ public void addCategoryToBlackList(String category) { List<String> cb = NULL_STRING_LIST; if(null != category_blacklist && 0 < category_blacklist.length) { cb = Arrays.asList(category_blacklist); } if(!cb.contains(category)) { List<String> cb_new = new ArrayList<String>(); cb_new.addAll(cb); cb_new.add(category); category_blacklist = (String[])cb_new.toArray(NULL_STRING_ARRAY); setArticleParameters(); ParametersPanel p = (ParametersPanel)(((SynonymPanel)wb.synonymTextPanel).params_panel); p.categories_field.setText( StringUtil.join("|", category_blacklist)); } } /** Removes category from the blacklist, saves blacklist to word settings * file, updates blacklist in parameters panel. */ public void removeCategoryFromBlackList(String category) { List<String> cb = NULL_STRING_LIST; if(null != category_blacklist && 0 < category_blacklist.length) { cb = Arrays.asList(category_blacklist); } if(cb.contains(category)) { List<String> cb_new = new ArrayList<String>(); cb_new.addAll(cb); cb_new.remove(category); category_blacklist = (String[])cb_new.toArray(NULL_STRING_ARRAY); setArticleParameters(); ParametersPanel p = (ParametersPanel)(((SynonymPanel)wb.synonymTextPanel).params_panel); p.categories_field.setText( StringUtil.join("|", category_blacklist)); } } /** Setup dump and log of search of synonyms to log directory. * The Session.Dump object updated by BrowserParameters */ public void setupDumpAndLog() { dump.enable_file_dot = false; if(wb.parameters.isLogEnabled()) { dump.file.SetDir(wb.parameters.getLogDir() + System.getProperty("file.separator")); String a, a2, fn, fn2; a = ((SynonymPanel)wb.synonymTextPanel).syn_word.getText(); a2 = session.connect.enc.EncodeFromUser(a); fn = getLatinitsaFilename( a, lang); fn2 = getLatinitsaFilename(a2, lang); dump.file.SetFilename(fn + ".txt"); dump.file.Open(false, "Cp1251"); // append = false session.dump = dump; } else { session.dump = null; } } /** Set type type_n to nodes, which are rated as synonyms by user (checked * in result table). */ public void SetTypeForRatedSynonyms(Map<Integer, Article> nodes, NodeType type_n) { List<String> rated = syn_table.getRatedSynonymList(); if(null == rated) { return; } //System.out.println("rated=" +rated); for(Article a:nodes.values()) { for(String r:rated) { if (r.equalsIgnoreCase(a.page_title)) { a.type = type_n; break; } } } } /** Search synonyms for article in the wikipedia */ public void SearchSynonyms(String article) { wb.completeEltSet.clearAll(); SynonymPanel sp = (SynonymPanel)wb.synonymTextPanel; DateFormat formatter = DateFormat.getDateTimeInstance(DateFormat.LONG, DateFormat.LONG, new Locale("en","US")); String today = formatter.format(new Date()); //String article_in_db = Encodings.FromTo(article, "UTF8", "ISO8859_1"); // ISO8859_1 Cp1251 System.out.println ("The word '" + session.connect.enc.EncodeToUser(article) + "' is processing..."); ResultTablePanel rtp = (ResultTablePanel)sp.result_table_panel; rtp.output.setText("The word '" + session.connect.enc.EncodeToUser(article) + "' is processing..."); CategoryPanel cp = (CategoryPanel)sp.category_table_panel; cp.output.setText(""); t_start = System.currentTimeMillis(); session.clear(); initSession(); setupDumpAndLog(); //Map<Integer, Article> List<String> rated = syn_table.getRatedSynonymList(); base_nodes = LinksBaseSet.CreateBaseSet(article, rated, session, root_set_size, increment); //dump.file.PrintNL("Total_steps_while_categories_removing:"+session.total_categories_passed); //dump.file.Flush(); List<Article> synonyms = auth.Calculate(base_nodes, eps_error, n_synonyms, session); t_end = System.currentTimeMillis(); t_work = (t_end - t_start)/1000f; // in sec if (null != synonyms) { String s = "\n\ntime sec:" + t_work + " iter:" + auth.iter + " vertices:" + base_nodes.values().size() + " edges:" + DCEL.CountLinksIn(base_nodes) + "\nroot_set_size:"+root_set_size+" increment:"+increment + "\nn_synonyms:"+synonyms.size() + " ("+n_synonyms +")" + "\ncategories:"+session.category_nodes.size() + " total_steps_while_categories_removing:"+ session.category_black_list.getTotalCategoriesPassed() + "\n\n" + "Synonyms and related words:\n"; if(null != session.dump) { dump.file.Print(s); dump.file.Flush(); auth.AppendSynonyms(article, synonyms, "|", dump); dump.file.Print("\n\nSynonyms rated by user:"+syn_table.getRatedSynonym("|")); dump.file.Flush(); } syn_table.addUnratedSynonymListByArticles(synonyms); syn_table.updateTable(); CategorySet.fillLinksFromCategoryToArticles(base_nodes, session.category_nodes); cat_table.createCategoriesList(session.category_nodes, session.category_black_list, base_nodes); cat_table.updateTable(); // dump results to result textarea: statistics and result synonyms rtp.output.append(s); rtp.output.append( auth.SynonymsToString(article, synonyms, "|")); if(null != session.dump) { rtp.output.append("\n\nSee more information in the file:\n" + dump.file.GetPath()); } if(session.getIWiki()) { // before drawing nodes for(Article a:base_nodes.values()) a.fillInterWikiTitle (session, session.getIWikiLang(), PageNamespace.MAIN); for(Category c:session.category_nodes.values()) c.fillInterWikiTitle (session, session.getIWikiLang(), PageNamespace.MAIN); } SetTypeForRatedSynonyms(base_nodes, NodeType.RATED_SYNONYMS); syn_art.drawNodesEdges (base_nodes, show_redirects); syn_art.setVisibleEdges(session.source_article_id, base_nodes); SetNodeAndTextPane(article); setArticleParameters(); // creates map in order to "Expand categories" via context menu m_articles = wikipedia.kleinberg.Article.createMapFromTitleToID(base_nodes); m_categories = wikipedia.kleinberg.Article.createMapFromTitleToID(session.category_nodes); /* // print best hubs as triangles dump.Dump(base_nodes, ""); // append dot command to bat file dump.file_bat.Print( dump.GetStatisticsHashMap(base_nodes) + dump.GetDotCommand("jpeg", true) ); dump.file_sh. Print( dump.GetStatisticsHashMap(base_nodes) + dump.GetDotCommand("jpeg", false) ); dump.file_bat.Flush(); dump.file_sh. Flush(); // cluster synonyms Map<Integer, Article> map_synonyms = Article.CreateHashMap((Article[])synonyms.toArray(new Article[0])); dump.enable_file_dot = false; CategorySet.prepareCategories(session, map_synonyms); int max_cluster_weight = 20; List<ClusterCategory> clusters = CategorySet.getCategoryClusters (session.category_nodes, map_synonyms, max_cluster_weight); //CategorySet.dumpClusterCategoryArticle(session, map_synonyms, clusters, "02_clusters_max_weight_"+max_cluster_weight+"_root_set_size_"+root_set_size+"_increment_"+increment); //CategorySet.dumpClusterCategorywithListArticles(session, map_synonyms, clusters, "03_list_articles_max_weight_"+max_cluster_weight+"_root_set_size_"+root_set_size+"_increment_"+increment); */ } else { rtp.output.append("\nNo synonyms were found."); ((SynonymPanel)wb.synonymTextPanel).colorResultTab(); if(wb.parameters.isLogEnabled() && 0 == dump.file.GetFileLength()) { // remove empty file dump.file.delete(); } } } } /* p = session.connect.cur_table.GetIDByTitle(session.connect, Encodings.Latin1ToUTF8(article)); p = session.connect.cur_table.GetIDByTitle(session.connect, Encodings.FromTo(article, "UTF8", "ISO8859_1")); p = session.connect.cur_table.GetIDByTitle(session.connect, Encodings.FromTo(article, "UTF8", "Cp1251")); p = session.connect.cur_table.GetIDByTitle(session.connect, Encodings.FromTo(article, "Cp1251", "UTF8")); p = session.connect.cur_table.GetIDByTitle(session.connect, Encodings.FromTo(article, "Cp1251", "ISO8859_1")); */ /* String s_in = article; String s = ""; s += "Latin1ToUTF8" + wikipedia.util.Encodings.Latin1ToUTF8(s_in); s += "UTF8 to ISO8859_1" + wikipedia.util.Encodings.FromTo(s_in, "UTF8", "ISO8859_1"); s += "UTF8 to Cp1251" + wikipedia.util.Encodings.FromTo(s_in, "UTF8", "Cp1251"); s += "Cp1251 to UTF8" + wikipedia.util.Encodings.FromTo(s_in, "Cp1251", "UTF8"); s += "Cp1251 to ISO8859_1" + wikipedia.util.Encodings.FromTo(s_in, "Cp1251", "ISO8859_1"); */