/* * Authorities.java - To calculate Hubs and Authority pages * * Copyright (c) 2005-2007 Andrew Krizhanovsky /aka at mail.iias.spb.su/ * Distributed under GNU Public License. */ package wikipedia.kleinberg; import wikipedia.language.Encodings; import wikipedia.sql.*; import wikipedia.util.*; import java.util.*; public class Authorities { public boolean debug_graphviz; // dump graph via graphviz if debug is true private DCEL dcel; Article node; public int iter; // number of passed iterations (updated after Iterate) public Authorities() { debug_graphviz = false; dcel = new DCEL(); node = new Article(); } /** Kleinberg p.10 * Let z denote the vector (1; 1; 1; : : :; 1) * Set x0 := z: * Set y0 := z: * For i = 1; 2; : : :; k * Apply the I operation to (xi−1; yi−1), obtaining new x-weights x0i. * Apply the O operation to (x0i; yi−1), obtaining new y-weights y0i. * Normalize x0i, obtaining xi. * Normalize y0i , obtaining yi. * End * Return number of passed iteration */ public int Iterate(Map<Integer, Article> nodes,float eps_error, SessionHolder session) { int iter; float links_in_number; Iterator<Article> it; Article node = new Article(); float[] total_error; if (null == nodes) return 0; int n_nodes = nodes.size(); //node.NormalizeXY(nodes); iter = 0; do { iter ++; it = nodes.values().iterator(); while (it.hasNext()) { Article n = it.next(); n.CalculateNewX(nodes, n_nodes); n.CalculateNewY(nodes, n_nodes); } node.NormalizeNewXNewY(nodes); // get x and y total change (error) total_error = node.UpdateXY(nodes); if (null != session.dump) { // Print the total error for the current iteration /* session.dump.file.Print("iter:" + iter + " total change x:" + total_error[0] + " y:" + total_error[1] + " x+y:" + (total_error[0] + total_error[1]) + "\n"); session.dump.file.Flush(); */ /* // Print the dot file: old session.dump.file.WriteNew(session.dump.path + i + ".dot", session.dump.Dump(nodes), "UTF8"); // append dot command to bat file String bat_text = "\ndot.exe -Tjpeg " + session.dump.filename_dot +i+ ".dot -v -o " + session.dump.filename_dot +i+ ".jpeg\n"; session.dump.file.Append(session.dump.dir + session.dump.filename_bat, bat_text, "Cp866"); */ } }while (eps_error < total_error[0] + total_error[1]); if (null != session.dump) { String s = StringUtilRegular.encodeRussianToLatinitsa(session.source_page_title, Encodings.enc_java_default, Encodings.enc_int_default); session.dump.DumpDotBat(nodes, s + "_iter.dot"); } return iter; } /** Get all HUBS - nodes which point to the source node. * Sort these nodes by Y value */ public List<Article> getAllHubsSortedByY (Map<Integer, Article> nodes, int source_article_id) { Article n = new Article(); if(null == nodes) return null; List<Article>nodes_pointed = new ArrayList<Article>(); Article source = nodes.get(source_article_id); if (null != source.links_in) { for(int i=0; i<source.links_in.length; i++) { Article add = nodes.get( source.links_in[i] ); nodes_pointed.add(add); } } Collections.sort(nodes_pointed, Article.Y_ORDER); return nodes_pointed; } /** Get <=n synonyms nodes which are referred from hubs (hubs pointed to source node). * Synonyms array should be locally sorted within the hub (i.e. links of one hub should be sorted) */ public List<Article> getAuthoritiesSortedByX (Map<Integer, Article> nodes, List<Article> hubs, int n_synonyms) { Article n = new Article(); if(null == nodes) return null; int i, j; int page_synonyms = 0; Map<Integer, Article> global_hash = new HashMap<Integer, Article>(); List<Article> global_list = new ArrayList<Article>(); HUBS_CYCLE: for(i=0; i<hubs.size(); i++) { Article hub = hubs.get(i); List<Article> local_list = new ArrayList<Article>(); for(j=0; j<hub.links_out.length; j++) { Article candidate = nodes.get(hub.links_out[j]); if (NodeType.ID_SOURCE_ARTICLE != candidate.type && !global_hash.containsKey(candidate.page_id)) { local_list.add(candidate); } } Collections.sort(local_list, Article.X_ORDER); // update global hash, global list for(j=0; j<local_list.size(); j++) { if (page_synonyms++ >= n_synonyms) break HUBS_CYCLE; global_hash.put(local_list.get(j).page_id, local_list.get(j)); global_list.add(local_list.get(j)); //global_list.addAll(local_list); } } return global_list; } public String getTitles(List<Article> nodes, String delimiter) { if (null == nodes || 0 == nodes.size()) return null; String titles = ""; if(1 == nodes.size()) { return nodes.get(0).page_title; } for(int i=0; i<nodes.size()-1; i++) { titles += nodes.get(i).page_title + delimiter; } return titles + nodes.get( nodes.size()-1 ).page_title; } /** Calculate x and y values of nodes. * The question to Kleinberg's algorithm: Could the hub be authoritative page? * I.e. if the word (it is really synonym) was selected as the hub, then could it appear in the list of synonyms? */ public List<Article> Calculate(Map<Integer, Article> nodes, float eps_error, int n_synonyms, SessionHolder session) { if (null == nodes) return null; //link.CountLinks (session.connect, nodes); iter = Iterate(nodes, eps_error, session); // Report the pages with the c largest coordinates in xk as authorities. List<Article> hubs = getAllHubsSortedByY(nodes, session.source_article_id); node.SetType((Article[])hubs.toArray(Article.NULL_ARTICLE_ARRAY), NodeType.HUB); if (null != session.dump) { session.dump.file.Open(true, "Cp1251"); session.dump.file.PrintNL( "\nhubs (sorted by Y) pointed to the source article:\n" + getTitles(hubs, " | ") ); session.dump.file.Flush(); } List<Article> synonyms = getAuthoritiesSortedByX(nodes, hubs, n_synonyms); if (null != session.dump) { session.dump.PrintSynonyms(session, synonyms); } session.category_black_list.fillCategoryNodesIfBlackListEmpty(synonyms); // set type 0 for authorities (best synonyms, max x) in base_nodes Article[] synonyms_array = (Article[])synonyms.toArray(Article.NULL_ARTICLE_ARRAY); node.SetType(synonyms_array, NodeType.AUTHORITY); if (null != session.dump) { session.dump.DumpDotBat(nodes, session.source_page_title + "_synonyms_triangle.dot"); } return synonyms; } /** Get names of synonyms (names or page_title of the best hubs) using Calculate() */ public String[] GetSynonyms(Article[] hubs) { int i, n_synonyms, counter; if (null == hubs) return null; // Check: Does hubs[].page_id contains source_article_id boolean b_contains = false; for (i=0; i<hubs.length; i++) { if (NodeType.ID_SOURCE_ARTICLE == hubs[i].type) { b_contains = true; break; } } String[] synonyms = null; n_synonyms = b_contains ? hubs.length - 1 : hubs.length; if (0 == n_synonyms) return null; synonyms = new String[n_synonyms]; counter = 0; for (i=0; i<hubs.length; i++) { if (NodeType.ID_SOURCE_ARTICLE != hubs[i].type) { synonyms [counter ++] = hubs[i].page_title; } } return synonyms; } /** Concatenate source synonym and list of synonyms (joined by delimiter) */ public String SynonymsToString( String source_synonym, List<Article> synonyms,String delimiter) { //String text = new String(Encodings.Latin1ToUTF8(source_synonym)); String text = source_synonym; if (null != synonyms) { if(0 < synonyms.size()) { text += delimiter; } for(int i=0; i<synonyms.size()-1; i++) { text += synonyms.get(i).page_title + delimiter; //text += Encodings.Latin1ToUTF8(synonyms.get(i).page_title) + delimiter; } if(0 < synonyms.size()) { text += synonyms.get(synonyms.size()-1).page_title; //text += Encodings.Latin1ToUTF8(synonyms.get(synonyms.size()-1).page_title); } } //text += "\n"; return text; } /** Write (Append) list of synonyms for the source synonym (joined by delimiter) to the file */ public void AppendSynonyms( String source_synonym, List<Article> synonyms,String delimiter, DumpToGraphViz dump) { if (null != dump) { dump.file.Print(SynonymsToString(source_synonym, synonyms, delimiter)); dump.file.Flush(); } } } /** get first n largest Nodes via sorting */ /* public Article[] getLargestXNodes( Map<Integer, Article> nodes, int n_largest_nodes) { Article n = new Article(); if(null == nodes) return null; List<Article>sorted_nodes = new ArrayList<Article>(nodes.values()); Collections.sort(sorted_nodes, n.X_ORDER); int n_max = Math.min(n_largest_nodes, sorted_nodes.size()); Article[] result = new Article[n_max]; for(int i=0; i<n_max; i++) { result [i] = sorted_nodes.get(i); } return result; } */