/*
* LinksBaseSet.java - Create Root Set and Base Set of links
*
* Copyright (c) 2005-2007 Andrew Krizhanovsky /aka at mail.iias.spb.su/
* Distributed under GNU Public License.
*/
package wikipedia.kleinberg;
import wikipedia.language.Encodings;
import wikipedia.sql.*;
import wikipedia.util.*;
import wikipedia.data.ArticleIdAndTitle;
import wikipedia.data.Redirect;
import java.util.*;
/*
* Algorithm text
* *
* "Thus, we assemble a root set Rp consisting of t pages that point to p; we grow this into a
* base set Sp as before; and the result is a subgraph Gp in which we can search for hubs and
* authorities." Kleinberg p.15
*
* Algorithm pseudocode
*
* Input: p is page_title
* Goal: find synonyms (related terms) of p
* Actions:
* 1. Get page_id of articles which refer to p. (This is Root Set Rp, but without the p.)
* SELECT l_from FROM links WHERE l_to = page_id of p UNIQUE
* detail: SELECT page_id, page_title FROM page WHERE page_namespace=0 AND page_id IN (SELECT l_from FROM links WHERE l_to=N
* 2. Expanding the root set into a base set Sp.
* 2.1 Base set Sp = Rp + articles which refer to any of Rp
* SELECT l_from FROM links WHERE l_to IN (page_id of Rp) UNIQUE LIMIT t \Rp
* 2.2 Base set Sp += articles which are referred by any of Rp.
* SELECT l_to FROM links WHERE l_from IN (page_id of Rp)
* 2.3 Get (all other) links inside Sp.
* (SELECT l_to FROM links WHERE l_from IN (page_id of Sp\Rp)) \ Rp
* (SELECT l_from FROM links WHERE l_to IN (page_id of Sp\Rp)) \ Rp
*
*/
public class LinksBaseSet {
public LinksBaseSet() {
}
/** Map from article's title to ArticleIdAndTitle object - temp here */
private static Map<String, ArticleIdAndTitle> _mr = new HashMap<String, ArticleIdAndTitle>(0);
/** Parametrized creation of base set via parameters t, d.
* @param t root_set_size - number of articles in the root set,
* negative value means no limit. There are different methodics
* to select the articles to the root set, e.g., randomly.
*
* @param d increment - number of articles which could be added to the base set
* (they refer to one of the pages in the root base)
*
* @param synonyms list of synonyms (for the word page_title) rated by user(s)
*
*<pre> Algorithm of creation base set (see comments for the function above):
*
* Set Sp := Rp
* For each page p from Rp
* Let G+(p) denote the set of all pages p points to.
* Let G-(p) denote the set of all pages pointing to p.
* Add all pages in G+(p) to Sp.
* If |G-(p)| <= d then
* Add all pages in G-(p) to Sp.
* Else
* Add an arbitrary set of d pages from G-(p) to Sp.
* End
* Return Sp</pre>
*
*
* Speed up the very slow function Links.GetAllLinks
* (i suppose that there is symmetry in links, e.g., a->b => b<-a):
*
* (todo: sets priority of reciprocity links, e.g., a->b && b->a > a->b || b->a):
*
* root_nodes1
* / \
* (increment links) | a1 -> | (all links)
* base_nodes1 ------------------>| + synonyms |-------------> base_nodes2
* | LToByLFrom-> |
* <-- LFromByLTo <-- (all links) --> LToByLFrom -->
*
* assert /after drawing this picture, I am Picasso/ that I == II, but II should be faster:
* I. Links.getAllLinks(session, map_title_article);
* II. Links.getAllLinksFromNodes(session, from:base_nodes1);
*/
public static Map<Integer, Article> CreateBaseSet(
String page_title,List<String> synonyms,
SessionHolder session,
int root_set_size, int increment)
{
//System.out.println("CreateBaseSet: Connection conn="+session.connect.conn);
// 1.
//String latin1_article = Encodings.UTF8ToLatin1(page_title);
//String latin1_article = Encodings.FromTo(page_title, "UTF8", "ISO8859_1");
int p = PageTable.getIDByTitle(session.connect, page_title); // latin1_article
if (p < 0) {
// redirect
ArticleIdAndTitle a_to = Redirect.getByRedirect (session, p, page_title, _mr);
if(null == a_to) {
return null;
}
p = a_to.id;
page_title = a_to.title;
}
//System.out.println("CreateBaseSet: int p="+p);
if(0 == p) { // page is absent
return null;}
// m_out - local map<title of article, list of titles links_out> // from article to articles
// m_in - local map<title of article, list of titles links_in>
// are used in getAllLinksFromNodes to find all links between articles
Map<String,Set<String>> m_out = new HashMap<String,Set<String>>();
Map<String,Set<String>> m_in = new HashMap<String,Set<String>>();
Article[] a1 = new Article[1];
a1[0] = new Article();
a1[0].page_id = p;
a1[0].page_title = session.source_page_title = page_title;
a1[0].type = NodeType.ID_SOURCE_ARTICLE; // set type for the very source article
session.source_article_id = 0; // for first calls of getLToByLFrom()
Article[] root_nodes = Links.getLToByLFrom(session, a1, root_set_size, m_out, m_in);
session.source_article_id = p;
// old
//Article[] root_nodes = Links.GetLFromByLTo(session, p, root_set_size);
// new todo: select first n links in article
// add synonyms (rated by user) to root_nodes
if(0 < synonyms.size()) {
List<Article> a_rated_synonyms = new ArrayList<Article>();
Set<String> root_titles = new HashSet<String>(root_nodes.length);
for(Article a:root_nodes) {
root_titles.add(a.page_title);
}
for(String s:synonyms) {
if(!root_titles.contains(s)) {
Article a = new Article();
a.page_title = s;
a.page_id = PageTable.getIDByTitle(session.connect, s);
a_rated_synonyms.add(a);
}
}
root_nodes = Article.joinUnique(root_nodes,
(Article[])a_rated_synonyms.toArray(Article.NULL_ARTICLE_ARRAY));
}
if(null == root_nodes || 0 == root_nodes.length) {
if (null != session.dump && null != session.dump.file_dot.GetFilename()) {
String bat_text = "\n:: " + session.dump.file_dot.GetFilename() +".dot \t Warning: no page refers to this page.\n";
session.dump.file_bat.Print(bat_text);
session.dump.file_bat.Flush();
}
return null; // nobody refers to the p page
}
Article.SetType(root_nodes, NodeType.ROOT);
root_nodes = Article.joinUnique(root_nodes, a1);
if (null != session.dump) { session.dump.DumpDotBat(root_nodes, page_title + "1_0_root_nodes.dot"); }
// 2.1
//Article[] base_nodes1 = Links.getLFromByLTo(session, root_nodes, increment, m_out, m_in);
int n_limit2 = -1;
Article[] base_nodes1 = Links.getLFromByLTo(session, root_nodes, increment, n_limit2, m_out, m_in);
if (null != session.dump) { session.dump.DumpDotBat(base_nodes1, page_title + "2_1_base_nodes1_GetLFromByLTo.dot"); }
// 2.2
Article[] base_nodes2 = Links.getLToByLFrom(session, root_nodes, -1, m_out, m_in);
if (null != session.dump) { session.dump.DumpDotBat(base_nodes2, page_title + "2_2_1_base_nodes2_GetLToByLFrom.dot"); }
Article[] base_nodes = Article.joinUnique(base_nodes1, base_nodes2);
Article[] base_and_root_nodes = Article.joinUnique(base_nodes, root_nodes);
if (null != session.dump) { session.dump.DumpDotBat(base_and_root_nodes, page_title + "2_2_2_base_and_root_nodes.dot"); }
Article.SetType(base_and_root_nodes, NodeType.BASE);
// 2.3
Map<Integer, Article> map_id_article = Article.createMapIdToArticleWithoutRedirects (base_and_root_nodes);
Map<String, Article> map_title_article = Article.createMapTitleToArticleWithoutRedirects(base_and_root_nodes);
for(String s:synonyms) {
if(map_title_article.containsKey(s)) { // redirects has not article
map_title_article.get(s).type = NodeType.RATED_SYNONYMS;
}
}
// set type for the very source article
// todo skip
//map_id_article.get(p).type = NodeType.ID_SOURCE_ARTICLE;
assert(map_id_article.get(p).type == NodeType.ID_SOURCE_ARTICLE);
Links.getAllLinksFromNodes(session, map_title_article, base_nodes1, m_out, m_in);
//Links.getAllLinks(session, map_title_article);
if (null != session.dump) {
String article_fn = StringUtilRegular.encodeRussianToLatinitsa(page_title, Encodings.enc_java_default, Encodings.enc_int_default);
session.dump.DumpDotBat(map_id_article, article_fn + ".dot");
}
m_out.clear();
m_in.clear();
m_in = null;
m_out = null;
return map_id_article;
}
}