/* * Links.java - SQL operations with wikipedia.links table. * * Copyright (c) 2005-2007 Andrew Krizhanovsky /aka at mail.iias.spb.su/ * Distributed under GNU Public License. */ package wikipedia.sql; import wikipedia.language.Encodings; import wikipedia.kleinberg.SessionHolder; import wikipedia.kleinberg.Article; import wikipedia.kleinberg.NodeType; //import wikipedia.kleinberg.Category; import wikipedia.kleinberg.CategoryBlackList; import wikipedia.util.*; import wikipedia.util_rand.*; import wikipedia.data.ArticleIdAndTitle; import wikipedia.data.Redirect; import wikipedia.data.StringMap; //import org.apache.commons.collections.ArrayStack; import java.sql.*; import java.util.Arrays; import java.util.Map; //import java.util.HashMap; import java.util.Set; import java.util.HashSet; //import java.util.Iterator; import java.util.List; import java.util.ArrayList; /** Works with the SQL table pagelinks. * The link or edge with articles (pl_from, pl_namespace.pl_title) * * ************ * TABLE pagelinks ( * -- Key to the page_id of the page containing the link. * pl_from int(8) * * -- Key to page_namespace/page_title of the target page. * -- The target page may or may not exist, and due to renames * -- and deletions may refer to different page records as time * -- goes by. * pl_namespace int NOT NULL default '0', * pl_title varchar(255) binary NOT NULL default '') * * (see mediawiki/maintenance/FiveUpgrade.inc function upgradeLinks()). * ************ * */ public class Links { private Links() {} private final static String[] NULL_STRING_ARRAY = new String [0]; private final static Article[] NULL_ARTICLE_ARRAY = new Article[0]; private final static Integer[] NULL_INTEGER_ARRAY = new Integer[0]; //private final static List<Integer> NULL_INTEGER_LIST = new ArrayList<Integer>(0); // ******************************************** // From stuff // /** Creates Article[], fills ->id and ->title using tables page, pagelinks. * * @param n_limit limits the number of returned nodes * It is not considered if n_limit <= 0, i.e. it will return all results. * <pre> * Parameters' example: * str_from = "FROM page WHERE page_namespace=0 AND page_id" + str_in; * str_sql_count_size = "SELECT COUNT(page_id) AS size " + str_from; * str_sql = "SELECT page_id, page_title " + str_from;</pre> */ public static Article[] getLinksSQL(SessionHolder session, String str_sql_count_size, String str_sql, int n_limit) { Article[] l_from = null; int size = 0, min; int l_from_counter = 0; try { Statement s = session.connect.conn.createStatement(); s.executeQuery(str_sql_count_size); ResultSet rs = s.getResultSet(); if (rs.next()) { size = rs.getInt("size"); if (0 < size) { if (n_limit <= 0) { min = size; // takes all elements } else { min = Math.min(n_limit, size); } s.executeQuery(str_sql); rs = s.getResultSet(); // gets all id, make permutation, takes first 'min' elements int[] id_all = new int[size]; for (int i=0; rs.next(); i++){ id_all[i] = rs.getInt("pl_from"); } int[] id_res = id_all; if(session.randomPages() && n_limit < size) { id_res = RandShuffle.permuteRandomly(id_all); } l_from = new Article[min]; for(int id:id_res) { String title = PageTable.getTitleByID(session.connect, id); if(null == title || 0 == title.length()) continue; Article a = Article.createArticleWithCategories(session, title, id); if (null != a) l_from [l_from_counter++] = a; if(l_from_counter >= min) break; } } } rs.close(); s.close(); } catch(SQLException ex) { System.err.println("SQLException (Links.java GetLinksSQL()): sql='" + str_sql + "' " + ex.getMessage()); } if(null == l_from || 0 == l_from.length) { l_from = null; return NULL_ARTICLE_ARRAY; } Article[] l_from_result; // cut off last elements of l_from (if they are null) if(l_from.length == l_from_counter) { l_from_result = l_from; } else { l_from_result = new Article[l_from_counter]; System.arraycopy(l_from, 0, l_from_result, 0, l_from_counter); l_from = null; } return l_from_result; } // (session, sb_sql_count_size.toString(), sb_sql.toString(), n_limit); /** Creates array of ID, takes it from the table pagelinks. Shuffles it, * if it is user preference. */ public static Integer[] getLinksSQL_AsID(SessionHolder session, String str_sql_count_size, String str_sql, int n_limit) { Integer[] result = NULL_INTEGER_ARRAY; try { Statement s = session.connect.conn.createStatement(); s.executeQuery(str_sql_count_size.toString()); ResultSet rs = s.getResultSet(); if (rs.next()) { int size = rs.getInt("size"); if (0 < size) { int min; if (n_limit <= 0) { min = size; // takes all elements } else { min = Math.min(n_limit, size); } s.executeQuery(str_sql.toString()); rs = s.getResultSet(); // gets all id, make permutation, takes first 'min' elements int[] id_all = new int[size]; for (int i=0; rs.next(); i++){ id_all[i] = rs.getInt("pl_from"); } int[] id_res = id_all; if(session.randomPages() && n_limit < size) { id_res = RandShuffle.permuteRandomly(id_all); } result = new Integer [min]; int counter = 0; for(int id:id_res) { result [counter++] = id; if(counter >= min) break; } } } rs.close(); s.close(); } catch(SQLException ex) { System.err.println("SQLException (Links.java GetLinksSQL()): sql='" + str_sql + "' " + ex.getMessage()); } return result; } /** Gets ids of source pages by id of destination page (What links here). * @param n_limit maximum number of articles id to be returned * @param id_to id of destination page *<pre> * page.page_title === pagelinks.pl_title * 1] SELECT page_title FROM page WHERE page_id=10484 AND page_namespace = 0; * 2] SELECT pl_from FROM pagelinks WHERE pl_title IN (SELECT page_title FROM * page WHERE page_id=10484 AND page_namespace = 0) AND pl_namespace = 0; * * Remark: * It is supposed that articles have unique names, i.e. only first id of 1] will be used in 2]. *</pre> * old 1.4 * Optimization of GetLFromByLToIN: * replace IN() by several explicit equations "=" * ( Time speed up - about three times - for "Parallelogram", "Sycamore", * Test Parameters: root_set_size:10 increment:10 iter:20) */ public static Article[] getLFromByLTo_deprecated(SessionHolder session, int id_to, String title_to, int n_limit, Map<String,Set<String>> m_out, Map<String,Set<String>> m_in) { String str_in, str_from; String str_sql_count_size, str_sql; Article[] result = null; Article node = new Article(); System.out.println("Warning: deprecated function getLFromByLTo! Replace it by getIDToByTitleFrom()."); /* // 1] String title_to = PageTable.getTitleByID(session.connect, id_to); if(null == title_to || 0 == title_to.length()) { return NULL_ARTICLE_ARRAY; } */ //title_to = Encodings.FromTo(title_to, "UTF8", "ISO8859_1"); String title_to_db = session.connect.enc.EncodeToDB(title_to); // 2] SELECT pl_from FROM pagelinks WHERE pl_title IN // (SELECT page_title FROM page WHERE page_id=10484 AND page_namespace = 0) AND pl_namespace = 0; str_from = "FROM pagelinks WHERE pl_title='" + title_to_db + "' AND pl_namespace = 0"; str_sql_count_size = "SELECT COUNT(pl_from) AS size " + str_from; str_sql = "SELECT pl_from " + str_from; result = getLinksSQL(session, str_sql_count_size, str_sql, n_limit); for(Article a:result) { addTitlesToMaps(a.page_title, title_to, m_out, m_in); } /* old: mediawiki 1.4 // 1. Calculate number of links // too complex & slow request: // "SELECT COUNT(page_id) AS size FROM page " + // "WHERE page_namespace=0 AND " + // "page_id IN (SELECT l_from FROM links WHERE l_to="+l_to+")"); // Execute subrequest SQL IN(...) int[] i_links_all = GetIntFromLinks(session.connect, "l_from", "WHERE l_to="+l_to); if (null == i_links_all) return null; int[] i_links = session.category_black_list.DeleteUsingBlackList (i_links_all, n_limit); for(int i=0; i<i_links.length; i++) { str_from = "FROM page WHERE page_namespace=0 AND page_id=" + i_links[i]; str_sql_count_size = "SELECT COUNT(page_id) AS size " + str_from; str_sql = "SELECT page_id, page_title " + str_from; Article[] add = GetLinksSQL(session, str_sql_count_size, str_sql, n_limit); result = node.JoinUnique(result, add); }*/ return result; //RandShuffle.getRandNodeArray(result, n_limit); } private static StringBuffer sb_sql_count_size = new StringBuffer(350); private static StringBuffer sb_sql = new StringBuffer(350); /** Gets array of ArticleIdAndTitle of pages which point to page with the * title 'title_to', number of return array is limited by 'n_limit'. * * @param increment */ public static ArticleIdAndTitle[] getFromByTitleTo (SessionHolder session, String title_to, PageNamespace namespace, int n_limit) { // SELECT page_id, page_title, page_is_redirect FROM page,pagelinks // WHERE page_id=pl_from AND pl_title='Робот' AND pl_namespace = 0 LIMIT 4; sb.setLength(0); sb.append("SELECT page_id, page_title, page_is_redirect FROM page,pagelinks WHERE page_id=pl_from AND pl_title='"); sb.append( session.connect.enc.EncodeToDB( StringUtil.spaceToUnderscore( StringUtil.escapeChars(title_to)) ) ); sb.append( "' AND pl_namespace="); sb.append( namespace.toInt() ); if(-1 != n_limit) { sb.append( " LIMIT "); sb.append( n_limit ); } List<ArticleIdAndTitle> aid = new ArrayList<ArticleIdAndTitle>(); //return getLinksSQL_AsID(session, sb_sql_count_size.toString(), sb_sql.toString(), n_limit); try { Statement s = session.connect.conn.createStatement(); s.executeQuery(sb.toString()); ResultSet rs = s.getResultSet(); //if (rs.next()) { Encodings e = session.connect.enc; // gets all id, make permutation, takes first 'min' elements while (rs.next()){ int page_id = rs.getInt("page_id"); if(1 == rs.getInt("page_is_redirect")) { page_id = - page_id; } String db_str = Encodings.bytesTo(rs.getBytes("page_title"), e.GetDBEnc()); String page_title = e.EncodeFromDB(db_str); aid.add(new ArticleIdAndTitle(page_id, page_title)); } //} rs.close(); s.close(); } catch(SQLException ex) { System.err.println("SQLException (Links.getFromByTitleTo() ArticleIdAndTitle[]): sql='" + sb.toString() + "' " + ex.getMessage()); } return aid.toArray(ArticleIdAndTitle.NULL_ARTICLEIDANDTITLE_ARRAY); } /** Gets list of identifiers of pages which point to page with the * title 'title_to'. * * @param increment */ public static Integer[] getIDFromByTitleTo_hide (SessionHolder session, String title_to, // int id_to, PageNamespace namespace, int n_limit) { List<Integer> ids = new ArrayList<Integer>(); String title_to_db = session.connect.enc.EncodeToDB(title_to); // 2] SELECT pl_from FROM pagelinks WHERE pl_title IN // (SELECT page_title FROM page WHERE page_id=10484 AND page_namespace = 0) AND pl_namespace = 0; sb.setLength(0); sb.append("FROM pagelinks WHERE pl_title='"); sb.append( title_to_db ); sb.append( "' AND pl_namespace = 0"); sb_sql_count_size.setLength(0); sb_sql_count_size.append("SELECT COUNT(pl_from) AS size "); sb_sql_count_size.append(sb); sb_sql.setLength(0); sb_sql.append("SELECT pl_from "); sb_sql.append(sb); return getLinksSQL_AsID(session, sb_sql_count_size.toString(), sb_sql.toString(), n_limit); } /** Gets identifiers by ArticleIdAndTitle[] 'to', fills the map * map_id_to__id_from. */ public static List<Integer> getLFromIDByLTo_WithBlackList_hide( SessionHolder session, ArticleIdAndTitle[] to,Map<Integer, Article> map_id_article_to, Map<Integer, List<Integer>> map_id_to__id_from, //Map<String,Set<String>> m_out, Map<String,Set<String>> m_in, int increment) { Integer[] id_from__with_blacklist; List<Integer> l = new ArrayList<Integer>(); for(ArticleIdAndTitle aid:to) { Article a = map_id_article_to.get(aid.id); // The source article is omitted, since GetLToByLFrom() was called for it already (root set contains the source article) if(NodeType.ID_SOURCE_ARTICLE != a.type) { id_from__with_blacklist = getIDFromByTitleTo_hide (session, a.page_title, //a.page_id, PageNamespace.MAIN, // from_titles, // skip from_titles, i.e. remove pl_from from the result //m_out, m_in, increment); List<Integer> li = Arrays.asList(id_from__with_blacklist); l.addAll(li); map_id_to__id_from.put(aid.id, li); } } return l; } /** Gets ArticleIdAndTitle[] by 'to', fills the map * map_id_to__id_from. */ public static ArticleIdAndTitle[] getLFromByLTo_WithBlackList( SessionHolder session, ArticleIdAndTitle[] to,//Map<Integer, Article> map_id_article_to, Map<String,Set<String>> m_out, Map<String,Set<String>> m_in, //Map<Integer, List<Integer>> map_id_to__id_from, int increment) { List<ArticleIdAndTitle> result = new ArrayList<ArticleIdAndTitle>(); Set<Integer> unique_result_id = new HashSet<Integer>(); for(ArticleIdAndTitle cur_to:to) { //Article a = map_id_article_to.get(cur_to.id); // The source article is omitted, since GetLToByLFrom() was called for it already (root set contains the source article) //if(NodeType.ID_SOURCE_ARTICLE != a.type) { ArticleIdAndTitle[] from = getFromByTitleTo (session, cur_to.title, //cur_to.id, PageNamespace.MAIN, // from_titles, // skip from_titles, i.e. remove pl_from from the result //m_out, m_in, increment); for(ArticleIdAndTitle cur_from:from) { Links.addTitlesToMaps(cur_from.title, cur_to.title, m_out, m_in); if(!unique_result_id.contains(cur_from.id)) { unique_result_id.add (cur_from.id); result .add (cur_from); } // redirects - get 'who points to the redirect page' if(cur_from.id < 0) { ArticleIdAndTitle[] from2 = getFromByTitleTo (session, cur_from.title, PageNamespace.MAIN, increment); for(ArticleIdAndTitle c:from2) { Links.addTitlesToMaps(c.title, cur_to.title, m_out, m_in); if(!unique_result_id.contains(c.id)) { unique_result_id.add (c.id); result .add (c); } } } } //} } unique_result_id.clear(); return result.toArray(ArticleIdAndTitle.NULL_ARTICLEIDANDTITLE_ARRAY); } /** * @param increment - number of articles which could be added to the base set * (they refer to one of the pages in the root base) ** @param n_limit max number of returned articles, negative value means no limit */ public static Article[] getLFromByLTo(SessionHolder session, Article[] l_to, int increment, int n_limit, Map<String,Set<String>> m_out, Map<String,Set<String>> m_in) { ArticleIdAndTitle[] to = ArticleIdAndTitle.create(l_to); // now to[].id may contains id from blacklist, let's delete them to = session.category_black_list.DeleteUsingBlackList(session.randomPages(), to, -1); if(0 == to.length) { return NULL_ARTICLE_ARRAY; } if(null == to[0].title) { System.out.println("Error in Links.getLFromByLTo(): to[0].title is null"); return NULL_ARTICLE_ARRAY; } Map<Integer, Article> map_id_article_to = Article.createMapIdToArticleWithoutRedirects (l_to); // Map<Integer, List<Integer>> map_id_to__id_from = new HashMap<Integer, List<Integer>> (); ArticleIdAndTitle[] from__with_blacklist__redirects; /* List<Integer> l = getLFromIDByLTo_WithBlackList(session, to, map_id_article_to, map_id_to__id_from, //m_out, m_in, increment); Integer[] id_from__with_blacklist__redirects = (Integer[])l.toArray(NULL_INTEGER_ARRAY); from__with_blacklist__redirects = ArticleIdAndTitle.createById(session.connect, id_from__with_blacklist__redirects); */ from__with_blacklist__redirects = getLFromByLTo_WithBlackList(session, to, // map_id_article_to, m_out, m_in, // map_id_to__id_from, increment); // StringMap.fill_m_in_m_out(m_out, m_in, // from__with_blacklist__redirects, to, // map_id_to__id_from); List<Article> from_articles = createArticlesResolveRedirects(session, from__with_blacklist__redirects, map_id_article_to, n_limit, m_out, m_in); // l.clear(); // l = null; // map_id_to__id_from.clear(); map_id_article_to.clear(); to = null; return (Article[])from_articles.toArray(NULL_ARTICLE_ARRAY); } // ******************************************** // To stuff // /** Gets id of page which is referred by page with id_from. * It can be used to retrieve id of real page by id of redirect page * @return 0 if there is problem */ public static int getIdToByIDFrom(SessionHolder session, int id_from, PageNamespace namespace) { int[] i = new int[1]; i[0] = id_from < 0 ? -id_from : id_from ; String[] s = Links.getTitleToByIDFrom(session, i, PageNamespace.MAIN); if(null != s && 0 < s.length) { return PageTable.getIDByTitleNamespace(session.connect, s[0], namespace); } return 0; } private static StringBuffer sb = new StringBuffer(350); //static int max_titles_len = 0; //static int max_pl_title_len = 0; /** Selects one pl_title from pagelinks by pl_from. It is needed for * redirect pages, which links only to one page. * @return null, if (1) it is absent (e.g. it's redirect to category) or * (2) title should be skipped */ public static String getTitleToOneByIDFrom(SessionHolder session, int pl_from) { if (0==pl_from) return null; // special treatment of id of redirect page if(pl_from < 0) pl_from = -pl_from; Statement s = null; ResultSet rs= null; String title = null; sb.setLength(0); sb.append("SELECT pl_title FROM pagelinks WHERE pl_from="); sb.append(pl_from); sb.append(" LIMIT 1"); try { s = session.connect.conn.createStatement(); //str_sql = SELECT pl_title FROM pagelinks WHERE pl_from=52141 LIMIT 1; s.executeQuery(sb.toString()); rs = s.getResultSet(); if (rs.next()) { Encodings e = session.connect.enc; String db_str = Encodings.bytesTo(rs.getBytes("pl_title"), e.GetDBEnc()); String utf8_str = e.EncodeFromDB(db_str); if(!session.skipTitle(utf8_str)) { title = utf8_str; } } } catch(SQLException ex) { System.err.println("SQLException (Links.java getTitleToOneByIDFrom): sql='" + sb.toString() + "' " + ex.getMessage()); } finally { if (rs != null) { try { rs.close(); } catch (SQLException sqlEx) { } rs = null; } if (s != null) { try { s.close(); } catch (SQLException sqlEx) { } s = null; } } return title; } /** Returns titles of destination (to) pages by id of source pages (pl_from), table pagelinks are used. * SQL: SELECT pl_title FROM pagelinks WHERE pl_from IN (18991, 22233) AND pl_namespace = 0; * @param namespace only pages with this namespace will be selected, value defined in PageTable.NS_MAIN, etc. * Return empty array if pl_from={0}; * * SELECT MAX(LENGTH(pl_title)) FROM pagelinks WHERE pl_namespace = 0; * ruwiki: 255, real application: 92,52 * * Test size of max_titles_len * Robot=11651 * Russina=8811 * Todo replace titles ArrayList<String>() by huge static array StringBuffer[][256]; */ public static String[] getTitleToByIDFrom(SessionHolder session, int[] pl_from, PageNamespace namespace) { if (null==pl_from || (1==pl_from.length && 0==pl_from[0])) { return NULL_STRING_ARRAY; } Statement s = null; ResultSet rs= null; List<String> titles = new ArrayList<String>(); sb.setLength(0); sb.append("SELECT pl_title FROM pagelinks WHERE pl_from IN ("); // Prepare SQL IN(...) via pl_from[].page_id for (int i=0; i<pl_from.length-1; i++) { sb.append(pl_from[i]); sb.append(","); } sb.append(pl_from[ pl_from.length-1 ]); // skip last comma sb.append(") AND pl_namespace="); sb.append( namespace.toInt()); int size, i = 0; //String str_sql = null; try { s = session.connect.conn.createStatement(); //str_sql = "SELECT pl_title FROM pagelinks WHERE " + sb.toString() + " AND pl_namespace="+namespace; //System.out.print("GetTitleToByIDFrom sql="+sb.toString()); s.executeQuery(sb.toString()); //GetTitleToByIDFromQuery(rs, s, sb); //System.out.println(" OK."); rs = s.getResultSet(); while (rs.next()) { Encodings e = session.connect.enc; String db_str = Encodings.bytesTo(rs.getBytes("pl_title"), e.GetDBEnc()); String utf8_str = e.EncodeFromDB(db_str); if(!session.skipTitle(utf8_str)) { titles.add( utf8_str ); //titles.add(connect.enc.EncodeFromDB(rs.getString("pl_title"))); } /*if(max_pl_title_len < utf8_str.length()) { max_pl_title_len = utf8_str.length(); System.out.println("GetTitleToByIDFrom max_pl_title_len="+max_pl_title_len); }*/ } /*if(max_titles_len < titles.size()) { max_titles_len = titles.size(); System.out.println("GetTitleToByIDFrom max_titles_len="+max_titles_len); }*/ } catch(SQLException ex) { System.err.println("SQLException (Links.java GetTitleToByIDFrom): sql='" + sb.toString() + "' " + ex.getMessage()); } finally { if (rs != null) { try { rs.close(); } catch (SQLException sqlEx) { } rs = null; } if (s != null) { try { s.close(); } catch (SQLException sqlEx) { } s = null; } } return (String[])titles.toArray(NULL_STRING_ARRAY); } /** @param skip_titles list of titles to be skipped */ public static String[] getTitleToByIDFrom(SessionHolder session, String title_from, int pl_from, PageNamespace namespace, //List<String> skip_titles, Map<String,Set<String>> m_out, Map<String,Set<String>> m_in) { if (0==pl_from) { return NULL_STRING_ARRAY; } Statement s = null; ResultSet rs= null; List<String> titles = new ArrayList<String>(); sb.setLength(0); sb.append("SELECT pl_title FROM pagelinks WHERE pl_from="); sb.append(pl_from < 0 ? -pl_from : pl_from); sb.append(" AND pl_namespace="); sb.append( namespace.toInt()); int size, i = 0; //String str_sql = null; try { s = session.connect.conn.createStatement(); //str_sql = "SELECT pl_title FROM pagelinks WHERE " + sb.toString() + " AND pl_namespace="+namespace; //System.out.print("GetTitleToByIDFrom 1 sql="+sb.toString()); s.executeQuery(sb.toString()); //GetTitleToByIDFromQuery(rs, s, sb); //System.out.println(" OK."); rs = s.getResultSet(); while (rs.next()) { Encodings e = session.connect.enc; String db_str = Encodings.bytesTo(rs.getBytes("pl_title"), e.GetDBEnc()); String utf8_str = e.EncodeFromDB(db_str); //if(!session.skipTitle(utf8_str)) { //if(!skip_titles.contains(utf8_str)) { titles.add( utf8_str ); //titles.add(connect.enc.EncodeFromDB(rs.getString("pl_title"))); //} addTitlesToMaps(title_from, utf8_str, m_out, m_in); //} /*if(max_pl_title_len < utf8_str.length()) { max_pl_title_len = utf8_str.length(); System.out.println("GetTitleToByIDFrom max_pl_title_len="+max_pl_title_len); }*/ } /*if(max_titles_len < titles.size()) { max_titles_len = titles.size(); System.out.println("GetTitleToByIDFrom max_titles_len="+max_titles_len); }*/ } catch(SQLException ex) { System.err.println("SQLException (Links.java GetTitleToByIDFrom 1): sql='" + sb.toString() + "' " + ex.getMessage()); } finally { if (rs != null) { try { rs.close(); } catch (SQLException sqlEx) { } rs = null; } if (s != null) { try { s.close(); } catch (SQLException sqlEx) { } s = null; } } return (String[])titles.toArray(NULL_STRING_ARRAY); } /** Gets all title_to (with blacklist) by articles from ArticleIdAndTitle array. * @param xxx - xxx */ public static List<String> getLToByLFrom_WithBlackList(SessionHolder session, ArticleIdAndTitle[] from, Map<String,Set<String>> m_out, Map<String,Set<String>> m_in) { String[] title_to__with_blacklist; List<String> l = new ArrayList<String>(); for(ArticleIdAndTitle aid:from) { // The source article is omitted, since GetLToByLFrom() was called for it already (root set contains the source article) if(session.source_article_id != aid.id) { //if(NodeType.ID_SOURCE_ARTICLE != a.type) { title_to__with_blacklist = getTitleToByIDFrom(session, aid.title, aid.id, PageNamespace.MAIN, m_out, m_in); l.addAll(Arrays.asList(title_to__with_blacklist)); } } return l; } // _m_out - local map<title of article, list of titles links_out> // _m_in - local map<title of article, list of titles links_in> //private static Map<String,Set<String>> _m_out = new HashMap<String,Set<String>>(); //private static Map<String,Set<String>> _m_in = new HashMap<String,Set<String>>(); /** Creates destination articles (fills ->id, ->title, ->links_in, * ->links_out, ->redirect) by the source articles id, number of adding * articles restricted by n_limit. * * @param n_limit max number of returned articles, negative value means no limit * Todo: select first n links in article (not first n links in table) * * @param m_out map <title of article, list of titles links_out> * @param m_in map <title of article, list of titles links_in> * * The tables page and pagelinks are used. * The scheme: pagelinks.pl_from -> pl_title=page_title -> page.page_id. * SQL * SELECT pl_title FROM pagelinks WHERE pl_from IN (18991, 22233) AND pl_namespace = 0; * out: 17 rows in set (0.24 sec), e.g.: Бархан_(Soundwave), Комикс * foreach pl_title: * PageTable p.GetIDByTitle(pl_title); * * Remark: * The article is omitted * if (1) article id is absent in the table page, * or (2) article's categories are in the blacklist, * or (3) article title is in pl_from[]. * The source article is omitted, since this func was called for it already. */ public static Article[] getLToByLFrom(SessionHolder session, Article[] pl_from, int n_limit, Map<String,Set<String>> m_out, Map<String,Set<String>> m_in) { ArticleIdAndTitle[] from = ArticleIdAndTitle.create(pl_from); // now from[].id may contains id from blacklist, let's delete them from = session.category_black_list.DeleteUsingBlackList(session.randomPages(), from, -1); if(0 == from.length) { return NULL_ARTICLE_ARRAY; } if(null == from[0].title) { System.out.println("Error in Links.getLToByLFrom(): from[0].title is null"); return NULL_ARTICLE_ARRAY; } Map<Integer, Article> map_id_article_from = Article.createMapIdToArticleWithoutRedirects(pl_from); //_m_in. clear(); // _m_out.clear(); List<String> l = getLToByLFrom_WithBlackList(session, from, //map_id_article_from, m_out, m_in); ArticleIdAndTitle[] to__with_blacklist__redirects; String[] title_to__with_blacklist__redirects__zero_id = (String[])l.toArray(NULL_STRING_ARRAY); to__with_blacklist__redirects = ArticleIdAndTitle.createByTitle (session.connect, m_out, m_in, title_to__with_blacklist__redirects__zero_id); List<Article> dest_articles = createArticlesResolveRedirects (session, to__with_blacklist__redirects, map_id_article_from, n_limit, m_out, m_in); l.clear(); l = null; map_id_article_from.clear(); from = null; return (Article[])dest_articles.toArray(NULL_ARTICLE_ARRAY); } /** Replaces redirects by target articles in m_in, m_out; creates articles. */ public static List<Article> createArticlesResolveRedirects( SessionHolder session, ArticleIdAndTitle[] aid__with_blacklist__redirects, Map<Integer, Article> map_id_article_exist, int n_limit, Map<String,Set<String>> m_out, Map<String,Set<String>> m_in) { ArticleIdAndTitle[] aid, aid__with_blacklist; aid__with_blacklist__redirects = ArticleIdAndTitle.join(aid__with_blacklist__redirects, map_id_article_exist); aid__with_blacklist = Redirect.resolveByIdAndTitles(session, m_out, m_in, aid__with_blacklist__redirects); aid__with_blacklist = ArticleIdAndTitle.skipTitles(session, aid__with_blacklist); StringMap.skipTitles(session, m_out, m_in); aid = session.category_black_list.DeleteUsingBlackList(aid__with_blacklist); return createArticlesByIdAndTitle(session, aid, map_id_article_exist, //_m_in, n_limit, m_out, m_in); } /** Creates list of articles for each element in 'aid_array' if it is not * presented in 'map_id_article_exist'. * Updates .redirects of articles in * 'map_id_article_exist' by adding .redirects from 'aid_array'. * Result list is limited by 'n_limit' value. * * @param aid_to array of (id,title_to) the articles will be created with * @param map_id_article_exist map from ID to an existing article, * they could be updated, but they should not be duplicated * in the returned list * * @param n_limit max number of returned articles, negative value means no limit * @param m_out map <title of article, list of titles links_out> * @param m_in map <title of article, list of titles links_in> */ public static List<Article> createArticlesByIdAndTitle( SessionHolder session, ArticleIdAndTitle[] aid_array, Map<Integer, Article> map_id_article_exist, //Map<String,Set<String>> local_m_in, // documentation: Adds local_m_in map to m_in, m_out. int n_limit, Map<String,Set<String>> m_out, Map<String,Set<String>> m_in) { // todo sort aid_array: articles with id > 0 move to begin, i.e. redirect to end, in order to skip additional retrieving of redirect pages from DB. // ... List<Article> articles = new ArrayList<Article>(); for(ArticleIdAndTitle aid:aid_array) { int id = aid.id; if(0 != id) { Article a; if(map_id_article_exist.containsKey(id)) { a = map_id_article_exist.get(id); Redirect.addRedirect(a, aid.redirect); } else { if(id < 0) { System.out.println("Error: Article is created for the redirect aid with id="+id+"; title="+aid.title); } a = new Article(); a.page_id = id; a.page_title = aid.title; //a.page_title = Encodings.FromTo(s, "ISO8859_1", "UTF8"); a.redirect = aid.redirect; a.id_categories = CategoryBlackList.getFirstLevelCategoriesID (session, id); articles.add(a); // --> // / --> // m_out (title_from) ----> \ // \ ----> m_in (title_to) // --> / // --> // title_to is known /*if(local_m_in.containsKey(to)) { for(String _title_from:local_m_in.get(to)) { addTitlesToMaps(_title_from, to, m_out, m_in); } }*/ if(n_limit>=0 && articles.size() >= n_limit) { break; } } } } return articles; } /** Adds title_from and title_to to maps. */ public static void addTitlesToMaps( String title_from, String title_to, Map<String,Set<String>> m_out, Map<String,Set<String>> m_in ) { if(!m_out.containsKey(title_from)) m_out.put(title_from, new HashSet<String>()); m_out.get(title_from).add(title_to); if(!m_in.containsKey(title_to)) m_in.put(title_to, new HashSet<String>()); m_in.get(title_to).add(title_from); } private static final int[] _one = new int[1]; /** Gets all links from nodes_from to vertices in map_title_article by function ..LtoByLFrom() * * @param m_out map <title of article, list of titles links_out> * @param m_in map <title of article, list of titles links_in> * * @see great superb picture at LinksBaseSet.CreateBaseSet() describing nodes_from */ public static void getAllLinksFromNodes(SessionHolder session, Map<String, Article> map_title_article, Article[] nodes_from, Map<String,Set<String>> m_out, Map<String,Set<String>> m_in) { // gets (foreach article) id of destination pages (pl_from, _to(pl_namespace.pl_title)) for(Article a:nodes_from) { int id_from = a.page_id; String title_from = a.page_title; _one[0] = id_from < 0 ? -id_from : id_from ; String[] titles_to = getTitleToByIDFrom(session, _one, PageNamespace.MAIN); if(null != titles_to) { for(String t:titles_to) { if(map_title_article.containsKey(t)) { addTitlesToMaps(title_from, t, m_out, m_in); } } } } // copy Lists m_out, m_in to links_out[], links_in[] for(String t:m_out.keySet()) { if(map_title_article.containsKey(t)) { Article a = map_title_article.get(t); a.links_out = Article.getIdExistedInMap(m_out.get(t), map_title_article); } } for(String t:m_in.keySet()) { if(map_title_article.containsKey(t)) { Article a = map_title_article.get(t); a.links_in = Article.getIdExistedInMap(m_in.get(t), map_title_article); } } } /** * Gets all links which link the articles in the hashmap. * Write them to articles.links_in[] and links_out[]. * * old mediawiki 1.4 * Wrong and slow SQL: * SELECT l_to,l_from FROM links WHERE l_to IN (18991) OR l_from IN (18991); 0.9, 0.75 second: 0.3 * * Right and fast SQL: * SELECT l_to,l_from FROM links WHERE l_to IN (18991) AND l_from IN (18991); 0.17 second: 0.00 * * SELECT l_to,l_from FROM links WHERE l_to IN (18991); 0.02 *//* public static void getAllLinks(SessionHolder session, Map<String, Article> map_title_article) { // m_out - local map<id of article, list of links_out> // m_in - local map<id of article, list of links_in> Map<String,Set<String>> m_out = new HashMap<String,Set<String>>(); Map<String,Set<String>> m_in = new HashMap<String,Set<String>>(); // gets (foreach article) id of destination pages (pl_from, _to(pl_namespace.pl_title)) for(Article a_from:map_title_article.values()) { int id_from = a_from.page_id; String title_from = a_from.page_title; int[] i = new int[1]; i[0] = id_from < 0 ? -id_from : id_from ; String[] titles_to = getTitleToByIDFrom(session, i, PageNamespace.MAIN); i = null; if(null != titles_to) { for(String t:titles_to) { if(map_title_article.containsKey(t)) { addTitlesToMaps(title_from, t, m_out, m_in); } } } } // copy Lists m_out, m_in to links_out[], links_in[] for(String t:m_out.keySet()) { Set<String> ss = m_out.get(t); Article a = map_title_article.get(t); a.links_out = new int[ss.size()]; int i=0; for(String s:ss) { a.links_out[i++] = map_title_article.get(s).page_id; } } for(String t:m_in.keySet()) { Set<String> ss = m_in.get(t); Article a = map_title_article.get(t); a.links_in = new int[ss.size()]; int i=0; for(String s:ss) { a.links_in[i++] = map_title_article.get(s).page_id; } } m_out.clear(); m_in.clear(); m_in = null; m_out = null; }*/ }