/*
* PageTableAll.java - worker with all pages in the WP table 'page'.
*
* Copyright (c) 2005-2008 Andrew Krizhanovsky <andrew.krizhanovsky at gmail.com>
* Distributed under GNU General Public License.
*/
package wikidf;
import wikipedia.language.LanguageType;
import wikipedia.language.Encodings;
import wikipedia.sql.*;
import java.sql.*;
import gate.*;
import gate.util.*;
/** Worker with all pages in the WP table 'page'.
*/
public class PageTableAll {
private static final boolean DEBUG = true;
/** pages which caused crash of program (Russian Wikipedia) - for fast debug */
private static final String[] debug_pages = {"Борланд,_Вес", "Atom", "BSD_DPL", "Sitemaps",
"WML", "XML", "XPath", "Апостроф_(диакритический_знак)", "Восход_и_заход_Солнца",
"Знак_ударения", "Бангладеш", "Административные_единицы_Китая_до_уезда", "Аум_Синрикё"};
/** Selects all pages (not categories, not redirects), stores to the IDF db.
* SQL:
* SELECT page_title FROM page WHERE page_namespace=0 AND page_is_redirect=0;
*
*/
public static void parseAllPages(Connect wp_conn,
LanguageType wiki_lang, boolean b_remove_not_expand_iwiki,
Connect idf_conn,Corpus corpus,StandAloneRussianPOSTagger prs,
int doc_freq_max)
throws GateException
{
Statement s = null;
ResultSet rs= null;
long t_start;
float t_work;
int n_total = Statistics.CountArticlesNonRedirects(wp_conn);
System.out.println("Total pages: " + n_total);
t_start = System.currentTimeMillis();
try {
s = wp_conn.conn.createStatement ();
StringBuffer sb = new StringBuffer();
sb.append("SELECT page_title FROM page WHERE page_namespace=0 AND page_is_redirect=0");
s.executeQuery(sb.toString()); //GetTitleByIDQuery(rs, s, sb);
rs = s.getResultSet ();
int n_cur = 0;
while (rs.next ())
{
Encodings e = wp_conn.enc;
//title = Encodings.bytesTo(rs.getBytes("page_title"), e.GetDBEnc());
String db_str = Encodings.bytesTo(rs.getBytes("page_title"), e.GetDBEnc());
String page_title = e.EncodeFromDB(db_str);
//title = Encodings.bytesTo(rs.getBytes("page_title"), enc.GetUser()); // ISO8859_1 UTF8
//title = Encodings.bytesTo(rs.getBytes("page_title"), "ISO8859_1"); //
// test problem pages:
/*if (n_cur < debug_pages.length)
page_title = wp_conn.enc.EncodeFromJava(debug_pages[n_cur]); //"Борланд,_Вес"
else
break;*/
//page_title = wp_conn.enc.EncodeFromJava("MTR"); // Sanskrit
if(DEBUG && 0 == ++n_cur % 100) { // % 100
//if(n_cur<10900)
// continue;
long t_cur, t_remain;
t_cur = System.currentTimeMillis() - t_start;
t_remain = (long)((n_total - n_cur) * t_cur/(60f*1000f*(float)(n_cur)));
// where time for 1 page = t_cur / n_cur
// in min, since /(60*1000)
t_cur = (long)(t_cur/(60f*1000f));
//t_cur = t_cur/(60f*1000f));
System.out.println(n_cur + ": " + page_title +
", duration: " + t_cur + // t_cur/(60f*1000f) +
" min, remain: " + t_remain +
" min");
}
Keeper.parseFromWP(
wp_conn, page_title,
wiki_lang, b_remove_not_expand_iwiki,
idf_conn, corpus, prs,
doc_freq_max);
}
} catch(SQLException ex) {
System.err.println("SQLException (parseAllPages.java PageTableAll()): " + ex.getMessage());
} finally {
if (rs != null) { try { rs.close(); } catch (SQLException sqlEx) { } rs = null; }
if (s != null) { try { s.close(); } catch (SQLException sqlEx) { } s = null; }
}
long t_end;
t_end = System.currentTimeMillis();
t_work = (t_end - t_start)/1000f; // in sec
System.out.println("\n\nTime sec:" + t_work +
"\nTotal pages: " + n_total);
}
}