/*
* PageTableBase.java - SQL operations with the table page in wikipedia
*
* Copyright (c) 2005-2009 Andrew Krizhanovsky <andrew.krizhanovsky at gmail.com>
* Distributed under GNU Public License.
*/
package wikipedia.sql;
import wikipedia.language.Encodings;
import wikipedia.util.*;
import java.sql.*;
/** The operations with the page table in MySQL wikipedia */
public class PageTableBase {
public PageTableBase() {
}
private static final PageTableData page_data = new PageTableData();
private static class PageTableData {
public int page_id;
public PageNamespace page_namespace; //public byte page_namespace;
public String page_title;
public boolean page_is_redirect;
public PageTableData () {
init();
}
public void init() {
page_id = 0;
page_namespace = null;
page_title = null;
page_is_redirect = false;
}
public void cleanup() {
page_namespace = null;
page_title = null;
}
}
private final static String NULL_STRING = new String("");
/** Gets the article text by article title.
* SQL:
* SELECT old_text FROM page,text WHERE page.page_title='article_title' AND page_namespace=0 AND page.page_latest=text.old_id;
*
SELECT old_text FROM page,text WHERE page.page_title=CONVERT('Żagań' USING utf8) AND page_namespace=0 AND page.page_latest=text.old_id limit 1;
+
SELECT old_text FROM page,text WHERE CONVERT(page.page_title USING utf8)='Żagań' AND page_namespace=0 AND page.page_latest=text.old_id limit 1;
+
SELECT old_text FROM page,text WHERE page.page_title=CONVERT('Żagań' USING latin1) AND page_namespace=0 AND page.page_latest=text.old_id limit 1;
+
* COLLATE utf8_general_ci
SELECT old_text FROM page,text WHERE page.page_title='Żagań' AND page_namespace=0 AND page.page_latest=text.old_id COLLATE latin1_bin;
SELECT * FROM page WHERE page.page_title='Żagań';
SELECT * FROM page WHERE page.page_title='Momotarō';
* Problem:
* SQLException in PageTableBase.getArticleText():: sql='SELECT old_text FROM page,text
* WHERE page.page_title='Å»agaÅ' AND page_namespace=0 AND page.page_latest=text.old_id'
* Illegal mix of collations (latin1_bin,IMPLICIT) and (utf8_general_ci,COERCIBLE) for operation '='
*/
public static String getArticleText(Connect connect, String page_title)
{
String old_text = "";
String str_sql = null;
if(null==connect || null==connect.conn)
return old_text;
try {
Statement s = connect.conn.createStatement ();
//page_title = convertToSafeStringEncodeToDB (connect, page_title);
page_title = convertToSafeStringEncodeToDBWunderscore (connect, page_title);
// Get text
//page_title = Encodings.FromTo(page_title, "UTF8", "ISO8859_1");
//page_title = Encodings.FromTo(page_title, "UTF8", "ISO8859_1");
//page_title = Encodings.FromTo(page_title, "Cp1251","ISO8859_1");
str_sql = "SELECT old_text FROM page,text WHERE page.page_title='"+page_title+"' AND page_namespace=0 AND page.page_latest=text.old_id";
ResultSet rs = s.executeQuery (str_sql);
// Get title
//s.executeQuery ("SELECT page_title FROM page WHERE page_id = 10332"); // Out: Глаз
//s.executeQuery ("SELECT page_title FROM page WHERE page_id = 1180710"); // Out: Gratitude
if (rs.next ())
{
// Get title
//title = bytesToUTF8(rs.getBytes("page_title"));
//page_text = "";
// Get text
// page_text = rs.getString("old_text"); // may be work - unreadable chars
// page_text = Latin1ToUTF8(rs.getString("old_text")); //error: ??? - question marks
old_text = Encodings.bytesToUTF8(rs.getBytes("old_text"));
}
rs.close ();
s.close ();
} catch(SQLException ex) {
System.err.println("SQLException in PageTableBase.getArticleText():: sql='" + str_sql +
"' " + ex.getMessage());
}
return old_text;
}
public static String convertToSafeStringEncodeToDB(Connect connect, String s) {
if(null == s || s.length() == 0)
return "";
String safe_title = StringUtil.spaceToUnderscore(
StringUtil.escapeChars(s));
return connect.enc.EncodeToDB(safe_title);
}
/** Converts to safe DB string, but without replacement of a space by an underscore symbol. */
public static String convertToSafeStringEncodeToDBWunderscore(Connect connect, String s) {
String safe_title = StringUtil.escapeChars(s);
return connect.enc.EncodeToDB(safe_title);
}
/** Finds the first position of wildcard characters in the string starting
* from the "from_index" character.
* Wildcards are the asterisk character ("*") and the question mark (?).
*
* If no such value exists, then -1 is returned.
*/
public static int getFirstPositionOfWildcardCharacter(String s, int from_index) {
int n_question = s.indexOf("?", from_index);
int n_asterisk = s.indexOf("*", from_index);
boolean b_question = -1 != n_question;
boolean b_asterisk = -1 != n_asterisk;
if(!b_question && !b_asterisk) {
return -1;
}
if(b_question && b_asterisk) {
return Math.min(n_question, n_asterisk);
}
if(b_question)
return n_question;
return n_asterisk;
}
/** Substitutes the asterisk character ('*') and the question mark ('?')
* by database wildcard characters (% and _).
*
* @param ch wildcard character '*' or '?'
* @param b_double_question if true then double "??" for 'LIKE'
* of non ASCII characters (e.g. cyrillics)
*/
private static String convertWildCharacter(char ch,boolean b_double_question) {
if('*' == ch) {
return "%";
} else {
if('?' == ch) {
if(b_double_question) {
return "__";
} else
return "_";
} else {
//System.out.println("Error in PageTableBase.convertWildCharacter(), character '"+ch+"' is not wildcard.");
return new StringBuffer(ch).toString();
}
}
}
/** Substitutes the asterisk character ("*") and the question mark (?)
* by database wildcard characters (% and _).
*
* Converts to safe database string (except "*" and "?"),
* but without replacement of a space by an underscore symbol.
*
* If the text does not contain wildcard characters then text .= '%'.
*/
public static String convertWildcardToDatabaseChars (Connect connect, String s){
if (null == s || 0 == s.length()) {
//System.out.println("Error in PageTableBase.convertWildcardToDatabaseChars(), argument is null.");
//return NULL_STRING;
return "%";
}
String safe_title = StringUtil.escapeChars(s);
String encoded_title = connect.enc.EncodeToDB(safe_title);
int from_index = 0;
int n_question_or_asterisk = getFirstPositionOfWildcardCharacter(s, from_index);
if(-1 == n_question_or_asterisk)
return encoded_title.concat("%");
// ASCII texts are happy with '?', but cyrillics require '??'
boolean b_double_question = encoded_title.length() > s.length();
StringBuffer sb = new StringBuffer();
while(-1 != n_question_or_asterisk) {
String s_to_convert = s.substring(from_index, n_question_or_asterisk);
safe_title = StringUtil.escapeChars(s_to_convert);
sb.append( connect.enc.EncodeToDB(safe_title) );
sb.append( convertWildCharacter(s.charAt(n_question_or_asterisk), b_double_question) );
from_index = n_question_or_asterisk + 1;
n_question_or_asterisk = getFirstPositionOfWildcardCharacter(s, from_index);
}
int len = s.length();
if(from_index != len) {
String s_to_convert = s.substring(from_index, len);
safe_title = StringUtil.escapeChars(s_to_convert);
sb.append( connect.enc.EncodeToDB(safe_title) );
} //else {
// last symbol is wildcard ('*' or '?')
//sb.append( convertWildCharacter(s.charAt(len - 1)) );
//}
return sb.toString();
}
/** Converts 'source' string into safe (for storing in a database) string,
* substitutes the asterisk character ('*') and the question mark ('?')
* by database wildcard characters (% and _).
*/
public static String convertToSafeWithWildCard(Connect connect, String source)
{
String s = source;
if(connect.isMySQL()) {
s = PageTableBase.convertWildcardToDatabaseChars(connect, source);
//s = PageTableBase.convertToSafeStringEncodeToDBWunderscore(connect, prefix);
} else {
// SQLite
if(-1 == PageTableBase.getFirstPositionOfWildcardCharacter(source, 0))
s = s.concat("%");
else
s = s.replace('*', '%').replace('?', '_');
}
return s;
}
private static StringBuffer sb = new StringBuffer(255);
/** Gets id of articles via Title:Namespace.
* Namespace could be MAIN (article), CATEGORY, ... .
* @return -id if article is the redirect page.
* @return 0 if article id is absent in the table page, or interwiki.
*/
public static int getIDByTitleNamespace(Connect connect, String page_title, PageNamespace namespace)
{
int page_id = 0;
if(null==connect || null==connect.conn)
return page_id;
if(StringUtil.isInterWiki(page_title)) {
//System.out.println("isInterWiki = "+page_title);
return 0;
}
sb.setLength(0);
try {
Statement s = connect.conn.createStatement ();
String safe_title = convertToSafeStringEncodeToDB (connect, page_title);
// Get ID
//str_sql = "SELECT page_id FROM page WHERE page_namespace=" + namespace + " AND page_title='"+safe_title+"'";
//str_sql = "SELECT page_id, page_is_redirect FROM page WHERE page_namespace=" + namespace + " AND page_title='"+safe_title+"'";
sb.append("SELECT page_id, page_is_redirect FROM page WHERE page_namespace=");
sb.append(namespace.toInt());
sb.append(" AND page_title='");
sb.append( safe_title);
sb.append( "'");
ResultSet rs = s.executeQuery (sb.toString());
if (rs.next ())
{
// Get title and redirectness
page_id = rs.getInt("page_id");
if(1 == rs.getInt("page_is_redirect"))
page_id = - page_id;
}
rs.close ();
s.close ();
} catch(SQLException ex) {
System.err.println("SQLException (PageTableBase.java GetIDByTitleNamespace): sql='" + sb.toString() + "' " + ex.getMessage());
}
return page_id;
}
/** Gets page's id by page's title (article, non-category).
* @return 0 if article id is absent in the table page.
* @return -id if article is the redirect page.
*/
public static int getIDByTitle(Connect connect, String page_title)
//public static int GetIDByTitle(Connect connect, byte[] page_title)
{
return getIDByTitleNamespace(connect, page_title, PageNamespace.MAIN);
}
//public static int GetCategoryIDByTitle(Connect connect, byte[] cur_title)
public static int getCategoryIDByTitle(Connect connect, String page_title)
{
return getIDByTitleNamespace(connect, page_title, PageNamespace.CATEGORY);
}
/** Gets row from the table 'page' by identifier.
* @return ??? null if article's title is absent in the table page.
*
* Attention: the value of returned object PageTableData will be changed
* during the next function call.
*/
public static PageTableData getPageTableDataByID(Connect connect, int id) {
Statement s = null;
ResultSet rs= null;
// special treatment of id of redirect page
if(id < 0)
id = -id;
if(null==connect || null==connect.conn)
return null;
page_data.init();
page_data.page_id = id;
try {
s = connect.conn.createStatement ();
sb.setLength(0);
sb.append("SELECT page_namespace, page_title, page_is_redirect FROM page WHERE page_id=");
sb.append(id);
rs = s.executeQuery(sb.toString()); //GetTitleByIDQuery(rs, s, sb);
if (rs.next ())
{
Encodings e = connect.enc;
//title = Encodings.bytesTo(rs.getBytes("page_title"), e.GetDBEnc());
String db_str = Encodings.bytesTo(rs.getBytes("page_title"), e.GetDBEnc());
page_data.page_title = e.EncodeFromDB(db_str);
//title = Encodings.bytesTo(rs.getBytes("page_title"), enc.GetUser()); // ISO8859_1 UTF8
//title = Encodings.bytesTo(rs.getBytes("page_title"), "ISO8859_1"); //
page_data.page_namespace = PageNamespace.get(rs.getInt("page_namespace"));
page_data.page_is_redirect = 1 == rs.getInt("page_is_redirect");
//System.out.println("id="+id+"; title="+page_data.page_title+
// "; ns="+page_data.page_namespace.toInt()+
// "; is_redirect="+page_data.page_is_redirect);
}
} catch(SQLException ex) {
System.err.println("SQLException (PageTableBase.java getTitleByID()): " + ex.getMessage());
} finally {
if (rs != null) { try { rs.close(); } catch (SQLException sqlEx) { } rs = null; }
if (s != null) { try { s.close(); } catch (SQLException sqlEx) { } s = null; }
}
return page_data;
}
/** Gets page's or category title by identifier.
* @return null if article's title is absent in the table page.
*/
public static String getTitleByID(Connect connect, int id) {
PageTableData d = getPageTableDataByID(connect, id);
if(null == d)
return "";
String s = d.page_title;
d.cleanup();
return s;
}
/** Gets page's title by identifier, the page is not redirect.
* @return null if article's title is absent in the table 'page' or it is
* a redirect page.
*/
public static String getArticleTitleNotRedirectByID (Connect connect, int id) {
PageTableData d = getPageTableDataByID(connect, id);
if(null == d)
return null;
if(0 != d.page_id && !d.page_is_redirect &&
d.page_namespace == PageNamespace.MAIN) {
String s = d.page_title;
d.cleanup();
return s;
} else {
d.cleanup();
return null;
}
}
/** Gets page's title by identifier, the page is not redirect.
* @return null if article's title is absent in the table 'page' or it is
* a redirect page.
*/
public static String getCategoryTitleByID (Connect connect, int id) {
PageTableData d = getPageTableDataByID(connect, id);
if(null == d)
return null;
if(0 != d.page_id &&
d.page_namespace == PageNamespace.CATEGORY)
{
String s = d.page_title;
d.cleanup();
return s;
} else {
d.cleanup();
return null;
}
}
/*
public static void GetTitleByIDQuery(ResultSet rs, Statement s,StringBuffer sb) {
try {
s.executeQuery(sb.toString());
} catch(SQLException ex) {
System.err.println("SQLException (PageTableBase.java GetTitleByID()): " + ex.getMessage());
}
}*/
/** Gets type of the page.
* @return Returns namespace MAIN for the article, CATEGORY for the category,
* else null (old -1).
*/
public static PageNamespace getNamespaceByID (Connect c, int id) {
PageNamespace ns = null; //byte ns = -1;
Statement s = null;
ResultSet rs= null;
try {
s = c.conn.createStatement ();
rs = s.executeQuery ("SELECT page_namespace FROM page WHERE page_id=" + id);
if (rs.next ())
{
int i_ns = rs.getInt("page_namespace");
if (PageNamespace.isValid(i_ns))
ns = PageNamespace.get(i_ns);
}
} catch(SQLException ex) {
System.err.println("SQLException (PageTableBase.java GetNamespaceByID()): " + ex.getMessage());
} finally {
if (rs != null) { try { rs.close(); } catch (SQLException sqlEx) { } rs = null; }
if (s != null) { try { s.close(); } catch (SQLException sqlEx) { } s = null; }
}
return ns;
}
}