/* TPage.java - SQL operations with the table 'page' in Wiktionary parsed database. * * Copyright (c) 2009-2011 Andrew Krizhanovsky <andrew.krizhanovsky at gmail.com> * Distributed under EPL/LGPL/GPL/AL/BSD multi-license. */ package wikokit.base.wikt.sql; import wikokit.base.wikipedia.sql.Connect; import wikokit.base.wikipedia.sql.PageTableBase; import wikokit.base.wikipedia.language.Encodings; //import wikt.api.WTMeaning; //import wikt.word.*; //import wikipedia.util.StringUtil; import java.util.List; import java.util.ArrayList; import java.sql.*; /** An operations with the table 'page' in MySQL wiktionary_parsed database. */ public class TPage { /** Unique page identifier. */ private int id; /** Title of the wiki page, word. */ private String page_title; /** Size of the page in words. */ private int word_count; /** Size of the page as a number of wikified words at the page * (number of out-links). */ private int wiki_link_count; /** true, if the page_title exists in Wiktionary * false, if the page_title exists only as a [[|wikified word]] */ private boolean is_in_wiktionary; /** Hard redirect defined by #REDIRECT * @see TLangPOS.redirect_type and .lemma - a soft redirect. */ private boolean is_redirect; /** Redirected page, i.e. target or destination page. * It is null for usual entries. * * Hard redirect defined by #REDIRECT", * @see TLangPOS.redirect_type and .lemma - a soft redirect. */ private String redirect_target; /** Array of language-POS with this page_title */ private TLangPOS[] lang_pos; private final static TPage[] NULL_TPAGE_ARRAY = new TPage[0]; private final static TLangPOS[] NULL_TLANGPOS_ARRAY = new TLangPOS[0]; public TPage(int _id,String _page_title,int _word_count,int _wiki_link_count, boolean _is_in_wiktionary, String _redirect_target) { id = _id; page_title = _page_title; word_count = _word_count; wiki_link_count = _wiki_link_count; is_in_wiktionary = _is_in_wiktionary; is_redirect = null != _redirect_target; redirect_target = _redirect_target; lang_pos = NULL_TLANGPOS_ARRAY; } /*public void init() { id = 0; page_title = ""; word_count = 0; wiki_link_count = 0; is_in_wiktionary = false; }*/ @Override public String toString() { return "id=" + id + "; page_title=" + page_title; } /** Gets unique ID from database */ public int getID() { return id; } /** Gets title of the wiki page, word. */ public String getPageTitle() { return page_title; } /** Gets number of words, size of the page in words. */ public int getWordCount() { return word_count; } /** Gets number of out-links, size of the page as a number of wikified words. */ public int getWikiLinkCount() { return wiki_link_count; } /** Returns true, if the page_title exists in Wiktionary. */ public boolean isInWiktionary() { return is_in_wiktionary; } /** Returns true, if the page_title is a #REDIRECT in Wiktionary. * @see TLangPOS.redirect_type and .lemma - a soft redirect. */ public boolean isRedirect() { return is_redirect; } /** Gets a redirected page, i.e. target or destination page. * It is null for usual entries. */ public String getRedirect() { return redirect_target; } /** Sets array: language and part of speech. */ public void setLangPOS(TLangPOS[] _lang_pos) { lang_pos = _lang_pos; } /** Gets ID of a record or inserts record (if it is absent) * into the table 'page'.<br><br> * * @param page_title title of wiki page * @param word_count size of the page in words * @param wiki_link_count number of wikified words at the page * @param is_in_wiktionary true, if the page_title exists in Wiktionary * @param redirect_target redirected (target, destination) page, * it is null for usual entries */ public static TPage getOrInsert (Connect connect,String _page_title, int _word_count,int _wiki_link_count, boolean _is_in_wiktionary,String _redirect_target) { TPage p = TPage.get(connect, _page_title); if(null == p) p = TPage.insert(connect, _page_title, _word_count, _wiki_link_count, _is_in_wiktionary, _redirect_target); else { if( p.is_in_wiktionary != _is_in_wiktionary) { TPage.setIsInWiktionary(connect, _page_title, _is_in_wiktionary); p.is_in_wiktionary = _is_in_wiktionary; } } return p; } /** Inserts record into the table 'page'.<br><br> * INSERT INTO page (page_title,word_count,wiki_link_count,is_in_wiktionary) VALUES ("apple",1,2,TRUE); * * or with redirect: * INSERT INTO page (page_title,word_count,wiki_link_count,is_in_wiktionary,is_redirect,redirect_target) VALUES ("apple",1,2,TRUE,TRUE,"test_neletnwi"); * @param page_title title of wiki page * @param word_count size of the page in words * @param wiki_link_count number of wikified words at the page * @param is_in_wiktionary true, if the page_title exists in Wiktionary * @param redirect_target redirected (target, destination) page, * it is null for usual entries */ public static TPage insert (Connect connect,String page_title,int word_count,int wiki_link_count, boolean is_in_wiktionary,String redirect_target) { StringBuilder str_sql = new StringBuilder(); TPage page = null; boolean is_redirect = null != redirect_target && redirect_target.length() > 0; try { Statement s = connect.conn.createStatement (); try { str_sql.append("INSERT INTO page (page_title,word_count,wiki_link_count,is_in_wiktionary"); if(is_redirect) str_sql.append(",is_redirect,redirect_target"); str_sql.append(") VALUES (\""); String safe_title = PageTableBase.convertToSafeStringEncodeToDBWunderscore(connect, page_title); str_sql.append(safe_title); str_sql.append("\","); str_sql.append(word_count); str_sql.append(","); str_sql.append(wiki_link_count); str_sql.append(","); str_sql.append(is_in_wiktionary); if(is_redirect) {// ,TRUE,"test_neletnwi" str_sql.append(",TRUE,\""); str_sql.append(PageTableBase.convertToSafeStringEncodeToDBWunderscore(connect, redirect_target)); str_sql.append("\""); } str_sql.append(")"); if(s.executeUpdate (str_sql.toString()) > 0) { ResultSet rs = connect.conn.prepareStatement( "SELECT LAST_INSERT_ID() AS id" ).executeQuery(); try { if (rs.next ()) { page = new TPage(rs.getInt("id"), page_title, word_count, wiki_link_count, is_in_wiktionary, redirect_target); //System.out.println("TPage insert()):: id=" + rs.getInt("id") + // "; page_title='" + page_title + "'"); } } finally { rs.close(); } } } finally { s.close(); } }catch(SQLException ex) { System.out.println("SQLException (TPage.insert()):: sql='" + str_sql.toString() + "' " + ex.getMessage()); } return page; } /** Update the field 'is_in_wiktionary' in the table 'page', * record is identiied by 'page_title'.<br><br> * UPDATE page SET is_in_wiktionary=1 WHERE page_title="centi-"; * * @param page_title unique title of an wiki page * @param is_in_wiktionary true, if the page_title exists in Wiktionary */ public static void setIsInWiktionary (Connect connect,String page_title, boolean is_in_wiktionary) { StringBuilder str_sql = new StringBuilder(); try { Statement s = connect.conn.createStatement (); try { if(is_in_wiktionary) str_sql.append("UPDATE page SET is_in_wiktionary=1"); else str_sql.append("UPDATE page SET is_in_wiktionary=0"); str_sql.append(" WHERE page_title=\""); String safe_title = PageTableBase.convertToSafeStringEncodeToDBWunderscore(connect, page_title); str_sql.append(safe_title); str_sql.append("\""); s.executeUpdate (str_sql.toString()); } finally { s.close(); } }catch(SQLException ex) { System.out.println("SQLException (wikt_parsed TPage.setIsInWiktionary()):: page_title='"+page_title+"'; sql='" + str_sql.toString() + "' " + ex.getMessage()); } } /** Selects row from the table 'page' by the page_title. * (1) Without conversion into the database encoding for SQLite. * (2) With conversion for MySQL. * * SELECT id,word_count,wiki_link_count,is_in_wiktionary,is_redirect,redirect_target FROM page WHERE page_title="apple"; * * @param page_title title of Wiktionary article * @return null if page_title is absent */ public static TPage get (Connect connect,String page_title) { boolean b_page_title_safe_convertion = connect.isMySQL(); return get(connect, page_title, b_page_title_safe_convertion); } /** Selects row from the table 'page' by the page_title. * * SELECT id,word_count,wiki_link_count,is_in_wiktionary,is_redirect,redirect_target FROM page WHERE page_title="apple"; * * @param page_title title of Wiktionary article * @return null if page_title is absent */ private static TPage get (Connect connect,String page_title, boolean b_page_title_safe_convertion) { StringBuilder str_sql = new StringBuilder(); TPage tp = null; try { Statement s = connect.conn.createStatement (); try { str_sql.append("SELECT id,word_count,wiki_link_count,is_in_wiktionary,is_redirect,redirect_target FROM page WHERE page_title=\""); if( b_page_title_safe_convertion ) { String safe_title = PageTableBase.convertToSafeStringEncodeToDBWunderscore(connect, page_title); str_sql.append(safe_title); } else { str_sql.append(page_title); } str_sql.append("\""); ResultSet rs = s.executeQuery (str_sql.toString()); try { if (rs.next ()) { int id = rs.getInt("id"); int word_count = rs.getInt("word_count"); int wiki_link_count = rs.getInt("wiki_link_count"); //boolean is_in_wiktionary = rs.getBoolean("is_in_wiktionary"); boolean is_in_wiktionary = 0 != rs.getInt("is_in_wiktionary"); boolean is_redirect = 0 != rs.getInt("is_redirect"); String redirect_target = is_redirect ? Encodings.bytesToUTF8(rs.getBytes("redirect_target")) : null; tp = new TPage(id, page_title, word_count, wiki_link_count, is_in_wiktionary, redirect_target); } } finally { rs.close(); } } finally { s.close(); } } catch(SQLException ex) { System.out.println("SQLException (TPage.get()):: sql='" + str_sql.toString() + "' " + ex.getMessage()); } return tp; } /** Selects row from the table 'page' by the page ID. * * SELECT page_title,word_count,wiki_link_count,is_in_wiktionary,is_redirect,redirect_target FROM page WHERE id=1; * * @param id ID of Wiktionary article's title in the table 'page' * @return null if page_title is absent */ public static TPage getByID (Connect connect,int id) { StringBuilder str_sql = new StringBuilder(); TPage tp = null; try { Statement s = connect.conn.createStatement (); try { str_sql.append("SELECT page_title,word_count,wiki_link_count,is_in_wiktionary,is_redirect,redirect_target FROM page WHERE id="); str_sql.append(id); ResultSet rs = s.executeQuery (str_sql.toString()); try { if (rs.next ()) { String page_title = Encodings.bytesToUTF8(rs.getBytes("page_title")); int word_count = rs.getInt("word_count"); int wiki_link_count = rs.getInt("wiki_link_count"); boolean is_in_wiktionary = 0 != rs.getInt("is_in_wiktionary"); boolean is_redirect = 0 != rs.getInt("is_redirect"); String redirect_target = is_redirect ? Encodings.bytesToUTF8(rs.getBytes("redirect_target")) : null; tp = new TPage(id, page_title, word_count, wiki_link_count, is_in_wiktionary, redirect_target); } } finally { rs.close(); } } finally { s.close(); } } catch(SQLException ex) { System.out.println("SQLException (TPage.getByID()):: sql='" + str_sql.toString() + "' " + ex.getMessage()); } return tp; } /** Selects row from the table 'page', WHERE page_title starts from 'prefix', * result list is constrained by 'limit'. * * skip #REDIRECT * SELECT id,page_title,word_count,wiki_link_count,is_in_wiktionary FROM page WHERE page_title LIKE 'zzz%' AND is_redirect is NULL LIMIT 1; * * any entries, with #REDIRECT too * SELECT id,page_title,word_count,wiki_link_count,is_in_wiktionary,is_redirect,redirect_target FROM page WHERE page_title LIKE 'S%' LIMIT 1; * * skip empty articles, i.e. is_in_wiktionary==FALSE * SELECT id,page_title,word_count,wiki_link_count,is_in_wiktionary FROM page WHERE page_title LIKE 'zzz%' AND is_in_wiktionary=1 LIMIT 1; * * @param limit constraint of the number of rows returned, * if it has a negative value then the constraint is omitted * @param prefix the begining of the page_titles * @param b_skip_redirects return articles without redirects if true * @param b_meaning return articles with definitions * @param b_sem_rel return articles with semantic relations * @param str_source_lang pages filtering for words with these language * codes, e.g. "ru en fr" * @return null if page_title is absent */ public static TPage[] getByPrefix ( Connect connect,String prefix, int limit, boolean b_skip_redirects, TLang source_lang[], // String str_source_lang, boolean b_meaning, boolean b_sem_rel ) { boolean b_trans = true; /** target (translation) language which filters the words */ // todo: TLang[] trans_lang; // TLang source_lang[] = new TLang[1]; // source_lang[0] = TLang.get(LanguageType.en); /** Language codes for words filtering, e.g. "ru en fr" */ //var lang_source_value: String = bind lang_source_Text.rawText; // var source_lang : TLang[]; //TLang source_lang[] = TLang.parseLangCode(str_source_lang); TLang trans_lang[] = new TLang[0]; //trans_lang[0] = TLang.get(LanguageType.fi); // todo: as func parameters ... StringBuilder str_sql = new StringBuilder(); List<TPage> tp_list = null; if(0==limit) return NULL_TPAGE_ARRAY; int limit_with_reserve = limit; if(b_meaning) limit_with_reserve += 42; // since some words without meaning will be skipped if(b_sem_rel) limit_with_reserve += 512; // since some words without relations will be skipped if(source_lang.length > 0) limit_with_reserve += 555; if(trans_lang.length > 0) limit_with_reserve += 55555; try { Statement s = connect.conn.createStatement (); try { String safe_prefix = PageTableBase.convertToSafeWithWildCard(connect, prefix); str_sql.append("SELECT id,page_title,word_count,wiki_link_count,is_in_wiktionary,is_redirect,redirect_target FROM page WHERE page_title LIKE \""); str_sql.append(safe_prefix); //str_sql.append("%\""); str_sql.append("\""); if(b_skip_redirects) str_sql.append(" AND is_redirect is NULL"); // temp: skip empty articles //str_sql.append(" AND is_in_wiktionary=1"); if(limit > 0) { str_sql.append(" LIMIT "); str_sql.append(limit_with_reserve); } //System.out.print("safe_prefix=" + safe_prefix); ResultSet rs = s.executeQuery (str_sql.toString()); try { while (rs.next () && (limit < 0 || null == tp_list || tp_list.size() < limit)) { int id = rs.getInt("id"); int word_count = rs.getInt("word_count"); int wiki_link_count = rs.getInt("wiki_link_count"); boolean is_in_wiktionary = rs.getBoolean("is_in_wiktionary"); String page_title = Encodings.bytesToUTF8(rs.getBytes("page_title")); boolean is_redirect = 0 != rs.getInt("is_redirect"); String redirect_target = is_redirect ? Encodings.bytesToUTF8(rs.getBytes("redirect_target")) : null; if (b_skip_redirects) assert(null == redirect_target); TPage tp = new TPage(id, page_title, word_count, wiki_link_count, is_in_wiktionary, redirect_target); tp.lang_pos = TLangPOS.getRecursive(connect, tp); boolean b_add = true; if(b_meaning) b_add = b_add && tp.hasDefinition(); if(b_sem_rel) b_add = b_add && tp.hasSemanticRelation(); if(source_lang.length > 0) b_add = b_add && tp.hasLanguage(source_lang); if(trans_lang.length > 0) b_add = b_add && tp.hasTranslation(trans_lang); if(b_add) { if(null == tp_list) tp_list = new ArrayList<TPage>(); tp_list.add(tp); } // System.out.println(" title=" + page_title); // "; redirect_target=" + redirect_target + // "; id=" + id + // "; is_redirect=" + is_redirect + // " (TPage.getByPrefix)"); } } finally { rs.close(); } } finally { s.close(); } } catch(SQLException ex) { System.out.println("SQLException (TPage.get()):: sql='" + str_sql.toString() + "' " + ex.getMessage()); } if(null == tp_list) return NULL_TPAGE_ARRAY; return ((TPage[])tp_list.toArray(NULL_TPAGE_ARRAY)); } /** Deletes row from the table 'page' by the page_title. * * DELETE FROM page WHERE page_title="apple"; * * @param page_title title of Wiktionary article */ public static void delete (Connect connect,String page_title) { StringBuilder str_sql = new StringBuilder(); try { Statement s = connect.conn.createStatement (); try { String safe_title = PageTableBase.convertToSafeStringEncodeToDBWunderscore(connect, page_title); str_sql.append("DELETE FROM page WHERE page_title=\""); str_sql.append(safe_title); str_sql.append("\""); s.execute (str_sql.toString()); //System.out.println("TPage delete()):: page_title='" + page_title + "'"); } finally { s.close(); } } catch(SQLException ex) { System.out.println("SQLException (wikt_parsed TPage.java delete()):: sql='" + str_sql.toString() + "' " + ex.getMessage()); } } /** Checks whether the article 'page_title' has any definitions. * The field 'lang_pos' is scanned here. */ public boolean hasDefinition() { if(null == lang_pos) return false; for(TLangPOS lp : lang_pos) { if(lp.getMeaning().length > 0) return true; } return false; } /** Checks whether the article 'page_title' has at least one synonym, * antonym, etc. The fields 'lang_pos', 'lang_pos.meaning' and * 'lang_pos.meaning.relation' are scanned here. */ public boolean hasSemanticRelation() { if(null == lang_pos) return false; for(TLangPOS lp : lang_pos) { TMeaning[] mm = lp.getMeaning(); for(TMeaning m : mm) { if(m.getRelation().size() > 0) return true; } } return false; } /** Checks whether the article 'page_title' has at least one translatio * into the destination languages from the array 'trans_lang'. * The fields 'lang_pos', 'lang_pos.translation' are scanned here. */ public boolean hasTranslation(TLang trans_lang[]) { if(null == lang_pos) return false; for(TLangPOS lp : lang_pos) { TMeaning[] mm = lp.getMeaning(); for(TMeaning m : mm) { if(m.hasTranslation(trans_lang)) return true; } } return false; } /** Checks whether the article 'page_title' has at least one wordform * in the language from the array 'source_lang'. * The language-POS of this page_title is scanned here. */ public boolean hasLanguage(TLang source_lang[]) { if(null == lang_pos) return false; for(TLangPOS lp : lang_pos) { TLang lang = lp.getLang(); for(TLang source : source_lang) if(lang == source) return true; } return false; } }