/* PageTableAll.java - parses all pages in the Wiktionary table 'page' (source database). * * Copyright (c) 2005-2011 Andrew Krizhanovsky <andrew.krizhanovsky at gmail.com> * Distributed under EPL/LGPL/GPL/AL/BSD multi-license. */ package wiktparsed.mean_semrel.parser; //import wikt.parser.*; import wikokit.base.wikt.sql.TLangPOS; import wikokit.base.wikt.sql.TMeaning; import wikokit.base.wikt.sql.TRelation; import wikokit.base.wikt.sql.TPage; import wikokit.base.wikipedia.sql.Statistics; import wikokit.base.wikipedia.sql.UtilSQL; import wikokit.base.wikipedia.sql.Connect; import wikokit.base.wikipedia.language.LanguageType; import wiktparsed.mean_semrel.parser.sql.*; import java.sql.*; import java.util.HashMap; import java.util.Map; import wikokit.base.wikt.api.WTMeaning; import wikokit.base.wikt.constant.Relation; /** Worker with all pages in the WP table 'page'. */ public class PageWithSemrel { private static final boolean DEBUG = false; /** Inner class which contains list of synonyms, antonyms, ... joined by * some delimiter. */ private static class Semrel { /** Relations stored to the table mean_semrel_XX. */ private final Relation[] ar_relations = { Relation.synonymy, Relation.antonymy, Relation.hypernymy, Relation.hyponymy, Relation.holonymy, Relation.meronymy, Relation.troponymy, Relation.coordinate_term }; private static Map<Relation, StringBuffer> m_relations; Semrel() { m_relations = new HashMap<Relation, StringBuffer>(); for(Relation r : ar_relations) { m_relations.put(r, new StringBuffer()); } } public void init() { for(Relation r : ar_relations) { m_relations.get(r).setLength(0); } } /** Gets maps: relation (e.g. synonymy) to a list of synonyms, antonyms, ... */ public Map<Relation, StringBuffer> getMapRelationToText() { return m_relations; } /** Adds (joins) synonym word (from TRelation) to synonyms, * antonym word to antonyms, etc; for example, synonyms += delimiter + word; */ private void add(TRelation tr, String delimiter) { String word = tr.getWikiText().getText(); String delimiter_word = delimiter + word; Relation r = tr.getRelationType(); if(!m_relations.containsKey(r)) return; // e.g. skip "See also" if( 0 == m_relations.get(r).length()) { m_relations.get(r).append( word); } else { m_relations.get(r).append(delimiter_word); } } /** Check whether exist any relations. */ private boolean hasRelation() { for(Relation r : ar_relations) { if (m_relations.get(r).length() > 0) return true; } return false; } } /** Selects only pages with non-empty meaning (definition) * and semantic relations from Wiktionary parsed database, * stores to the wikt_mean_semrel database. * * @param native_lang native language in the Wiktionary, * e.g. Russian language in Russian Wiktionary, * @param n_start_from number of first Wiktionary entry to be parsed * @param delimiter symbol between words in the table fields "synonyms", "antonyms", etc. * @param min_meaning threshold - minimum number of records in mean_semrel_XX, * the lesser tables (mean_semrel_XX) and records (lang.XX) will be deleted */ public static void parse( // LanguageType native_lang, Connect wikt_parsed_conn, Connect mean_semrel_conn, int n_start_from, String delimiter, int min_meaning) { long t_start; float t_work; int n_total = Statistics.Count(wikt_parsed_conn, "lang_pos"); System.out.println("Total lang_pos: " + n_total); t_start = System.currentTimeMillis(); if(0 == n_start_from) // create wikt_mean_semrel SemrelParser.clearDatabase(wikt_parsed_conn, mean_semrel_conn); else SemrelParser.initWithoutClearDatabase(wikt_parsed_conn, mean_semrel_conn); try { Statement s = wikt_parsed_conn.conn.createStatement (); try { if(DEBUG) { s.executeQuery ("SELECT id FROM lang_pos LIMIT 1000000"); } else { s.executeQuery ("SELECT id FROM lang_pos"); } ResultSet rs = s.getResultSet (); try { Semrel semrel = new Semrel(); int n_cur = 0; while (rs.next ()) { //if (n_cur >= 1) // break; n_cur ++; if(n_start_from >= 0 && n_start_from > n_cur) continue; // skip first [0, n_start_from] records int id = rs.getInt("id"); TLangPOS lang_pos_not_recursive = TLangPOS.getByID (wikt_parsed_conn, id);// fields are not filled recursively if(null == lang_pos_not_recursive) continue; LanguageType xx_lang = lang_pos_not_recursive.getLang().getLanguage(); TPage tpage = lang_pos_not_recursive.getPage(); String page_title = tpage.getPageTitle(); int n_meaning = WTMeaning.countMeanings(wikt_parsed_conn, lang_pos_not_recursive); if(0 == n_meaning) continue; //POS p = lang_pos_not_recursive.getPOS().getPOS(); TMeaning[] mm = TMeaning.get(wikt_parsed_conn, lang_pos_not_recursive); for(TMeaning m : mm) { String meaning_text = m.getWikiTextString(); if(0 == meaning_text.length()) continue; TRelation[] rels = TRelation.get(wikt_parsed_conn, m); if(0 == rels.length) continue; semrel.init(); for(TRelation r : rels) semrel.add(r, delimiter); if(!semrel.hasRelation()) continue; // save to database relations and meaning text MSRMeanSemrelXX.insert ( xx_lang, mean_semrel_conn, page_title, meaning_text, semrel.getMapRelationToText(), rels.length); } if(0 == n_cur % 10000) { // % 100 1000 //if(n_cur<10900) // continue; long t_cur, t_remain; t_cur = System.currentTimeMillis() - t_start; t_remain = (long)((n_total - n_cur) * t_cur/(60f*1000f*(float)(n_cur))); // where time for 1 page = t_cur / n_cur // in min, since /(60*1000) t_cur = (long)(t_cur/(60f*1000f)); //t_cur = t_cur/(60f*1000f)); System.out.println(n_cur + ": " + page_title + ", duration: " + t_cur + // t_cur/(60f*1000f) + " min, remain: " + t_remain + " min"); } } } finally { rs.close(); } } finally { s.close(); } } catch(SQLException ex) { System.out.println("SQLException (PageWithSemrel.parse()): " + ex.getMessage()); } // post-processing 1 MSRLang.calcMeanSemrelStatistics(mean_semrel_conn); // post-processing 2 // delete mean_semrel_XX, if it is empty table, i.e count(*) < 10 MSRLang.deleteEmptyRecordsAndTables(mean_semrel_conn, min_meaning); UtilSQL.dropTable(mean_semrel_conn, "mean_semrel_letter_ru"); long t_end; t_end = System.currentTimeMillis(); t_work = (t_end - t_start)/1000f; // in sec System.out.println("\n\nTime sec:" + t_work + "\nTotal pages: " + n_total); } }