/* TranslationTableAll.java - translations' statistics in the database of the parsed Wiktionary. * * Copyright (c) 2009-2011 Andrew Krizhanovsky <andrew.krizhanovsky at gmail.com> * Distributed under EPL/LGPL/GPL/AL/BSD multi-license. */ package wikt.stat; import wikokit.base.wikt.sql.TLang; import wikokit.base.wikt.sql.TLangPOS; import wikokit.base.wikt.sql.TPOS; import wikokit.base.wikt.sql.TPage; import wikokit.base.wikt.sql.TTranslation; import wikokit.base.wikt.sql.TTranslationEntry; import wikokit.base.wikipedia.sql.Statistics; import wikokit.base.wikipedia.sql.Connect; import wikt.stat.printer.CommonPrinter; import wikokit.base.wikipedia.language.LanguageType; import wikokit.base.wikipedia.language.Encodings; import java.sql.*; import java.util.Map; import java.util.HashMap; /** Translations' statistics in the database of the parsed Wiktionary. */ public class TranslationTableAll { private static final boolean DEBUG = false; // Native language is a language of Wiktionary edition, e.g. Russian in Russian Wiktionary. // 1. number of words in native language which have at least one translation into any language // todo // 2. number of words (in native) with translations into foreign (for each foreign language) // todo // 3. number of meanings of words (in native language) with translations into foreign (for each foreign language) // TranslationBox, see countTranslationPerLanguage() // 4. number of total translations for each meaning (of word in native) into any language // TranslationBox and Translation entry // 5. number of word/phrase pairs, e.g. word in native -> in foreign (and for each language) // tanslation_entry table // 6. average number of translation languages (for words which have at least one translation). /** Counts number of translations of native word's meaning into each * foreign language by selecting all records from the table 'translation' * from the database of the parsed Wiktionary.<br><br> * SELECT * FROM translation; * * @param connect connection to the database of the parsed Wiktionary * @return map from the language into a number of translation boxes * which contain synonyms, antonyms, etc. in English (etc.) */ public static Map<LanguageType, Integer> countTranslationPerLanguage (Connect wikt_parsed_conn) { // translation -> lang -> count Statement s = null; ResultSet rs= null; long t_start; int n_unknown_lang_pos = 0; // translations into unknown languages int n_total = Statistics.Count(wikt_parsed_conn, "translation"); //System.out.println("Total translation boxes (translated meanings of words): " + n_total); t_start = System.currentTimeMillis(); Map<LanguageType, Integer> m_lang_n = new HashMap<LanguageType, Integer>(); LanguageType native_lang = wikt_parsed_conn.getNativeLanguage(); try { s = wikt_parsed_conn.conn.createStatement (); StringBuilder str_sql = new StringBuilder(); // SELECT id,lang_pos_id,meaning_summary,meaning_id FROM translation str_sql.append("SELECT id,lang_pos_id,meaning_summary FROM translation"); s.executeQuery (str_sql.toString()); rs = s.getResultSet (); int n_cur = 0; while (rs.next ()) { n_cur ++; int id = rs.getInt("id"); TLangPOS lang_pos = TLangPOS.getByID(wikt_parsed_conn, rs.getInt("lang_pos_id")); String meaning_summary = Encodings.bytesToUTF8(rs.getBytes("meaning_summary")); TLang tlang = lang_pos.getLang(); LanguageType lt = tlang.getLanguage(); // see: Wiktionary:About Translingual if(null != tlang && native_lang != lt && LanguageType.mul != lt) { System.out.print("Error (TranslationTableAll.countTranslationPerLanguage()): There is a translation box from a foreign language, code=" + tlang.getLanguage().getCode()); TPage p = lang_pos.getPage(); if(null != p) System.out.println(", page_title=" + p.getPageTitle()); } if(null != lang_pos) { TTranslation trans = new TTranslation(id, lang_pos, meaning_summary, null); // meaning = null TTranslationEntry[] t_entries = TTranslationEntry.getByTranslation(wikt_parsed_conn, trans); for(TTranslationEntry entry : t_entries) { LanguageType lang = entry.getLang().getLanguage(); if(m_lang_n.containsKey(lang) ) { int n = m_lang_n.get(lang); m_lang_n.put(lang, n + 1); } else m_lang_n.put(lang, 1); } if(DEBUG && 0 == n_cur % 1000) { // % 100 //if(n_cur > 333) // break; long t_cur, t_remain; t_cur = System.currentTimeMillis() - t_start; t_remain = (long)((n_total - n_cur) * t_cur/(60f*1000f*(float)(n_cur))); // where time for 1 page = t_cur / n_cur // in min, since /(60*1000) t_cur = (long)(t_cur/(60f*1000f)); //t_cur = t_cur/(60f*1000f)); TPage tpage = lang_pos.getPage(); if(null != tpage) { System.out.println(n_cur + ": " + tpage.getPageTitle() + ", duration: " + t_cur + // t_cur/(60f*1000f) + " min, remain: " + t_remain + " min"); } } } else n_unknown_lang_pos ++; } } catch(SQLException ex) { System.out.println("SQLException (TranslationTableAll.countTranslationPerLanguage()): " + ex.getMessage()); } finally { if (rs != null) { try { rs.close(); } catch (SQLException sqlEx) { } rs = null; } if (s != null) { try { s.close(); } catch (SQLException sqlEx) { } s = null; } } //long t_end; //float t_work; //t_end = System.currentTimeMillis(); //t_work = (t_end - t_start)/1000f; // in sec System.out.println(//"\nTime sec:" + t_work + "\nTotal translation boxes (translated meanings of words): " + n_total + "\n\nUnknown<ref>'''Unknown''' - words which have translations but have unknown language code and POS</ref>: " + n_unknown_lang_pos); return m_lang_n; } public static void main(String[] args) { // Connect to wikt_parsed database Connect wikt_parsed_conn = new Connect(); // Russian // LanguageType native_lang = LanguageType.ru; // wikt_parsed_conn.Open(Connect.RUWIKT_HOST, Connect.RUWIKT_PARSED_DB, Connect.RUWIKT_USER, Connect.RUWIKT_PASS, LanguageType.ru); // English LanguageType native_lang = LanguageType.en; wikt_parsed_conn.Open(Connect.ENWIKT_HOST, Connect.ENWIKT_PARSED_DB, Connect.ENWIKT_USER, Connect.ENWIKT_PASS, LanguageType.en); TLang.createFastMaps(wikt_parsed_conn); TPOS.createFastMaps(wikt_parsed_conn); //TRelationType.createFastMaps(wikt_parsed_conn); String db_name = wikt_parsed_conn.getDBName(); System.out.println("\n== Statistics of translations in the Wiktionary parsed database =="); CommonPrinter.printHeader (db_name); Map<LanguageType, Integer> m = TranslationTableAll.countTranslationPerLanguage(wikt_parsed_conn); wikt_parsed_conn.Close(); System.out.println(); int total_trans = CommonPrinter.printSomethingPerLanguage(native_lang, m); System.out.println("Total translations: " + total_trans); System.out.println("\nThere are translations into " + m.size() + " languages."); CommonPrinter.printFooter(); } }