/* QuoteYears.java - statistics of quotes' years * in the database of the parsed Wiktionary. * * Copyright (c) 2012 Andrew Krizhanovsky <andrew.krizhanovsky at gmail.com> * Distributed under EPL/LGPL/GPL/AL/BSD multi-license. */ package wikt.stat; import java.sql.ResultSet; import java.sql.SQLException; import java.sql.Statement; import java.util.ArrayList; import java.util.List; import wikokit.base.wikipedia.language.LanguageType; import wikokit.base.wikipedia.sql.Connect; import wikokit.base.wikipedia.sql.Statistics; import wikokit.base.wikt.sql.*; import wikokit.base.wikt.sql.quote.TQuotRef; import wikokit.base.wikt.sql.quote.TQuotYear; import wikt.stat.printer.CommonPrinter; /** Statistics of quotes' years in the database of the parsed Wiktionary. */ public class QuoteYears { private static final boolean DEBUG = false; private static int MAX_EXAMPLE_WORDS = 3; /** SELECT `from`,`to` FROM quot_year ORDER BY `from` LIMIT 17; */ private static int MIN_YEAR_RU = 0; // 1076; // воинъ Изборник Святослава private static int MAX_YEAR = 2617; // 2015; // = current year + 1 /** Inner class which contains quote example, * the distance is stored in 'dist'. */ private static class OneYearQuote { /** Example of several words, which have quotes, which were written in this year. */ private List<String> example_words; /** Number of quotes written in this year. */ public int counter; OneYearQuote() { example_words = null; counter = 0; } /** Adds one quote which was written in this year. */ public void add(String page_table) { if(null == example_words) example_words = new ArrayList<String>(MAX_EXAMPLE_WORDS); if( example_words.size() < MAX_EXAMPLE_WORDS) { example_words.add(page_table); } } /** Gets concatenation of example_words joined by separator. */ public String getConcatWords(String separator) { if(null == example_words) return ""; StringBuilder result = new StringBuilder(); for(String w : example_words) result.append(w); return result.toString(); } } /** Counts number of quotes with years,... * by selecting all records from the table 'quote' from the database of the parsed Wiktionary.<br><br> * SELECT * FROM quote; * * @param connect connection to the database of the parsed Wiktionary * @return map from the language into a number of translation boxes * which contain synonyms, antonyms, etc. in English (etc.) */ public static OneYearQuote[] countYears (Connect wikt_parsed_conn, int min_year, int max_year, LanguageType native_lang) { OneYearQuote[] years_all_lang = new OneYearQuote[max_year - min_year]; int year; for (year = min_year; year < max_year; year ++) years_all_lang [year - min_year] = new OneYearQuote(); Statement s = null; ResultSet rs= null; long t_start; int n_unknown_lang_pos = 0; // translations into unknown languages int n_total = Statistics.Count(wikt_parsed_conn, "quote"); int n_total_with_years = 0; int n_one_year = 0; int n_range = 0; int n_quot_years_native_lang = 0; // number of quotations with years in native language entries t_start = System.currentTimeMillis(); try { s = wikt_parsed_conn.conn.createStatement (); StringBuilder str_sql = new StringBuilder(); if(DEBUG) // SELECT id, meaning_id, lang_id, text, ref_id FROM quote LIMIT 3 str_sql.append("SELECT id, meaning_id, lang_id, ref_id FROM quote LIMIT 7000"); else str_sql.append("SELECT id, meaning_id, lang_id, ref_id FROM quote"); s.executeQuery (str_sql.toString()); rs = s.getResultSet (); int n_cur = 0; int i; while (rs.next ()) { n_cur ++; int id = rs.getInt("id"); TMeaning m = TMeaning.getByID(wikt_parsed_conn, rs.getInt("meaning_id")); TLang tlang = TLang.getTLangFast( rs.getInt("lang_id")); i = rs.getInt("ref_id"); TQuotRef quot_ref = (0 == i) ? null : TQuotRef.getByID(wikt_parsed_conn, i); if(null == m) { System.out.println("Warning (QuoteYears.countYears()): there is quote with id=" +id+ " with NULL meaning_id!"); continue; } TLangPOS lang_pos = m.getLangPOS(wikt_parsed_conn); if(null != lang_pos) { TPage tpage = lang_pos.getPage(); String page_title = tpage.getPageTitle(); if(null != quot_ref) { TQuotYear tquot_year = quot_ref.getYear(); if(null != tquot_year) { n_total_with_years ++; LanguageType lang = tlang.getLanguage(); if(lang == native_lang) n_quot_years_native_lang ++; int _from = tquot_year.getFrom(); int _to = tquot_year.getTo(); if(_from == _to) { n_one_year ++; } else { n_range ++; } System.out.println(" _from = " + _from + "; _to = " + _to + "; min_year = " + min_year); if (min_year <= _from) { for (year =_from; year <_to+1; year ++) { years_all_lang [year - min_year].counter ++; } years_all_lang [_from - min_year].add(page_title); } else { System.out.println("Error: _from < min_year in page_title="+ page_title +": _from = " + _from + "; _to = " + _to + "; min_year = " + min_year); } } } if(DEBUG && 0 == n_cur % 1000) { // % 100 //if(n_cur > 333) // break; long t_cur, t_remain; t_cur = System.currentTimeMillis() - t_start; t_remain = (long)((n_total - n_cur) * t_cur/(60f*1000f*(float)(n_cur))); // where time for 1 page = t_cur / n_cur // in min, since /(60*1000) t_cur = (long)(t_cur/(60f*1000f)); //t_cur = t_cur/(60f*1000f)); if(null != tpage) { System.out.println(n_cur + ": " + tpage.getPageTitle() + ", duration: " + t_cur + // t_cur/(60f*1000f) + " min, remain: " + t_remain + " min"); } } } else n_unknown_lang_pos ++; } } catch(SQLException ex) { System.out.println("SQLException (QuoteTableAll.countQuotes()): " + ex.getMessage()); } finally { if (rs != null) { try { rs.close(); } catch (SQLException sqlEx) { } rs = null; } if (s != null) { try { s.close(); } catch (SQLException sqlEx) { } s = null; } } double quot_years_native_lang_percent = ((double) (Math.round(n_quot_years_native_lang * 10000f / n_total_with_years ))) / 100; //long t_end; //float t_work; //t_end = System.currentTimeMillis(); //t_work = (t_end - t_start)/1000f; // in sec System.out.println(//"\nTime sec:" + t_work + "\nTotal quotes: " + n_total + "\n\nUnique ranges of years (number of records in the table ''quot_year''): " + Statistics.Count(wikt_parsed_conn, "quot_year") + "\n\nTotal quotes with years: " + n_total_with_years + "\n\nThere are "+ n_one_year +" one year quotations, e.g. 1986 year. " + "\n\nThere are "+ n_range +" quotations with ranges of years, e.g. 1986-1989 years. " + "\n\nThere are "+ n_quot_years_native_lang +" ("+ quot_years_native_lang_percent +" %) quotations with years for entries in native language ("+native_lang.getName()+"). " + // "\n\nThere are quotes in " + m_lang_n.size() + " languages." + "\n\nUnknown<ref>'''Unknown''' - words which have quotes but have unknown language code and POS</ref>: " + n_unknown_lang_pos); return years_all_lang; } /** Prints statistics about quote years in the Wiktionary. */ private static void printQuoteYears (OneYearQuote[] years_all_lang, int min_year, int max_year) { // print header line System.out.println("\n=== Quote years ==="); //System.out.println("\n'''Number of entries''' is a number of (Language & POS level) entries per language. E.g. the Wiktionary article \"[[:en:rook|rook]]\" contains three English and two Dutch entries of Part Of Speech level."); //System.out.println("\n'''Total''' is a total number of relations, i.e. synonyms + antonyms + etc...\n"); /** Number of quotes for each source: <source name, example_words and counter). */ System.out.println("{| class=\"sortable prettytable\" style=\"text-align: center;\""); System.out.print(" ! Year || Number of quotes || Examples "); // print values for (int year = min_year; year < max_year; year ++) { OneYearQuote y = years_all_lang [year - min_year]; System.out.print("\n|-\n| " + year + " || " + y.counter + " || " ); if(null != y.example_words) { StringBuilder s = new StringBuilder(); boolean b_first = true; List<String> words = y.example_words; for(String w : words) { if(b_first) { b_first = false; } else { s.append(", "); } s.append("[[" + w + "]]"); } System.out.print(s.toString()); } //System.out.print(" || "); } System.out.println("\n|}"); } // TODO + skip strange dates: SKIP if (to - from) > 50 years; public static void main(String[] args) { // Connect to wikt_parsed database Connect wikt_parsed_conn = new Connect(); int min_year = 0; // Russian LanguageType native_lang = LanguageType.ru; wikt_parsed_conn.Open(Connect.RUWIKT_HOST, Connect.RUWIKT_PARSED_DB, Connect.RUWIKT_USER, Connect.RUWIKT_PASS, LanguageType.ru); min_year = MIN_YEAR_RU; // English //LanguageType native_lang = LanguageType.en; //wikt_parsed_conn.Open(Connect.ENWIKT_HOST, Connect.ENWIKT_PARSED_DB, Connect.ENWIKT_USER, Connect.ENWIKT_PASS, LanguageType.en); // min_year = MIN_YEAR_EN; todo TLang.createFastMaps(wikt_parsed_conn); TPOS.createFastMaps(wikt_parsed_conn); //TRelationType.createFastMaps(wikt_parsed_conn); String db_name = wikt_parsed_conn.getDBName(); System.out.println("\n== Statistics of years in quotes in the Wiktionary parsed database =="); System.out.println("\n''Last updated: summer 2014.''"); CommonPrinter.printHeader (db_name); //OneYearQuote[] years_all_lang = new OneYearQuote[MAX_YEAR - min_year]; OneYearQuote[] years_all_lang = QuoteYears.countYears(wikt_parsed_conn, min_year, MAX_YEAR, native_lang ); wikt_parsed_conn.Close(); // System.out.println(); /** Number of quotes for each source: <source name, example_words and counter). */ QuoteYears.printQuoteYears(years_all_lang, min_year, MAX_YEAR); //System.out.println("\nThere are quotes in " + m.size() + " languages."); CommonPrinter.printFooter(); } }