/* POSAndPolysemyTableAll.java - Parts of speech statistics and data about
* polysemy in the database of the parsed Wiktionary.
*
* Copyright (c) 2011-2012 Andrew Krizhanovsky <andrew.krizhanovsky at gmail.com>
* Distributed under EPL/LGPL/GPL/AL/BSD multi-license.
*/
package wikt.stat;
import wikokit.base.wikt.sql.TLang;
import wikokit.base.wikt.sql.TLangPOS;
import wikokit.base.wikt.sql.TPOS;
import wikokit.base.wikt.sql.TPage;
import wikt.stat.printer.POSAndPolysemyPrinter;
import wikt.stat.printer.CommonPrinter;
import wikokit.base.wikipedia.language.LanguageType;
import wikokit.base.wikipedia.sql.Connect;
import wikokit.base.wikipedia.sql.Statistics;
import wikokit.base.wikt.constant.POS;
import java.sql.*;
import java.util.HashMap;
import java.util.Map;
import wikokit.base.wikt.api.WTMeaning;
/** Parts of speech statistics and data about
* polysemy in the database of the parsed Wiktionary.
*
* @see for inspiration: http://wordnet.princeton.edu/wordnet/man/wnstats.7WN.html
*/
public class POSAndPolysemyTableAll {
private static final boolean DEBUG = false;
/** Let's constrain the maximum number of meanings/definitions for one word */
private static final int max_meanings = 110;
private static final int max_meanings_to_print = 10; // 70 10
private static final int[] mean_histogram = new int[max_meanings];
// histogram for each language
private static final Map<LanguageType, Integer[]> m_lang_histogram = new HashMap<LanguageType, Integer[]>();
// Number of unknown parts of speech for each language
private static final Map<LanguageType, Integer> m_lang_unknown_pos = new HashMap<LanguageType, Integer>();
/** Inner POSStat class for each POS. */
public static class POSStat {
POSStat() {
uniques_strings = 0;
word_sense_pairs = 0;
monosemous = 0;
polysemous_words = 0;
polysemous_senses = 0;
max_senses1 = 0; max_senses2 = 0; max_senses3 = 0;
page_title1 = ""; page_title2 = ""; page_title3 = "";
}
/** Total number of LangPOS for this POS (?sum of all languages). */
private int uniques_strings;
/** Total number of all meanings of LangPOS for this POS (?sum of all languages). */
private int word_sense_pairs;
/** Number of monosemous words and senses for this POS (i.e. LangPOS with one meaning). */
private int monosemous;
/** Number of polysemous words (i.e. LangPOS has more than one meaning) for this POS. */
private int polysemous_words;
/** Number of polysemous senses (i.e += number of meanings of LangPOS) for this POS. */
private int polysemous_senses;
/** Value of maximum number of senses for this POS (for page_title1,2,3). */
private int max_senses1;
private int max_senses2;
private int max_senses3;
/** Word (3 words) with maximum number of senses with this POS. */
private String page_title1;
private String page_title2;
private String page_title3;
public int getNumberOfUniquesStrings() {
return uniques_strings;
}
public int getNumberOfWordSensePairs() {
return word_sense_pairs;
}
public int getMonosemous() {
return monosemous;
}
public int getPolysemousWords() {
return polysemous_words;
}
public int getPolysemousSenses() {
return polysemous_senses;
}
public float calcAveragePolysemyIncludingMonosemousWords() {
if(0 == uniques_strings)
return -1f;
return new Float(word_sense_pairs) / new Float(uniques_strings);
}
public float calcAveragePolysemyExcludingMonosemousWords() {
if(0 == polysemous_words)
return -1f;
return new Float(word_sense_pairs - monosemous) / new Float(polysemous_words);
}
/** Maximum number of senses (meanings, definitions) for this POS. */
public int getMaxSenses() {
return max_senses1;
}
/** Word with maximum number of senses (meanings, definitions) for this POS. */
public String getWikifiedWordWithMaxSenses() {
String result = "";
if(page_title1.length() > 0)
result += "[[" + page_title1 + "]] "+max_senses1+", ";
if(page_title2.length() > 0)
result += "[[" + page_title2 + "]] "+max_senses2+", ";
if(page_title3.length() > 0)
result += "[[" + page_title3 + "]] "+max_senses3+", ";
// chop last comma ", "
int len = result.length();
if(len > 2)
result = result.substring(0, len-2);
return result;
}
/** Increment statistics data.
* @param n_meaning number of meanings for this POS
*/
public void addPOS(int n_meaning, String current_page_title) {
uniques_strings += 1;
word_sense_pairs += n_meaning;
if(1 == n_meaning) {
monosemous += 1;
} else {
polysemous_words += 1;
polysemous_senses += n_meaning;
}
if(n_meaning > max_senses1) {
max_senses1 = n_meaning;
page_title1 = current_page_title;
} else if(n_meaning > max_senses2) {
max_senses2 = n_meaning;
page_title2 = current_page_title;
} else if(n_meaning > max_senses3) {
max_senses3 = n_meaning;
page_title3 = current_page_title;
}
}
}
/** List of the words with the maximum number of meanings,
* or the maximum number of types of semantic relations. */
//private static final List<TLangPOS> words_with_many_meanings = new ArrayList<TLangPOS>();
/** Number of meanings for each POS, sum by all languages,
* e.g. noun = English nouns + Russian nouns + ..., etc. **/
private static final Map<POS,POSStat> m_pos_sum_all_lang = new HashMap(POS.size());
/** Counts number of different POS, cycle for each LangPOS.
* .<br><br>
*
* SELECT * FROM lang_pos;
*
* @param connect connection to the database of the parsed Wiktionary
*
* @return histogram with number of semantic relations, i.e.
* [0] = number of words (one language, one part of speech) without any semantic relations,
* [1] = number of words with one relation, etc.
*/
public static Map<LanguageType, Map<POS,POSStat>> countPOS (Connect wikt_parsed_conn,
LanguageType native_lang) {
// lang_pos -> meaning -> count
Statement s = null;
ResultSet rs= null;
long t_start;
// mean_histogram [0]
// int n_empty_meaning = 0;// total number of unique noun, verb, etc. without definitions
int n_unknown_pos__in_rich_words = 0; // number of words (with relations) with unknown POS
int n_langpos_with_empty_meaning = 0;// total number of unique noun, verb, etc. with empty definitions
int n_nonempty_meaning = 0;// total number of words (unique noun, verb, etc.) with nonempty definitions
int n_total = Statistics.Count(wikt_parsed_conn, "lang_pos");
t_start = System.currentTimeMillis();
Map<LanguageType, Map<POS,POSStat>> m_lang_pos_pos_stat = new HashMap<LanguageType, Map<POS,POSStat>>();
// Map<POS,POSStat> m_pos_sum_all_lang
try {
s = wikt_parsed_conn.conn.createStatement ();
s.executeQuery ("SELECT id FROM lang_pos");
rs = s.getResultSet ();
int n_cur = 0;
while (rs.next ())
{
n_cur ++;
int id = rs.getInt("id");
TLangPOS lang_pos_not_recursive = TLangPOS.getByID (wikt_parsed_conn, id);// fields are not filled recursively
if(null == lang_pos_not_recursive)
continue;
LanguageType lang = lang_pos_not_recursive.getLang().getLanguage();
n_langpos_with_empty_meaning ++;
TPage tpage = lang_pos_not_recursive.getPage();
String page_title = tpage.getPageTitle();
int n_meaning = WTMeaning.countMeanings(wikt_parsed_conn, lang_pos_not_recursive);
if(DEBUG && lang == LanguageType.ru) {
System.out.print("\n" + tpage.getPageTitle() + ", meanings:" + n_meaning);
}
if(n_meaning < max_meanings) {
mean_histogram [n_meaning] ++;
Integer[] h;
if(m_lang_histogram.containsKey(lang) ) {
h = m_lang_histogram.get(lang);
} else {
h = new Integer[max_meanings];
for(int i=0;i<max_meanings;i++)
h[i] = 0;
}
h[n_meaning] ++;
m_lang_histogram.put(lang, h);
}
if(n_meaning > 0)
n_nonempty_meaning ++;
POS p = lang_pos_not_recursive.getPOS().getPOS();
if(POS.unknown == p) {
n_unknown_pos__in_rich_words ++;
if(DEBUG && lang == LanguageType.ru)
System.out.print(", pos:" + p.toString());
if( m_lang_unknown_pos.containsKey(lang) ) {
m_lang_unknown_pos.put(lang, 1 + m_lang_unknown_pos.get(lang));
} else {
m_lang_unknown_pos.put(lang, 1);
}
} else {
if(n_meaning > 0)
{
{ // all languages statistics
POSStat ps = m_pos_sum_all_lang.get(p);
if(null == ps)
ps = new POSStat();
ps.addPOS(n_meaning, page_title);
m_pos_sum_all_lang.put(p, ps);
}
{ // POS statistics of this language
POSStat ps2 = null;
Map<POS,POSStat> m_pos_stat = m_lang_pos_pos_stat.get(lang);
if(null == m_pos_stat) {
m_pos_stat = new HashMap<POS,POSStat>();
} else {
ps2 = m_pos_stat.get(p);
}
if(null == ps2)
ps2 = new POSStat();
ps2.addPOS(n_meaning, page_title);
m_pos_stat.put(p, ps2);
m_lang_pos_pos_stat.put(lang, m_pos_stat);
}
}
}
/* list of rich words... todo
boolean b_added = false;
if((native_lang == lang && n_meaning >= threshold_meanings_native) ||
(native_lang != lang && n_meaning >= threshold_meanings_foreign))
{
b_added = true;
words_with_many_meanings.add(lang_pos_not_recursive);// List of the words with the maximum number of semantic relations.
}*/
if(0 == n_cur % 1000) { // % 100
if(DEBUG && n_cur > 1999)
break;
long t_cur, t_remain;
t_cur = System.currentTimeMillis() - t_start;
t_remain = (long)((n_total - n_cur) * t_cur/(60f*1000f*(float)(n_cur)));
t_cur = (long)(t_cur/(60f*1000f));
System.out.println(n_cur + ": " +
", duration: " + t_cur + // t_cur/(60f*1000f) +
" min, remain: " + t_remain +
" min");
}
}
} catch(SQLException ex) {
System.out.println("SQLException (RelationTableAll.countRelationsHistogram()): " + ex.getMessage());
} finally {
if (rs != null) { try { rs.close(); } catch (SQLException sqlEx) { } rs = null; }
if (s != null) { try { s.close(); } catch (SQLException sqlEx) { } s = null; }
}
System.out.println("\nNumber of words (with meanings) with unknown POS: " + n_unknown_pos__in_rich_words);
System.out.println("\nThe total of all unique noun, verb, etc. (+ with empty definitions): " + n_langpos_with_empty_meaning);
System.out.println("\nNumber of empty definitions: " + mean_histogram [0]);
System.out.println("\nNumber of words (unique noun, verb, etc.) with nonempty definitions: " + n_nonempty_meaning);
System.out.println("\nNumber of records in the table lang_pos: " + n_total);
return m_lang_pos_pos_stat;
}
public static void main(String[] args) {
// Connect to wikt_parsed database
Connect wikt_parsed_conn = new Connect();
LanguageType native_lang;
boolean b_english = true;
// English
if(b_english) {
native_lang = LanguageType.en;
wikt_parsed_conn.Open(Connect.ENWIKT_HOST, Connect.ENWIKT_PARSED_DB, Connect.ENWIKT_USER, Connect.ENWIKT_PASS, LanguageType.en);
} else {
// Russian
native_lang = LanguageType.ru;
wikt_parsed_conn.Open(Connect.RUWIKT_HOST, Connect.RUWIKT_PARSED_DB, Connect.RUWIKT_USER, Connect.RUWIKT_PASS, LanguageType.ru);
}
TLang.createFastMaps(wikt_parsed_conn);
TPOS.createFastMaps(wikt_parsed_conn);
//initLangEntries();
String db_name = wikt_parsed_conn.getDBName();
CommonPrinter.printHeader (db_name);
System.out.println("This page outlines:");
System.out.println("* Number of meanings.");
System.out.println("* Number of empty definitions for each language.");
System.out.println("* Number of entries for each part of speech (POS).");
//System.out.println("\nNumber of entries for each part of speech (POS). Number of meanings.");
System.out.println("\nSee about Part of Speech (POS) headers:");
System.out.println("* [http://en.wiktionary.org/wiki/Wiktionary:Entry_layout_explained/POS_headers#Standard_non-POS_level_3_headers Wiktionary:Entry layout explained/POS headers]");
System.out.println("* [http://ru.wiktionary.org/wiki/%D0%92%D0%B8%D0%BA%D0%B8%D1%81%D0%BB%D0%BE%D0%B2%D0%B0%D1%80%D1%8C:%D0%A7%D0%B0%D1%81%D1%82%D0%B8_%D1%80%D0%B5%D1%87%D0%B8 Приложение:Части речи]");
System.out.println("* [http://ru.wiktionary.org/wiki/%D0%9A%D0%B0%D1%82%D0%B5%D0%B3%D0%BE%D1%80%D0%B8%D1%8F:%D0%A8%D0%B0%D0%B1%D0%BB%D0%BE%D0%BD%D1%8B_%D1%81%D0%BB%D0%BE%D0%B2%D0%BE%D0%B8%D0%B7%D0%BC%D0%B5%D0%BD%D0%B5%D0%BD%D0%B8%D0%B9 Категория:Шаблоны словоизменений]");
System.out.println("\n= Meanings =");
Map<LanguageType, Map<POS,POSStat>> m_lang_pos =
POSAndPolysemyTableAll.countPOS(wikt_parsed_conn, native_lang);
// todo print number of rows (i.e. number of languages (words) with definitions)
// ...
POSAndPolysemyPrinter.printHistogramPerlanguage(mean_histogram, max_meanings_to_print,
m_lang_histogram);
System.out.println("\n= Part of speech =");
System.out.println("\n== Total (all entries) ==");
boolean print_templates_and_short_names = true;
POSAndPolysemyPrinter.printPOSSensesAndPolysemy(native_lang, m_pos_sum_all_lang, print_templates_and_short_names);
print_templates_and_short_names = false;
// English order
if(b_english) {
System.out.println("\n== English entries ==");
POSAndPolysemyPrinter.printPOS(native_lang, LanguageType.en, m_lang_pos, m_lang_unknown_pos, print_templates_and_short_names);
System.out.println("\n== Russian entries ==");
POSAndPolysemyPrinter.printPOS(native_lang, LanguageType.ru, m_lang_pos, m_lang_unknown_pos, print_templates_and_short_names);
System.out.println("\n== Finnish entries ==");
POSAndPolysemyPrinter.printPOS(native_lang, LanguageType.fi, m_lang_pos, m_lang_unknown_pos, print_templates_and_short_names);
System.out.println("\n== Ukrainian entries ==");
POSAndPolysemyPrinter.printPOS(native_lang, LanguageType.uk, m_lang_pos, m_lang_unknown_pos, print_templates_and_short_names);
System.out.println("\n== French entries ==");
POSAndPolysemyPrinter.printPOS(native_lang, LanguageType.fr, m_lang_pos, m_lang_unknown_pos, print_templates_and_short_names);
System.out.println("\n== German entries ==");
POSAndPolysemyPrinter.printPOS(native_lang, LanguageType.de, m_lang_pos, m_lang_unknown_pos, print_templates_and_short_names);
System.out.println("\n== Serbian entries ==");
POSAndPolysemyPrinter.printPOS(native_lang, LanguageType.sr, m_lang_pos, m_lang_unknown_pos, print_templates_and_short_names);
System.out.println("\n== Tatar entries ==");
POSAndPolysemyPrinter.printPOS(native_lang, LanguageType.tt, m_lang_pos, m_lang_unknown_pos, print_templates_and_short_names);
System.out.println("\n== Esperanto entries ==");
POSAndPolysemyPrinter.printPOS(native_lang, LanguageType.eo, m_lang_pos, m_lang_unknown_pos, print_templates_and_short_names);
System.out.println("\n== Latin entries ==");
POSAndPolysemyPrinter.printPOS(native_lang, LanguageType.la, m_lang_pos, m_lang_unknown_pos, print_templates_and_short_names);
System.out.println("\n== Italian entries ==");
POSAndPolysemyPrinter.printPOS(native_lang, LanguageType.it, m_lang_pos, m_lang_unknown_pos, print_templates_and_short_names);
System.out.println("\n== Swedish entries ==");
POSAndPolysemyPrinter.printPOS(native_lang, LanguageType.sv, m_lang_pos, m_lang_unknown_pos, print_templates_and_short_names);
System.out.println("\n== Spanish entries ==");
POSAndPolysemyPrinter.printPOS(native_lang, LanguageType.es, m_lang_pos, m_lang_unknown_pos, print_templates_and_short_names);
System.out.println("\n== Mandarin entries ==");
POSAndPolysemyPrinter.printPOS(native_lang, LanguageType.cmn, m_lang_pos, m_lang_unknown_pos, print_templates_and_short_names);
} else {
// Russian order
System.out.println("\n== Russian entries ==");
POSAndPolysemyPrinter.printPOS(native_lang, LanguageType.ru, m_lang_pos, m_lang_unknown_pos, print_templates_and_short_names);
System.out.println("\n== Ukrainian entries ==");
POSAndPolysemyPrinter.printPOS(native_lang, LanguageType.uk, m_lang_pos, m_lang_unknown_pos, print_templates_and_short_names);
System.out.println("\n== English entries ==");
POSAndPolysemyPrinter.printPOS(native_lang, LanguageType.en, m_lang_pos, m_lang_unknown_pos, print_templates_and_short_names);
System.out.println("\n== French entries ==");
POSAndPolysemyPrinter.printPOS(native_lang, LanguageType.fr, m_lang_pos, m_lang_unknown_pos, print_templates_and_short_names);
System.out.println("\n== German entries ==");
POSAndPolysemyPrinter.printPOS(native_lang, LanguageType.de, m_lang_pos, m_lang_unknown_pos, print_templates_and_short_names);
System.out.println("\n== Serbian entries ==");
POSAndPolysemyPrinter.printPOS(native_lang, LanguageType.sr, m_lang_pos, m_lang_unknown_pos, print_templates_and_short_names);
System.out.println("\n== Tatar entries ==");
POSAndPolysemyPrinter.printPOS(native_lang, LanguageType.tt, m_lang_pos, m_lang_unknown_pos, print_templates_and_short_names);
System.out.println("\n== Belarusian entries ==");
POSAndPolysemyPrinter.printPOS(native_lang, LanguageType.be, m_lang_pos, m_lang_unknown_pos, print_templates_and_short_names);
System.out.println("\n== Esperanto entries ==");
POSAndPolysemyPrinter.printPOS(native_lang, LanguageType.eo, m_lang_pos, m_lang_unknown_pos, print_templates_and_short_names);
System.out.println("\n== Bashkir entries ==");
POSAndPolysemyPrinter.printPOS(native_lang, LanguageType.ba, m_lang_pos, m_lang_unknown_pos, print_templates_and_short_names);
}
CommonPrinter.printFooter();
wikt_parsed_conn.Close();
}
}