/* POSAndPolysemyPrinter.java - Print (in wiki format)
* parts of speech statistics and data about polysemy
* in the database of the parsed Wiktionary.
*
* Copyright (c) 2011 Andrew Krizhanovsky <andrew.krizhanovsky at gmail.com>
* Distributed under EPL/LGPL/GPL/AL/BSD multi-license.
*/
package wikt.stat.printer;
import wikokit.base.wikt.constant.POS;
import wikt.stat.POSAndPolysemyTableAll.POSStat;
import java.util.Map;
import wikokit.base.wikipedia.language.LanguageType;
import wikokit.base.wikipedia.util.PrintfFormat;
/** Statistics of parts of speech and polysemy of the parsed Wiktionary.
*
* The result could be inserted into the Wiktionary page.
* @see http://ru.wiktionary.org/wiki/User:AKA MBG/Статистика:POS
* @see http://wordnet.princeton.edu/wordnet/man/wnstats.7WN.html
*/
public class POSAndPolysemyPrinter {
/** Prints
* (1) statistics about parts of speech.
* (2) names of POS in English (with templates) and in Russian (with templates).
*
* @param m_lang_rel_n map of maps with number of synonyms, antonyms, etc.
* in English, Russian etc. (lang -> relations -> count)
*
* @param m_lang_entries_number number of (Language & POS level) entries per language
*/
public static void printPOSWordsAndSensesTable (
LanguageType native_lang,
Map<POS,POSStat> m_lang_pos,
boolean print_templates_and_short_names)
{
if(null == m_lang_pos)
return;
System.out.println("{| class=\"sortable prettytable\" style=\"text-align: center;\"");
System.out.print("! Unique Strings || Total Word-Sense Pairs || POS");
if(LanguageType.en != native_lang)
System.out.print(" || POS in " + native_lang.getName());
if(print_templates_and_short_names)
System.out.print(" || Short name || Templates");
System.out.print(" || Max Senses || Entry");
// print values
for(POS pos : m_lang_pos.keySet()) {
POSStat pos_stat = m_lang_pos.get(pos);// Map<POS,POSStat> m_lang_pos
System.out.print("\n|-\n| " +
pos_stat.getNumberOfUniquesStrings() + " || " +
pos_stat.getNumberOfWordSensePairs() + " || " +
pos.toString());
if(LanguageType.en != native_lang)
System.out.print(" || " + pos.toString(native_lang));
if(print_templates_and_short_names)
System.out.print(" || " + pos.getShortName(native_lang) +
" || " + pos.getTemplates(", ", native_lang));
System.out.print(" || " + pos_stat.getMaxSenses() +
" || " + pos_stat.getWikifiedWordWithMaxSenses());
}
System.out.println("\n|}");
}
public static void printPOSPolysemyTable (
LanguageType native_lang,
Map<POS,POSStat> m_lang_pos)
{
System.out.println("{| class=\"sortable prettytable\" style=\"text-align: center;\"");
System.out.print("! POS");
if(LanguageType.en != native_lang)
System.out.print(" || POS in " + native_lang.getName());
System.out.print(" || Monosemous Words and Senses || Polysemous Words || Polysemous Senses");
System.out.print(" || Average Polysemy Including Monosemous Words || Average Polysemy Excluding Monosemous Words");
// print values
for(POS pos : m_lang_pos.keySet()) {
POSStat pos_stat = m_lang_pos.get(pos);// Map<POS,POSStat> m_lang_pos
System.out.print("\n|-\n| " + pos.toString());
if(LanguageType.en != native_lang)
System.out.print(" || " + pos.toString(native_lang));
System.out.print( " || " + pos_stat.getMonosemous() +
" || " + pos_stat.getPolysemousWords() +
" || " + pos_stat.getPolysemousSenses() +
" || " + new PrintfFormat("%.2lg").sprintf(pos_stat.calcAveragePolysemyIncludingMonosemousWords()) +
" || " + new PrintfFormat("%.2lg").sprintf(pos_stat.calcAveragePolysemyExcludingMonosemousWords()));
}
System.out.println("\n|}");
}
/**
* @param m_lang_unknown_pos number of unknown POS for each language
*/
public static void printPOS (
LanguageType native_lang,
LanguageType current_lang,
Map<LanguageType, Map<POS,POSStat>> map_lang_pos_all,
Map<LanguageType, Integer> m_lang_unknown_pos,
boolean print_templates_and_short_names)
{
Map<POS,POSStat> m_lang_pos = map_lang_pos_all.get(current_lang);
if(null == m_lang_pos)
return;
if(null != m_lang_unknown_pos)
System.out.println("\nNumber of words with unknown POS: " + m_lang_unknown_pos.get(current_lang));
printPOSSensesAndPolysemy(native_lang, m_lang_pos, print_templates_and_short_names);
}
public static void printPOSSensesAndPolysemy (
LanguageType native_lang,
Map<POS,POSStat> m_lang_pos,
boolean print_templates_and_short_names)
{
if(null == m_lang_pos)
return;
// !print header line before this function
System.out.println("\n=== Number of words and senses ===");
int lang_pos_size = 0;
if(null != m_lang_pos)
lang_pos_size = m_lang_pos.size();
System.out.println("\nRows in the table: " + lang_pos_size + "\n");
printPOSWordsAndSensesTable(native_lang,
m_lang_pos,
print_templates_and_short_names);
System.out.println("\n=== Polysemy information ===");
System.out.println("\nRows in the table: " + m_lang_pos.size() + "\n");
printPOSPolysemyTable ( native_lang,
m_lang_pos);
}
/** Maximum "number of relations" will be printed in the table:
* (2) Number of words per number of relations
* @see http://en.wiktionary.org/wiki/User:AKA_MBG/Statistics:Semantic_relations#Number_of_words_per_number_of_relations
*/
//static final Integer max_relations_to_print = 50;
/** Prints statistics-histogram about number of meanings in Wiktionary.
*
* @param max_values_to_print values histogram[0..max_values_to_print-1] will be printed
* @param total_histogram with total number of meanings for all languages
* @param m_lang_histogram number of meanings for each language
* @param m_lang_unknown_pos number of unknown parts of speech for each language
*/
public static void printHistogramPerlanguage (int[] total_histogram, int max_values_to_print,
Map<LanguageType, Integer[]> m_lang_histogram) {
// maximum number of meanings [0..max] to be printed in the table
// max := first non-zero value in total_histogram[] from the end of array
int max = Math.min(total_histogram.length, max_values_to_print);
for(int i=max-1; i>=0; i--) {
if(0 != total_histogram[i]) {
max = i;
break;
}
}
if (0 == max)
return;
// print header line
System.out.println("\n=== Number of words having different number of meanings / definitions ===\n");
System.out.println("Table description:");
System.out.println("* column 0 - number of words with empty definitions (total and for each language)");
System.out.println("* column 1 - number of monosemous words (total and for each language)");
System.out.println("* column 2 - number of words with two meanings, etc.");
System.out.println("* last column (\"Total\") - total number of words for this language.");
System.out.println("\nOnly the first " + max + " meanings (columns) are presented in the table.");
System.out.println("{| class=\"sortable prettytable\" style=\"text-align: center;\"");
System.out.print("! || Number of meanings: ");
for(int i=0; i<=max; i++)
System.out.print("||" + i);
System.out.print("||Total");
// System.out.print("\n|-");
// System.out.print("\n! Language name || Language code || colspan=\""+(max+2)+"\"| ");
System.out.print("\n|-");
System.out.print("\n! code || Total (all languages) :");
int cur_total = 0;
for(int i=0; i<=max; i++) {
System.out.print("||" + total_histogram[i]);
cur_total += total_histogram[i];
}
System.out.print("||" + cur_total);
for(LanguageType lang : m_lang_histogram.keySet()) {
System.out.println("\n|-\n! " + lang.getCode() + " || " + lang.getName());
cur_total = 0;
Integer[] h = m_lang_histogram.get(lang);
for(int i=0; i<=max; i++) {
System.out.print("||" + h[i]);
cur_total += h[i];
}
System.out.print("||" + cur_total);
}
// Total (all languages)
System.out.println("\n|}");
}
}