/* WikiPrinterStat.java - Printer (in wiki format) of statistics * of the newly created (parsed) database of the Wiktionary. * * Copyright (c) 2009-2011 Andrew Krizhanovsky <andrew.krizhanovsky at gmail.com> * Distributed under EPL/LGPL/GPL/AL/BSD multi-license. */ package wikt.stat.printer; //import wikipedia.sql.Connect; import wikokit.base.wikt.sql.TLangPOS; import wikokit.base.wikt.constant.Relation; import wikokit.base.wikipedia.language.LanguageType; import wikokit.base.wikipedia.sql.Connect; import wikokit.base.wikt.api.WTRelation; import java.util.Map; import java.util.List; /** Statistics of the database of the parsed Wiktionary. * * The result could be inserted into the Wiktionary page. * @see http://ru.wiktionary.org/wiki/User:AKA MBG/Статистика:Семантические_отношения * @see todo */ public class CommonPrinter { public static void printHeader (String db_name) { System.out.println("\nThe parsed database name: " + db_name +"<ref>" + "This (or more recent) database would be available at the project site " + "[http://code.google.com/p/wikokit wikokit], see Download section at page [http://whinger.krc.karelia.ru/soft/wikokit/index.html whinger.krc.karelia.ru]." + "</ref>"); } public static void printHeaderXML (String db_name) { System.out.println("<!-- The parsed database name: " + db_name +". -->"); System.out.println("<!-- This (or more recent) database would be available at the project site (http://code.google.com/p/wikokit), see Download section at page (http://whinger.krc.karelia.ru/soft/wikokit/index.html). -->"); } public static void printFooter () { System.out.println("\n== References ==\n<references />\n"); } /** Prints statistics about relations in Wiktionary. * * @param m_lang_rel_n map of maps with number of synonyms, antonyms, etc. * in English, Russian etc. (lang -> relations -> count) * * @param m_lang_entries_number number of (Language & POS level) entries per language */ public static void printRelationsPerLanguage ( LanguageType native_lang, Map<LanguageType, Map<Relation,Integer>> m_lang_rel_n, Map<LanguageType,Integer> m_lang_entries_number) { // print header line System.out.println("=== Number of relations per language ==="); System.out.println("\n'''Number of entries''' is a number of (Language & POS level) entries per language. E.g. the Wiktionary article \"[[:en:rook|rook]]\" contains three English and two Dutch entries of Part Of Speech level (Etymologies)."); System.out.println("\n'''Total''' is a total number of relations, i.e. synonyms + antonyms + etc...\n"); System.out.println("{| class=\"sortable prettytable\" style=\"text-align: center;\""); System.out.println("! Language name"); System.out.println("! Code"); // Code - enwiki, Template - ruwiki if(LanguageType.en != native_lang) System.out.println("! in " + native_lang.getName()); System.out.println("! Number of entries"); //Collection<Relation> all_rel = Relation.getAllRelations(); Relation[] all_rel = { Relation.synonymy, Relation.antonymy, Relation.hypernymy, Relation.hyponymy, Relation.holonymy, Relation.meronymy}; System.out.println("! total"); // " Number of semantic relations" for(Relation r : all_rel) { System.out.println("! " + r.toString()); } // print values for(LanguageType lang : m_lang_rel_n.keySet()) { /*if(!m_lang_rel_n.containsKey(lang)) System.out.println(lang.toString() + " : 0"); else {*/ //System.out.print("|| " + lang.getName() + " || " + lang.getCode()); System.out.println("|-"); System.out.print("|" + lang.getName() + "||" + lang.getCode()); if(LanguageType.en != native_lang) { String local_name = ""; if (lang.hasTranslation(native_lang)) local_name = lang.translateTo(native_lang); System.out.print("||" + local_name); } System.out.print("||" + m_lang_entries_number.get(lang)); //|- //! Abaza Map<Relation,Integer> rel_n = m_lang_rel_n.get(lang); int total = 0; // number of relations for one language: synonyms + antonyms + ... for(Relation r : all_rel) total += (rel_n.containsKey(r) ? rel_n.get(r) : 0); System.out.print("||" + total); for(Relation r : all_rel) { int n = rel_n.containsKey(r) ? rel_n.get(r) : 0; System.out.print("||" + n); total += n; } System.out.println(); } System.out.println("|}"); } /** Prints statistics about number of words per number of relation types in Wiktionary. * * @param ... */ public static void printRelationsTypeHistogram ( int[] rel_type_histogram, Map<Relation,Integer>[] m_relation_type_number ) { System.out.println("=== Number of words per number of relation types ===\n"); System.out.println("Number of words which have the following number of types of semantic relations. E.g.:"); System.out.println("\n1 | number of words (one language, one part of speech) which have only Synonyms, or only Antonyms, etc."); System.out.println("\n2 | number of words with two types of relation, e.g. Synonymy and Antonymy, or Synonymy and Hypernymy, etc.\n"); System.out.println("{| class=\"sortable prettytable\" style=\"text-align: center;\""); System.out.println("! Number of relation types"); System.out.println("! Number of words"); //Collection<Relation> all_rel = Relation.getAllRelations(); Relation[] all_rel = { Relation.synonymy, Relation.antonymy, Relation.hypernymy, Relation.hyponymy, Relation.holonymy, Relation.meronymy, Relation.troponymy, Relation.coordinate_term, Relation.otherwise_related }; System.out.println("! total"); // " Number of semantic relations" for(Relation r : all_rel) System.out.println("! " + r.toString()); for(int i=1; i < rel_type_histogram.length; i++) { System.out.println("|-"); System.out.print("|" + i + "||" + rel_type_histogram[i]); //|- //! Abaza Map<Relation,Integer> rel_n = m_relation_type_number[i]; if(null == rel_n) { System.out.print("||0"); for(Relation r : all_rel) System.out.print("||0"); } else { int total = 0; // number of relations for one language: synonyms + antonyms + ... for(Relation r : all_rel) total += (rel_n.containsKey(r) ? rel_n.get(r) : 0); System.out.print("||" + total); for(Relation r : all_rel) { int n = rel_n.containsKey(r) ? rel_n.get(r) : 0; System.out.print("||" + n); total += n; } } System.out.println(); } System.out.println("\n|}"); } /** Maximum "number of relations" will be printed in the table: * (2) Number of words per number of relations * @see http://en.wiktionary.org/wiki/User:AKA_MBG/Statistics:Semantic_relations#Number_of_words_per_number_of_relations */ //static final Integer max_relations_to_print = 50; /** Prints statistics-histogram about number of relations in Wiktionary. * * @param max_values_to_print values rel_histogram[0..max_values_to_print-1] will be printed * @param histogram with number of semantic relations, i.e. * [0] = number of words (one language, one part of speech) without any semantic relations, * [1] = number of words with one relation, etc. */ public static void printRelationHistogram (int[] rel_histogram, int max_values_to_print) { // print header line System.out.println("\n=== Number of words per number of relations ===\n"); System.out.println("Number of words which have the following number of semantic relations. E.g.:"); System.out.println("\n0 | number of words (one language, one part of speech) without any semantic relations"); System.out.println("\n1 | number of words with one relation, e.g. one synonym or one antonym, etc.\n"); System.out.println("\nOnly the first " + max_values_to_print + " rows are presented in the table."); System.out.println("{| class=\"sortable prettytable\" style=\"text-align: center;\""); System.out.println("! Number of relations"); System.out.println("! Number of words"); int max = Math.min(rel_histogram.length, max_values_to_print); for(int i=0; i<max; i++) { int n_rel = rel_histogram[i]; if(0 == n_rel) continue; //System.out.print("|| " + lang.getName() + " || " + lang.getCode()); System.out.println("|-"); System.out.println("|" + i + "||" + n_rel); } System.out.println("|}"); } /** Prints the words with the maximum number of semantic relations. * * @param words_rich_in_relations list of the words with the maximum number * of semantic relations * @param threshold_relations the minimum number of relations of these words */ public static void printWordsWithManyRelations ( LanguageType native_lang, Connect wikt_parsed_conn, List<TLangPOS> words_rich_in_relations, int threshold_relations_foreign, int threshold_relations_native, int threshold_type_relations) { // print header line System.out.println("\n=== List of words with many semantic relations ===\n"); System.out.println("There are " + words_rich_in_relations.size() + " words which have >= " + threshold_relations_native + " ("+ native_lang.getName() +"), " + " >= " + threshold_relations_foreign + " (other languages) " + " semantically related words or >= " + threshold_type_relations + " types of semantic relations."); System.out.println("\n{| class=\"sortable prettytable\" style=\"text-align: center;\""); System.out.println("! Word"); System.out.println("! Number<br>of<br>relations"); System.out.println("! Types<br>of<br>semantic<br>relations"); System.out.println("! Number<br>of<br>meanings"); System.out.println("! POS"); System.out.println("! Language name"); for(TLangPOS lang_pos : words_rich_in_relations) { String page_title = lang_pos.getPage().getPageTitle(); String pos = lang_pos.getPOS().getPOS().toString(); String lang_code = lang_pos.getLang().getLanguage().getCode(); String lang_name = lang_pos.getLang().getLanguage().getName(); int n_meaning = lang_pos.countMeanings(); int n_relation = WTRelation.getNumberByPageLang(wikt_parsed_conn, lang_pos); int n_types_relation = lang_pos.countRelationTypes(); // wikitext has problems with a symbol '/', so print [[:/]] instead of a subpage link: [[/]] if(page_title.equalsIgnoreCase("/")) page_title = ":/"; System.out.println("|-"); System.out.println("|[[" + page_title + "]]||" + n_relation + "||" + n_types_relation + "||" + n_meaning + "||" + pos + "||" + lang_name); } System.out.println("|}"); } /** Prints statistics about (translations, or quotes, ...) per language * in Wiktionary. * * @param m_lang_n map of maps with number of translations into * English, Russian etc. (lang -> count) */ public static int printSomethingPerLanguage ( LanguageType native_lang, Map<LanguageType, Integer> m_lang_n) { int total = 0; // total number of translations. or quotations, or... // print header line //System.out.println("\n=== Quote languages ===\n\n"); System.out.println("{| class=\"sortable prettytable\""); System.out.println("! Language name"); System.out.println("! Number"); if(LanguageType.en == native_lang) { System.out.println("! Language code"); } else { // e.g. Russian System.out.println("! in " + native_lang.getName()); // let's print translations of the language for non-English Wiktionaries System.out.println("! Template"); } // print values for(LanguageType lang : m_lang_n.keySet()) { System.out.println("|-"); int n = m_lang_n.get(lang); //System.out.print("|-\n! " + lang.getName() + " || " + lang.getCode() + "\n|| " + n); System.out.print("|" + lang.getName() + "||" + n); //System.out.print("|| " + lang.getCode() + " || " + lang.getName()); // System.out.println(" || " + n + " ||"); if(LanguageType.en == native_lang) { System.out.print("||" + lang.getCode()); } else { String local_name = ""; if (lang.hasTranslation(native_lang)) local_name = lang.translateTo(native_lang); System.out.print("||" + local_name); System.out.print("||{{" + lang.getCode() + "}}"); } System.out.println(); total += n; } System.out.println("|}"); //System.out.println( "Total translations: " + total); return total; } }