/* RelationTableAll.java - relations' statistics in the database of the parsed Wiktionary.
*
* Copyright (c) 2005-2013 Andrew Krizhanovsky <andrew.krizhanovsky at gmail.com>
* Distributed under EPL/LGPL/GPL/AL/BSD multi-license.
*/
package wikt.stat;
import wikokit.base.wikt.sql.TLang;
import wikokit.base.wikt.sql.TLangPOS;
import wikokit.base.wikt.sql.TMeaning;
import wikokit.base.wikt.sql.TPOS;
import wikokit.base.wikt.sql.TRelationType;
import wikokit.base.wikt.sql.TPage;
import wikokit.base.wikipedia.sql.Statistics;
import wikokit.base.wikipedia.sql.Connect;
import wikt.stat.printer.CommonPrinter;
import wikokit.base.wikt.constant.Relation;
import wikokit.base.wikipedia.language.LanguageType;
import wikokit.base.wikt.api.WTRelation;
import java.sql.*;
import java.util.Set;
import java.util.Map;
import java.util.HashMap;
import java.util.List;
import java.util.ArrayList;
import wikokit.base.wikt.constant.POS;
/** Relations' statistics in the database of the parsed Wiktionary.
*/
public class RelationTableAll {
private static final boolean DEBUG = false;
/** Let's constrain the maximum number of semantic relation for one word
* in one language */
private static final int max_relation = 200;
private static final int[] rel_histogram = new int[max_relation];
/** Let's constrain the maximum number of types of semantic relation
* for one word in one language */
private static final int max_type_relation = 10;
private static final int[] rel_type_histogram = new int[max_type_relation];
/** List of the words with the maximum number of semantic relations,
* or the maximum number of types of semantic relations. */
private static final List<TLangPOS> words_rich_in_relations = new ArrayList<TLangPOS>();
/** Number of semantic relations per type,
* i.e. number of synonyms for words, number of antonyms, etc. **/
private static final Map<Relation,Integer>[] m_relation_type_number = new HashMap[max_type_relation];
// <Relation,Integer> ()
/** Number of words (pairs: language & part of speech)
* with semantic relations */
private static int lang_pos_with_relations;
/** Number of (Language & POS level) entries per language. **/
private static final Map<LanguageType,Integer> m_lang_entries_number = new HashMap();
/** Counts number of semantic relations for each language
* by selecting all relations from the database of the parsed Wiktionary.<br><br>
* SELECT FROM relation;
*
* @param connect connection to the database of the parsed Wiktionary
* @return map of maps with number of synonyms (etc.) in English (etc.)
*/
public static Map<LanguageType, Map<Relation,Integer>> countRelationsPerLanguage (
Connect wikt_parsed_conn) {
// lang -> relations -> count
Statement s = null;
ResultSet rs= null;
long t_start;
float t_work;
int n_unknown_lang_pos = 0; // relations which belong to words with unknown language and POS
int n_total = Statistics.Count(wikt_parsed_conn, "relation");
System.out.println("Total relations: " + n_total);
t_start = System.currentTimeMillis();
Map<LanguageType, Map<Relation,Integer>> m_lang_rel_n = new HashMap<LanguageType, Map<Relation,Integer>>();
try {
s = wikt_parsed_conn.conn.createStatement ();
StringBuilder str_sql = new StringBuilder();
str_sql.append("SELECT id,meaning_id,wiki_text_id,relation_type_id FROM relation");
s.executeQuery (str_sql.toString());
rs = s.getResultSet ();
int n_cur = 0;
while (rs.next ())
{
n_cur ++;
//int id = rs.getInt("id");
TMeaning m = TMeaning.getByID(wikt_parsed_conn, rs.getInt("meaning_id"));
//TWikiText wt = TWikiText.getByID(wikt_parsed_conn, rs.getInt("wiki_text_id"));
TRelationType tr = TRelationType.getRelationFast( rs.getInt("relation_type_id"));
if(null != m && null != tr) {
TLangPOS lang_pos = m.getLangPOS(wikt_parsed_conn);
Relation r = tr.getRelation();
assert(null != r);
if(null != lang_pos) {
TLang tlang = lang_pos.getLang(); //TPOS tpos = lang_pos.getPOS(); // future statistics
if(null != tlang ) {
LanguageType lang = lang_pos.getLang().getLanguage();
Map<Relation,Integer> rel_n = null;
int n;
if(m_lang_rel_n.containsKey(lang) ) {
rel_n = m_lang_rel_n.get(lang); // assert(null != rel_n);
if(rel_n.containsKey( r)) {
n = rel_n.get(r);
rel_n.put(r, n + 1);
} else rel_n.put(r, 1 );
} else {
rel_n = new HashMap<Relation,Integer>();
rel_n.put(r, 1);
m_lang_rel_n.put(lang, rel_n);
}
if(0 == n_cur % 1000) { // % 100
if(DEBUG && n_cur > 333)
break;
long t_cur, t_remain;
t_cur = System.currentTimeMillis() - t_start;
t_remain = (long)((n_total - n_cur) * t_cur/(60f*1000f*(float)(n_cur)));
// where time for 1 page = t_cur / n_cur
// in min, since /(60*1000)
t_cur = (long)(t_cur/(60f*1000f));
//t_cur = t_cur/(60f*1000f));
TPage tpage = lang_pos.getPage();
if(null != tpage) {
System.out.println(n_cur + ": " + tpage.getPageTitle() +
", duration: " + t_cur + // t_cur/(60f*1000f) +
" min, remain: " + t_remain +
" min");
}
}
}
}
} else
n_unknown_lang_pos ++;
}
} catch(SQLException ex) {
System.out.println("SQLException (RelationTableAll.countRelationsPerLanguage()): " + ex.getMessage());
} finally {
if (rs != null) { try { rs.close(); } catch (SQLException sqlEx) { } rs = null; }
if (s != null) { try { s.close(); } catch (SQLException sqlEx) { } s = null; }
}
long t_end;
t_end = System.currentTimeMillis();
t_work = (t_end - t_start)/1000f; // in sec
System.out.println("\nTime sec:" + t_work +
"\nTotal relations: " + n_total +
"\n\nUnknown<ref>'''Unknown''' - relations which belong to words with unknown language and POS</ref>: " +
n_unknown_lang_pos);
return m_lang_rel_n;
}
/** Initialize (set to zero) number of entries for each language. */
public static void initLangEntries () {
Set<LanguageType> languages = TLang.getAllLanguages().keySet();
for(LanguageType lang : languages)
m_lang_entries_number.put(lang, 0);
}
/** Increments number of entries for the given language 'lt'.
*/
public static void incLangEntry (LanguageType lt) {
int old = m_lang_entries_number.get(lt);
m_lang_entries_number.put(lt, old + 1);
}
/** Counts number of semantic relations for each number of relations per
* word. Fills
* (1) 'rel_histogram' - number of words per number of semantic relations;
* (2) 'rel_types_histogram' - number of words per number of types of
* semantic relations;
* (3) 'words_rich_in_relations' - list of the words with the maximum number
* of semantic relations, or the maximum number of types of semantic relations;
* (4) 'm_relation_type_number'
* (5) 'lang_pos_with_relations' number of lang_pos with semantic relations
* .<br><br>
*
* SELECT * FROM lang_pos;
*
* @param connect connection to the database of the parsed Wiktionary
*
* @param threshold_relations_native number (or more) of relations
* the word (in native language) have
* to have in order to be included into the
* list RelationTableAll.words_rich_in_relations
*
* @param threshold_relations_foreign the same constraint for foreign words,
* it used since native words usually push out
* foreign words from the list,
* threshold_relations_foreign << threshold_relations_native, so
*
*
* @param threshold_types_relations number (or more) of types
* of the semantic relations the word have
* to have in order to be included into the
* list RelationTableAll.words_rich_in_relations
*
* @return histogram with number of semantic relations, i.e.
* [0] = number of words (one language, one part of speech) without any semantic relations,
* [1] = number of words with one relation, etc.
*/
public static void countRelationsHistogram (Connect wikt_parsed_conn,
LanguageType native_lang,
int threshold_relations_foreign, int threshold_relations_native,
int threshold_type_relations) {
// lang_pos -> meaning -> relations -> count
Statement s = null;
ResultSet rs= null;
long t_start;
int n_unknown_pos__in_rich_words = 0; // number of words (with many relations) with unknown POS
int n_total = Statistics.Count(wikt_parsed_conn, "lang_pos");
t_start = System.currentTimeMillis();
try {
s = wikt_parsed_conn.conn.createStatement ();
s.executeQuery ("SELECT id FROM lang_pos");
rs = s.getResultSet ();
int n_cur = 0;
while (rs.next ())
{
n_cur ++;
int id = rs.getInt("id");
TLangPOS lang_pos_not_recursive = TLangPOS.getByID (wikt_parsed_conn, id);// fields are not filled recursively
if(null == lang_pos_not_recursive)
continue;
LanguageType lang = lang_pos_not_recursive.getLang().getLanguage();
incLangEntry(lang);
TPage tpage = lang_pos_not_recursive.getPage();
String page_title = tpage.getPageTitle();
int n_relation = WTRelation.getNumberByPageLang(wikt_parsed_conn, lang_pos_not_recursive);
int n_type_relation = 0;
if(n_relation > 1) {
// there is a reason to calculate: number of types of semantic relations
TLangPOS lang_pos = null;
{ // calculate lang_pos with filled fields in order
// to get number of meanings by getRecursive, etc.
TLangPOS[] lang_pos_array = TLangPOS.getRecursive(wikt_parsed_conn, tpage);
for(TLangPOS tlp : lang_pos_array) {
if(tlp.getID() == id) {
lang_pos = tlp;
break;
}
}
}
assert(lang_pos != null);
if(POS.unknown == lang_pos.getPOS().getPOS())
n_unknown_pos__in_rich_words ++;
boolean b_added = false;
if((native_lang == lang && n_relation >= threshold_relations_native) ||
(native_lang != lang && n_relation >= threshold_relations_foreign))
{
b_added = true;
words_rich_in_relations.add(lang_pos);// List of the words with the maximum number of semantic relations.
}
{
n_type_relation = lang_pos.countRelationTypes();
if(n_type_relation >= threshold_type_relations && !b_added)
words_rich_in_relations.add(lang_pos);
if(n_type_relation < max_type_relation) {
rel_type_histogram [n_type_relation] ++;
if(null == m_relation_type_number [n_type_relation])
m_relation_type_number [n_type_relation] = new HashMap<Relation,Integer> ();
lang_pos.addNumberOfRelationPerType(m_relation_type_number [n_type_relation]);
} else
System.out.println("Error (RelationTableAll.countRelationsHistogram()): n_types_relation=" +
n_type_relation + " > max_types_relation for the word=" + page_title);
}
}
if(1 == n_relation)
n_type_relation = 1;
if(n_relation < max_relation)
rel_histogram [n_relation] ++;
else
System.out.println("Error (RelationTableAll.countRelationsHistogram()): n_relation=" +
n_relation + " > max_relation for the word=" + page_title);
if(n_relation > 0)
lang_pos_with_relations ++;
if(0 == n_cur % 1000) { // % 100
if(DEBUG && n_cur > 333)
break;
long t_cur, t_remain;
t_cur = System.currentTimeMillis() - t_start;
t_remain = (long)((n_total - n_cur) * t_cur/(60f*1000f*(float)(n_cur)));
t_cur = (long)(t_cur/(60f*1000f));
System.out.println(n_cur + ": " +
", duration: " + t_cur + // t_cur/(60f*1000f) +
" min, remain: " + t_remain +
" min");
}
}
} catch(SQLException ex) {
System.out.println("SQLException (RelationTableAll.countRelationsHistogram()): " + ex.getMessage());
} finally {
if (rs != null) { try { rs.close(); } catch (SQLException sqlEx) { } rs = null; }
if (s != null) { try { s.close(); } catch (SQLException sqlEx) { } s = null; }
}
System.out.println("\nNumber of words (with many relations) with unknown POS: " + n_unknown_pos__in_rich_words);
}
public static void main(String[] args) {
// Connect to wikt_parsed database
Connect wikt_parsed_conn = new Connect();
int threshold_relations_foreign, threshold_relations_native,
threshold_type_relations;
/*// Russian
LanguageType native_lang = LanguageType.ru;
threshold_relations_native = 42; // 40
threshold_relations_foreign = 12; // 10; threshold_relations = 14;
threshold_type_relations = 5;
if(DEBUG) threshold_relations_native = 3;
wikt_parsed_conn.Open(Connect.RUWIKT_HOST, Connect.RUWIKT_PARSED_DB, Connect.RUWIKT_USER, Connect.RUWIKT_PASS, LanguageType.ru);
*/
// English
LanguageType native_lang = LanguageType.en;
threshold_relations_native = 40; // for English words, more strict rules, too much rich words
threshold_relations_foreign = 21; // 10 21
threshold_type_relations = 4; // 3 4
wikt_parsed_conn.Open(Connect.ENWIKT_HOST, Connect.ENWIKT_PARSED_DB, Connect.ENWIKT_USER, Connect.ENWIKT_PASS, LanguageType.en);
TLang.createFastMaps(wikt_parsed_conn);
TPOS.createFastMaps(wikt_parsed_conn);
TRelationType.createFastMaps(wikt_parsed_conn);
RelationTableAll.initLangEntries();
String db_name = wikt_parsed_conn.getDBName();
System.out.println("\n== Statistics of semantic relations in the Wiktionary parsed database ==");
CommonPrinter.printHeader (db_name);
Map<LanguageType, Map<Relation,Integer>> m = RelationTableAll.countRelationsPerLanguage(wikt_parsed_conn);
// fills rel_histogram, rel_types_histogram
RelationTableAll.countRelationsHistogram(wikt_parsed_conn, native_lang,
threshold_relations_foreign, threshold_relations_native,
threshold_type_relations);
System.out.println("\nWords (pairs: language & part of speech) with semantic relations: " + lang_pos_with_relations);
System.out.println("\nLanguages with semantic relations: " + m.size());
System.out.println();
//WTStatisticsGoogleWiki.printRelationsPerLanguage(m);
CommonPrinter.printRelationsPerLanguage(native_lang, m, m_lang_entries_number);
/** Maximum "number of relations" will be printed in the table:
* (2) Number of words per number of relations
* @see http://en.wiktionary.org/wiki/User:AKA_MBG/Statistics:Semantic_relations#Number_of_words_per_number_of_relations
*/
int max_relations_to_print = 50;
CommonPrinter.printRelationHistogram(rel_histogram, max_relations_to_print);
CommonPrinter.printRelationsTypeHistogram (rel_type_histogram, m_relation_type_number);
CommonPrinter.printWordsWithManyRelations(native_lang, wikt_parsed_conn,
words_rich_in_relations,
threshold_relations_foreign, threshold_relations_native,
threshold_type_relations);
CommonPrinter.printFooter();
wikt_parsed_conn.Close();
}
}