/* QuoteTableAll.java - quotes' statistics in the database of the parsed Wiktionary.
*
* Copyright (c) 2011 Andrew Krizhanovsky <andrew.krizhanovsky at gmail.com>
* Distributed under EPL/LGPL/GPL/AL/BSD multi-license.
*/
package wikt.stat;
import wikokit.base.wikt.sql.quote.TQuotRef;
import wikokit.base.wikt.sql.quote.TQuotSource;
import wikokit.base.wikt.sql.quote.TQuotAuthor;
import wikokit.base.wikt.sql.TLang;
import wikokit.base.wikt.sql.TMeaning;
import wikokit.base.wikt.sql.TLangPOS;
import wikokit.base.wikt.sql.TPOS;
import wikokit.base.wikt.sql.TPage;
import wikokit.base.wikipedia.sql.Statistics;
import wikokit.base.wikipedia.sql.Connect;
import wikt.stat.printer.CommonPrinter;
import uk.ac.shef.wit.simmetrics.similaritymetrics.*;
import wikokit.base.wikipedia.language.LanguageType;
//import wikipedia.language.Encodings;
import java.sql.*;
import java.util.ArrayList;
import java.util.Map;
import java.util.HashMap;
import java.util.List;
import java.util.Set;
/** Quotes' statistics in the database of the parsed Wiktionary.
*/
public class QuoteTableAll {
private static final boolean DEBUG = false;
/** Number of quotes per language. */
private static Map<LanguageType, Integer> m_lang_n = new HashMap<LanguageType, Integer>();
/** Number of quotes for each source: <source name, example_words and counter). */
private static Map<String, ObjectWithWords> m_source_n = new HashMap<String, ObjectWithWords>();
/** Number of quotes for each author: <author name, example_words and counter). */
private static Map<String, ObjectWithWords> m_author_n = new HashMap<String, ObjectWithWords>();
/** The linear list of authors (the same as m_author_n.keys()),
* it is required to store the order of adding of authors (for sorting and clustering). */
//private static List<String> l_author_n = new ArrayList<String>();
private static Map<String, String> author_to_cluster = new HashMap<String, String>();
private static int MAX_EXAMPLE_WORDS = 3;
private static AbstractStringMetric metric = new JaroWinkler();
private static float CLUSTER_THRESHOLD = 0.87F;
/** Inner class which contains the string which is nearest to some word,
* the distance is stored in 'dist'.
*/
private static class NearestWord {
NearestWord(float _dist, String _nearest_name) {
dist = _dist;
nearest_name = _nearest_name;
checked = false;
}
/** Distance to 'nearest_name'. */
float dist;
/** String which is nearest to some word. */
String nearest_name;
boolean checked;
}
/** Inner class which contains an object with a (small, example) list of words using this object.
* An object is a source, or author, or...
*/
private static class ObjectWithWords {
ObjectWithWords(String _object_name, String _object_wikilink) {
object_name = _object_name;
object_wikilink = _object_wikilink;
example_words = new ArrayList<String>();
counter = 0;
nearest_word = null;
}
/** Object's name, e.g. source of the quote, or author name, etc. */
public String object_name;
/** Object's second name, e.g. quot_ref.title_wikilink, or quot_author.wikilink */
public String object_wikilink;
/** Example of several entries which refer to this source. */
public List<String> example_words;
/** Counter of using this Source, or Author, or Title in Wiktionary entries. */
public int counter;
/** Distance from this->object_name to nearest another object_name. */
NearestWord nearest_word;
/** Calculates shortest distance between source word and the set of words 'words'.
* Returns object NearestWord with shortest distance and the nearest word.
*
* Remark: nearest word has maximum distance.
*/
private static NearestWord calcDistance(String source_word, Set<String> words)
{
float max_dist = 0;
String nearest_word = "";
boolean b_first = true;
if(0 == source_word.length() || 0 == words.size()) {
System.out.println("Warning (QuoteTableAll.ObjectWithWords.calcDistance()): source_word or words are empty!");
return new NearestWord (max_dist, nearest_word);
}
for(String w : words) {
float dist = metric.getSimilarity(source_word, w);
if( b_first ) {
b_first = false;
max_dist = dist;
nearest_word = w;
} else {
if(dist > max_dist) {
max_dist = dist;
nearest_word = w;
if(DEBUG && max_dist > CLUSTER_THRESHOLD)
System.out.println("(QuoteTableAll.ObjectWithWords.calcDistance()): max dist("+source_word+
", "+ w +")= "+max_dist);
}
}
}
return new NearestWord (max_dist, nearest_word);
}
/** Adds new quote object (source, or author...) to the map m;
* if there is space (< MAX_EXAMPLE_WORDS), then add example word for this object.
*/
private static void add(String page_title,
String _object_name, String _object_wikilink, // TQuotSource tsource, // String _source,
Map<String, ObjectWithWords> m,
Map<String, String> word_to_cluster)
//List<String> ordered_list)
{
if(0 == _object_name.length()) {
System.out.println("Warning (QuoteTableAll.ObjectWithWords.add()): page=" +page_title+ " with empty _object_name!");
return;
}
ObjectWithWords s_w = m.get(_object_name);
if(null == s_w) {
s_w = new ObjectWithWords(_object_name, _object_wikilink);
s_w.counter = 1;
s_w.example_words = new ArrayList<String>();
if(!s_w.example_words.contains(page_title))
s_w.example_words.add(page_title);
/*if(null != ordered_list) {
ordered_list.add(_object_name);
s_w.nearest_word = calcDistance( _object_name, m.keySet());
}*/
if(null != word_to_cluster) {
NearestWord nw = calcDistance( _object_name, m.keySet());
s_w.nearest_word = nw;
if(nw.dist > CLUSTER_THRESHOLD) {
String cluster_name = "";
String a = _object_name;
String b = nw.nearest_name;
if(!word_to_cluster.containsKey(a) && !word_to_cluster.containsKey(b)) {
cluster_name = _object_name;
} else if(word_to_cluster.containsKey(a)) {
cluster_name = word_to_cluster.get(a);
} else if(word_to_cluster.containsKey(b)) {
cluster_name = word_to_cluster.get(b);
}
word_to_cluster.put(a, cluster_name);
word_to_cluster.put(b, cluster_name);
//System.out.println("(QuoteTableAll.ObjectWithWords.add()): page=" +page_title+ " cluster:"+cluster_name+
// " + two words: '"+a+"' (len="+a.length()+") and '"+b+"' (len="+b.length()+"); equals="+a.equalsIgnoreCase(b));
}
}
m.put(_object_name, s_w);
} else {
s_w.counter += 1;
if(s_w.example_words.size() < MAX_EXAMPLE_WORDS) {
if(!s_w.example_words.contains(page_title))
s_w.example_words.add(page_title);
}
}
}
} // eo class ObjectWithWords
/** Collects words from one cluster to one list.
*/
private static Map<String, List<String>> collectWordsToCluster(Map<String, String> word_to_cluster)
{
Map<String, List<String>> cluster_to_words = new HashMap<String, List<String>>();
for(String word : word_to_cluster.keySet()) {
String cluster = word_to_cluster.get(word);
List<String> words = cluster_to_words.get(cluster);
if(null == words) {
words = new ArrayList<String>();
words.add(word);
cluster_to_words.put(cluster, words);
} else {
words.add(word);
}
}
return cluster_to_words;
}
/** Counts number of quotes, authors, sources,...
* by selecting all records from the table 'quote' from the database of the parsed Wiktionary.<br><br>
* SELECT * FROM quote;
*
* @param connect connection to the database of the parsed Wiktionary
* @return map from the language into a number of translation boxes
* which contain synonyms, antonyms, etc. in English (etc.)
*/
public static Map<LanguageType, Integer> countQuotes (Connect wikt_parsed_conn) {
// translation -> lang -> count
Statement s = null;
ResultSet rs= null;
long t_start;
int n_unknown_lang_pos = 0; // translations into unknown languages
int n_total_with_authors = 0;
int n_total_with_sources = 0;
int n_total = Statistics.Count(wikt_parsed_conn, "quote");
//System.out.println("Total quotes: " + n_total);
t_start = System.currentTimeMillis();
try {
s = wikt_parsed_conn.conn.createStatement ();
StringBuilder str_sql = new StringBuilder();
if(DEBUG) // SELECT id, meaning_id, lang_id, text, ref_id FROM quote LIMIT 3
str_sql.append("SELECT id, meaning_id, lang_id, ref_id FROM quote LIMIT 7000"); // 10000
else
str_sql.append("SELECT id, meaning_id, lang_id, ref_id FROM quote");
s.executeQuery (str_sql.toString());
rs = s.getResultSet ();
int n_cur = 0;
int i;
while (rs.next ())
{
n_cur ++;
int id = rs.getInt("id");
TMeaning m = TMeaning.getByID(wikt_parsed_conn, rs.getInt("meaning_id"));
TLang tlang = TLang.getTLangFast( rs.getInt("lang_id"));
i = rs.getInt("ref_id");
TQuotRef quot_ref = (0 == i) ? null : TQuotRef.getByID(wikt_parsed_conn, i);
LanguageType lang = tlang.getLanguage();
if(m_lang_n.containsKey(lang) ) {
int n = m_lang_n.get(lang);
m_lang_n.put(lang, n + 1);
} else
m_lang_n.put(lang, 1);
if(null == m) {
System.out.println("Warning (QuoteTableAll.countQuotes()): there is quote with id=" +id+ " with NULL meaning_id!");
continue;
}
TLangPOS lang_pos = m.getLangPOS(wikt_parsed_conn);
if(null != lang_pos) {
TPage tpage = lang_pos.getPage();
String page_title = tpage.getPageTitle();
if(null != quot_ref) {
TQuotSource tquot_source = quot_ref.getSource();
if(null != tquot_source) {
n_total_with_sources ++;
ObjectWithWords.add(page_title, tquot_source.getText(), "", m_source_n, null);
}
TQuotAuthor tquot_author = quot_ref.getAuthor();
if(null != tquot_author) {
n_total_with_authors ++;
ObjectWithWords.add(page_title, tquot_author.getName(), tquot_author.getWikilink(),
m_author_n, author_to_cluster); // l_author_n);
}
}
if(DEBUG && 0 == n_cur % 1000) { // % 100
//if(n_cur > 333)
// break;
long t_cur, t_remain;
t_cur = System.currentTimeMillis() - t_start;
t_remain = (long)((n_total - n_cur) * t_cur/(60f*1000f*(float)(n_cur)));
// where time for 1 page = t_cur / n_cur
// in min, since /(60*1000)
t_cur = (long)(t_cur/(60f*1000f));
//t_cur = t_cur/(60f*1000f));
if(null != tpage) {
System.out.println(n_cur + ": " + tpage.getPageTitle() +
", duration: " + t_cur + // t_cur/(60f*1000f) +
" min, remain: " + t_remain +
" min");
}
}
} else
n_unknown_lang_pos ++;
}
} catch(SQLException ex) {
System.out.println("SQLException (QuoteTableAll.countQuotes()): " + ex.getMessage());
} finally {
if (rs != null) { try { rs.close(); } catch (SQLException sqlEx) { } rs = null; }
if (s != null) { try { s.close(); } catch (SQLException sqlEx) { } s = null; }
}
//long t_end;
//float t_work;
//t_end = System.currentTimeMillis();
//t_work = (t_end - t_start)/1000f; // in sec
System.out.println(//"\nTime sec:" + t_work +
"\nTotal quotes: " + n_total +
"\n\nTotal quotes with sources: " + n_total_with_sources +
"\n\nThere are "+ m_source_n.size() +" unique sources " +
"\n\nTotal quotes with authors: " + n_total_with_authors +
"\n\nThere are "+ m_author_n.size() +" unique author names " +
"\n\nThere are quotes in " + m_lang_n.size() + " languages." +
"\n\nUnknown<ref>'''Unknown''' - words which have quotes but have unknown language code and POS</ref>: "
+ n_unknown_lang_pos);
return m_lang_n;
}
/** Prints statistics about quote sources in Wiktionary.
*/
private static void printQuoteSource (
Map<String, ObjectWithWords> m_source_n)
{
// print header line
System.out.println("\n=== Quote sources ===");
//System.out.println("\n'''Number of entries''' is a number of (Language & POS level) entries per language. E.g. the Wiktionary article \"[[:en:rook|rook]]\" contains three English and two Dutch entries of Part Of Speech level.");
//System.out.println("\n'''Total''' is a total number of relations, i.e. synonyms + antonyms + etc...\n");
/** Number of quotes for each source: <source name, example_words and counter). */
System.out.println("{| class=\"sortable prettytable\" style=\"text-align: center;\"");
System.out.print(" ! Source name || Number of quotes || Examples ");
// print values
for(String _source : m_source_n.keySet()) {
ObjectWithWords s_w = m_source_n.get(_source);
//System.out.print("|| " + lang.getName() + " || " + lang.getCode());
System.out.print("\n|-\n| " + _source +
" || " + s_w.counter + " || " );
List<String> words = s_w.example_words;
for(String w : words)
System.out.print("[[" + w + "]], ");
//System.out.print(" || ");
}
System.out.println("\n|}");
}
/** Joins wiki words to one string: "[[words 1]], [[words 2]], ... [[words N]]"
*/
private static String joinWikiWords (List<String> words)
{
StringBuilder s = new StringBuilder();
boolean b_first = true;
for(String w : words) {
if(b_first) {
b_first = false;
} else {
s.append(", ");
}
s.append("[[").append(w).append("]]");
}
return s.toString();
}
/** Prints statistics about quote sources in Wiktionary.
* Split names to to clusters.
*/
private static void printQuoteAuthor (
Map<String, ObjectWithWords> m_author_n,
Map<String, List<String>> cluster_to_words) // Map<String, String> word_to_cluster)
//List<String> l_author_n)
{
System.out.println("\n=== Quote authors ===");
/** Number of quotes for each source: <source name, example_words and counter). */
System.out.println("{| class=\"sortable prettytable\" style=\"text-align: center;\"");
System.out.print(" ! Author name || Author wikilink || Number of quotes || Examples ");
// print values from end to start, since last elements has nearest element links
//Collections.reverse(l_author_n);
// 1) print clusters of words
int cluster_counter = 1;
for(String cluster : cluster_to_words.keySet()) {
List<String> ww = cluster_to_words.get(cluster);
// | colspan=\"4\" style=\"text-align: center;\" | Cluster
System.out.print("\n|-\n| colspan=\"4\" style=\"text-align: center;\" | Cluster " + (cluster_counter ++));
for(String _name : ww) {
ObjectWithWords s_w = m_author_n.get(_name);
s_w.nearest_word.checked = true;
//if(s_w.example_words.size() > 2) {
System.out.print("\n|-\n| " + _name +
" || " + s_w.object_wikilink +
" || " + s_w.counter + " || " );
System.out.print(joinWikiWords( s_w.example_words ));
//}
}
}
// 2) print remaining words (not in any cluster)
System.out.print("\n|-\n| colspan=\"4\" style=\"text-align: center;\" | Not in clusters ");
//for(String _name : l_author_n) {
for(String _name : m_author_n.keySet()) {
ObjectWithWords s_w = m_author_n.get(_name);
if(s_w.nearest_word.checked)
continue;
s_w.nearest_word.checked = true;
// if(s_w.example_words.size() > 2) {
System.out.print("\n|-\n| " + _name +
" || " + s_w.object_wikilink +
" || " + s_w.counter + " || " );
System.out.print(joinWikiWords( s_w.example_words ));
/*List<String> words = s_w.example_words;
for(String w : words)
System.out.print("[[" + w + "]], ");*/
//}
}
System.out.println("\n|}");
}
public static void main(String[] args) {
// Connect to wikt_parsed database
Connect wikt_parsed_conn = new Connect();
// Russian
LanguageType native_lang = LanguageType.ru;
wikt_parsed_conn.Open(Connect.RUWIKT_HOST, Connect.RUWIKT_PARSED_DB, Connect.RUWIKT_USER, Connect.RUWIKT_PASS, LanguageType.ru);
// English
//LanguageType native_lang = LanguageType.en;
//wikt_parsed_conn.Open(Connect.ENWIKT_HOST, Connect.ENWIKT_PARSED_DB, Connect.ENWIKT_USER, Connect.ENWIKT_PASS, LanguageType.en);
TLang.createFastMaps(wikt_parsed_conn);
TPOS.createFastMaps(wikt_parsed_conn);
//TRelationType.createFastMaps(wikt_parsed_conn);
String db_name = wikt_parsed_conn.getDBName();
System.out.println("\n== Statistics of quotes in the Wiktionary parsed database ==");
System.out.println("\n''Last updated: summer 2014.''");
CommonPrinter.printHeader (db_name);
Map<LanguageType, Integer> m = QuoteTableAll.countQuotes(wikt_parsed_conn);
wikt_parsed_conn.Close();
Map<String, List<String>> cluster_to_authors = collectWordsToCluster(author_to_cluster); // author_to_cluster
//private static Map<String, String> author_to_cluster = new HashMap<String, String>();
System.out.println();
//int total_quotes =
CommonPrinter.printSomethingPerLanguage(native_lang, m);
//System.out.println("Total quotes: " + total_quotes);
/** Number of quotes for each source: <source name, example_words and counter). */
QuoteTableAll.printQuoteSource(m_source_n);
QuoteTableAll.printQuoteAuthor(m_author_n, cluster_to_authors); // l_author_n);
//System.out.println("\nThere are quotes in " + m.size() + " languages.");
CommonPrinter.printFooter();
}
}