/* DefQuoteSynExporter.java - exports definition, quotations and synonyms
* from the database of the parsed Wiktionary in YARN format.
*
* Copyright (c) 2013-2014 Andrew Krizhanovsky <andrew.krizhanovsky at gmail.com>
* Distributed under EPL/LGPL/GPL/AL/BSD multi-license.
*/
package wiktparsed.yarn;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import net.htmlparser.jericho.Source;
import wikokit.base.wikipedia.language.LanguageType;
import wikokit.base.wikipedia.sql.Connect;
import wikokit.base.wikipedia.sql.PageTableBase;
import wikokit.base.wikipedia.sql.Statistics;
import wikokit.base.wikt.api.WTMeaning;
import wikokit.base.wikt.constant.Label;
import wikokit.base.wikt.constant.LabelCategoryLocal;
import wikokit.base.wikt.constant.POS;
import wikokit.base.wikt.constant.Relation;
import wikokit.base.wikt.multi.en.name.LabelEn;
import wikokit.base.wikt.multi.ru.name.LabelCategoryRu;
import wikokit.base.wikt.multi.ru.name.LabelRu;
import wikokit.base.wikt.sql.TLang;
import wikokit.base.wikt.sql.TLangPOS;
import wikokit.base.wikt.sql.TMeaning;
import wikokit.base.wikt.sql.TPOS;
import wikokit.base.wikt.sql.TPage;
import wikokit.base.wikt.sql.TRelation;
import wikokit.base.wikt.sql.TRelationType;
import wikokit.base.wikt.sql.label.TLabel;
import wikokit.base.wikt.sql.label.TLabelCategory;
import wikokit.base.wikt.sql.label.TLabelMeaning;
import wikokit.base.wikt.sql.quote.TQuotRef;
import wikokit.base.wikt.sql.quote.TQuote;
import wikt.stat.printer.CommonPrinter;
/** YARN format exporter
*
* @see YARN format https://github.com/xoposhiy/yarn/commit/65411750ee8f867c79cdd77bcbaf8024df2c9d63
*/
public class DefQuoteSynExporter {
private static final boolean DEBUG = false;
//private static final FileWriter file;
/** map for the first part of YARN file: lexicon. Map from word to "nID" */
private static final Map<String, Integer> m_noun_word_to_id = new HashMap<String, Integer>();
/** Gets ID of word in the exported list.
* @return -1 if this word is absent, i.e. it was not exported yet.
**/
public static int getWordEntryID (POS pos, String word, Map<String, Integer> _m_noun_word_to_id) {
if(POS.noun == pos) {
if(_m_noun_word_to_id.containsKey(word))
return _m_noun_word_to_id.get(word);
}
return -1;
}
/** Gets Part-of-speech prefix (letter) n - noun, v - verb, a - adjective. */
public static String getPOSOneLetterPrefix(POS pos) {
String pos_prefix = "";
if(POS.noun == pos)
pos_prefix = "n";
else if(POS.verb == pos)
pos_prefix = "v";
else if(POS.adjective == pos)
pos_prefix = "a";
return pos_prefix;
}
/** Gets XML chunk with word. Returns empty string if this word was added to the lexicon already.
* @param pos_prefix n - noun, v - verb, a - adjective
* @param word_id
* @param source_url_word Wiktionary entry which is the information source
* @param native_language_code main language of Wiktionary
* @return
*/
public static String getWordEntryXMLWithoutDuplicates (Connect wikt_parsed_conn,
POS pos, int word_id, String word, String source_url_word,
LanguageType native_lang, Map<String, Integer> _m_noun_word_to_id)
{
if(getWordEntryID (pos, word, _m_noun_word_to_id) > 0)
return ""; // this word was added already
String pos_prefix = getPOSOneLetterPrefix(pos);
if(POS.noun == pos)
_m_noun_word_to_id.put(word, word_id);
if(null != TPage.get (wikt_parsed_conn, word)) // if there is an entry for synonym, then write <url>http://ru.wiktionary.org/wiki/word</url>
source_url_word = word;
StringBuilder sb = new StringBuilder();
String code = native_lang.getCode();
sb.append(" <wordEntry id=\"").append(pos_prefix).append(word_id).append("\""); // id="n123"
sb.append(" author=\"").append(code).append(".wiktionary\">\n"); // author="ru.wiktionary" >
sb.append(" <word>").append(word).append("</word>\n");
sb.append(" <url>http://").append(code).append(".wiktionary.org/wiki/").append(source_url_word).append("</url>\n");
sb.append(" </wordEntry>\n");
return sb.toString();
}
/** Converts HTML to text by Jericho (TextExtractor).
* @see http://jericho.htmlparser.net/docs/javadoc/net/htmlparser/jericho/TextExtractor.html
*/
public static String HTMLToText (String text)
{
Source source = new Source(text);
return source.getTextExtractor().toString();
}
public static String HTMLEscape (String text)
{
return text.replace("<", "<").replace(">", ">").
replace("&", "&").replace("\"", """);
}
/** Gets bibliographic information about quote sentence in the form:
* Author, 'Title' // Publisher, Years, Source
*
* Example: "В. В. Крестовский, 'Петербургские трущобы', 1867 г., НКРЯ"
* @return null if there are no author name, title, years for this quotation.
**/
private static String getReference (TQuote _quote)
{
TQuotRef quot_ref = _quote.getReference();
if(null == quot_ref)
return null;
/** Related bibliography text: author, title, year, publisher. */
/** Author name. */
String author_name;
/** Source title. */
String title;
/** Years of the book. */
String years_range;
/** Publisher. */
String publisher;
/** Source. */
String source;
// 2a. data and logic
//reference_text = "{quot_ref.getYearsRange()}{quot_ref.getAuthorName()}";
years_range = quot_ref.getYearsRange();
author_name = HTMLEscape(quot_ref.getAuthorName());
title = HTMLEscape(quot_ref.getTitle());
publisher = HTMLEscape(quot_ref.getPublisherName());
source = quot_ref.getSourceName();
// 0. 'title'
if(title.length() > 0)
title = "'".concat(title).concat("'");
// 1. author_name, title
if(author_name.length() > 0 && (title.length() > 0 || years_range.length() > 0 || source.length() > 0))
author_name = author_name.concat(", ");
// 2. title // (publisher or source)
if(title.length() > 0 && publisher.length() > 0)
title = title.concat(" // ");
// 3. (title or publisher), year
if((title.length() > 0 || publisher.length() > 0) && (years_range.length() > 0 || source.length() > 0))
publisher = publisher.concat(", ");
// 4. year, source
if(years_range.length() > 0 && source.length() > 0)
years_range = years_range.concat(", ");
// Author, 'Title' // Publisher, Years, Source
StringBuilder sb = new StringBuilder();
if(0 < author_name.length())
sb = sb.append(author_name);
if(0 < title.length())
sb = sb.append(title);
if(0 < publisher.length())
sb = sb.append(publisher);
if(0 < years_range.length())
sb = sb.append(years_range);
if(0 < source.length())
sb = sb.append(source);
return sb.toString();
}
public static String getSynsetEntryBegin (POS pos, int synset_id, String word,
Map<String, Integer> _m_noun_word_to_id,
Label[] labels, TQuote[] quotes)
{
String pos_prefix = getPOSOneLetterPrefix(pos);
int word_id = getWordEntryID (pos, word, _m_noun_word_to_id);
StringBuilder sb = new StringBuilder();
sb.append(" <synsetEntry id=\"sn").append(synset_id).append("\">\n"); // id="sn1"
if(DEBUG) // comment: <!-- word -->
sb.append(" <word ref=\"").append(pos_prefix).append(word_id).append("\"> <!-- " + word + " -->\n");
else
sb.append(" <word ref=\"").append(pos_prefix).append(word_id).append("\">\n");
// Label[] labels
for(Label la : labels ) {
sb.append(" <mark>").append(la.getShortName()).append("</mark>\n");
sb.append(" <mark_desc>").append(la.getName()).append("</mark_desc>\n");
}
// todo sample: quotations
// <sample source="В. В. Крестовский, 'Петербургские трущобы', 1867 г., НКРЯ">Мечут же карты, передѐргивают и всякие иные фокусы употребляют только главные и
// самые искусные престидижитаторы, которые поэтому специально называются
// «дергачами».</sample>
// ...
for(TQuote q : quotes ) {
// sb.append(" <sample source=\"todo ref\">").append(q.getText()).append("</sample>\n"); // variant 1.
String text = HTMLToText (q.getTextWithoutWikification());
String ref = getReference (q);
sb.append(" <sample source=\"").append(ref).append("\">").append(text).append("</sample>\n");
//TQuotRef quot_ref = result.getReference();
//TQuotAuthor a = quot_ref.getAuthor();
}
sb.append(" </word>\n");
return sb.toString();
}
public static String getSynonymWordRef (POS pos, String word, Map<String, Integer> _m_noun_word_to_id)
{
StringBuilder sb = new StringBuilder();
String pos_prefix = getPOSOneLetterPrefix(pos);
int word_id = getWordEntryID (pos, word, _m_noun_word_to_id);
assert( word_id > 0 ); // at previous step word was added to the lexicon
if(DEBUG)
sb.append(" <word ref=\"").append(pos_prefix).append(word_id).append("\"/> <!-- " + word + " -->\n");
else
sb.append(" <word ref=\"").append(pos_prefix).append(word_id).append("\"/>\n");
return sb.toString();
}
/** Gets word (synset) definition in XML format.
* Example:
* <definition url="http://ru.wiktionary.org/wiki/престидижитатор" source="ru.wiktionary">фокусник, отличающийся ловкостью рук; манипулятор</definition>
*/
public static String getDefinition (String source_url_word, String definition, LanguageType native_lang)
{
StringBuilder sb = new StringBuilder();
String code = native_lang.getCode();
sb.append(" <definition url=\"http://").append(code).append(".wiktionary.org/wiki/").
append(source_url_word).
append("\" source=\"").append(code).append(".wiktionary\">");
//sb.append( StringUtil.replaceSpecialChars(definition) );
sb.append( HTMLEscape(definition.replace("{{-}}", " - ")) );
sb.append("</definition>\n");
return sb.toString();
}
/** Prints words, definitions, quotations and synonyms for each part_of_speech ("poses") in Wiktonary.
* .<br><br>
*
* SELECT * FROM lang_pos;
*
* @param connect connection to the database of the parsed Wiktionary
*/
public static void printYARN (Connect wikt_parsed_conn,
LanguageType native_lang, Set<POS> exported_pos) {
// lang_pos -> meaning (definition)
// meaning -> relation (synonym)
// meaning -> quote
Statement s = null;
ResultSet rs= null;
long t_start;
StringBuilder sb_words = new StringBuilder();
StringBuilder sb_synsets = new StringBuilder();
sb_words.append (" <words>\n");
sb_synsets.append(" <synsets>\n");
/** Current incremental ID of word entry (YARN file first part - lexicon) */
int current_word_id = 0;
int current_synset_id = 0;
int n_total = Statistics.Count(wikt_parsed_conn, "lang_pos");
t_start = System.currentTimeMillis();
try {
s = wikt_parsed_conn.conn.createStatement ();
s.executeQuery ("SELECT id FROM lang_pos");
rs = s.getResultSet ();
int n_cur = 0;
while (rs.next ())
{
n_cur ++;
int id = rs.getInt("id");
TLangPOS lang_pos_not_recursive = TLangPOS.getByID (wikt_parsed_conn, id);// fields are not filled recursively
if(null == lang_pos_not_recursive)
continue;
LanguageType lang = lang_pos_not_recursive.getLang().getLanguage();
if(lang != LanguageType.ru) // this is not our language :)
continue;
TPage tpage = lang_pos_not_recursive.getPage();
String page_title = tpage.getPageTitle();
int n_meaning = WTMeaning.countMeanings(wikt_parsed_conn, lang_pos_not_recursive);
if(0 == n_meaning)
continue; // only meanings with nonempty definitions
POS p = lang_pos_not_recursive.getPOS().getPOS();
if(!exported_pos.contains(p)) // only our POS should be exported :)
continue;
current_word_id ++;
String xml_word = getWordEntryXMLWithoutDuplicates (wikt_parsed_conn,
p, current_word_id, page_title, page_title, native_lang, m_noun_word_to_id);
sb_words.append( xml_word );
if(DEBUG)
System.out.print("\n" + page_title + ", meanings:" + n_meaning);
//System.out.print(", pos:" + p.toString());
TMeaning[] mm = TMeaning.get(wikt_parsed_conn, lang_pos_not_recursive);
for(TMeaning m : mm) {
String meaning_text = m.getWikiTextString();
if(0 == meaning_text.length())
continue;
if(DEBUG)
System.out.print("\n def: " + meaning_text);
TQuote[] quotes = TQuote.get (wikt_parsed_conn, m);
Label[] labels = TLabelMeaning.get(wikt_parsed_conn, m);
current_synset_id ++;
StringBuilder xml_synset = new StringBuilder( DefQuoteSynExporter.
getSynsetEntryBegin (p, current_synset_id, page_title, m_noun_word_to_id, labels, quotes));
TRelation[] rels = TRelation.get(wikt_parsed_conn, m);
if(0 == rels.length)
continue;
for(TRelation tr : rels)
{
Relation r = tr.getRelationType();
if(Relation.synonymy != r)
continue;
String word = tr.getWikiText().getText(); // synonym
if(0 == word.compareToIgnoreCase(" ")) // " " instead of synonym :(
continue;
// if this synonym is absent in the dictionary, it should be added
if(-1 == getWordEntryID (p, word, m_noun_word_to_id)) {
current_word_id ++;
xml_word = getWordEntryXMLWithoutDuplicates (wikt_parsed_conn, p, current_word_id, word, page_title, native_lang, m_noun_word_to_id);
sb_words.append( xml_word );
}
xml_synset.append( getSynonymWordRef (p, word, m_noun_word_to_id) );
if(DEBUG)
System.out.print("\n syn: " + word);
}
sb_synsets.append( xml_synset );
String def = getDefinition (page_title, meaning_text, native_lang);
sb_synsets.append( def );
sb_synsets.append(" </synsetEntry>\n");
}
if(0 == n_cur % 1000) { // % 100
if(DEBUG && n_cur > 1999)
break;
long t_cur, t_remain;
t_cur = System.currentTimeMillis() - t_start;
t_remain = (long)((n_total - n_cur) * t_cur/(60f*1000f*(float)(n_cur)));
t_cur = (long)(t_cur/(60f*1000f));
System.out.println(n_cur + ": " +
", duration: " + t_cur + // t_cur/(60f*1000f) +
" min, remain: " + t_remain +
" min");
}
} // eo while
} catch(SQLException ex) {
System.err.println("SQLException (DefQuoteSynExporter.printYARN()): " + ex.getMessage());
} finally {
if (rs != null) { try { rs.close(); } catch (SQLException sqlEx) { } rs = null; }
if (s != null) { try { s.close(); } catch (SQLException sqlEx) { } s = null; }
}
System.out.println("\n");
sb_words.append(" </words>\n");
System.out.println(sb_words.toString());
sb_synsets.append(" </synsets>\n");
System.out.println(sb_synsets.toString());
// System.out.println("<!-- Number of exported meanings with nonempty definitions: " + current_word_id + " -->");
// System.out.println("<!-- Total number of records in the table lang_pos: " + n_total + " -->");
}
public static void main(String[] args) {
// set of parts of speech to be exported
Set<POS> exported_pos = new HashSet<POS>();
exported_pos.add(POS.noun);
// Connect to wikt_parsed database
Connect wikt_parsed_conn = new Connect();
LanguageType native_lang;
// Russian
native_lang = LanguageType.ru;
wikt_parsed_conn.Open(Connect.RUWIKT_HOST, Connect.RUWIKT_PARSED_DB, Connect.RUWIKT_USER, Connect.RUWIKT_PASS, LanguageType.ru);
TLang.createFastMaps(wikt_parsed_conn);
TPOS.createFastMaps(wikt_parsed_conn);
TRelationType.createFastMaps(wikt_parsed_conn);
LabelCategoryLocal temp0 = LabelCategoryRu.computing; // let's initialize maps in LabelCategoryRu class
TLabelCategory.createFastMaps(wikt_parsed_conn);
Label temp1 = LabelEn.Acadia; // let's initialize maps in LabelEn class
Label temp2 = LabelRu.Yoruba; // ... in LabelRu class
TLabel.createFastMaps(wikt_parsed_conn, native_lang);
System.out.println("<?xml version=\"1.0\" encoding=\"utf-8\" ?>");
CommonPrinter.printHeaderXML (wikt_parsed_conn.getDBName());
System.out.println("<yarn>");
DefQuoteSynExporter.printYARN (wikt_parsed_conn, native_lang, exported_pos);
System.out.println("</yarn>");
wikt_parsed_conn.Close();
}
}