/* DefQuoteSynExporterWordlist.java - exports definition, quotations and synonyms
* from the database of the parsed Wiktionary in YARN format.
*
* Copyright (c) 2013 Andrew Krizhanovsky <andrew.krizhanovsky at gmail.com>
* Distributed under EPL/LGPL/GPL/AL/BSD multi-license.
*/
package wiktparsed.yarn;
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.List;
import java.sql.ResultSet;
import java.sql.Statement;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import wikokit.base.wikipedia.language.LanguageType;
import wikokit.base.wikipedia.sql.Connect;
import wikokit.base.wikt.api.WTMeaning;
import wikokit.base.wikt.constant.Label;
import wikokit.base.wikt.constant.LabelCategoryLocal;
import wikokit.base.wikt.constant.POS;
import wikokit.base.wikt.constant.Relation;
import wikokit.base.wikt.multi.en.name.LabelEn;
import wikokit.base.wikt.multi.ru.name.LabelCategoryRu;
import wikokit.base.wikt.multi.ru.name.LabelRu;
import wikokit.base.wikt.sql.TLang;
import wikokit.base.wikt.sql.TLangPOS;
import wikokit.base.wikt.sql.TMeaning;
import wikokit.base.wikt.sql.TPOS;
import wikokit.base.wikt.sql.TPage;
import wikokit.base.wikt.sql.TRelation;
import wikokit.base.wikt.sql.TRelationType;
import wikokit.base.wikt.sql.label.TLabel;
import wikokit.base.wikt.sql.label.TLabelCategory;
import wikokit.base.wikt.sql.label.TLabelMeaning;
import wikokit.base.wikt.sql.quote.TQuote;
import wikt.stat.printer.CommonPrinter;
/** YARN format exporter.
* Reads 22K of words from NKRYA.
*
*
* @see YARN format https://github.com/xoposhiy/yarn/commit/65411750ee8f867c79cdd77bcbaf8024df2c9d63
*/
public class DefQuoteSynExporterWordlist {
private static final boolean DEBUG = false;
//private static final FileWriter file;
/** map for the first part of YARN file: lexicon. Map from word to "nID" */
private static final Map<String, Integer> m_noun_word_to_id = new HashMap<String, Integer>();
/** Prints words, definitions, quotations and synonyms for each part_of_speech ("poses") in Wiktonary.
* .<br><br>
*
* SELECT * FROM lang_pos;
*
* @param connect connection to the database of the parsed Wiktionary
*/
public static void printYARNwithWordlist (Connect wikt_parsed_conn,
LanguageType native_lang,
Set<POS> exported_pos, List<String> export_words) {
// lang_pos -> meaning (definition)
// meaning -> relation (synonym)
// meaning -> quote
Statement s = null;
ResultSet rs= null;
long t_start;
StringBuilder sb_words = new StringBuilder();
StringBuilder sb_synsets = new StringBuilder();
sb_words.append (" <words>\n");
sb_synsets.append(" <synsets>\n");
/** Current incremental ID of word entry (YARN file first part - lexicon) */
int current_word_id = 0;
int current_synset_id = 0;
int n_total = export_words.size();
t_start = System.currentTimeMillis();
int n_cur = 0;
outerloop:
for(String page_title : export_words) {
n_cur ++;
TPage tpage = TPage.get(wikt_parsed_conn, page_title);
if(null == tpage) {
System.out.print("* [[" + page_title + "]]\n"); // see http://ru.wiktionary.org/w/index.php?title=Участник:AKA_MBG/Todo
continue;
}
TLangPOS[] tlang_pos_array = TLangPOS.get (wikt_parsed_conn, tpage);
for(TLangPOS lang_pos_not_recursive : tlang_pos_array) {
//TLangPOS lang_pos_not_recursive = TLangPOS.getByID (wikt_parsed_conn, id);// fields are not filled recursively
if(null == lang_pos_not_recursive)
continue;
LanguageType lang = lang_pos_not_recursive.getLang().getLanguage();
if(lang != LanguageType.ru) // this is our language :)
continue;
// TPage tpage = lang_pos_not_recursive.getPage();
// String page_title = tpage.getPageTitle();
int n_meaning = WTMeaning.countMeanings(wikt_parsed_conn, lang_pos_not_recursive);
if(0 == n_meaning)
continue; // only meanings with nonempty definitions
POS p = lang_pos_not_recursive.getPOS().getPOS();
if(!exported_pos.contains(p)) // this is our POS :) it should be exported
continue;
current_word_id ++;
String xml_word = DefQuoteSynExporter.getWordEntryXMLWithoutDuplicates (wikt_parsed_conn,
p, current_word_id, page_title, page_title, native_lang, m_noun_word_to_id);
sb_words.append( xml_word );
if(DEBUG)
System.out.print("\n" + page_title + ", meanings:" + n_meaning);
//System.out.print(", pos:" + p.toString());
TMeaning[] mm = TMeaning.get(wikt_parsed_conn, lang_pos_not_recursive);
for(TMeaning m : mm) {
String meaning_text = m.getWikiTextString();
if(0 == meaning_text.length())
continue;
if(DEBUG)
System.out.print("\n def: " + meaning_text);
TQuote[] quotes = TQuote.get (wikt_parsed_conn, m);
Label[] labels = TLabelMeaning.get(wikt_parsed_conn, m);
current_synset_id ++;
StringBuilder xml_synset = new StringBuilder( DefQuoteSynExporter.
getSynsetEntryBegin (p, current_synset_id, page_title, m_noun_word_to_id, labels, quotes));
TRelation[] rels = TRelation.get(wikt_parsed_conn, m);
if(0 == rels.length)
continue;
for(TRelation tr : rels)
{
Relation r = tr.getRelationType();
if(Relation.synonymy != r)
continue;
String word = tr.getWikiText().getText(); // synonym
if(0 == word.compareToIgnoreCase(" ")) // " " instead of synonym :(
continue;
// if this synonym is absent in the dictionary, it should be added
if(-1 == DefQuoteSynExporter.getWordEntryID (p, word, m_noun_word_to_id)) {
current_word_id ++;
xml_word = DefQuoteSynExporter.getWordEntryXMLWithoutDuplicates (wikt_parsed_conn,
p, current_word_id, word, page_title, native_lang, m_noun_word_to_id);
sb_words.append( xml_word );
}
xml_synset.append( DefQuoteSynExporter.getSynonymWordRef (p, word, m_noun_word_to_id) );
if(DEBUG)
System.out.print("\n syn: " + word);
}
sb_synsets.append( xml_synset );
String def = DefQuoteSynExporter.getDefinition (page_title, meaning_text, native_lang);
sb_synsets.append( def );
sb_synsets.append(" </synsetEntry>\n");
}
if(0 == n_cur % 1000) { // % 100
if(DEBUG && n_cur > 333)
break outerloop;
long t_cur, t_remain;
t_cur = System.currentTimeMillis() - t_start;
t_remain = (long)((n_total - n_cur) * t_cur/(60f*1000f*(float)(n_cur)));
t_cur = (long)(t_cur/(60f*1000f));
System.out.println(n_cur + ": " +
", duration: " + t_cur + // t_cur/(60f*1000f) +
" min, remain: " + t_remain +
" min");
}
} // eo for(TLangPOS
} // eo for(export_words
System.out.println("\n");
sb_words.append(" </words>\n");
System.out.println(sb_words.toString());
sb_synsets.append(" </synsets>\n");
System.out.println(sb_synsets.toString());
// System.out.println("<!-- Number of exported meanings with nonempty definitions: " + current_word_id + " -->");
// System.out.println("<!-- Total number of records in the table lang_pos: " + n_total + " -->");
}
public static List<String> readLines (String file_name) {
List<String> lines = null;
try {
lines = Files.readAllLines(Paths.get( file_name ), Charset.forName("UTF-8"));
//for(String line:lines){
// System.out.println(line);
//}
} catch (IOException ex) {
Logger.getLogger(DefQuoteSynExporterWordlist.class.getName()).log(Level.SEVERE, null, ex);
}
return lines;
}
public static void main(String[] args) {
// set of parts of speech to be exported
Set<POS> exported_pos = new HashSet<POS>();
exported_pos.add(POS.noun);
// Connect to wikt_parsed database
Connect wikt_parsed_conn = new Connect();
LanguageType native_lang;
// Russian
native_lang = LanguageType.ru;
wikt_parsed_conn.Open(Connect.RUWIKT_HOST, Connect.RUWIKT_PARSED_DB, Connect.RUWIKT_USER, Connect.RUWIKT_PASS, LanguageType.ru);
TLang.createFastMaps(wikt_parsed_conn);
TPOS.createFastMaps(wikt_parsed_conn);
TRelationType.createFastMaps(wikt_parsed_conn);
LabelCategoryLocal temp0 = LabelCategoryRu.computing; // let's initialize maps in LabelCategoryRu class
TLabelCategory.createFastMaps(wikt_parsed_conn);
Label temp1 = LabelEn.Acadia; // let's initialize maps in LabelEn class
Label temp2 = LabelRu.Yoruba; // ... in LabelRu class
TLabel.createFastMaps(wikt_parsed_conn, native_lang);
System.out.println("<?xml version=\"1.0\" encoding=\"utf-8\" ?>");
CommonPrinter.printHeaderXML (wikt_parsed_conn.getDBName());
System.out.println("<yarn>");
List<String> export_words = readLines ("c:/w/bin/yarn/s_list_utf8.txt");
printYARNwithWordlist (wikt_parsed_conn, native_lang, exported_pos, export_words);
System.out.println("</yarn>");
wikt_parsed_conn.Close();
}
}