/* Main.java - main file for Wiktionary parsing.
*
* Copyright (c) 2008-2013 Andrew Krizhanovsky <andrew.krizhanovsky at gmail.com>
* Distributed under EPL/LGPL/GPL/AL/BSD multi-license.
*/
package wikt.parser;
import wikokit.base.wikipedia.sql.Connect;
import wikokit.base.wikipedia.language.LanguageType;
/** Wiktionary parser creates MySQL database (like WordNet)
* from Wiktionary MySQL dump file.
*
* The wikt_parsed database should be created in advance,
* see http://code.google.com/p/wikokit/wiki/File_wikt_parsed_empty_sql
*
* The database filled by data from Wiktionary should be created before.
*/
public class Main {
/**
* @param args the command line arguments todo
*
* Run from the command-line, with a list of arguments:
* <P><B>
* java -jar "./wikt_parser/dist/wikt_parser.jar" ru 0
* </B><BR>
*/
public static void main(String[] args) {
// Connect to Wiktionary database
Connect wikt_conn = new Connect();
// Connect to wikt_parsed database
Connect wikt_parsed_conn = new Connect();
if(args.length != 2) {
System.out.println("Wiktionary parser.\n" +
"Usage:\n run_wikt_parser.bat language_code n_start_from\n" +
"Arguments:\n" +
" language_code - language code of MySQL Wiktionary database to be parsed\n" +
" n_start_from - number of records in database to start from\n" +
"Examples: run_wikt_parser.bat en 0\n"
);
return;
}
String s = args[0];
if(!LanguageType.has(s)) {
System.out.println("Error. Unknown language code '" + s + "'. Stop.");
return;
}
System.out.println("Wiktionary parser will create wikt_parsed database.\n");
LanguageType wikt_lang = LanguageType.get(s);
System.out.println("OK. language code is '" + s + "'");
int n_start_from = Integer.parseInt(args[1]);
System.out.println("OK. n_start_from=" + n_start_from);
/*
// simple
wiki_lang = LanguageType.simple;
wikt_conn.Open(Connect.WP_HOST,Connect.WP_SIMPLE_DB, Connect.WP_USER, Connect.WP_PASS);
wikt_parsed_conn.Open(IDF_SIMPLE_HOST, IDF_SIMPLE_DB, IDF_SIMPLE_USER, IDF_SIMPLE_PASS);
// Category:Main page - failed - too much articles
// "Literature" 812 docs - OK
// "Folklore" 29 docs
// "American_poets" 9 docs - OK
*/
// Russian Wiktionary
if(LanguageType.ru == wikt_lang) {
wikt_conn.Open (Connect.RUWIKT_HOST, Connect.RUWIKT_DB, Connect.RUWIKT_USER, Connect.RUWIKT_PASS, wikt_lang);
wikt_parsed_conn.Open(Connect.RUWIKT_HOST, Connect.RUWIKT_PARSED_DB, Connect.RUWIKT_USER, Connect.RUWIKT_PASS, wikt_lang);
} else {
// English Wiktionary
if(LanguageType.en == wikt_lang) {
wikt_conn.Open (Connect.ENWIKT_HOST, Connect.ENWIKT_DB, Connect.ENWIKT_USER, Connect.ENWIKT_PASS, wikt_lang);
wikt_parsed_conn.Open(Connect.ENWIKT_HOST, Connect.ENWIKT_PARSED_DB, Connect.ENWIKT_USER, Connect.ENWIKT_PASS, wikt_lang);
} else {
System.out.println("This language code ('" + s + "') is not supported yet. Stop.");
return;
}
}
String category_name = "Викисловарь:Избранные статьи";
// "Викисловарь:Избранные статьи";
// "Слово дня";
// "Статья недели", "Слово дня"
// "Кандидаты в избранные статьи", "Статьи со ссылками на Википедию"
// "Статьи с звучащими примерами произношения", "Статьи с иллюстрациями",
WiktParser w = new WiktParser();
// w.runSubCategories(wiki_lang, wikt_conn, wikt_parsed_conn, category_name);
PageTableAll.parseAllPages(wikt_lang, wikt_conn, wikt_parsed_conn, n_start_from);
wikt_conn.Close();
wikt_parsed_conn.Close();
}
}