/* Main.java - main file for Wiktionary (Meaning + Semantic relations) parsing. * * Copyright (c) 2011 Andrew Krizhanovsky <andrew.krizhanovsky at gmail.com> * Distributed under EPL/LGPL/GPL/AL/BSD multi-license. */ package wiktparsed.mean_semrel.parser; //import wikt.parser.*; import wikokit.base.wikipedia.sql.Connect; import wikokit.base.wikipedia.language.LanguageType; /** Parser (second generation parser) * 1. takes Wiktionary parsed database (wikt_parsed) * 2. extracts meaning with related semantic relations * 3. saves to the wikt_mean_semrel database. * * The wikt_parsed database should be created in advance, * see http://code.google.com/p/wikokit/wiki/File_mean_semrel_empty_sql * (See comments in WiktParser.java) * * The Wiktionary parsed database (filled by data from the Wiktionary) should be created before. */ public class Main { /** * @param args the command line arguments todo * * Run from the command-line, with a list of arguments: * <P><B> * java -jar "./wikt_parser/dist/wikt_parser.jar" * </B><BR> */ public static void main(String[] args) { // Connect to wikt_parsed database Connect wikt_parsed_conn = new Connect(); // Connect to mean_semrel database Connect mean_semrel_conn = new Connect(); if(args.length != 4) { System.out.println("Wiktionary parser.\n" + "Usage:\n run_wikt_mean_semrel_parser.bat language_code n_start_from\n" + "Arguments:\n" + " language_code - language code of MySQL Wiktionary database to be parsed\n" + " n_start_from - number of records in database to start from\n" + " delimiter - symbol between words in the table fields \"synonyms\", \"antonyms\", etc.\n" + " min_meaning - threshold, i.e. minimum number of records in mean_semrel_XX,\n" + " the lesser tables (mean_semrel_XX) and records (lang.XX) will be deleted,\n" + "Examples: run_wikt_mean_semrel_parser.bat en 0 \"|\" 10\n" ); return; } String s = args[0]; if(!LanguageType.has(s)) { System.out.println("Error. Unknown language code '" + s + "'. Stop."); return; } System.out.println("The wikt_mean_semrel database will be created from wikt_parsed database.\n"); LanguageType wiki_lang = LanguageType.get(s); System.out.println("OK. language code is '" + s + "'"); int n_start_from = Integer.parseInt(args[1]); System.out.println("OK. n_start_from=" + n_start_from); String delimiter = args[2];// e.g. "|" - symbol between words in the table fields "synonyms", "antonyms", etc. System.out.println("OK. delimiter is '" + delimiter + "'"); // threshold, e.g. 10 - minimum number of records in mean_semrel_XX, // the lesser tables (mean_semrel_XX) and records (lang.XX) will be deleted int min_meaning = Integer.parseInt(args[3]); System.out.println("OK. min_meaning=" + min_meaning); // Russian if(LanguageType.ru == wiki_lang) { // wikt_conn.Open (Connect.RUWIKT_HOST, Connect.RUWIKT_DB, Connect.RUWIKT_USER, Connect.RUWIKT_PASS, wiki_lang); // wikt_parsed_conn.Open(Connect.RUWIKT_HOST, Connect.RUWIKT_PARSED_DB, Connect.RUWIKT_USER, Connect.RUWIKT_PASS, wiki_lang); } else { // English if(LanguageType.en == wiki_lang) { wikt_parsed_conn.Open(Connect.ENWIKT_HOST, Connect.ENWIKT_PARSED_DB, Connect.ENWIKT_USER, Connect.ENWIKT_PASS, wiki_lang); mean_semrel_conn.Open(Connect.ENWIKT_HOST, Connect.ENWIKT_MEAN_SEMREL, Connect.ENWIKT_USER, Connect.ENWIKT_PASS, wiki_lang); } else { System.out.println("This language code ('" + s + "') is not supported yet. Stop."); return; } } //String category_name = "Викисловарь:Избранные статьи"; SemrelParser p = new SemrelParser(); // p.runSubCategories(wiki_lang, wikt_conn, wikt_parsed_conn, category_name); PageWithSemrel.parse(//wiki_lang, wikt_parsed_conn, mean_semrel_conn, n_start_from, delimiter, min_meaning); wikt_parsed_conn.Close(); mean_semrel_conn.Close(); } }