/* WiktParser.java - second main file for Wiktionary parsing. * * Copyright (c) 2008-2013 Andrew Krizhanovsky <andrew.krizhanovsky at gmail.com> * Distributed under EPL/LGPL/GPL/AL/BSD multi-license. */ package wikt.parser; import java.util.*; import wikokit.base.wikt.word.WordBase; import wikokit.base.wikipedia.language.LanguageType; import wikokit.base.wikipedia.category.CategoryHyponyms; import wikokit.base.wikt.mrd.Keeper; import wikokit.base.wikt.sql.index.IndexForeign; import wikokit.base.wikipedia.sql.PageTableBase; import wikokit.base.wikipedia.sql.Connect; import wikokit.base.wikipedia.sql.UtilSQL; import wikokit.base.wikt.constant.Label; import wikokit.base.wikt.constant.LabelCategory; import wikokit.base.wikt.multi.en.name.LabelEn; import wikokit.base.wikt.multi.ru.name.LabelRu; import wikokit.base.wikt.constant.LabelCategoryLocal; import wikokit.base.wikt.multi.ru.name.LabelCategoryRu; import wikokit.base.wikt.sql.TLang; import wikokit.base.wikt.sql.TPOS; import wikokit.base.wikt.sql.TRelationType; import wikokit.base.wikt.sql.label.TLabel; import wikokit.base.wikt.sql.label.TLabelCategory; /** Top level functions for Wiktionary parsing. */ public class WiktParser { private static final boolean DEBUG = false; /* Clear the export of MySQL Workbench Visual %s/`mydb`.//g del "CREATE SCHEMA..." add first two lines: SET @saved_cs_client = @@character_set_client; SET character_set_client = utf8; add last line: SET character_set_client = @saved_cs_client; mysql> use ruwikt20080620_parsed or use enwikt20080525_parsed source ./wikt_parser/doc/wikt_parsed_empty.sql * * ruwikt20080620_parsed */ public static void clearDatabase (Connect wikt_parsed_conn, LanguageType native_lang) { TLang.recreateTable(wikt_parsed_conn); TLang.createFastMaps(wikt_parsed_conn); TPOS.recreateTable(wikt_parsed_conn); TPOS.createFastMaps(wikt_parsed_conn); TRelationType.recreateTable(wikt_parsed_conn); TRelationType.createFastMaps(wikt_parsed_conn); TLabelCategory.recreateTable(wikt_parsed_conn); TLabelCategory.createFastMaps(wikt_parsed_conn); Label temp1 = LabelEn.Acadia; // let's initialize maps in LabelEn class Label temp2 = LabelRu.Yoruba; // ... in LabelRu class TLabel.recreateTable(wikt_parsed_conn, TLabelCategory.getMapCategory2ID(), native_lang); TLabel.createFastMaps(wikt_parsed_conn, native_lang); UtilSQL.deleteAllRecordsResetAutoIncrement(wikt_parsed_conn, "inflection"); UtilSQL.deleteAllRecordsResetAutoIncrement(wikt_parsed_conn, "lang_pos"); UtilSQL.deleteAllRecordsResetAutoIncrement(wikt_parsed_conn, "meaning"); UtilSQL.deleteAllRecordsResetAutoIncrement(wikt_parsed_conn, "page"); UtilSQL.deleteAllRecordsResetAutoIncrement(wikt_parsed_conn, "page_inflection"); UtilSQL.deleteAllRecordsResetAutoIncrement(wikt_parsed_conn, "relation"); UtilSQL.deleteAllRecordsResetAutoIncrement(wikt_parsed_conn, "wiki_text"); UtilSQL.deleteAllRecordsResetAutoIncrement(wikt_parsed_conn, "wiki_text_words"); UtilSQL.deleteAllRecordsResetAutoIncrement(wikt_parsed_conn, "translation"); UtilSQL.deleteAllRecordsResetAutoIncrement(wikt_parsed_conn, "translation_entry"); UtilSQL.deleteAllRecordsResetAutoIncrement(wikt_parsed_conn, "label_meaning"); UtilSQL.deleteAllRecordsResetAutoIncrement(wikt_parsed_conn, "label_relation"); UtilSQL.deleteAllRecordsResetAutoIncrement(wikt_parsed_conn, "index_native"); IndexForeign.generateTables(wikt_parsed_conn, native_lang); } public static void initWithoutClearDatabase (Connect wikt_parsed_conn, LanguageType native_lang) { TLang.createFastMaps(wikt_parsed_conn); TPOS.createFastMaps(wikt_parsed_conn); TRelationType.createFastMaps(wikt_parsed_conn); // Russian Wiktionary if(LanguageType.ru == native_lang) { LabelCategoryLocal temp0 = LabelCategoryRu.computing; // let's initialize maps in LabelCategoryRu class } else { // English Wiktionary if(LanguageType.en == native_lang) { LabelCategory temp0 = LabelCategory.computing; // let's initialize maps in LabelCategory class } } TLabelCategory.createFastMaps(wikt_parsed_conn); Label temp1 = LabelEn.Acadia; // let's initialize maps in LabelEn class Label temp2 = LabelRu.Yoruba; // ... in LabelRu class TLabel.createFastMaps(wikt_parsed_conn, native_lang); } /** Parses the set of Wiktionary pages, * stores result to wikt_parsed database. * * @param native_lang native language in the Wiktionary, * e.g. Russian language in Russian Wiktionary, * it defines parsed wiki language, * it is needed, e.g., * in order to recognize categories for the selected language, * e.g. English (Category) or Esperanto (Kategorio).<br> * * @param wikt_conn connection to Wiktionary database * @param wikt_parsed_conn connection to database that contains results * of parsing * * @param category_name articles of this category and subcategories * are parsed * * * ????????? * DECLARE cur1 CURSOR FOR SELECT page_namespace, page_title, page_is_redirect FROM page WHERE page_id=5865; * OPEN cur1; * FETCH cur1 INTO var1, var2, var3 * CLOSE cur1; */ public static void runSubCategories( LanguageType native_lang, Connect wikt_conn, Connect wikt_parsed_conn, String category_name ) // w.runSubCategories(native_lang, wikt_conn, wikt_parsed_conn); { long t_start, t_end; float t_work; t_start = System.currentTimeMillis(); clearDatabase(wikt_parsed_conn, native_lang); // 1. get wiki-text from MySQL database // variant A. Get all articles // todo // variant B. Get all articles which belongs to the category or its // subcategories. Skip redirects. Disambig? // (1. Finds all, return. 2. The iterator returns the next // article which is not parsed (it's absent in idf database.) //int max_docs = 9000; int cur_doc = 0; System.out.println("Parsing of documents:"); //String[] pt3 = {"яблоко", "ангел", "самолёт", "order", "lead"}; // Category:Main page - failed - too much articles // "Literature" 812 docs - OK // "Folklore" 29 docs // "American_poets" 9 docs - OK List<String> pt = CategoryHyponyms.getArticlesOfSubCategories(wikt_conn, category_name); //"Яблоки" System.out.println("Total documents: " + pt.size()); for(String page_title:pt) { cur_doc ++; //page_title = "ангел"; // ангел самолёт коса яблоко //page_title = "апподжиатура"; // Bolesław_Prus car //if(++ cur_doc > max_docs) { //if(++ cur_doc > 100) // break; //page_title = pt3[cur_doc]; // "Will_o'_the_wisp"; // "Momotarō"; // id=68417 if(DEBUG) { System.out.println(" "+cur_doc+": "+page_title + " "); } parseWiktionaryEntry(native_lang, wikt_conn, wikt_parsed_conn, page_title); } t_end = System.currentTimeMillis(); t_work = (t_end - t_start)/1000f; // in sec System.out.println("\n\nTime sec:" + t_work + "\ndocuments: " + pt.size()); } /** Parses one article. * * @param native_lang native language in the Wiktionary, * e.g. Russian language in Russian Wiktionary * @param wikt_conn * @param wikt_parsed_conn * @param page_title */ public static void parseWiktionaryEntry( LanguageType native_lang, Connect wikt_conn, Connect wikt_parsed_conn, String page_title ) { // gets Wiktionary article text StringBuffer str = new StringBuffer( //StringUtil.escapeCharDollar( PageTableBase.getArticleText(wikt_conn, page_title)); if(0 == str.length()) { //System.out.println("Error in WiktParser.parseWiktionaryEntry(): The article with the title '"+ // page_title + "' has no text in Wiktionary."); return; } // converts "text_with_underscore" into the "text without underscore" page_title = page_title.replace("_", " "); // parses wiki text 'str', stores to the object 'word' WordBase word = new WordBase(page_title, native_lang, str); if(word.hasOnlyTemplatesWithoutDefinitions()) return; if(word.isEmpty()) { System.out.println("Warning in WiktParser.parseWiktionaryEntry(): The article with the title '"+ page_title + "' after convert wiki to text: has no text."); return; } // store results to tables: pos_term, meaning, synonyms... Keeper.storeToDB(wikt_parsed_conn, word, native_lang); str.setLength(0); str = null; } }