/* WikIDF.java - main file for WP parsing. * * Copyright (c) 2005-2008 Andrew Krizhanovsky <andrew.krizhanovsky at gmail.com> * Distributed under GNU General Public License. */ package wikidf; //import wikidf.db.Term; import wikipedia.sql.Connect; import wikipedia.category.CategoryHyponyms; import wikipedia.language.LanguageType; //import wikipedia.util.StringUtil; import java.util.*; import java.io.*; import gate.*; import gate.util.*; /** * * This class calculates Inverse Document Frequency (IDF) for wiki-texts. * Wiki is stored in MySQL database. The result are stored in another MySQL * database (titled idfenwiki, or idfsimplewiki, or idfruwiki). * use idfsimplewiki or use idfruwiki source ./wikidf/doc/idfwiki_empty.sql DELETE FROM page; DELETE FROM related_page; DELETE FROM term; DELETE FROM term_page; 1) LEMMA -> PAGE SELECT * FROM term WHERE lemma="НИЙЯ" LIMIT 0,5 2) page_title -> page_id -> list of lemma SELECT * FROM page WHERE page_title="Через_тернии_к_звёздам_(фильм)" LIMIT 0,5 SELECT lemma FROM term, term_page WHERE page_id=193323 AND term.term_id=term_page.term_id */ public class WikIDF { private static final boolean DEBUG = true; Connect connect_simple; //connect_ru; /** Calculated TF-IDF for set of Wikipedia documents, * stores to wiki idf database. * * @param dict_lang defines languages of dictionary for lemmatizing * (at LemServer in RuPOSTagger), e.g. English, German or Russian.<br> * * @param wiki_lang defines parsed wiki language, it is needed to remove * category for the selected language, e.g. English (Category) or Esperanto * (Kategorio).<br> * * @param b_remove_not_expand_iwiki if true then it removes interwiki, * e.g. "[[et:Talvepalee]] text" -> " text"; else it expands interwiki by * removing interwiki brackets and language code, * e.g. "[[et:Talvepalee]] text" -> "Talvepalee text".<br> * * @param doc_freq_max the limit for the table term_page. If number * of documents which contain a lemma (e.g. lemma "website") > doc_freq_max, * then * (1) table term_page.(term_id and page_id) contains only ID of first * doc_freq_max documents which contain the term; * (2) but term.doc_freq can be > doc_freq_max * * DECLARE cur1 CURSOR FOR SELECT page_namespace, page_title, page_is_redirect FROM page WHERE page_id=5865; * OPEN cur1; * FETCH cur1 INTO var1, var2, var3 * CLOSE cur1; */ public void runSubCategories(DictLanguage dict_lang, LanguageType wiki_lang, boolean b_remove_not_expand_iwiki, int doc_freq_max) throws GateException, IOException { //String[] texts = new String[2]; //texts[0] = "file:/mnt/win_e/all/projects/java/aot/gate/russian/embedRPOST/data/en/signatures_en.txt"; // long //texts[1] = "file:/mnt/win_e/projects/java/aot/rupostagger/data/ru/ABS_zmldks_short.txt"; // short //texts[0] = "file:/mnt/win_e/projects/java/aot/rupostagger/data/en/Winter_Palace.txt"; // simplewiki //args = texts; long t_start, t_end; float t_work; t_start = System.currentTimeMillis(); // initialise the GATE library Out.prln("Initialising GATE..."); Gate.init(); // Load ANNIE plugin File gateHome = Gate.getGateHome(); File pluginsHome = new File(gateHome, "plugins"); Gate.getCreoleRegister().registerDirectories(new File(pluginsHome, "ANNIE").toURI().toURL()); Gate.getCreoleRegister().registerDirectories(new File(pluginsHome, "RussianPOSTagger").toURI().toURL()); Out.prln("...GATE initialised"); // initialise ANNIE (this may take several minutes) StandAloneRussianPOSTagger prs = new StandAloneRussianPOSTagger(); prs.initPRs(dict_lang); // Connect to Wikipedia database connect_simple = new Connect(); connect_simple.Open(Connect.WP_HOST,Connect.WP_SIMPLE_DB, Connect.WP_USER, Connect.WP_PASS); // Connect to wiki IDF database Connect idf_conn = new Connect(); //idf_conn.Open(IDF_HOST, IDF_DB, IDF_USER, IDF_PASS); idf_conn.Open(Connect.IDF_SIMPLE_HOST, Connect.IDF_SIMPLE_DB, Connect.IDF_SIMPLE_USER, Connect.IDF_SIMPLE_PASS); // create a GATE corpus Corpus corpus = (Corpus) Factory.createResource("gate.corpora.CorpusImpl"); // 1. get wiki-text from MySQL database // variant A. Get all articles // todo // variant B. Get all articles which belongs to the category or its // subcategories. Skip redirects. Disambig? // (1. Finds all, return. 2. The iterator returns the next // article which is not parsed (it's absent in idf database.) //int max_docs = 9000; int cur_doc = 0; Out.prln("Parsing of documents:"); //String[] pt3 = {"Saadi", "Omar_Khayyám", "Amir_Khosrow"}; // Category:Main page - failed - too much articles // "Literature" 812 docs - OK // "Folklore" 29 docs // "American_poets" 9 docs - OK List<String> pt = CategoryHyponyms.getArticlesOfSubCategories(connect_simple, "American_poets"); Out.prln("Total documents: " + pt.size()); for(String page_title:pt) { //page_title = "List_of_Buffy_the_Vampire_Slayer_episodes"; //page_title = "Bolesław_Prus"; //if(++ cur_doc > max_docs) { //if(++ cur_doc > 1) { // break; //} //page_title = pt3[cur_doc]; // "Will_o'_the_wisp"; // "Momotarō"; // id=68417 if(DEBUG) { Out.prln(""); Out.pr(" "+cur_doc+": "+page_title + " "); } Keeper.parseFromWP( connect_simple, page_title, wiki_lang, b_remove_not_expand_iwiki, idf_conn, corpus, prs, doc_freq_max); } prs.deletePRs(); idf_conn.Close(); connect_simple.Close(); t_end = System.currentTimeMillis(); t_work = (t_end - t_start)/1000f; // in sec System.out.println("\n\nTime sec:" + t_work + "\ndocuments: " + pt.size()); } /** Parses all pages in Wikipedia. */ public void runAll(DictLanguage dict_lang, LanguageType wiki_lang, boolean b_remove_not_expand_iwiki, int doc_freq_max, Connect connect_wp, Connect idf_conn) throws GateException, IOException { // initialise the GATE library Out.prln("Initialising GATE..."); Gate.init(); // Load ANNIE plugin File gateHome = Gate.getGateHome(); File pluginsHome = new File(gateHome, "plugins"); Gate.getCreoleRegister().registerDirectories(new File(pluginsHome, "ANNIE").toURI().toURL()); Gate.getCreoleRegister().registerDirectories(new File(pluginsHome, "RussianPOSTagger").toURI().toURL()); Out.prln("...GATE initialised"); // initialise ANNIE (this may take several minutes) StandAloneRussianPOSTagger prs = new StandAloneRussianPOSTagger(); prs.initPRs(dict_lang); // create a GATE corpus Corpus corpus = (Corpus) Factory.createResource("gate.corpora.CorpusImpl"); // 1. get wiki-text from MySQL database // variant A. Get all articles //int max_docs = 9000; Out.prln("Parsing of documents:"); PageTableAll.parseAllPages( connect_wp, wiki_lang, b_remove_not_expand_iwiki, idf_conn, corpus, prs, doc_freq_max); prs.deletePRs(); } /** * Run from the command-line, with a list of arguments: * <P><B> * java -Dgate.home=/opt/GATE-4.0 -Dgate.plugins.home=/opt/GATE-4.0/plugin -jar "/mnt/win_e/projects/java/aot/rupostagger/wikidf/dist/wikidf.jar" * </B><BR> * */ public static void main(String args[]) throws GateException, IOException { DictLanguage dict_lang; LanguageType wiki_lang; // Connect to Wikipedia database Connect connect_wp = new Connect(); // Connect to wiki IDF database Connect idf_conn = new Connect(); /* // simple dict_lang = DictLanguage.get("ENGLISH"); wiki_lang = LanguageType.simple; connect_wp.Open(Connect.WP_HOST,Connect.WP_SIMPLE_DB, Connect.WP_USER, Connect.WP_PASS); idf_conn.Open(IDF_SIMPLE_HOST, IDF_SIMPLE_DB, IDF_SIMPLE_USER, IDF_SIMPLE_PASS); */ // russian dict_lang = DictLanguage.get("RUSSIAN"); wiki_lang = LanguageType.ru; connect_wp.Open(Connect.WP_RU_HOST,Connect.WP_RU_DB, Connect.WP_USER, Connect.WP_PASS); idf_conn.Open(Connect.IDF_RU_HOST, Connect.IDF_RU_DB, Connect.IDF_RU_USER, Connect.IDF_RU_PASS); boolean b_remove_not_expand_iwiki = true; int doc_freq_max = 1000; // 100 WikIDF w = new WikIDF(); //w.runSubCategories(dict_lang, wiki_lang, b_remove_not_expand_iwiki, doc_freq_max); w.runAll(dict_lang, wiki_lang, b_remove_not_expand_iwiki, doc_freq_max, connect_wp, idf_conn); idf_conn.Close(); connect_wp.Close(); } } // class StandAloneAnnie