/* MSRLang.java - SQL operations with the table 'lang' in the database * (wikt_mean_semrel) wich contains only word's meanings and semantic relations. * It is simplified Wiktionary parsed database based on wikt_parsed database. * * Copyright (c) 2011 Andrew Krizhanovsky <andrew.krizhanovsky at gmail.com> * Distributed under EPL/LGPL/GPL/AL/BSD multi-license. */ package wiktparsed.mean_semrel.parser.sql; import wikokit.base.wikipedia.language.LanguageType; import wikokit.base.wikipedia.util.StringUtil; import wikokit.base.wikipedia.sql.Connect; import wikokit.base.wikipedia.sql.UtilSQL; import wikokit.base.wikipedia.sql.Statistics; import java.sql.*; import java.util.Map; import java.util.LinkedHashMap; import java.util.List; import java.util.ArrayList; import java.util.Collections; /** Table wikt_mean_semrel.lang contains list of languages: name and ID. */ public class MSRLang { /** Unique page identifier. */ private int id; /** Languages of wiki: code and name, e.g. 'ru' and 'Russian'. */ private LanguageType lang; /** Number of meanings (with semantic relations) of words of this language, * in the table mean_semrel_XX, where XX is a language code. */ private int n_meaning; // SELECT COUNT(*) FROM mean_semrel_en; /** Number of semantic relations (for this language) in the table mean_semrel_XX, * where XX is a language code. */ private int n_sem_rel; /** Map from id to language. It is created from data in the table `lang`, * which is created from data in LanguageType.java.*/ private static Map<Integer, MSRLang> id2lang; /** Map from language to id.*/ private static Map<LanguageType, Integer> lang2id; private final static MSRLang[] NULL_MSRLANG_ARRAY = new MSRLang[0]; public MSRLang(int _id,LanguageType _lang,int _n_meaning,int _n_sem_rel) { id = _id; lang = _lang; n_meaning = _n_meaning; n_sem_rel= _n_sem_rel; } /** Gets unique ID of the language. */ public int getID() { return id; } /** Gets language. */ public LanguageType getLanguage() { return lang; } /** Gets number of meanings (with semantic relations) of words * in this language. <br><br> * SELECT COUNT(*) FROM mean_semrel_en; */ public int getNumberMeanings() { return n_meaning; } /** Gets number of semantic relations in this language. */ public int getNumberSemanticRelations() { return n_sem_rel; } /** Gets language ID from the table 'lang'.<br><br> * * REM: the function 'createFastMaps()' should be run at least once, * before this function execution. */ public static int getIDFast(LanguageType lt) { if(null == lang2id || 0 == lang2id.size()) { System.err.println("Error (wikt_mean_semrel MSRLang.getIDFast()):: What about calling 'createFastMaps()' before?"); return -1; } if(null == lt) { System.err.println("Error (wikt_mean_semrel MSRLang.getIDFast()):: argument LanguageType is null"); return -1; } Integer result = lang2id.get(lt); if(null == result) { System.out.println("Warning (wikt_mean_semrel MSRLang.getIDFast()):: map lang2id don't have this id. Are you adding new lang codes in time of parsing?"); return -1; } return result; } /** Gets language by ID from the table 'lang'.<br><br> * * REM: the functions createFastMaps() should be run at least once, * before this function execution. */ public static MSRLang getMSRLangFast(int id) { if(null == id2lang) { System.err.println("Error (wikt_mean_semrel MSRLang.getMSRLangFast()):: What about calling 'createFastMaps()' before?"); return null; } if(id <= 0) { System.err.println("Error (wikt_mean_semrel MSRLang.getMSRLangFast()):: argument id <=0, id = "+id); return null; } return id2lang.get(id); } /** Gets language MSRLang by LanguageType from the table 'lang'.<br><br> * * REM: the functions createFastMaps() should be run at least once, * before this function execution. */ public static MSRLang get(LanguageType lt) { return getMSRLangFast(getIDFast(lt)); } /** Gets the map from language to ID (ID in the table 'lang'). * * REM: the functions createFastMaps() should be run at least once, * before this function execution. */ public static Map<LanguageType, Integer> getAllLanguages() { return lang2id; } /** Read all records from the table 'lang', * fills the internal map from a table ID to a language. * * REM: during a creation of Wiktionary parsed database * the functions recreateTable() should be called (before this function). */ public static void createFastMaps(Connect connect) { System.out.println("Loading table `lang`..."); MSRLang[] langs = getAllLang(connect); int size = langs.length; if(langs.length != LanguageType.size()) { System.out.println("Warning (wikt_mean_semrel MSRLang.createFastMaps()):: LanguageType.size (" + LanguageType.size() + ") is not equal to size of table 'lang'("+ size +"). Is the database outdated?"); } if(null != id2lang && id2lang.size() > 0) id2lang.clear(); if(null != lang2id && lang2id.size() > 0) lang2id.clear(); id2lang = new LinkedHashMap<Integer, MSRLang>(size); lang2id = new LinkedHashMap<LanguageType, Integer>(size); for(MSRLang t : langs) { id2lang.put(t.getID(), t); lang2id.put(t.getLanguage(), t.getID()); } } /** Gets all records from the table 'lang'. */ private static MSRLang[] getAllLang(Connect connect) { int size = Statistics.Count(connect, "lang"); if(0==size) { System.err.println("Error (wikt_mean_semrel MSRLang.getAllLang()):: The table `lang` is empty!"); return NULL_MSRLANG_ARRAY; } List<MSRLang>lang_list = new ArrayList<MSRLang>(size); Map<String, LanguageType> ll = LanguageType.getAllLanguages(); for(LanguageType l : ll.values()) { MSRLang t = get(connect, l); if(null != t) lang_list.add(t); } return( (MSRLang[])lang_list.toArray(NULL_MSRLANG_ARRAY) ); } /** Deletes all records from the table 'lang', * loads language code and name from LanguageType.java, * sorts by language code, * fills the table. */ public static void recreateTable(Connect connect) { //Map<Integer, LanguageType> id2lang = null; //= new HashMap<Integer, LanguageType>(); //Map<String, Integer> lang_code2id = null; //= new HashMap<String, Integer>(); System.out.println("Recreating the table `lang`..."); Map<Integer, LanguageType> id2lang_local = fillLocalMaps(); UtilSQL.deleteAllRecordsResetAutoIncrement(connect, "lang"); fillDB(connect, id2lang_local); { int db_current_size = wikokit.base.wikipedia.sql.Statistics.Count(connect, "lang"); assert(db_current_size == LanguageType.size()); // 356 languages } } /** Load data from a LanguageType class, sorts, * and fills local maps 'id2lang' and 'lang_code2id'. */ private static Map<Integer, LanguageType> fillLocalMaps() { //Map<String, Integer> lang_code2id int size = LanguageType.size(); Map<String, LanguageType> code2lang = LanguageType.getAllLanguages(); List<String>list_code = new ArrayList<String>(size); for(String s : code2lang.keySet()) { list_code.add(s); } Collections.sort(list_code); // OK, we have list of language codes. Sorted list. // list_code Map<Integer, LanguageType> id2lang_local = new LinkedHashMap<Integer, LanguageType>(size); //lang_code2id = new LinkedHashMap<String, Integer>(size); for(int id=0; id<size; id++) { String code = list_code.get(id); LanguageType lang = code2lang.get(code); id2lang_local.put(id, lang); } return id2lang_local; } /** Fills database table 'lang' by data from LanguageType class. */ private static void fillDB(Connect connect, Map<Integer, LanguageType> id2lang) { for(int id : id2lang.keySet()) { LanguageType lang = id2lang.get(id); /*if(lang.equals(LanguageType.ru)) { int z = 0; }*/ insert (connect, lang.getCode(), lang.getName(), 0, 0); //insert (connect, lang.getCode(), lang.getCode()); // insert (connect, lang.getCode(), connect.enc.EncodeFromJava(lang.getName())); } } /** Calculates (1) number of meanings with semantic relations, * (2) total number of semantic relations in the table mean_semrel_XX, * stores statistics into fields: * (1) lang.n_meaning, (2) lang.n_sem_rel for each language. <br><br> * * (1) SELECT COUNT(*) FROM mean_semrel_os; * * REM: this func should be called after the a creation of Wiktionary * wikt_mean_semrel database, and the tables mean_semrel_XXs should be * filled with data. */ public static void calcMeanSemrelStatistics(Connect connect) { System.out.println("Fill table `lang` by statistics from mean_semrel_XX tables..."); for(LanguageType lt : lang2id.keySet()) { String table_name = "mean_semrel_" + lt.getCode(); // SELECT COUNT(*) FROM mean_semrel_os; - number of meanings with semantic relations int n_meaning = Statistics.Count(connect, table_name); // SELECT SUM(n_sem_rel) FROM mean_semrel_os; // total number of semantic relations for this language int n_sem_rel = Statistics.Sum(connect, table_name, "n_sem_rel"); update(connect, lt, n_meaning, n_sem_rel); } } /** Deletes * (1) records XX in the table lang, * (2) tables mean_semrel_XX, if XX language is absent in Wiktionary, * i.e. if number of records in the table mean_semrel_XX < threshold, * that is lang.XX.n_meaning < threshold (= min_meaning). <br><br> * * (1) SELECT COUNT(*) FROM mean_semrel_os; * * @param min_meaning threshold - minimum number of records in mean_semrel_XX, * the lesser tables (mean_semrel_XX) and records (lang.XX) will be deleted * * REM: this func should be called after the a creation of Wiktionary * wikt_mean_semrel database, and the tables mean_semrel_XXs should be * filled with data. */ public static void deleteEmptyRecordsAndTables(Connect connect, int min_meaning) { System.out.println("\nDeleting (1) records XX in the table lang, (2) tables mean_semrel_XX, " + "if number of records in the table mean_semrel_XX < " + min_meaning + ". Deleted languages: language (code) number of meanings"); //MSRLang.createFastMaps(connect); MSRLang[] ar_lang = MSRLang.getAllLang(connect); for(MSRLang m_lang : ar_lang) { if (m_lang.n_meaning < min_meaning) { LanguageType lt = m_lang.lang; System.out.println(lt.getName() + " (" + lt.getCode() + ") " + m_lang.n_meaning); // delete record XX in the table lang MSRLang.delete(connect, lt); // delete table mean_semrel_XX String table_name = "mean_semrel_" + lt.getCode(); UtilSQL.dropTable(connect, table_name); } } } /** Inserts record into the table 'lang'. * * INSERT INTO lang (code,name,n_meaning,n_sem_rel) VALUES ("ru","Russian", 12, 13); * * @param code two (or more) letter language code, e.g. 'en', 'ru' * @param name language name, e.g. 'English', 'Russian' */ public static void insert (Connect connect,String code,String name, int n_meaning,int n_sem_rel) { StringBuilder str_sql = new StringBuilder(); try { Statement s = connect.conn.createStatement (); try { str_sql.append("INSERT INTO lang (code,name,n_meaning,n_sem_rel) VALUES (\""); str_sql.append(code); str_sql.append("\",\""); String safe_title = StringUtil.spaceToUnderscore( StringUtil.escapeChars(name)); str_sql.append(safe_title); str_sql.append("\","); str_sql.append(n_meaning); str_sql.append(","); str_sql.append(n_sem_rel); str_sql.append(")"); s.executeUpdate (str_sql.toString()); } finally { s.close(); } }catch(SQLException ex) { System.err.println("SQLException (TLang.insert()):: sql='" + str_sql.toString() + "' " + ex.getMessage()); } } /** Updates values (n_meaning, n_sem_rel) in the table 'lang'. <br><br> * * UPDATE lang SET n_meaning=11, n_sem_rel=13 WHERE code="en"; * * @param lang language, the corresponded record in the table to be updated */ public static void update (Connect connect,LanguageType lang, int n_meaning,int n_sem_rel) { StringBuilder str_sql = new StringBuilder(); try { Statement s = connect.conn.createStatement (); try { // UPDATE lang SET n_meaning=11, n_sem_rel=13 WHERE code="en" str_sql.append("UPDATE lang SET n_meaning="); str_sql.append(n_meaning); str_sql.append(", n_sem_rel="); str_sql.append(n_sem_rel); str_sql.append(" WHERE code=\""); str_sql.append(lang.getCode()); str_sql.append("\""); s.executeUpdate (str_sql.toString()); } finally { s.close(); } }catch(SQLException ex) { System.err.println("SQLException (MSRLang.update()):: sql='" + str_sql.toString() + "' " + ex.getMessage()); } } /** Selects row from the table 'lang' by a language code. * * SELECT _id,name,n_meaning,n_sem_rel FROM lang WHERE code="ru"; * * @param lt.lang_code language code * @return null if the language code is absent in the table 'lang' */ public static MSRLang get (Connect connect,LanguageType lt) { StringBuilder str_sql = new StringBuilder(); MSRLang tp = null; if(null == lt) return null; String lang_code = lt.getCode(); try { Statement s = connect.conn.createStatement (); try { str_sql.append("SELECT _id,name,n_meaning,n_sem_rel FROM lang WHERE code=\""); str_sql.append(lang_code); str_sql.append("\""); ResultSet rs = s.executeQuery (str_sql.toString()); try { if (rs.next ()) { //String name = StringUtil.underscoreToSpace(rs.getString("name")); tp = new MSRLang( rs.getInt("_id"), lt, rs.getInt("n_meaning"), rs.getInt("n_sem_rel")); /*if(!lt.getName().equalsIgnoreCase(name)) { // cause: field lang.name is NOT unique, only .code is unique System.err.println("Warning: (wikt_mean_semrel TLang.java get()):: Table 'lang' has unknown language name =" + name + " (language code = " + lt.getCode() + ")"); }*/ } else { System.err.println("Error: (wikt_mean_semrel.MSRLang.get()):: The language code '" + lang_code + "' is absent in the table 'lang'."); } } finally { rs.close(); } } finally { s.close(); } } catch(SQLException ex) { System.err.println("SQLException (MSRLang.get()):: sql='" + str_sql.toString() + "' " + ex.getMessage()); } return tp; } /** Deletes row from the table 'lang' by the language code.<br><br> * * DELETE FROM lang WHERE code="ru"; * * @param lt language to be deleted */ public static void delete (Connect connect,LanguageType lt) { StringBuilder str_sql = new StringBuilder(); if(null == lt) return; String lang_code = lt.getCode(); try { Statement s = connect.conn.createStatement (); try { str_sql.append("DELETE FROM lang WHERE code=\""); str_sql.append(lang_code); str_sql.append("\""); s.execute (str_sql.toString()); } finally { s.close(); } } catch(SQLException ex) { System.err.println("SQLException (MSRLang.delete()):: sql='" + str_sql.toString() + "' " + ex.getMessage()); } } }