/* TLang.java - SQL operations with the table 'page' in Wiktionary parsed database.
*
* Copyright (c) 2009-2011 Andrew Krizhanovsky <andrew.krizhanovsky at gmail.com>
* Distributed under EPL/LGPL/GPL/AL/BSD multi-license.
*/
package wikokit.base.wikt.sql;
import wikokit.base.wikipedia.language.LanguageType;
import wikokit.base.wikipedia.util.StringUtil;
import wikokit.base.wikipedia.sql.Connect;
import wikokit.base.wikipedia.sql.UtilSQL;
import wikokit.base.wikipedia.sql.Statistics;
import wikokit.base.wikt.sql.index.IndexForeign;
import wikokit.base.wikt.sql.index.IndexNative;
import java.sql.*;
import java.util.Map;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.ArrayList;
import java.util.Collections;
/** Table lang contains list of languages: name and ID. */
public class TLang {
/** Unique language identifier. */
private int id;
/** Languages of wiki: code and name, e.g. 'ru' and 'Russian'. */
private LanguageType lang;
/** Number of foreign parts of speech (POS) in the table index_XX,
* which have its own articles in Wiktionary,
* where XX is a language code. */
private int n_foreign_POS;
// SELECT COUNT(*) FROM index_en WHERE native_page_title is NULL;
/** Number of translation pairs in the table index_XX,
* where XX is a language code. */
private int n_translations;
// SELECT COUNT(*) FROM index_en WHERE native_page_title is not NULL;
/** Map from id to language. It is created from data in the table `lang`,
* which is created from data in LanguageType.java.*/
private static Map<Integer, TLang> id2lang;
/** Map from language to id.*/
private static Map<LanguageType, Integer> lang2id;
private final static TLang[] NULL_TLANG_ARRAY = new TLang[0];
public TLang(int _id,LanguageType _lang,int _n_foreign_POS,int _n_translations) {
id = _id;
lang = _lang;
n_foreign_POS = _n_foreign_POS;
n_translations= _n_translations;
}
/** Gets unique ID of the language. */
public int getID() {
return id;
}
/** Gets language. */
public LanguageType getLanguage() {
return lang;
}
/** Gets number of parts of speech (POS) in this language. <br><br>
* SELECT COUNT(*) FROM index_en WHERE native_page_title is NULL;
*/
public int getNumberPOS() {
return n_foreign_POS;
}
/** Gets number of translation into this language. <br><br>
* SELECT COUNT(*) FROM index_en WHERE native_page_title is not NULL;
*/
public int getNumberTranslations() {
return n_translations;
}
/** Gets language ID from the table 'lang'.<br><br>
*
* REM: the function 'createFastMaps()' should be run at least once,
* before this function execution.
*/
public static int getIDFast(LanguageType lt) {
if(null == lang2id || 0 == lang2id.size()) {
System.out.println("Error (wikt_parsed TLang.getIDFast()):: What about calling 'createFastMaps()' before?");
return -1;
}
if(null == lt) {
System.out.println("Error (wikt_parsed TLang.getIDFast()):: argument LanguageType is null");
return -1;
}
Integer result = lang2id.get(lt);
if(null == result) {
System.out.println("Warning (wikt_parsed TLang.getIDFast()):: map lang2id don't have this id. Are you adding new lang codes in time of parsing?");
return -1;
}
return result;
}
/** Gets language by ID from the table 'lang'.<br><br>
*
* REM: the functions createFastMaps() should be run at least once,
* before this function execution.
*/
public static TLang getTLangFast(int id) {
if(null == id2lang) {
System.out.println("Error (wikt_parsed TLang.getTLangFast()):: What about calling 'createFastMaps()' before?");
return null;
}
if(id <= 0) {
System.out.println("Error (wikt_parsed TLang.getTLangFast()):: argument id <=0, id = "+id);
return null;
}
return id2lang.get(id);
}
/** Gets language TLang by LanguageType from the table 'lang'.<br><br>
*
* REM: the functions createFastMaps() should be run at least once,
* before this function execution.
*/
public static TLang get(LanguageType lt) {
return getTLangFast(getIDFast(lt));
}
/** Gets the map from language to ID (ID in the table 'lang').
*
* REM: the functions createFastMaps() should be run at least once,
* before this function execution.
*/
public static Map<LanguageType, Integer> getAllLanguages() {
return lang2id;
}
/** Gets the map from language ID (ID in the table 'lang') to language.
*
* REM: the functions createFastMaps() should be run at least once,
* before this function execution.
*/
public static Map<Integer, TLang> getAllTLang() {
return id2lang;
}
/** Parses and extracts language codes from the string 'lang_codes'.
*
* @return empty array if language codes were not extracted
*/
public static TLang[] parseLangCode(String lang_codes) {
String s = lang_codes.trim();
if(0 == s.length())
return NULL_TLANG_ARRAY;
List<TLang>tlang_list = new ArrayList<TLang>();
String[] words = s.split("\\s");
for(String w : words) {
if(LanguageType.has(w))
tlang_list.add( TLang.get( LanguageType.get(w) ));
}
if(tlang_list.isEmpty())
return NULL_TLANG_ARRAY;
return( (TLang[])tlang_list.toArray(NULL_TLANG_ARRAY) );
}
/** Compares language codes extracted from the string 'str_lang2' and
* array of language codes 'tlang1'. */
public static boolean isEquals(TLang[] tlang1, String str_lang2) {
TLang tlang2[] = parseLangCode(str_lang2);
if(tlang1.length != tlang2.length)
return false;
if(tlang1.length == 0)
return true;
for(TLang l1 : tlang1) {
boolean bfound = false;
for(TLang l2 : tlang2) {
if(l1 == l2) {
bfound = true;
break;
}
}
if(!bfound)
return false;
}
return true;
}
/** Read all records from the table 'lang',
* fills the internal map from a table ID to a language.
*
* REM: during a creation of Wiktionary parsed database
* the functions recreateTable() should be called (before this function).
*/
public static void createFastMaps(Connect connect) {
System.out.println("Loading table `lang`...");
TLang[] tlangs = getAllTLang(connect);
int size = tlangs.length;
if(tlangs.length != LanguageType.size()) {
System.out.println("Warning (wikt_parsed TLang.java createFastMaps()):: LanguageType.size (" + LanguageType.size()
+ ") is not equal to size of table 'lang'("+ size +"). Is the database outdated?");
}
if(null != id2lang && id2lang.size() > 0)
id2lang.clear();
if(null != lang2id && lang2id.size() > 0)
lang2id.clear();
id2lang = new LinkedHashMap<Integer, TLang>(size);
lang2id = new LinkedHashMap<LanguageType, Integer>(size);
for(TLang t : tlangs) {
id2lang.put(t.getID(), t);
lang2id.put(t.getLanguage(), t.getID());
}
}
/** Gets all records from the table 'lang'.
*/
private static TLang[] getAllTLang(Connect connect) {
int size = Statistics.Count(connect, "lang");
if(0==size) {
System.out.println("Error (wikt_parsed TLang.java getAllTLang()):: The table `lang` is empty!");
return NULL_TLANG_ARRAY;
}
List<TLang>tlang_list = new ArrayList<>(size);
Map<String, LanguageType> ll = LanguageType.getAllLanguages();
for(LanguageType l : ll.values()) {
TLang t = get(connect, l);
if(null != t)
tlang_list.add(t);
}
return( (TLang[])tlang_list.toArray(NULL_TLANG_ARRAY) );
}
/** Deletes all records from the table 'lang',
* loads language code and name from LanguageType.java,
* sorts by language code,
* fills the table.
*/
public static void recreateTable(Connect connect) {
//Map<Integer, LanguageType> id2lang = null; //= new HashMap<Integer, LanguageType>();
//Map<String, Integer> lang_code2id = null; //= new HashMap<String, Integer>();
System.out.println("Recreating the table `lang`...");
Map<Integer, LanguageType> id2lang = fillLocalMaps();
UtilSQL.deleteAllRecordsResetAutoIncrement(connect, "lang");
fillDB(connect, id2lang);
{
int db_current_size = wikokit.base.wikipedia.sql.Statistics.Count(connect, "lang");
assert(db_current_size == LanguageType.size()); // 356 languages
}
}
/** Load data from a LanguageType class, sorts,
* and fills local maps 'id2lang' and 'lang_code2id'. */
private static Map<Integer, LanguageType> fillLocalMaps() { //Map<String, Integer> lang_code2id
int size = LanguageType.size();
Map<String, LanguageType> code2lang = LanguageType.getAllLanguages();
List<String>list_code = new ArrayList<>(size);
for(String s : code2lang.keySet()) {
list_code.add(s);
}
Collections.sort(list_code);
// OK, we have list of language codes. Sorted list.
// list_code
Map<Integer, LanguageType> id2lang = new LinkedHashMap<>(size); //lang_code2id = new LinkedHashMap<String, Integer>(size);
for(int id=0; id<size; id++) {
String code = list_code.get(id);
LanguageType lang = code2lang.get(code);
id2lang.put(id, lang); //lang_code2id.put(lang.getCode(), id);
}
return id2lang;
}
/** Fills database table 'lang' by data from LanguageType class. */
private static void fillDB(Connect connect, Map<Integer, LanguageType> id2lang) {
for(int id : id2lang.keySet()) {
LanguageType lang = id2lang.get(id);
/*if(lang.equals(LanguageType.ru)) {
int z = 0;
}*/
insert (connect, lang.getCode(), lang.getName(), 0, 0); //insert (connect, lang.getCode(), lang.getCode());
// insert (connect, lang.getCode(), connect.enc.EncodeFromJava(lang.getName()));
}
}
/** Calculates (1) number of foreign parts of speech (POS) and translation pairs
* in the table index_XX, stores statistics into fields:
* (1) n_foreign_POS, (2) n_translations for each language (except native). <br><br>
*
* For native language calculates only (1) n_foreign_POS by data from
* the table 'index_native'. (In really it's a number of native POS.)
*
* REM: this func should be called after the a creation of Wiktionary
* parsed database, and the tables should be filled with data.
*
* @param native_lang native language in the Wiktionary,
* e.g. Russian language in Russian Wiktionary
*/
public static void calcIndexStatistics(Connect connect,
LanguageType native_lang) {
System.out.println("Fill table `lang` by statistics from index_XX tables...");
// foreign languages statistics
for(LanguageType lt : lang2id.keySet()) {
if(native_lang != lt) {
int n_foreign_POS = IndexForeign.countNumberOfForeignPOS(connect, lt);
int n_translations = IndexForeign.countTranslations(connect, lt);
update(connect, lt, n_foreign_POS, n_translations);
}
}
// For native language calculates only (1) n_foreign_POS by data from
// the table 'index_native'. (In really it's a number of native POS.)
int n_native_POS = IndexNative.countNumberPOSWithDefinition(connect);
update(connect, native_lang, n_native_POS, 0);
}
/** Inserts record into the table 'lang'.
*
* INSERT INTO lang (code,name,n_foreign_POS,n_translations) VALUES ("ru","Russian", 12, 13);
*
* @param code two (or more) letter language code, e.g. 'en', 'ru'
* @param name language name, e.g. 'English', 'Russian'
*/
public static void insert (Connect connect,String code,String name,
int n_foreign_POS,int n_translations) {
StringBuilder str_sql = new StringBuilder();
try
{
Statement s = connect.conn.createStatement ();
try {
str_sql.append("INSERT INTO lang (code,name,n_foreign_POS,n_translations) VALUES (\"");
str_sql.append(code);
str_sql.append("\",\"");
String safe_title = StringUtil.spaceToUnderscore(
StringUtil.escapeChars(name));
str_sql.append(safe_title);
str_sql.append("\",");
str_sql.append(n_foreign_POS);
str_sql.append(",");
str_sql.append(n_translations);
str_sql.append(")");
s.executeUpdate (str_sql.toString());
} finally {
s.close();
}
}catch(SQLException ex) {
System.out.println("SQLException (TLang.insert()):: sql='" + str_sql.toString() + "' " + ex.getMessage());
}
}
/** Updates values (n_foreign_POS, n_translations) in the table 'lang'. <br><br>
*
* UPDATE lang SET n_foreign_POS=11, n_translations=13 WHERE code="en";
*
* @param lang language, the corresponded record in the table to be updated
*/
public static void update (Connect connect,LanguageType lang,
int n_foreign_POS,int n_translations) {
StringBuilder str_sql = new StringBuilder();
try
{
Statement s = connect.conn.createStatement ();
try {
// UPDATE lang SET n_foreign_POS=11, n_translations=13 WHERE code="en"
str_sql.append("UPDATE lang SET n_foreign_POS=");
str_sql.append(n_foreign_POS);
str_sql.append(", n_translations=");
str_sql.append(n_translations);
str_sql.append(" WHERE code=\"");
str_sql.append(lang.getCode());
str_sql.append("\"");
s.executeUpdate (str_sql.toString());
} finally {
s.close();
}
}catch(SQLException ex) {
System.out.println("SQLException (TLang.update()):: sql='" + str_sql.toString() + "' " + ex.getMessage());
}
}
/** Selects row from the table 'lang' by a language code.
*
* SELECT id,name,n_foreign_POS,n_translations FROM lang WHERE code="ru";
*
* @param lang_code title of Wiktionary article
* @return null if a language code is absent in the table 'lang'
*/
public static TLang get (Connect connect,LanguageType lt) {
StringBuilder str_sql = new StringBuilder();
TLang tp = null;
if(null == lt) return null;
String lang_code = lt.getCode();
try {
Statement s = connect.conn.createStatement ();
try {
str_sql.append("SELECT id,name,n_foreign_POS,n_translations FROM lang WHERE code=\"");
str_sql.append(lang_code);
str_sql.append("\"");
ResultSet rs = s.executeQuery (str_sql.toString());
try {
if (rs.next ())
{
//String name = StringUtil.underscoreToSpace(rs.getString("name"));
tp = new TLang( rs.getInt("id"), lt,
rs.getInt("n_foreign_POS"),
rs.getInt("n_translations"));
/*if(!lt.getName().equalsIgnoreCase(name)) { // cause: field lang.name is NOT unique, only .code is unique
System.out.println("Warning: (wikt_parsed TLang.java get()):: Table 'lang' has unknown language name =" + name +
" (language code = " + lt.getCode() + ")");
}*/
} else {
System.out.println("Error: (wikt_parsed TLang.java get()):: The language code '" + lang_code + "' is absent in the table 'lang'.");
}
} finally {
rs.close();
}
} finally {
s.close();
}
} catch(SQLException ex) {
System.out.println("SQLException (TLang.get()):: sql='" + str_sql.toString() + "' " + ex.getMessage());
}
return tp;
}
/** Deletes row from the table 'lang' by the language code.
*
* DELETE FROM lang WHERE code="ru";
*
* @param lt language to be deleted
*/
public static void delete (Connect connect,LanguageType lt) {
StringBuilder str_sql = new StringBuilder();
if(null == lt) return;
String lang_code = lt.getCode();
try {
Statement s = connect.conn.createStatement ();
try {
str_sql.append("DELETE FROM lang WHERE code=\"");
str_sql.append(lang_code);
str_sql.append("\"");
s.execute (str_sql.toString());
} finally {
s.close();
}
} catch(SQLException ex) {
System.out.println("SQLException (TLang.delete()):: sql='" + str_sql.toString() + "' " + ex.getMessage());
}
}
}