/* WLanguage.java - corresponds to a language level of Wiktionary word.
*
* Copyright (c) 2008-2011 Andrew Krizhanovsky <andrew.krizhanovsky at gmail.com>
* Distributed under EPL/LGPL/GPL/AL/BSD multi-license.
*/
package wikokit.base.wikt.word;
//import wikt.constant.POS;
import wikokit.base.wikt.util.LangText;
//import wikt.util.POSText;
import wikokit.base.wikipedia.language.LanguageType;
import wikokit.base.wikt.multi.ru.WLanguageRu;
import wikokit.base.wikt.multi.en.WLanguageEn;
/** Language lets you know the language of the word in question. It is almost
* always in a level two heading. E.g. ==English== or {{-ru-}}
*
* Exception: ==Translingual==
*
* @see http://en.wiktionary.org/wiki/Wiktionary:Entry_layout_explained
* and http://ru.wiktionary.org/wiki/Викисловарь:Правила оформления статей
*/
public class WLanguage {
/** Language of the word. */
private LanguageType lang;
/** Part of speech. */
private WPOS[] wpos;
/** Gets language. */
public LanguageType getLanguage() {
return lang;
}
/** Gets all parts of speech for this word. */
public WPOS[] getAllPOS() {
return wpos;
}
private final static WLanguage[] NULL_WLANGUAGE_ARRAY = new WLanguage[0];
/** Frees memory recursively. */
public void free ()
{
if(null != wpos) {
for(WPOS p : wpos)
p.free();
wpos = null;
}
}
/** Parses text, creates and fills array of homonym (WLanguage) for each language
* @param wikt_lang language of Wiktionary
* @param page_title word which are described in this article 'text'
* @param text
* @return
*/
public static WLanguage[] parse (
LanguageType wikt_lang,
String page_title,
StringBuffer text)
{
// = Level I. Language =
LangText[] lang_sections = splitToLanguageSections(wikt_lang, page_title, text);
if(0==lang_sections.length) {
return NULL_WLANGUAGE_ARRAY;
}
WLanguage[] wl = new WLanguage[lang_sections.length];
for(int i=0; i<lang_sections.length; i++) {
wl[i] = new WLanguage();
wl[i].lang = lang_sections[i].getLanguage();
wl[i].wpos = WPOS.parse(wikt_lang, page_title, lang_sections[i]);
}
return reduceNonUniqueLanguages (page_title, wl);
}
/** Reduces number of languages, removes any non unique languages.
* E.g. "kom" and "koi" refer to "kv" language code, see LanguageType.java
* So if the entry contains the description of two words: "kom" and "koi",
* then only the first one will be parsed ("kom"), the second ("koi") will be rejected.
*
* Side effect: the non unique languages ([] sources) will be set to NULL.
*
* @param page_title word which are described in this article 'text'
* @param source entry text parsed and stored into the objects
* @return
*/
private static WLanguage[] reduceNonUniqueLanguages (
String page_title,WLanguage[] source)
{
// 1. let's check that does exist any duplication
int duplication = 0;
for(int i=0; i<source.length; i++) {
for(int j=i+1; j<source.length; j++) {
if(source[i].lang == source[j].lang) {
source[i].free();
source[i] = null;
duplication ++;
break;
}
}
}
if(0 == duplication)
return source;
// 2. copy without duplication, i.e. skip empty (null) elements of array
assert(source.length - duplication > 0);
WLanguage[] dest = new WLanguage[source.length - duplication];
int dest_i = 0;
for(int i=0; i<source.length; i++) {
if(null != source[i]) {
dest [dest_i] = source[i];
dest_i ++;
}
}
return dest;
}
/**
* @param page_title word which are described in this article text
* @param wikt_lang language of Wiktionary
*/
public static LangText[] splitToLanguageSections (
LanguageType wikt_lang,
String page_title,
StringBuffer text)
{
LangText[] lang_sections; // result will be stored to
LanguageType l = wikt_lang;
if(l == LanguageType.ru) {
lang_sections = WLanguageRu.splitToLanguageSections(page_title, text);
} else if(l == LanguageType.en) {
lang_sections = WLanguageEn.splitToLanguageSections(page_title, text);
//} //else if(code.equalsIgnoreCase( "simple" )) {
// return WordSimple;
// todo
// ...
} else {
throw new NullPointerException("Null LanguageType");
}
return lang_sections;
}
/** True if the meaning section contains only templates (e.g. {{form of|}}
* or {{plural of|}}), i.e. there are no any real definitions,
* there are only references to main (normal) forms of the word.
*
* @param lang parsed entry stored into the array of objects WLanguage
* @param wikt_lang language of Wiktionary
*/
public static boolean hasOnlyTemplatesWithoutDefinitions (
LanguageType wikt_lang, WLanguage[] lang)
{
boolean b = false;
if(wikt_lang == LanguageType.en) {
b = hasOnlyTemplatesWithoutDefinitions(lang);
}
//else if(l == LanguageType.ru) {
//} else { throw new NullPointerException("Null LanguageType");
//}
return b;
}
/** True if the meaning section contains only templates,
* e.g. {{form_of|}}, or {{plural of|}},
* i.e. there are only references to main (normal) forms of the word,
* and there are no any real definitions.
*
* @param lang parsed entry stored into the array of objects WLanguage
*/
private static boolean hasOnlyTemplatesWithoutDefinitions (
WLanguage[] lang)
{
boolean at_least_one_template = false;
for(int i=0; i<lang.length; i++) {
assert(null != lang[i]);
WPOS[] wpos = lang[i].getAllPOS();
for(int j=0; j<wpos.length; j++) {
assert(null != wpos[j]);
WMeaning[] wm = wpos[j].getAllMeanings();
for(WMeaning m : wm) {
if(!m.isFormOfInflection())
return false;
at_least_one_template = true;
}
}
}
return at_least_one_template;
}
}