/* WTranslationEntryEn.java - corresponds to a line in Translations of a word
* in English Wiktionary.
*
* Copyright (c) 2010 Andrew Krizhanovsky <andrew.krizhanovsky at gmail.com>
* Distributed under GNU General Public License.
*/
package wikokit.base.wikt.multi.en;
import wikokit.base.wikt.word.WTranslationEntry;
import wikokit.base.wikipedia.language.LanguageType;
import wikokit.base.wikt.util.WikiText;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import java.util.List;
import java.util.ArrayList;
/** One line in the Translation section, i.e. a translation to one language,
* e.g. "* Latvian: {{t|lt|oranžs}}".
*/
public class WTranslationEntryEn {
/** Chop the beginning of the line: "*: " or "* ", e.g.:
* "* German: {{t|de|Orange|f}}" ->
* "German: {{t|de|Orange|f}}"
*/
private final static Pattern ptrn_begin_asterisk = Pattern.compile(
"^(\\*\\:?\\s*)");
/** Wikified language name: "[[Alabama]]" -> "Alabama". */
private final static Pattern ptrn_wikified_lang_name = Pattern.compile(
"^\\[\\[(.+?)\\]\\]\\s*");
/** Extract from text {{t,t+,t-,trad etc.|...}}: */
private final static Pattern ptrn_t_template = Pattern.compile(
"\\{\\{(t[^}]*?)\\}\\}"); // RE: (\Q{{t\E[^}]*?\}\})
/** Structure for storing identified language and translation words. */
private static class LangAndTrans { // source: "* French: {{t|fr|orange|f}}"
LanguageType lang; // lang = fr, it could be null
String trans; // remain = {{t|fr|orange|f}}
}
/** Parses one entry (one line) of a translation box,
* extracts a language and a list of translations
* from template (wikified words) for this language,
* creates and fills WTranslationEntry.
*
* @param wikt_lang language of Wiktionary
* @param page_title word which are described in this article 'text'
* @param text translaton box text
* @return WTranslationEntry or null if the translation language or translation text are absent.
*
* @see http://en.wiktionary.org/wiki/Template_talk:t#Documentation
*/
public static WTranslationEntry parse(
String page_title,
String text)
{
// split "*[[Datiwuy]]: {{t|duj|buurnba}}" into "Datiwuy" and remain
// "* French: {{t|fr|orange|f}}" into "French" and remain.
LangAndTrans lang_trans = splitToLanguageAndTranslations(text);
if(null == lang_trans)
return null;
LanguageType prev_lang, lang = null;
List<String> translations = new ArrayList<String>();
// extract from text {{t,t+,t-,trad etc.|...}}:
Matcher m = ptrn_t_template.matcher(lang_trans.trans);
while(m.find()) {
String t_template = m.group(1);
String[] t_params = t_template.split("\\|");
// {{t|language_code|word|etc...}}
// {{0|1 |2 |3.....}}
if(t_params.length < 3 || !isValidTemplateT(t_params[0]))
continue;
String lang_code = t_params[1];
String translated_word = t_params[2];
if(translated_word.length() == 0) // does exist any translation
continue;
if(!LanguageType.has(lang_code)) {
// concise logging: only one message for one uknown language code
if(!LanguageType.hasUnknownLangCode(lang_code)) {
LanguageType.addUnknownLangCode(lang_code);
System.out.println("Warning in WTranslationEntryEn.parse(): The article '"+
page_title + "' has translation into unknown language with code: " + lang_code + ".");
}
if(lang_code.length() > 10)
System.out.println("Error in WTranslationEntryEn.parse(): The article '"+
page_title + "' has too long unknown language code: " + lang_code + ".");
continue;
}
prev_lang = lang;
lang = LanguageType.get(lang_code);
if(prev_lang != null && prev_lang != lang) {
// previous and next languages should be the same at one line...
System.out.println("Warning in WTranslationEntryEn.parse(): The article '"+
page_title + "' has translation into different languages at one line. Language codes: " + prev_lang + " and " + lang_code + ".");
return null;
}
translations.add(translated_word);
// todo
// 1. extract all info from template
// 2. add fields gender, number, sc (script template), tr (transliteration),
// alt (alternate form of the word) to the table translation_entry
}
if(translations.size() == 0)
return null;
// 2. translation wikified text
WikiText[] wt = WikiText.createWithoutParsing(page_title, translations);
if(0 == wt.length)
return null;
return new WTranslationEntry(lang, wt);
}
/** Splits one entry (one line of a translation box) into language and
* remain text (translation words).
*
* result->LanguageType could be null.
*
* @return LanguageType and translation text.
* It could be null if there is no column delimiter.
*/
private static LangAndTrans splitToLanguageAndTranslations(String text)
{
// 1. сhop the beginning of the line: "*: " or "* "
Matcher m = ptrn_begin_asterisk.matcher(text);
if(m.find())
text = m.replaceFirst("");
int pos_colon = text.indexOf(':');
if(-1 == pos_colon)
return null;
if(pos_colon + 1 > text.length()) // does exist any translation after ":"
return null;
String lang_text = text.substring(0, pos_colon);
String lang_name = "";
// 2. gets wikified language name
m = ptrn_wikified_lang_name.matcher(lang_text);
if(m.find())
lang_name = m.group(1);
else
lang_name = lang_text.trim();
// 3. create result structure
LangAndTrans lat = new LangAndTrans();
lat.trans = text.substring(pos_colon+1).trim();
if(lat.trans.length() == 0)
return null;
if(LanguageType.hasEnglishName(lang_name))
lat.lang = LanguageType.getByEnglishName(lang_name);
else
lat.lang = null;
return lat;
}
/** Returns true if this is one of templates:
* {{t}}, {{t+}}, {{t-}}, {{trad}}, or {{trad-}}.
*/
private static boolean isValidTemplateT (String template_name)
{
if(null == template_name)
return false;
if( template_name.equalsIgnoreCase("t") ||
template_name.equalsIgnoreCase("t+") ||
template_name.equalsIgnoreCase("t-") ||
template_name.equalsIgnoreCase("trad") ||
template_name.equalsIgnoreCase("trad-")
)
return true;
return false;
}
}