/* WTranslationRu.java - corresponds to a Translations level of a word in
* Russian Wiktionary.
*
* Copyright (c) 2009 Andrew Krizhanovsky <andrew.krizhanovsky at gmail.com>
* Distributed under GNU General Public License.
*/
package wikokit.base.wikt.multi.ru;
import wikokit.base.wikipedia.language.LanguageType;
import wikokit.base.wikipedia.util.StringUtilRegular;
import wikokit.base.wikt.util.POSText;
import wikokit.base.wikt.word.WTranslation;
import wikokit.base.wikt.word.WTranslationEntry;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import java.util.List;
import java.util.ArrayList;
/** Translations of Russian Wiktionary word.
*
* @see http://ru.wiktionary.org/wiki/Викисловарь:Правила оформления статей#Перевод
*/
public class WTranslationRu {
private final static WTranslation[] NULL_WTRANSLATION_ARRAY = new WTranslation[0];
private final static WTranslationEntry[] NULL_WTRANSLATIONENTRY_ARRAY = new WTranslationEntry[0];
/** Gets position after === Перевод === */
private final static Pattern ptrn_translation_3th_level = Pattern.compile(
"===?\\s*Перевод\\s*===?\\s*\\n");
/** Gets a header of translation box template "{{перев-блок|header"
* or "{{перев-блок|header|"
* or "{{перев-блок" - header is absent
* or "{{перев-блок||" - header is absent
* "|en="
*/
private final static Pattern ptrn_translation_box_header = Pattern.compile(
// "\\Q{{перев-блок|\\E(.*?)\\|?\\n\\|");
"\\Q{{перев-блок\\E\\|?(.*?)\\|?\\n\\|");
// "\\Q{{перев-блок\\E\\|?(.*?)|?\\n"); // vim {{перев-блок|\?\(.*\)|\?\n
/** Parses text (related to the POS), creates and fill array of translations (WTranslation).
* @param wikt_lang language of Wiktionary
* @param page_title word which are described in this article 'text'
* @param lang_section language of this section of an article
* @param pt POSText defines POS stored in pt.text
* @return
*/
public static WTranslation[] parse (
LanguageType wikt_lang,
LanguageType lang_section,
String page_title,
POSText pt)
{
// === Level III. Translation ===
if(null == pt.getText()) {
return NULL_WTRANSLATION_ARRAY;
}
StringBuffer text_source_sb = pt.getText();
if(0 == text_source_sb.length()) {
return NULL_WTRANSLATION_ARRAY;
}
// 1. gets position in text after === Перевод ===
String text_source = text_source_sb.toString();
Matcher m = ptrn_translation_3th_level.matcher(text_source_sb);
boolean b_next = m.find();
if(!b_next) { // there is no translation section!
if(lang_section == LanguageType.ru)
System.out.println("Warning in WTranslationRu.parse(): The Russian word '"+
page_title + "' has no section === Перевод ===.");
return NULL_WTRANSLATION_ARRAY;
}
// one more check that there is any translation
if(!text_source.contains("{{перев-блок")) {
System.out.println("Warning in WTranslationRu.parse(): " + "The Russian word '" + page_title +
"' has section === Перевод === but there is no any translation box \"{{перев-блок\".");
return NULL_WTRANSLATION_ARRAY;
}
// x = gets position of the next 2nd or 3rd level block == See also or Bibliography ==
// gets text till x of the last brackets: "}}"
String text = StringUtilRegular.getTextTillFirstHeaderPosition(m.end(), text_source);
int len = text.length();
if(0 == len) {
return NULL_WTRANSLATION_ARRAY;
}
List<WTranslation> wt_list = new ArrayList<WTranslation>();
int prev_end = 0; // previous end of previous translation box
boolean to_continue = true;
while(to_continue) {
// 3. gets next substring "{{перев-блок"
int next_end = text.indexOf("{{перев-блок", prev_end + 1);
if(-1 == next_end) {
to_continue = false;
// gets text till first line "\n}}\n" or "{{unfinished"
/*int unfinished_template_pos = text.indexOf("{{unfinished", prev_end + 1);
if(-1 != unfinished_template_pos)
next_end = unfinished_template_pos;
else {*/
// search first "\n}}", i.e. end of translation template
int e = text.indexOf("\n}}", prev_end + 12);
if(-1 != e)
next_end = e + 3; // + len("\n}}")
else
next_end = len;
//}
}
String trans_block = text.substring(prev_end, next_end);
// 4. extracts lang code "|en=", e.g. wikified translation [[angel]]
// return WTranslation or null if the translation text block was not found.
WTranslation wt = WTranslation.parseOneTranslationBox(wikt_lang, page_title, trans_block);
if(null != wt) {
wt_list.add(wt);
}
if(to_continue)
to_continue = -1 != next_end && next_end < len;
prev_end = next_end;
}
if(!atLeastOneTranslationExists(wt_list))
return NULL_WTRANSLATION_ARRAY;
if(wt_list.size() > 1 && !allTranslationsHaveHeader(wt_list)) {
System.out.println("Warning in WTranslationRu.parse(): The article '"+
page_title + "' has several translation boxes, but not all of them have headers.");
}
return( (WTranslation[])wt_list.toArray(NULL_WTRANSLATION_ARRAY) );
}
/** Checks, wheather all the translation boxes have headers. */
public static boolean allTranslationsHaveHeader(List<WTranslation> wt_list)
{
for(WTranslation wt : wt_list) {
if (0 == wt.getHeader().length()) {
return false;
}
}
return true;
}
/** Returens true, if there is at least one translation entry
* in any of the translation boxes.
*/
public static boolean atLeastOneTranslationExists(List<WTranslation> wt_list)
{
for(WTranslation wt : wt_list) {
if (wt.getTranslationsNumber() > 0) {
return true;
}
}
return false;
}
/** Chops close brackets "}}" */
private final static Pattern ptrn_double_close_curly_brackets = Pattern.compile(
"\\n?\\Q}}\\E[\\n\\s]*$"); // gvim: \n\?}}\n\?\Z
//"\\n?\\Q}}\\E[\\n\\s]*\\z"); // gvim: \n\?}}\n\?\Z
/** Parses one translation box, i.e. extracts languages and a list of
* translations (wikified words) for each language,
* creates and fills WTranslation.
*
* @param wikt_lang language of Wiktionary
* @param page_title word which are described in this article 'text'
* @param text translaton box text
* @return WTranslation or null if the translation text block was not found.
*/
public static WTranslation parseOneTranslationBox(LanguageType wikt_lang,
String page_title,
String text)
{
String meaning_summary = "";
String text_wo_header;
// 1. extract header (meaning summary, first line in translation box)
Matcher m = ptrn_translation_box_header.matcher(text.toString());
boolean b_found = m.find();
// System.out.println("WTranslationRu.parseOneTranslationBox(): The article '"+page_title + "'.");
if(b_found) { // there is a header
meaning_summary = m.group(1);
text_wo_header = text.substring(m.end()); // text without header
} else
text_wo_header = text;
// chop close brackets "}}"
Matcher m_bracket = ptrn_double_close_curly_brackets.matcher(text_wo_header);
String t = m_bracket.replaceFirst(""); // text without header and without brackets
String[] lines = t.split("\n\\|");
List<WTranslationEntry> wte_list = new ArrayList<WTranslationEntry>();
for(String s : lines) { // for each language (for each line)
WTranslationEntry wte = WTranslationEntry.parse(wikt_lang, page_title, s);
if(null != wte) {
wte_list.add(wte);
}
}
return new WTranslation(
meaning_summary,
(WTranslationEntry[])wte_list.toArray(NULL_WTRANSLATIONENTRY_ARRAY));
}
}