/* WMeaningRu.java - corresponds to a Meaning (definition + quotations)
* level of a word in Russian Wiktionary.
*
* Copyright (c) 2008 Andrew Krizhanovsky <andrew.krizhanovsky at gmail.com>
* Distributed under EPL/LGPL/GPL/AL/BSD multi-license.
*/
package wikokit.base.wikt.multi.ru;
import wikokit.base.wikt.multi.ru.name.LabelRu;
import wikokit.base.wikt.multi.en.name.LabelEn;
import wikokit.base.wikt.word.WMeaning;
import wikokit.base.wikt.word.WQuote;
import wikokit.base.wikipedia.language.LanguageType;
import wikokit.base.wikt.util.POSText;
import wikokit.base.wikt.util.Definition;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import java.util.List;
import java.util.ArrayList;
import wikokit.base.wikt.constant.Label;
import wikokit.base.wikt.util.LabelsText;
/** Meaning consists of <PRE>
* # Definition (preceded by "#", which causes automatic numbering).
* and Quotations. </PRE>
*/
public class WMeaningRu {
private final static WMeaning[] NULL_WMEANING_ARRAY = new WMeaning[0];
/** Gets position after ==== Значение ==== */
private final static Pattern ptrn_meaning_4th_level = Pattern.compile(
"====?\\s*Значение\\s*====?\\s*\\n");
/** Parses text (related to the POS), creates and fill array of meanings (WMeaning).
* @param wikt_lang language of Wiktionary
* @param page_title word which are described in this article 'text'
* @param lang_section language of this section of an article
* @param pt POSText defines POS stored in pt.text
* @return
*/
public static WMeaning[] parse (
String page_title,
LanguageType lang_section,
POSText pt)
{
LanguageType wikt_lang = LanguageType.ru;
// === Level III. Meaning ===
if(null == pt.getText()) {
return NULL_WMEANING_ARRAY;
}
StringBuffer text = pt.getText();
if(0 == text.length()) {
return NULL_WMEANING_ARRAY;
}
// 1. gets position in text after ==== Значение ====
Matcher m = ptrn_meaning_4th_level.matcher(text.toString());
boolean b_next = m.find();
if(!b_next) { // there is no definition section!
//System.out.println("Warning in WMeaningRu.parse(): The article '" +
// page_title + "', language section '" +
// lang_section.toString() + "' has no section ==== Значение ====.");
return NULL_WMEANING_ARRAY;
}
int len = text.length();
int prev_eol = m.end(); // previous end of line
if(len < prev_eol+3 || text.substring(prev_eol,prev_eol+3).equalsIgnoreCase("==="))
return NULL_WMEANING_ARRAY; // the definition section is empty!
List<WMeaning> wm_list = null;
boolean to_continue = true;
while(to_continue) {
// 3. gets next # line
int next_eol = text.indexOf("\n", prev_eol);
if(-1 == next_eol) {
next_eol = len;
}
String line = text.substring(prev_eol, next_eol);
if(!line.startsWith("{{прото|")) // skip one-line with {{proto|common meaning}}
{
// 4. extracts {{label.}}, definition, {{example|Sentence.}}
// return WMeaning
// return null if this line is not started from "#" or = "# "
WMeaning wm = WMeaning.parseOneDefinition(wikt_lang, page_title, lang_section, line);
if(null != wm) {
if(null == wm_list)
wm_list = new ArrayList<WMeaning>();
wm_list.add(wm);
}
}
to_continue = next_eol < len-1 && (text.charAt(next_eol+1) == '#');
prev_eol = next_eol + 1;
}
if(null == wm_list)
return NULL_WMEANING_ARRAY;
return( (WMeaning[])wm_list.toArray(NULL_WMEANING_ARRAY) );
}
/** Parses one definition line, i.e. extracts {{label}}, definition,
* {{example|Quotation sentence.}}, creates and fills a meaning (WMeaning).
* @param page_title word which is described in this article 'text'
* @param lang_section language of this section of an article
* @param line definition line
* @return WMeaning or null if the line is not started from "#" or = "# "
*/
public static WMeaning parseOneDefinition(
String page_title,
LanguageType lang_section,
String line)
{
if(line.contains("{{Нужен перевод}}"))
return null;
// remove empty quotations: {{пример|}} and {{пример}}
if(line.contains("{{пример|}}"))
line = line.replace("{{пример|}}", "");
if(line.contains("{{пример}}"))
line = line.replace("{{пример}}", "");
if(line.contains("{{пример перевод|}}"))
line = line.replace("{{пример перевод|}}", ""); // todo check - does exist this example
if(line.contains("[[]]"))
line = line.replace("[[]]", ""); // empty definition
line = Definition.stripNumberSign(page_title, line);
if(0 == line.length())
return null;
if(line.startsWith("{{морфема"))
return null; // skip now, todo (parse) in future
LabelsText label_text = LabelRu.extractLabelsTrimText(line);
if(null == label_text)
return null;
line = label_text.getText();
// 2. extract text till first {{пример|
String wiki_definition = WQuoteRu.getDefinitionBeforeFirstQuote(page_title, line);
// 3. parsing wiki-text
//StringBuffer definition = WikiWord.parseDoubleBrackets(page_title, new StringBuffer(wiki_definition));
// 4. extract wiki-links (internal links)
//WikiWord[] ww = WikiWord.getWikiWords(page_title, new StringBuffer(wiki_definition));
// 5. extract quotations
WQuote[] quote = WQuoteRu.getQuotes(page_title, line);
return new WMeaning(page_title, label_text.getLabels(), wiki_definition, quote, false);
}
}