/* WPOSRu.java - corresponds to a POS level of Russian Wiktionary word.
*
* Copyright (c) 2008-2011 Andrew Krizhanovsky <andrew.krizhanovsky at gmail.com>
* Distributed under EPL/LGPL/GPL/AL/BSD multi-license.
*/
package wikokit.base.wikt.multi.ru;
import wikokit.base.wikt.util.POSText;
import wikokit.base.wikt.util.LangText;
//import wikt.constant.POSType;
import wikokit.base.wikt.constant.POS;
//import wikt.multi.ru.POSTypeRu;
import wikokit.base.wikipedia.util.StringUtilRegular;
import wikokit.base.wikipedia.language.LanguageType;
import wikokit.base.wikipedia.text.WikiParser;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import java.util.List;
import java.util.ArrayList;
/** Splits text to fragments related to different parts of speech (POS).
* POS is basically a level 2 header in Russian Wiktionary, e.g. for "roast":
* ==roast I==
* ...
* ==roast II==
*
* (and a level 3 in English Wiktionary: ===Verb===)
*
* @see http://ru.wiktionary.org/wiki/%D0%92%D0%B8%D0%BA%D0%B8%D1%81%D0%BB%D0%BE%D0%B2%D0%B0%D1%80%D1%8C:%D0%A7%D0%B0%D1%81%D1%82%D0%B8_%D1%80%D0%B5%D1%87%D0%B8
* http://ru.wiktionary.org/wiki/Викисловарь:Части речи
*
* @see http://ru.wiktionary.org/wiki/Викисловарь:Правила оформления статей
*/
public class WPOSRu {
private final static POSText[] NULL_POS_TEXT_ARRAY = new POSText[0];
/** start of the POS block,
* current: ==word I==
* old: == Существительное I ==
* it can absent...
*/
private final static Pattern ptrn_2nd_level = Pattern.compile( // Vim: ^==\s*\([^=]\+\)\s*==\s*\Z
//"(?m)^\\s*==");
"(?m)^==\\s*([^=]+?)\\s*==\\s*");
// "\\A==\\s*([^=]+)\\s*==\\s*\\Z");
/** start of the POS block,
* {{заголовок|be|add=I}}
* it can absent...
*/
private final static Pattern ptrn_title_add_template = Pattern.compile(
// Vim: ^{{заголовок|\([^|]\+\)|add=[^}]\+}}\Z
// ^{{заголовок|\([^|]\+\)|add=[^}]\+}}\Z
//"(?m)^\\s*==");
//"(?m)^==\\s*([^=]+?)\\s*==\\s*");
//"(?m)^\\Q{{заголовок|\\E([^|]+?)\\Q|add=\\E([^}]+?)\\}\\}\\s*\\Z"); // -
//"(?m)^\\Q{{заголовок|\\E([^|]+?)\\Q|add=\\E([^}]{2,9})\\s*\\Z"); // -
"(?m)^\\Q{{заголовок|\\E([^|]+?)\\Q|add=\\E([^}]{1,4})\\s*"); // ? 1-4 = len(I,..,VIII,..)
//"(?m)^\\Q{{заголовок|\\E([^|]+?)\\Q|add=\\E([^}]{2,9})"); // +
// "\\A==\\s*([^=]+)\\s*==\\s*\\Z");
// "\\{\\{-([-_a-zA-Z]{2,9})-(?:\\}\\}|\\|.*?\\}\\})|\\Q{{заголовок|\\E([-_a-zA-Z]{2,9})(?:\\}\\}|\\|add=\\}\\})"
/** start of the POS block,
* {{заголовок|add=I}}
* it can absent...
*/
private final static Pattern ptrn_title_add_template_without_lang = Pattern.compile(
"(?m)^\\Q{{заголовок|add=\\E([^}]{1,4})\\s*"); // ? 1-4 = len(I,..,VIII,..)
/** Gets first two letter after ==Морфологические и синтаксические свойства==
* e.g. "{{" or "Су"ществительное, or "Гл"агол...
*/
private final static Pattern ptrn_morpho_then_2letters = Pattern.compile(
"===\\s*Морфологические и синтаксические свойства\\s*===\\s*\\n\\s*(..)");
//"===\\s*Морфологические и синтаксические свойства\\s*==="); +
//"\\A===\\s*Морфологические и синтаксические свойства\\s*===\\s*\\n\\s*(..)");
/** Gets true, if str is known header, e.g. "References",
* but it's not a part of speech name, e.g. "Verb".
*/
public static boolean isSecondLevelHeaderWordNotPOS (String str)
{
if(str.equalsIgnoreCase("Ссылки"))
return true;
return false;
}
/** page_title - word which are described in this article 'text'
* @param lt .text will be parsed and splitted,
* .lang is not using now, may be in future...
*
* 1) Split the following text to "lead I" and "leat II"
* 2) Extracts part of speech "гл" from "lead II"
* <PRE>
* == lead I ==
* English text1
* == lead II==
* ===Морфологические и синтаксические свойства==="
* {{гл en reg|lead}}";</PRE>
*
* todo isPOSHeader() (remove acce'nt -> accent) or guessPOS
*/
public static POSText[] splitToPOSSections (
String page_title,
LangText lt)
{
String pos_title = "";
if(null == lt.text || 0 == lt.text.length()) {
return NULL_POS_TEXT_ARRAY;
}
Matcher m = ptrn_2nd_level.matcher(lt.text.toString());
boolean b_next = m.find();
if(b_next && m.groupCount() > 0 && isSecondLevelHeaderWordNotPOS(m.group(1)))
b_next = false; // it's usual header, e.g. "Links", not a == Verb I ==
if(!b_next) { // check: "{{заголовок|sq|add=I}}")
POSText[] pp = splitToPOSWithTitleAddParameter(page_title, lt);
if(pp.length > 0)
return pp;
// there is only one ==Second level header== in this language in this word
POSText[] pos_section_alone = new POSText[1];
pos_section_alone[0] = guessPOS(lt.text);
return pos_section_alone;
}
// there are more than one POS in this language in this word
List<POSText> pos_sections = new ArrayList<POSText>(); // result will be stored to
StringBuffer current_pos_section = new StringBuffer();
int start, end; // "<start> == Verb I == ... <end> == Verb II ==" position of POS block in the lt.text
start = 0;
pos_title = WikiParser.removeAcuteAccent(new StringBuffer(m.group(1)), LanguageType.ru).toString();
b_next = m.find();
if(b_next)
end = m.start();
else {
end = 0; // there is only one POS block, e.g. ==Verb I==, it is a little strange ...
System.out.println("Warning: there is only one POS block, e.g. ==Verb I== for the word '" + page_title + "' with language code '" + lt.getLanguage().toString() + "' in WPOSRu.splitToPOSSections()");
}
while(b_next) {
current_pos_section.append(lt.text.substring(start, end));
POS p = guessPOSWith2ndLevelHeader(page_title, pos_title, current_pos_section);
if(null != p) { // OK. It's POS header, though it's possible that p=unknown :(
POSText pt = new POSText(p, current_pos_section.toString());
current_pos_section.setLength(0);
pos_sections.add(pt);
} else {
// null, if this is another 2nd level header, e.g. Bibliography or References
current_pos_section.append(""); // +??? this Bibliography text
// todo ...
}
// variant I: \1==page_title+"I", "II", ... "VIII"
// variant II: \1==Verb|Noun|... (In Russian)
pos_title = WikiParser.removeAcuteAccent(new StringBuffer(m.group(1)), LanguageType.ru).toString();
b_next = m.find();
if(b_next) {
start = end;
end = m.start();
}
}
current_pos_section.append(lt.text.substring(end)); // last POS section
POS p = guessPOSWith2ndLevelHeader(page_title, pos_title, current_pos_section);
if(null != p) { // OK. It's last POS header, though it's possible that p=unknown :(
POSText pt = new POSText(p, current_pos_section.toString());
current_pos_section.setLength(0);
pos_sections.add(pt);
}
return (POSText[])pos_sections.toArray(NULL_POS_TEXT_ARRAY);
}
/** Checks whether the language code XX is equals to lt.lang.
*
* @param page_title word which are described in this article 'text'
* @param lt .text will be parsed and splitted,
* .lang is not using now, may be in future...
* @param add_lang_code language code XX in {{заголовок|XX|add=..}}
*/
private static boolean isValidLanguageCode (
String page_title,
LangText lt,
String add_lang_code)
{
// template {{заголовок|add=II}} can be without language code
if(add_lang_code.startsWith("add="))
return true;
if(null == add_lang_code
|| add_lang_code.length() < 2
|| !LanguageType.has(add_lang_code)) {
// i.e. skip the whole block POS if the first lang code is unknown
if (null == add_lang_code)
System.out.println("Error: null language code in {{заголовок|lang_code|add=..}} for the word '" + page_title + "' in WPOSRu.splitToPOSWithTitleAddParameter()");
else
System.out.println("Error: unknown language code '" + add_lang_code + "' in {{заголовок|lang_code|add=..}} for the word '" + page_title + "' in WPOSRu.splitToPOSWithTitleAddParameter()");
return false;
}
LanguageType add_lang_type = LanguageType.get(add_lang_code);
if(add_lang_type != lt.getLanguage()) {
System.out.println("Error: language code '" + add_lang_code + "' != '"+ lt.getLanguage().toString() +"' (in {{заголовок|YY|add=..}} and {{-XX-}}) for the word '" + page_title + "' in WPOSRu.splitToPOSWithTitleAddParameter()");
return false;
}
return true;
}
/** Splits to blocks of text which describe different part of speech.
*
* page_title - word which are described in this article 'text'
* @param lt .text will be parsed and splitted,
* .lang is not using now, may be in future...
*
* 1) Splits the following text to "заголовок|...|add=I" and "заголовок|...|add=II"
* 2) Extracts part of speech (e.g. "сущ" i.e. "noun")
* <PRE>
* {{заголовок|be|add=I}}
* === Морфологические и синтаксические свойства ===
* {{сущ be m|слоги={{по-слогам|шах}}|}}
*
* {{заголовок|be|add=II}}
* === Морфологические и синтаксические свойства ===
* {{сущ be m|слоги={{по-слогам|}}|}}</PRE>
*/
private static POSText[] splitToPOSWithTitleAddParameter (
String page_title,
LangText lt)
{
Matcher m;
String lt_text = lt.text.toString();
boolean lang_code_presented;
if(lt_text.contains("{{заголовок|add=")) {
m = ptrn_title_add_template_without_lang.matcher( lt_text );
lang_code_presented = false;
} else {
m = ptrn_title_add_template.matcher( lt_text );
lang_code_presented = true;
}
boolean b_next = m.find();
if(!b_next) // there is no POS delimiter "{{заголовок|...|add=I}}"
return NULL_POS_TEXT_ARRAY;
List<POSText> pos_sections = new ArrayList<POSText>(); // result will be stored to
StringBuffer current_pos_section = new StringBuffer();
int start, end; // "<start> {{заголовок|...|add=I}} ...
// <end> {{заголовок|...|add=II}}" position of POS block in the lt.text
if(lang_code_presented && !isValidLanguageCode(page_title, lt, m.group(1)))
return NULL_POS_TEXT_ARRAY;
start = 0;
b_next = m.find();
if(b_next)
end = m.start();
else {
end = 0; // there is only one POS block, it is a little strange ...
System.out.println("Warning: there is only one POS block, e.g. {{заголовок|...|add=I}} for the word '" + page_title + "' with language code '" + lt.getLanguage().toString() + "' in WPOSRu.splitToPOSSections()");
}
while(b_next) {
current_pos_section.append(lt.text.substring(start, end));
POSText pt = guessPOS (current_pos_section);
if(null != pt.getPOSType()) { // OK. It's POS header, though it's possible that p=unknown :(
pos_sections.add(pt);
current_pos_section.setLength(0);
} else {
// null, if this is another 2nd level header, e.g. Bibliography or References
current_pos_section.append(""); // +??? this Bibliography text
// todo ...
}
// variant I: \1==page_title+"I", "II", ... "VIII"
// variant II: \1==Verb|Noun|... (In Russian)
//pos_title = WikiParser.removeAcuteAccent(new StringBuffer(m.group(1)), LanguageType.ru).toString();
b_next = m.find();
if(b_next) {
start = end;
end = m.start();
}
}
current_pos_section.append(lt.text.substring(end)); // last POS section
POSText pt = guessPOS (current_pos_section);
if(null != pt.getPOSType()) { // OK. It's last POS header, though it's possible that p=unknown :(
current_pos_section.setLength(0);
pos_sections.add(pt);
}
return (POSText[])pos_sections.toArray(NULL_POS_TEXT_ARRAY);
}
/** The POS should be extracted from the texts, e.g.<PRE>
* noun:
* ===Морфологические и синтаксические свойства===
* {{сущ en|слоги=lead|lead|leads}}
*
* verb:
* ===Морфологические и синтаксические свойства===
* {{гл ru 4b-ся
* {{гл ru 8b/b^
* {{гл ru 5c'^-т
*
* adjective:
* ===Морфологические и синтаксические свойства===
* {{прил ru 1*a
*
* adverb:
* ===Морфологические и синтаксические свойства===
*
* {{adv ru|слоги={{по-слогам|ра|но|ва́|то}}|или=предикатив|или-кат=предикативы|}}
*
* {{adv-ru|
* Наречие, неизменяемое.
*
* Old formatting
*
* ===Морфологические и синтаксические свойства===
* {{СущМужНеодуш1c(1)
* {{СущЖенНеодуш8a
* Существительное, ...
*
* {{прил ia}}
*
* {{парадигма-рус // old formatting (>500, < 1000 pages)
* |шаблон=Гл11b/c
* {{Гл1a</PRE>
*/
public static POSText guessPOS (StringBuffer text)
{
POS p_type = POS.unknown;
if(null == text || 0 == text.length()) {
return new POSText(p_type, "");
}
Matcher m = ptrn_morpho_then_2letters.matcher(text.toString());
boolean b = m.find();
if(b) {
String two_letters = m.group(1);
if(two_letters.equalsIgnoreCase("{{")) {
// if \1=="{{" then get first letters till space
// substring started after the symbol "{{"
//String pos_name = StringUtilRegular.getLettersTillSpace(text.substring(m.end())).toLowerCase();
String pos_name = StringUtilRegular.getLettersTillSpaceHyphenOrPipe(text.substring(m.end())).toLowerCase();
if(POSTemplateRu.has(pos_name)) {
p_type = checkIfSuchPOSExist(pos_name);
} /*else {
// old template of POS with hyphen, e.g. "{{adv-ru|}} instead of {{adv ru|}}, or Мс-п6b
pos_name = StringUtilRegular.getLettersTillHyphen(text.substring(m.end())).toLowerCase();
if(POSTypeRu.has(pos_name)) {
p_type = POSTypeRu.get(pos_name);
}
}*/
} else {
// if first line after "===Морфологические и синтаксические свойства===" do not start from "{{" then search first "{{"
int double_braces_pos = text.indexOf("{{", m.end());
if(-1 != double_braces_pos && double_braces_pos+5 < text.length()) {
String pos_name = StringUtilRegular.getLettersTillSpaceHyphenOrPipe(text.substring(double_braces_pos+2)).toLowerCase();
if(POSTemplateRu.has(pos_name)) {
p_type = checkIfSuchPOSExist(pos_name);
}
}
}
} else {
if(isPhrasePOS(text))
p_type = POS.phrase;
}
return new POSText(p_type, new StringBuffer(text));
}
/** Checks whether the text describes a phrase. It is true if the text
* contains something like:
* <PRE>
* === Тип и синтаксические свойства сочетания ===
* {{phrase|
* |тип=фразема
* }}
* </PRE>
* @param text
* @return
*/
private static boolean isPhrasePOS (StringBuffer text)
{
return text.toString().contains("{{phrase");
}
public static POS checkIfSuchPOSExist(String pos_name) {
for (int idx = 0; idx < pos_name.length(); idx++)
if (" |}-".indexOf(pos_name.charAt(idx)) >= 0) {
pos_name = pos_name.substring(0, idx);
break;
}
if (POSTemplateRu.has(pos_name))
return POSTemplateRu.get(pos_name);
// POS may exist in the pos_name as a substring. isPOSin() - checks wheather POS is in the pos_name
else
return POSTemplateRu.isPOSIn(pos_name.toLowerCase());
}
/** The POS should be extracted from the text.
* @return POS, e.g. POS.verb for "== Verb =="
* @return POS.unknown if text contains "== Word I ==" or "== Word II =="
* (without additional POS data, e.g. "verb").
* @return else null
*
* Example of text.<PRE>
* noun:
* == bar II ==
* ===Морфологические и синтаксические свойства===
* {{сущ en|nom-sg=bar|слоги=bar}}
*
* adjective:
* == round I ==
* ===Морфологические и синтаксические свойства===
* {{прил en|round|слоги=round}}
*
* adverb (old style for "fast"):
* ==Наречие==
* {{нар en|fast}}
*
* adverb (very old style for DE "fast")
* <b>fast</b>
* Наречие
* ==Произношение==
* {{transcription|fɑst}}
* ==Значение==
* [[почти]]
* </PRE>
*
* @param page_title word, name of the article, e.g. "lead"
* @param pos_title extracted 2nd level title, e.g. "lead I", "lead II", or "Adverb" (old style)
*/
public static POS guessPOSWith2ndLevelHeader (String page_title,String pos_title, StringBuffer text)
{
POSText pt = guessPOS (text);
if(POS.unknown != pt.getPOSType() || null == text || 0 == text.length()) {
return pt.getPOSType();
}
// compare pos_title with POSType
pos_title = pos_title.toLowerCase();
if( POSTemplateRu.has(pos_title)) {
return POSTemplateRu.get(pos_title);
}
// get first word without number, e.g. ==Verb I== -> "Verb"
String pos_name = StringUtilRegular.getLettersTillSpace(pos_title);
if( POSTemplateRu.has(pos_name)) {
return POSTemplateRu.get(pos_name);
}
if(page_title.equalsIgnoreCase(pos_name)) { // It's POS because, e.g. "round I" == "round" + "I", but it's unknown POS
return POS.unknown;
}
// takes one by one second-level-headers (first word, e.g. Noun for "Noun I", compare with POS
/*Matcher m = ptrn_2nd_level.matcher(text.toString());
while(m.find()) {
String s = m.group(1); // header 2nd level
if( POSTypeRu.has(s)) {
return POSTypeRu.get(s);
}
}*/
return null;
}
}