/* WLanguageRu.java - corresponds to a language level of Russian Wiktionary word. * * Copyright (c) 2008 Andrew Krizhanovsky <andrew.krizhanovsky at gmail.com> * Distributed under GNU General Public License. */ package wikokit.base.wikt.multi.ru; import wikokit.base.wikipedia.language.LanguageType; import wikokit.base.wikt.util.LangText; import java.util.regex.Pattern; import java.util.regex.Matcher; import java.util.List; import java.util.ArrayList; /** Language lets you know the language of the word in question. It is almost * always in a level two heading. ==English==, {{-ru-}}, {{заголовок|ru|..}, * or {{-de-|schwalbe}}. * * @see http://en.wiktionary.org/wiki/Wiktionary:Entry_layout_explained * and http://ru.wiktionary.org/wiki/Викисловарь:Правила оформления статей * * http://ru.wiktionary.org/wiki/Категория:Викисловарь:Шаблоны:Языки * http://en.wiktionary.org/wiki/Category:Language_templates * * todo: create LanguageTypeRu with list of all languages in Russian * ... */ public class WLanguageRu { // lang code length // lang-chu-ru not used now! // slovio-la 9 maximum length // slovio-c 8 // zh-tw 5 // ain 3 // en 2 private final static Pattern ptrn_add = Pattern.compile( "\\|add=(.*?)(?:\\Z|\\|)"); //"\\|add=(.*?)"); /** start of the language block, e.g. {{-ru-}}, {{-en-}}, {{-de-}}, etc. */ private final static Pattern ptrn_lang = Pattern.compile( //"\\{\\{-([-_a-zA-Z]{2,9})-\\}\\}|\\Q{{заголовок|\\E([-_a-zA-Z]{2,9})(?:\\}\\}|\\|add=\\}\\})"); //"\\{\\{-([-_a-zA-Z]{2,9})-(?:\\}\\}|\\|.*?\\}\\})|\\Q{{заголовок|\\E([-_a-zA-Z]{2,9})(?:\\}\\}|\\|add=\\}\\})"); //"\\{\\{-([-_a-zA-Z]{2,9})-(?:\\}\\}|\\|.*?\\}\\})|\\Q{{заголовок|\\E([-_a-zA-Z]{2,9})(?:\\}\\}|\\|add=\\}\\})"); "(\\{\\{)-([-_a-zA-Z]{2,9})-(?:\\}\\}|\\|.*?\\}\\})|(\\Q{{заголовок|\\E)(.*?)\\}\\}"); // (\{\{)-([-_a-zA-Z]{2,9})-(?:\}\}|\|.*?\}\})|(\Q{{заголовок|\E)(.*?)\}\} // Yes, this is language delimiter: // {{-en-}} // group1={{ group2=en // {{-de-|schwalbe}} // group1={{ group2=de // {{заголовок|ka|add=}} // group1={{заголовок| group2=ka|add= // {{заголовок|ka}} // group1={{заголовок| group2=ka // // {{заголовок|de|add=|aare}} // group1={{заголовок| group2=de|add=|aare // // No, this is not a laguage, but a POS delimiter: // {{заголовок|sq|add=I}} // group1={{заголовок| group2=sq|add=I // // old: // vim: {{-\([-_a-zA-Z]\{2,9\}\)-[|}][^}]*}}\?\|{{заголовок|\([-_a-zA-Z]\{2,9\}\)[|}][^}]*}}\? //part1:{{-\([-_a-zA-Z]\{2,9\}\)-[|}][^}]*}}\? part2: {{заголовок|\([-_a-zA-Z]\{2,9\}\)[|}][^}]*}}\? //java: \\{\\{-([-_a-zA-Z]{2,9})-(?:\\}\\}|\\|.*?\\}\\}) java: \\Q{{заголовок|\\E([-_a-zA-Z]{2,9})[|}] //private final static StringBuffer NULL_STRINGBUFFER = new StringBuffer(""); private final static LangText[] NULL_LANG_TEXT_ARRAY = new LangText[0]; //private final static List<LangText> NULL_LANG_TEXT_LIST = new ArrayList<LangText>(0); /** Gets language type (code) information from a Wiktionary article header * and from the result of search by regular expression stored in a matcher m. */ public static LanguageType getLanguageType(Matcher m,String page_title) { LanguageType lang_type = null; String lang_code = ""; String group1 = m.group(1); String group2 = m.group(2); String group3 = m.group(3); String group4 = m.group(4); boolean zagolovok = false; // {{заголовок| // let's skip "unknown language code" for "{{заголовок|" if(null == group1 && null == group3) return null; if(null != group1 && group1.equalsIgnoreCase("{{")) lang_code = group2; else { if(null != group3 && group3.equalsIgnoreCase("{{заголовок|")) { zagolovok = true; int pipe_index = group4.indexOf('|'); if(-1 == pipe_index) { lang_code = group4; // {{заголовок|ka}} } else { String text_till_first_pipe = group4.substring(0, pipe_index); if(-1 == group4.indexOf("add=")) // {{заголовок|de|aare}} exists? lang_code = text_till_first_pipe; else { Matcher m_add = ptrn_add.matcher(group4.toString()); if(m_add.find() && m_add.group(1).length() == 0) lang_code = text_till_first_pipe; // {{заголовок|de|add=|aare}} } // {{заголовок|ka|add=}} } } } if(lang_code.length() == 0) { if(!zagolovok) System.out.println("Warning in WLanguageRu.getLanguageType(): empty language code for the word '" + page_title + "' in WLanguageRu.getLanguageType()"); return null; } //String lang_code = m.group(1); /*if((lang_code.length() < 2) && 2 == m.groupCount()) { lang_code = m.group(2); }*/ if (!LanguageType.has(lang_code)) { // i.e. skip the whole article if the first lang code is unknown if(!zagolovok) System.out.println("Warning in WLanguageRu.getLanguageType(): unknown language code '" + lang_code + "' for the word '" + page_title + "' in WLanguageRu.getLanguageType()"); } else lang_type = LanguageType.get(lang_code); return lang_type; } /** Splits an article text into language sections. * * @param page_title word which are described in this article 'text'*/ public static LangText[] splitToLanguageSections ( String page_title, StringBuffer text) { if(null == text || 0 == text.length()) { return NULL_LANG_TEXT_ARRAY; } List<LangText> lang_sections = new ArrayList<LangText>(); // result will be stored to Matcher m = ptrn_lang.matcher(text.toString()); int i = 0; boolean b_next = m.find(); boolean b_at_least_one_lang = b_next; // at least one language section was recognized boolean b_known_lang = true; if(b_next) { LanguageType lang_type = getLanguageType(m, page_title); b_known_lang = null != lang_type; if(b_known_lang) { LangText lt = new LangText(lang_type); m.appendReplacement(lt.text, ""); // "First {{-ru-}}" (add the text before the first lang code) lang_sections.add(lt); b_next = m.find(); while(b_next) { if(b_known_lang) { i++; // text belongs to previous lang code: m.appendReplacement(lang_sections.get(i-1).text, ""); // i.e. {{-prev lang code-}} current text {{-current lang code } else { m.appendReplacement(new StringBuffer(), ""); // {{-unknown-}} just reset the text within the unknown lang {{-known-}} } lang_type = getLanguageType(m, page_title); b_known_lang = null != lang_type; b_next = m.find(); if (b_known_lang) { lt = new LangText(lang_type); //m.appendReplacement(lang_sections.get(i-1).text, ""); // text belongs to previous lang code: lang_sections.add(lt); // i.e. {{-prev lang code-}} current text {{-current lang code-}} if(!b_next) { m.appendTail(lang_sections.get(i).text); } } } } } if(0==i && b_known_lang && b_at_least_one_lang) { m.appendTail(lang_sections.get(i).text); } if(b_known_lang && 0 == lang_sections.size()) { // System.out.println("Warning: Ok. I guess that this is an article about Russian word, without language code. Word = '" + page_title + "'; in WLanguageRu.splitToLanguageSections()"); LangText lt = new LangText(LanguageType.ru); lt.text = text; lang_sections.add(lt); } return (LangText[])lang_sections.toArray(NULL_LANG_TEXT_ARRAY); } }