/* WLanguageRu.java - corresponds to a language level of Russian Wiktionary word. * * Copyright (c) 2008-2010 Andrew Krizhanovsky <andrew.krizhanovsky at gmail.com> * Distributed under GNU General Public License. */ package wikokit.base.wikt.multi.en; import wikokit.base.wikipedia.language.LanguageType; import wikokit.base.wikt.util.LangText; import java.util.regex.Pattern; import java.util.regex.Matcher; import java.util.List; import java.util.ArrayList; /** Language lets you know the language of the word in question. It is almost * always in a level two heading. ==English==, or {{-de-|schwalbe}}. * * @see http://en.wiktionary.org/wiki/Wiktionary:Entry_layout_explained * @see http://en.wiktionary.org/wiki/Wiktionary:Language_considerations * @see http://en.wiktionary.org/wiki/Wiktionary:Language names * * @see http://en.wiktionary.org/wiki/Category:Language_templates */ public class WLanguageEn { /** ==English== or ==Russian==, etc. */ private final static Pattern ptrn_2nd_level = Pattern.compile( // Vim: ^==\s*\([^=]\+\)\s*==\s*\Z //"(?m)^\\s*=="); "(?m)^==\\s*([^=]+?)\\s*==\\s*"); // "\\A==\\s*([^=]+)\\s*==\\s*\\Z"); private final static LangText[] NULL_LANG_TEXT_ARRAY = new LangText[0]; /** Gets language type (code) information from a Wiktionary article header * and from the result of search by regular expression stored in a matcher m. */ public static LanguageType getLanguageType(Matcher m,String page_title) { LanguageType lang_type = null; String english_lang_name = m.group(1); if(null == english_lang_name) return null; int len = english_lang_name.length(); if( len > 4 && english_lang_name.charAt(0) == '[') // e.g. ==[[Ewe]]== lang_type = LanguageType.getByEnglishName( english_lang_name.substring(2, len-2)); else lang_type = LanguageType.getByEnglishName( english_lang_name ); // i.e. skip the whole article if the first lang code is unknown if (null == lang_type) { if(!LanguageType.hasUnknownLangName(english_lang_name)) { LanguageType.addUnknownLangName(english_lang_name); System.out.println("Warning in WLanguageEn.getLanguageType(): The article '"+ page_title + "' has section with unknown language: " + english_lang_name + "."); } } return lang_type; } /** Splits an article text into language sections. * * @param page_title word which are described in this article 'text'*/ public static LangText[] splitToLanguageSections ( String page_title, StringBuffer text) { if(null == text || 0 == text.length()) { return NULL_LANG_TEXT_ARRAY; } List<LangText> lang_sections = new ArrayList<LangText>(); // result will be stored to Matcher m = ptrn_2nd_level.matcher(text.toString()); int i = 0; boolean b_next = m.find(); boolean b_at_least_one_lang = b_next; // at least one language section was recognized boolean b_known_lang = true; if(b_next) { LanguageType lang_type = getLanguageType(m, page_title); b_known_lang = null != lang_type; if(b_known_lang) { LangText lt = new LangText(lang_type); m.appendReplacement(lt.text, ""); // "First {{-ru-}}" (add the text before the first lang code) lang_sections.add(lt); b_next = m.find(); while(b_next) { if(b_known_lang) { i++; // text belongs to previous lang code: m.appendReplacement(lang_sections.get(i-1).text, ""); // i.e. {{-prev lang code-}} current text {{-current lang code } else { m.appendReplacement(new StringBuffer(), ""); // {{-unknown-}} just reset the text within the unknown lang {{-known-}} } lang_type = getLanguageType(m, page_title); b_known_lang = null != lang_type; b_next = m.find(); if (b_known_lang) { lt = new LangText(lang_type); //m.appendReplacement(lang_sections.get(i-1).text, ""); // text belongs to previous lang code: lang_sections.add(lt); // i.e. {{-prev lang code-}} current text {{-current lang code-}} if(!b_next) { m.appendTail(lang_sections.get(i).text); } } } } } if(0==i && b_known_lang && b_at_least_one_lang) { m.appendTail(lang_sections.get(i).text); } if(b_known_lang && 0 == lang_sections.size()) { System.out.println("Warning: Ok. I guess that this is an article about English word, without language code. Word = '" + page_title + "'; in WLanguageRu.splitToLanguageSections()"); LangText lt = new LangText(LanguageType.ru); lt.text = text; lang_sections.add(lt); } return (LangText[])lang_sections.toArray(NULL_LANG_TEXT_ARRAY); } }