/* WordBase.java - list of main fields common in Wiktionaries. * * Copyright (c) 2008 Andrew Krizhanovsky <andrew.krizhanovsky at gmail.com> * Distributed under GNU General Public License. */ package wikokit.base.wikt.word; //import wikt.util.LangText; import wikokit.base.wikipedia.language.LanguageType; import wikokit.base.wikipedia.text.WikiParser; import wikokit.base.wikipedia.text.ReferenceParser; //import wikt.word.ru.WordRu; /** Article in Wiktionary. * * See http://en.wiktionary.org/wiki/Wiktionary:Entry_layout_explained */ public class WordBase { /** Article title in Wiktionary. */ private String page_title; /** Language level of the word (includes: POS, meaning, translation). */ private WLanguage[] lang; /** Redirected page, i.e. target or destination page. * It is null for usual entries. * * Hard redirect defined by #REDIRECT", * @see TLangPOS.redirect_type and .lemma - a soft redirect. */ private String redirect_target; /** True, if there are templates {{form of|}} or {{plural of|}}, * there are no any other text in the definition. (enwikt) */ private boolean template_not_def; /** True if there are template (e.g. {{form of|}}, {{plural of|}}), * {{es-verb form of|}}) instead of definiton text (in enwikt). */ public boolean hasOnlyTemplatesWithoutDefinitions() { return template_not_def; } /** Parses the article text. * Creates and stores parsed data to the word (WordBase) * for the given Wiktionary (defined by wikt_lang language). */ public WordBase( String _page_title, LanguageType wikt_lang, // constant for the Wiktionary dump StringBuffer text) { page_title = _page_title; // remove <!-- comments --> and <ref> ... </ref> StringBuffer s = WikiParser.removeHTMLComments( ReferenceParser.removeReferences (text)); redirect_target = WRedirect.getRedirect(wikt_lang, page_title, s); if (null == redirect_target) { // it is not a redirect //LangText[] lang_sections = WLanguage.splitToLanguageSections(wikt_lang, page_title, s); lang = WLanguage.parse(wikt_lang, page_title, s); } template_not_def = WLanguage.hasOnlyTemplatesWithoutDefinitions(wikt_lang, lang); } /** Gets an article title in Wiktionary. */ public String getPageTitle() { return page_title; } /** Gets all languages. */ public WLanguage[] getAllLanguages() { return lang; } /** Checks is the entry a REDIRECT. */ public boolean isRedirect() { return null != redirect_target; } /** Gets a redirected page, i.e. target or destination page. * It is null for usual entries. */ public String getRedirect() { return redirect_target; } /** Creates word for the given Wiktionary (defined by language) * by parsing the Wiktionary article text. * Stores parsed data to the WordBase object. * * @param page_title the word itself */ /*public static WordBase create( String page_title, LanguageType wikt_lang, StringBuffer text) { LanguageType l = wikt_lang; WordBase w; if(l == LanguageType.ru) { return new WordRu(page_title); //} else if(l == LanguageType.en) { // return WordEn; //} //else if(code.equalsIgnoreCase( "simple" )) { // return WordSimple; // todo // ... } else { throw new NullPointerException("Null LanguageType"); } }*/ /** Word is empty if there are no recognized data in the parsed wiki text. */ public boolean isEmpty() { if(isRedirect()) return false; // REDIRECT is not an empty article. if (null != lang && lang.length > 0) return false; return true; } }