/* WRelation.java - corresponds to a semantic relations level of a word in * Russian Wiktionary. * * Copyright (c) 2009 Andrew Krizhanovsky <andrew.krizhanovsky at gmail.com> * Distributed under GNU General Public License. */ package wikokit.base.wikt.multi.ru; import wikokit.base.wikt.constant.Relation; import wikokit.base.wikt.word.WRelation; import wikokit.base.wikipedia.language.LanguageType; import wikokit.base.wikt.util.POSText; import wikokit.base.wikt.util.WikiText; import wikokit.base.wikipedia.util.StringUtilRegular; import java.util.regex.Pattern; import java.util.regex.Matcher; import java.util.Map; import java.util.HashMap; import java.util.List; import java.util.ArrayList; import wikokit.base.wikt.multi.ru.name.LabelRu; import wikokit.base.wikt.util.LabelsWikiText; /** Semantic relations of Russian Wiktionary word. * * @see http://ru.wiktionary.org/wiki/Викисловарь:Правила оформления статей#Оформление семантических отношений */ public class WRelationRu { private final static WRelation[] NULL_WRELATION_ARRAY = new WRelation[0]; //Map<Relation, WRelation[]> m = new HashMap<Relation, WRelation[]>>(); // private final static Label[] NULL_LABEL_ARRAY = new Label[0]; // private final static List<Label> NULL_LABEL_LIST = new ArrayList(0); private final static Map<Relation, WRelation[]> NULL_MAP_RELATION_WRELATION_ARRAY = new HashMap<Relation, WRelation[]>(); /** Gets position after ==== Синонимы ==== */ private final static Pattern ptrn_synonymy = Pattern.compile("===?=?\\s*Синонимы\\s*===?=?\\s*\\n"); private final static Pattern ptrn_antonymy = Pattern.compile("===?=?\\s*Антонимы\\s*===?=?\\s*\\n"); private final static Pattern ptrn_hypernymy = Pattern.compile("===?=?\\s*Гиперонимы\\s*===?=?\\s*\\n"); private final static Pattern ptrn_hyponymy = Pattern.compile("===?=?\\s*Гипонимы\\s*===?=?\\s*\\n"); private final static Pattern ptrn_coordinate_term = Pattern.compile("===?=?\\s*Согипонимы\\s*===?=?\\s*\\n"); private final static Pattern ptrn_holonymy = Pattern.compile("===?=?\\s*Холонимы\\s*===?=?\\s*\\n"); private final static Pattern ptrn_meronymy = Pattern.compile("===?=?\\s*Меронимы\\s*===?=?\\s*\\n"); /** Two main patterns for synonyms with labels */ // private final static Pattern ptrn_labels = Pattern.compile("(?<label>[^:]+):(?<word>.+)"); private final static Pattern ptrn_labels = Pattern.compile("^(?<label>[^:(]+):(?<word>.+)"); private final static Pattern ptrn_labels_brackets = Pattern.compile("(?<word>\\[\\[[^\\]]+\\]\\])([ ]?(\\((?<label>[^\\)]+)\\))?)"); /** Split by comma */ private final static Pattern ptrn_comma = Pattern.compile("[,]+"); /** The begin of any list of semantic relations: "# " */ private final static Pattern ptrn_line_start = Pattern.compile( "^#\\s*"); /** The empty line can contain a dash and spaces. */ private final static Pattern ptrn_dashes = Pattern.compile( "^[-‐‒–—―]?\\s*$"); /** Parses text (related to the POS), creates and fill array of * semantic relations (WRelation). * @param wikt_lang language of Wiktionary * @param page_title word which are described in this article 'text' * @param lang_section language of this section of an article * @param pt POSText defines POS stored in pt.text * @param relation_type type of parsing relation, e.g. synonymy * @return */ public static Map<Relation, WRelation[]> parse ( LanguageType wikt_lang, String page_title, POSText pt) { // ===Семантические свойства=== // ====Синонимы==== // ==== Level IV. Relation ==== if(null == pt.getText()) { return NULL_MAP_RELATION_WRELATION_ARRAY; } StringBuffer text_source_sb = pt.getText(); if(0 == text_source_sb.length()) { return NULL_MAP_RELATION_WRELATION_ARRAY; } Map<Relation, WRelation[]> m_rel = new HashMap<Relation, WRelation[]>(); String text = text_source_sb.toString(); WRelation[] r; // synonymy r = parseOneKindOfRelation (wikt_lang, page_title, text, ptrn_synonymy, Relation.synonymy); if(0 < r.length) m_rel.put(Relation.synonymy, r); // antonymy r = parseOneKindOfRelation (wikt_lang, page_title, text, ptrn_antonymy, Relation.antonymy); if(0 < r.length) m_rel.put(Relation.antonymy, r); // hypernymy r = parseOneKindOfRelation (wikt_lang, page_title, text, ptrn_hypernymy, Relation.hypernymy); if(0 < r.length) m_rel.put(Relation.hypernymy, r); // hyponymy r = parseOneKindOfRelation (wikt_lang, page_title, text, ptrn_hyponymy, Relation.hyponymy); if(0 < r.length) m_rel.put(Relation.hyponymy, r); // coordinate term r = parseOneKindOfRelation (wikt_lang, page_title, text, ptrn_coordinate_term, Relation.coordinate_term); if(0 < r.length) m_rel.put(Relation.coordinate_term, r); // holonymy r = parseOneKindOfRelation (wikt_lang, page_title, text, ptrn_holonymy, Relation.holonymy); if(0 < r.length) m_rel.put(Relation.holonymy, r); // meronymy r = parseOneKindOfRelation (wikt_lang, page_title, text, ptrn_meronymy, Relation.meronymy); if(0 < r.length) m_rel.put(Relation.meronymy, r); return m_rel; } /** Parses text (related to the POS), creates and fill array of * semantic relations only for one kind of semantic relations (e.g. synonyms) * defined by the variable 'relation'. * @param wikt_lang language of Wiktionary * @param page_title word which are described in this article 'text' * @param lang_section language of this section of an article * @param text text of wiki article related to one POS * @param relation_header_pattern regular expression to find the header of semantic relation * @param relation type of parsing relation, e.g. synonymy * @return an empty array if relations are absent */ public static WRelation[] parseOneKindOfRelation ( LanguageType wikt_lang, String page_title, String text, Pattern relation_header_pattern, Relation relation) { // ===Семантические свойства=== // ====Синонимы==== // ==== Level IV. Relation ==== // 1. gets position in text after e.g. ====Синонимы==== Matcher m = relation_header_pattern.matcher(text); boolean b_next = m.find(); if(!b_next) { // the section is absent! //System.out.println("Warning in WRelationRu.parse(): The article '"+ // page_title + "' has no section ====Синонимы====."); return NULL_WRELATION_ARRAY; } // 1. get text till (1) next header or (2) empty line String relation_text = StringUtilRegular.getTextTillFirstHeaderOrEmptyLine(m.end(), text); if(0 == relation_text.length()) { return NULL_WRELATION_ARRAY; } List<WRelation> wr_list = new ArrayList<WRelation>(); // 2. split into lines: "\n" (not "\n#") // parse lines till the line which is not started from # String[] lines = relation_text.split("\n"); boolean b_relations = false; for(String s : lines) { Matcher m_start = ptrn_line_start.matcher(s); if(m_start.find()) { WRelation wr = null; s = m_start.replaceFirst(""); // remove "# " if(s.length() > 0) { // 3. split list of synonyms into wikiwords (or wiki phrases?) wr = parseOneLine (page_title, s); } //if(null != wr) wr_list.add(wr); // null means that relation = "# -", i.e. absent for this meaning if(null != wr) b_relations = true; } else break; // this line starts not from "#". Stop. } if(!b_relations) { // only empty lists of relations return NULL_WRELATION_ARRAY; } return (WRelation[])wr_list.toArray(NULL_WRELATION_ARRAY); } /** Parses one line of a semantic relations, * extracts a list of words (wikified words), creates and fills WRelation. * * @param page_title word which are described in this article 'text' * @param text semantic relation text line (e.g. list of synonyms) * @return WRelation or null if the list of semantic relations is empty or equal "-". */ public static WRelation parseOneLine( String page_title, String text) { // 1. check emptyness: regular expression "-" if(0 == text.length()) return null; Matcher m = ptrn_dashes.matcher(text); if(m.find()) return null; if(text.equals(" ")) return null; if(text.equals("{{-}}")) return null; // 2. split by semicolon WikiText[] wt = WikiText.createSplitBySemicolon(page_title, text); if(0 == wt.length) return null; // 3. get text and labels List<LabelsWikiText> lwt_array = new ArrayList(0); for(WikiText _wiki_text : wt) { String _text = _wiki_text.getWikifiedText(); if(null == _text || _text.length() == 0) continue; // if this is a non-wikified synonym //check if first pattern "works" Matcher demo_match = ptrn_labels.matcher(_text); Matcher main_matcher; //check, what variant of regexp fits if(demo_match.find()) // labels before word: word1, word2; main_matcher = ptrn_labels.matcher(_wiki_text.getWikifiedText()); else // word (label after word, label2, label3); main_matcher = ptrn_labels_brackets.matcher(_text); //use chosen regexp while (main_matcher.find()) { String _words = main_matcher.group("word"); String _labels = main_matcher.group("label"); WikiText[] wt1 = WikiText.createSplitByComma(page_title, _words); for(WikiText _wiki_word : wt1) { lwt_array.add(new LabelsWikiText(LabelRu.createSplitByPattern(_labels, ptrn_comma), _wiki_word)); } } } return new WRelation(null, lwt_array.toArray(new LabelsWikiText[lwt_array.size()])); } }