/* WRelation.java - corresponds to a semantic relations level of a word in * English Wiktionary. * * Copyright (c) 2010 Andrew Krizhanovsky <andrew.krizhanovsky at gmail.com> * Distributed under GNU General Public License. */ package wikokit.base.wikt.multi.en; import wikokit.base.wikt.constant.Relation; import wikokit.base.wikt.word.WRelation; import wikokit.base.wikipedia.language.LanguageType; import wikokit.base.wikt.util.POSText; import wikokit.base.wikt.util.WikiText; import wikokit.base.wikipedia.util.StringUtilRegular; import java.util.regex.Pattern; import java.util.regex.Matcher; import java.util.ListIterator; import java.util.Map; import java.util.HashMap; import java.util.List; import java.util.ArrayList; import wikokit.base.wikt.constant.Label; import wikokit.base.wikt.util.LabelsWikiText; /** Semantic relations of English Wiktionary word. * * @see http://en.wiktionary.org/wiki/Wiktionary:Semantic_relations * @see http://en.wiktionary.org/wiki/Template_talk:sense */ public class WRelationEn { private final static WRelation[] NULL_WRELATION_ARRAY = new WRelation[0]; private final static Map<Relation, WRelation[]> NULL_MAP_RELATION_WRELATION_ARRAY = new HashMap<Relation, WRelation[]>(); /** Gets position after ====Synonyms==== */ private final static Pattern ptrn_synonymy = Pattern.compile( "(?m)^={3,5}\\s*Synonyms\\s*={3,5}\\s*$"); /** Gets position after ====Antonyms==== */ private final static Pattern ptrn_antonymy = Pattern.compile( "(?m)^={3,5}\\s*Antonyms\\s*={3,5}\\s*$"); /** Gets position after ====Hypernyms==== */ private final static Pattern ptrn_hypernymy = Pattern.compile( "(?m)^={3,5}\\s*Hypernyms\\s*={3,5}\\s*$"); /** Gets position after ====Hyponyms==== */ private final static Pattern ptrn_hyponymy = Pattern.compile( "(?m)^={3,5}\\s*Hyponyms\\s*={3,5}\\s*$"); /** Gets position after ====Holonyms==== */ private final static Pattern ptrn_holonymy = Pattern.compile( "(?m)^={3,5}\\s*Holonyms\\s*={3,5}\\s*$"); /** Gets position after ====Meronyms==== */ private final static Pattern ptrn_meronymy = Pattern.compile( "(?m)^={3,5}\\s*Meronyms\\s*={3,5}\\s*$"); /** Gets position after ====Troponyms==== */ private final static Pattern ptrn_troponymy = Pattern.compile( "(?m)^={3,5}\\s*Troponyms\\s*={3,5}\\s*$"); /** Gets position after ====Coordinate terms==== */ private final static Pattern ptrn_coordinate_term = Pattern.compile( "(?m)^={3,5}\\s*Coordinate\\s+terms\\s*={3,5}\\s*$"); /** Gets position after ====See also==== */ private final static Pattern ptrn_see_also = Pattern.compile( "(?m)^={3,5}\\s*See\\s+also\\s*={3,5}\\s*$"); /** Gets position after ====Derived terms==== */ private final static Pattern ptrn_derived_terms = Pattern.compile( "(?m)^={3,5}\\s*Derived\\s+terms\\s*={3,5}\\s*$"); /** Gets position after ====Related terms==== */ private final static Pattern ptrn_related_terms = Pattern.compile( "(?m)^={3,5}\\s*Related\\s+terms\\s*={3,5}\\s*$"); /** Gets position after ====Translations==== */ private final static Pattern ptrn_translations = Pattern.compile( "(?m)^={3,5}\\s*Translations\\s*={3,5}\\s*$"); /** The begin of any list of semantic relations: "* " */ private final static Pattern ptrn_line_start = Pattern.compile( "^\\*\\s*"); /** The summary of definition before the list of synonyms, e.g. "furp" from * "(''furp''): [[furp]], [[whoodleplunk]]" */ private final static Pattern ptrn_summary_in_italics = Pattern.compile( "^\\(''(.+?)''\\):\\s*"); /** The summary of definition before the list of synonyms in template:sense, * e.g. "forked, branched" from: * "{{sense|forked, branched}} [[cloven]], [[forked]]" */ private final static Pattern ptrn_summary_in_sense = Pattern.compile( "^\\Q{{sense|\\E(.+?)\\}\\}\\s*"); /** Chops the dot symbol (".") at the end of line (EOL). */ private final static Pattern ptrn_eol_dot = Pattern.compile( "(?m)\\s*\\.\\s*$"); /** Parses text (related to the POS), creates and fill array of * semantic relations (WRelation). * @param wikt_lang language of Wiktionary * @param page_title word which are described in this article 'text' * @param lang_section language of this section of an article * @param pt POSText defines POS stored in pt.text * @param relation_type type of parsing relation, e.g. synonymy * @return */ public static Map<Relation, WRelation[]> parse ( LanguageType wikt_lang, String page_title, POSText pt) { // ====Synonyms==== // ==== Level IV (or V?). Relation ==== if(null == pt.getText()) { return NULL_MAP_RELATION_WRELATION_ARRAY; } StringBuffer text_source_sb = pt.getText(); if(0 == text_source_sb.length()) { return NULL_MAP_RELATION_WRELATION_ARRAY; } // gets text till ====Translations==== String text = null; Matcher m = ptrn_translations.matcher(text_source_sb.toString()); if(!m.find()) // the section Translations is absent! text = text_source_sb.toString(); else text = text_source_sb.substring(0, m.start()); Map<Relation, WRelation[]> m_rel = new HashMap<Relation, WRelation[]>(); WRelation[] r; // synonymy r = parseOneKindOfRelation (wikt_lang, page_title, text, ptrn_synonymy, Relation.synonymy); if(0 < r.length) m_rel.put(Relation.synonymy, r); // antonymy r = parseOneKindOfRelation (wikt_lang, page_title, text, ptrn_antonymy, Relation.antonymy); if(0 < r.length) m_rel.put(Relation.antonymy, r); // hypernymy r = parseOneKindOfRelation (wikt_lang, page_title, text, ptrn_hypernymy, Relation.hypernymy); if(0 < r.length) m_rel.put(Relation.hypernymy, r); // hyponymy r = parseOneKindOfRelation (wikt_lang, page_title, text, ptrn_hyponymy, Relation.hyponymy); if(0 < r.length) m_rel.put(Relation.hyponymy, r); // holonymy r = parseOneKindOfRelation (wikt_lang, page_title, text, ptrn_holonymy, Relation.holonymy); if(0 < r.length) m_rel.put(Relation.holonymy, r); // meronymy r = parseOneKindOfRelation (wikt_lang, page_title, text, ptrn_meronymy, Relation.meronymy); if(0 < r.length) m_rel.put(Relation.meronymy, r); // troponymy r = parseOneKindOfRelation (wikt_lang, page_title, text, ptrn_troponymy, Relation.meronymy); if(0 < r.length) m_rel.put(Relation.troponymy, r); // coordinate_term r = parseOneKindOfRelation (wikt_lang, page_title, text, ptrn_coordinate_term, Relation.meronymy); if(0 < r.length) m_rel.put(Relation.coordinate_term, r); // gets text till min(====Derived terms====, ====Related terms====), // since "See also" can be used not only for semanticrelations, // but also for etymologically related words text = WRelationEn.substrTillFirstMatch(text, ptrn_derived_terms, ptrn_related_terms); /*boolean b_found = false; int pos1, pos2; pos1 = pos2 = text.length(); m = ptrn_derived_terms.matcher(text); if(m.find()) { pos1 = m.start(); b_found = true; } m = ptrn_related_terms.matcher(text); if(m.find()) { pos2 = m.start(); b_found = true; } if(b_found) text = text.substring(0, Math.min(pos1, pos2)); */ // otherwise_related (see also) r = parseOneKindOfRelation (wikt_lang, page_title, text, ptrn_see_also, Relation.meronymy); if(0 < r.length) m_rel.put(Relation.otherwise_related, r); return m_rel; } /** Gets text till the first match defined by patterns ptrn1 and ptr2. * If match fails then return the source text string. */ public static String substrTillFirstMatch ( String text, Pattern ptrn1, Pattern ptrn2) { boolean b_found = false; int pos1, pos2; pos1 = pos2 = text.length(); Matcher m = ptrn1.matcher(text); if(m.find()) { pos1 = m.start(); b_found = true; } m = ptrn2.matcher(text); if(m.find()) { pos2 = m.start(); b_found = true; } if(b_found) return text.substring(0, Math.min(pos1, pos2)); return text; } /** Parses text (related to the POS), creates and fill array of * semantic relations only for one kind of semantic relations (e.g. synonyms) * defined by the variable 'relation'. * @param wikt_lang language of Wiktionary * @param page_title word which are described in this article 'text' * @param lang_section language of this section of an article * @param text text of wiki article related to one POS * @param relation_header_pattern regular expression to find the header of semantic relation * @param relation type of parsing relation, e.g. synonymy * @return an empty array if relations are absent */ public static WRelation[] parseOneKindOfRelation ( LanguageType wikt_lang, String page_title, String text, Pattern relation_header_pattern, Relation relation) { // e.g.: // ====Synonyms==== // ==== Level IV. Relation ==== // 1. gets position in text after e.g. ====Synonyms==== Matcher m = relation_header_pattern.matcher(text); boolean b_next = m.find(); if(!b_next) { // the section is absent! //System.out.println("Warning in WRelationRu.parse(): The article '"+ // page_title + "' has no section ====Synonyms====."); return NULL_WRELATION_ARRAY; } // 1. get text till (1) next header or (2) empty line String relation_text = StringUtilRegular.getTextTillFirstHeaderOrEmptyLine(m.end()+1, text); if(0 == relation_text.length()) { // skip \n => +1 return NULL_WRELATION_ARRAY; } List<WRelation> wr_list = new ArrayList<WRelation>(); // 2. split into lines: "\n" (not "\n*") // parse lines till the line which is not started from * (?to recheck?) String[] lines = relation_text.split("\n"); boolean b_relations = false; for(String s : lines) { if(s.length() == 0) continue; Matcher m_start = ptrn_line_start.matcher(s); if(m_start.find()) { WRelation wr = null; s = m_start.replaceFirst(""); // remove "# " if(s.length() > 0) wr = parseOneLine (page_title, s); // 3. split list of synonyms into wikiwords (or wiki phrases?) //if(null != wr) wr_list.add(wr); // null means that relation = "# -", i.e. absent for this meaning if(null != wr) b_relations = true; } else { if(!b_relations) // it's one of first lines, before any relations continue; else break; // this line starts not from "#". Stop. } } if(!b_relations) { // only empty lists of relations return NULL_WRELATION_ARRAY; } wr_list = chompNullElementsEndOfList(wr_list); return (WRelation[])wr_list.toArray(NULL_WRELATION_ARRAY); } /** Chomps NULL elements at the end of list (wr_list), if there is any. */ private static List<WRelation> chompNullElementsEndOfList(List<WRelation> wr_list) { ListIterator i = wr_list.listIterator(wr_list.size()); if(i.hasPrevious()) { do { Object wr = i.previous(); if(null == wr) i.remove(); else break; } while(i.hasPrevious()); } return wr_list; } /** Structure for storing summary of meaning with list of (syn)onyms. */ private static class SummaryAndText { String meaning_summary; String onym_list; // e.g. synonym list } /** Parses one line of a semantic relations, * extracts a meaning summary and list of (syn)onyms (wikified words), * creates and fills SummaryAndText. * * @param text semantic relation text line (e.g. list of synonyms) * @return structure or null if the meaning summary is absent. */ private static SummaryAndText splitToSummaryAndOnymList (String text) { // extract meaning_summary, i.e. "flrink with cumplus" from // variant 1 (without "* ") // * (''flrink with cumplus''): [[flrink]], [[pigglehick]] // variant 2 // * {{sense|An oath or affirmation}} [[promise]], [[vow]], {{qualifier|informal}} [[word]] // * {{sense|forked, branched}} [[cloven]], [[forked]] String meaning_summary = "", onym_list = ""; // 1 Matcher m = ptrn_summary_in_sense.matcher(text); if(m.find()) { meaning_summary = m.group(1); onym_list = text.substring(m.end()); } else { // 2 m = ptrn_summary_in_italics.matcher(text); if(m.find()) { meaning_summary = m.group(1); onym_list = text.substring(m.end()); } } if(onym_list.length() == 0) return null; SummaryAndText st = new WRelationEn.SummaryAndText(); st.meaning_summary = meaning_summary; st.onym_list = onym_list; return st; } /** Replace template:l by usual [[wiki link]]. * * @param onym_list list of synonyms as a text string * @return the same text but without template:l. * * @see http://en.wiktionary.org/wiki/Template:l */ private static String replaceTemplateL( String onym_list) { if(onym_list.length() == 0) return onym_list; int start, end, prev_end, pipe2, pipe3; StringBuilder s = new StringBuilder(); start = onym_list.indexOf("{{l|"); end = 0; while( -1 != start) { // "|something" - optional // {{l|de|synonym|something}} -> [[synonym]] // | | // start end // pipe2 pipe3 prev_end + 2 prev_end = end; end = onym_list.indexOf("}}", start); pipe2 = onym_list.indexOf("|", start + 4); pipe3 = onym_list.indexOf("|", pipe2 + 1); if(-1 == pipe3 || pipe3 > end) // {{l|de|synonym}} - simple case pipe3 = end; if( prev_end + 2 < start ) // it is false in the first time, i.e. if there is only one synonym, not a list s.append( onym_list.substring(prev_end + 2, start) ); s.append("[["); if(pipe2 + 1 < 0 || pipe3 > onym_list.length() || pipe2 + 1 > pipe3) { System.out.println("\n\nError in WRelationEn.replaceTemplateL(): onym_list=" + onym_list); return onym_list; } else { s.append( onym_list.substring(pipe2 + 1, pipe3) ); } s.append("]]"); start = onym_list.indexOf("{{l|", end); } return s.toString(); } /** Parses one line of a semantic relations, * extracts a list of words (wikified words), creates and fills WRelation. * * @param page_title word which are described in this article 'text' * @param text semantic relation text line (e.g. list of synonyms) * @return WRelation or null if the list of semantic relations is empty or equal "-". * * The link to Wikisaurus is ommited now... to parse in future. */ private static WRelation parseOneLine( String page_title, String text) { // 1. check emptyness if(0 == text.length()) return null; if(text.contains("[[Wikisaurus:")) return null; // 2. extract meaning_summary, i.e. "flrink with cumplus" from // * (''flrink with cumplus''): [[flrink]], [[pigglehick]] SummaryAndText st = splitToSummaryAndOnymList(text); String meaning_summary = null; String onym_list = ""; if(null != st) { meaning_summary = st.meaning_summary; onym_list = st.onym_list; } else { onym_list = text; } // 3. chops the dot symbol (".") at the end of line (onym_list) Matcher m = ptrn_eol_dot.matcher(onym_list); if(m.find()) onym_list = m.replaceFirst(""); // {{l|de|synonym|something}} -> [[synonym]] if(onym_list.contains("{{l|")) onym_list = replaceTemplateL(onym_list); // 4. split by semicolon and comma WikiText[] wt = WikiText.createSplitByComma(page_title, onym_list); if(0 == wt.length) return null; // temp solution (without labels, i.e. label's array is empty) --------------------- LabelsWikiText[] lwt_array = new LabelsWikiText[wt.length]; int i=0; for(WikiText _wiki_text : wt) { List<Label> _labels = null; lwt_array[i] = new LabelsWikiText(_labels, _wiki_text); i++; } // -------------------------------------------------------- eo temp solution return new WRelation(meaning_summary, lwt_array); } }