/* WPOSEn.java - corresponds to a POS level of English Wiktionary word. * * Copyright (c) 2010 Andrew Krizhanovsky <andrew.krizhanovsky at gmail.com> * Distributed under GNU General Public License. */ package wikokit.base.wikt.multi.en; import wikokit.base.wikt.util.POSText; import wikokit.base.wikt.util.LangText; import wikokit.base.wikt.constant.POS; import java.util.regex.Pattern; import java.util.regex.Matcher; import java.util.List; import java.util.ArrayList; /** Splits text to fragments related to different parts of speech (POS). * POS is a level 3 or 4 header in English Wiktionary: * <PRE> * 1) * ==English== * ===Etymology=== * ===Noun=== * ===Verb=== * * ==Finnish== * ===Etymology=== * ===Noun=== (level 3 in English Wiktionary: ===Noun===) * * 2) * In the case of multiple etymologies, all subordinate headers need to have * their levels increased by 1: * ===Etymology 1=== * ====Pronunciation==== * ====Noun==== POS=noun * ===Etymology 2=== * ====Pronunciation==== * ====Noun==== POS=noun * ====Verb==== POS=verb * (level 4 in English Wiktionary: ===Verb===)</PRE> * * @see http://en.wiktionary.org/wiki/Wiktionary:Entry_layout_explained * @see http://en.wiktionary.org/wiki/Wiktionary:Entry_layout_explained/POS_headers */ public class WPOSEn { private final static POSText[] NULL_POS_TEXT_ARRAY = new POSText[0]; private final static List<POSText> NULL_POS_TEXT_LIST = new ArrayList<POSText>(0); /** start of the POS block: [\w {}], * \s (space) since there is "===Proper noun===" * {} (brackets) since there is "==={{abbreviation}}===" */ private final static Pattern ptrn_3_or_4_level = Pattern.compile( // Vim: ^==\s*\([^=]\+\)\s*==\s*\Z // RE: ^====?\s*([\w {}]+)\s*====?\s* //"(?m)^\\s*=="); "(?m)^====?\\s*([-\\w {}]+)\\s*====?\\s*"); /** Gets true, if str is known header, e.g. "References", * but it's not a part of speech name, e.g. "Verb". */ public static boolean isSecondLevelHeaderWordNotPOS (String str) { if(str.equalsIgnoreCase("Derived terms")) return true; if(str.equalsIgnoreCase("Related terms")) return true; if(str.equalsIgnoreCase("Translations")) return true; if(str.equalsIgnoreCase("References")) return true; if(str.equalsIgnoreCase("External links")) return true; return false; } /** Cuts (if it is presented) the header (===POS Name===) and return * POSText list with one element. * Since it is known that this LangText object 'lt' contains exactly * one POS section. * * @param lt .text with only one POS section */ private static List<POSText> cutHeaderFromAlonePOSSection ( LangText lt, Matcher m) { List<POSText> pos_section_alone = new ArrayList<POSText>(1); m.reset(); while(m.find()) { String pos_header = m.group(1).toLowerCase(); if(m.groupCount() > 0 && POSTemplateEn.has(pos_header)) { pos_section_alone.add( new POSText( POSTemplateEn.get(pos_header), new StringBuffer( // text after === POS === lt.text.toString().substring(m.end()))) ); return pos_section_alone; } } // save all text for unknown POS pos_section_alone.add( new POSText( POS.unknown, lt.text) ); return pos_section_alone; } /** page_title - word which are described in this article 'text' * @param lt .text will be parsed and splitted, * .lang is not using now, may be in future... * lt corresponds to one Etymology section */ private static List<POSText> splitToPOSSections ( String page_title, LangText lt) { if(null == lt.text || 0 == lt.text.length()) { return NULL_POS_TEXT_LIST; } Matcher m = ptrn_3_or_4_level.matcher(lt.text.toString()); int n_pos = countPOSSections(m); if(n_pos <= 1) // there is only one ===Third of forth level POS header=== return cutHeaderFromAlonePOSSection(lt,m); // in this language in this etymology for this word // else: there are at least two sections: POS // 1. Gets POS and // 2. Splits lt.text into POS sections m.reset(); boolean b_next = m.find(); assert(b_next); List<POSText> pos_sections = new ArrayList<POSText>(n_pos); // there is exactly n_pos POS headers // position of POS block in the lt.text // "<start_old> ===Noun=== <end_old> ... <start_new> ===Verb=== <end_new>" // POS block = substring(end_old, start_new) // end_old = end_new = m.end() // start_new = m.start() // First POS header String pos_header, pos_header_old = ""; pos_header = m.group(1).toLowerCase(); while(b_next && !POSTemplateEn.has(pos_header)) { b_next = m.find(); pos_header = m.group(1).toLowerCase(); } pos_header_old = pos_header; assert(POSTemplateEn.has(pos_header)); assert(b_next); int end_old = m.end(); search_POS: while(b_next) { pos_header = ""; while(b_next && !POSTemplateEn.has(pos_header)) { b_next = m.find(); if (!b_next) { POS p = POSTemplateEn.get(pos_header_old); POSText pt = new POSText(p, lt.text.substring(end_old)); pos_sections.add(pt); break search_POS; } pos_header = m.group(1).toLowerCase(); } POS p = POSTemplateEn.get(pos_header_old); pos_header_old = pos_header; POSText pt = new POSText(p, lt.text.substring(end_old, m.start())); pos_sections.add(pt); end_old = m.end(); } return pos_sections; } /** Splits each etymology section into POS sections. * Then merge all POS sections into one big array. * * page_title - word which are described in this article 'text' * @param lt .text will be parsed and splitted, * .lang is not using now, may be in future...<br><br> * * 1) Splits the following text to "Noun" and "Verb" * 2) Extracts part of speech "noun" and "verb" * <PRE> * ===Noun=== * {{en-noun}} * ===Verb=== * </PRE> * * Todo: save info about the link Etymology <-> POS. */ public static POSText[] splitToPOSSections ( String page_title, LangText[] etymology_sections) //LangText lt) { if(etymology_sections.length == 0) return NULL_POS_TEXT_ARRAY; List<POSText> pos_sections = new ArrayList<POSText>(); for(LangText e : etymology_sections) { pos_sections.addAll( splitToPOSSections(page_title, e) ); //pos_sections.addAll( Arrays.asList( splitToPOSSections(page_title, e) ) ); //POSText[] pt = splitToPOSSections(page_title, e); //if(pt.length > 0) // pos_sections.addAll( Arrays.asList(pt) ); } if(pos_sections.isEmpty()) return NULL_POS_TEXT_ARRAY; return (POSText[])pos_sections.toArray(NULL_POS_TEXT_ARRAY); } /** Counts number of POS sections in this lt->text. * * @param page_title title of Wiktionary entry * @param lt ->text field may contain POS section(s) * @param m regular expression matcher ptrn_3_or_4_level */ private static int countPOSSections (// String page_title, LangText lt, Matcher m) { int n_pos = 0; while(m.find()) { String POS_candidate = m.group(1).toLowerCase(); if(m.groupCount() > 0 && POSTemplateEn.has(POS_candidate)) n_pos ++; } return n_pos; } /** Gets first encountered POS name. * * @param m regular expression matcher ptrn_3_or_4_level of POS header */ private static POS getFirstPOS (// String page_title, LangText lt, Matcher m) { POS p_type = POS.unknown; while(m.find()) { String pos_header = m.group(1); if(m.groupCount() > 0 && POSTemplateEn.has(pos_header)) return POSTemplateEn.get(pos_header); } return p_type; } /** Checks wheather this name is a name of some part of speech. */ /*public static POS guessPOS (//StringBuffer text) { POS p_type = POS.unknown; if(null == text || 0 == text.length()) { return new POSText(p_type, text.toString()); } Matcher m = ptrn_morpho_then_2letters.matcher(text.toString()); boolean b = m.find(); if(b) { String two_letters = m.group(1); if(two_letters.equalsIgnoreCase("{{")) { // if \1=="{{" then get first letters till space // substring started after the symbol "{{" String pos_header = StringUtilRegular.getLettersTillSpace(text.substring(m.end())); if(POSTypeRu.has(pos_header)) { p_type = POSTypeRu.get(pos_header); } else { // old template of POS with hyphen, e.g. "{{adv-ru|}} instead of {{adv ru|}} pos_header = StringUtilRegular.getLettersTillHyphen(text.substring(m.end())); if(POSTypeRu.has(pos_header)) { p_type = POSTypeRu.get(pos_header); } } } else { // else get two_letters + the first Word // todo // .... } } return new POSText(p_type, text.toString()); }*/ }