/* WTranslationEn.java - corresponds to a Translations level of a word in * English Wiktionary. * * Copyright (c) 2010 Andrew Krizhanovsky <andrew.krizhanovsky at gmail.com> * Distributed under GNU General Public License. */ package wikokit.base.wikt.multi.en; import wikokit.base.wikipedia.language.LanguageType; import wikokit.base.wikt.util.POSText; import wikokit.base.wikipedia.util.StringUtilRegular; import wikokit.base.wikt.word.WTranslation; import wikokit.base.wikt.word.WTranslationEntry; import java.util.regex.Pattern; import java.util.regex.Matcher; import java.util.List; import java.util.ArrayList; /** Translations of English Wiktionary word. * * @see http://en.wiktionary.org/wiki/Wiktionary:Translations */ public class WTranslationEn { private final static WTranslation[] NULL_WTRANSLATION_ARRAY = new WTranslation[0]; private final static WTranslationEntry[] NULL_WTRANSLATIONENTRY_ARRAY = new WTranslationEntry[0]; /** Gets position after ====Translations==== */ private final static Pattern ptrn_translation_level = Pattern.compile( "(?m)^={3,5}\\s*Translations\\s*={3,5}\\s*$"); /** Gets a header of translation box template "{{trans-top|header}}" * or "{{trans-top|header|}}" * or "{{trans-top}}" - header is absent * or "{{trans-top||}}" - header is absent * "* French: {{t|fr|orange|f}}" */ private final static Pattern ptrn_translation_box_header = Pattern.compile( "\\Q{{trans-top\\E\\|?(.*?)\\|?\\}\\}"); // RE: \Q{{trans-top\E\|?(.*?)\|?\}\} /** Parses text (related to the POS), creates and fill array of translations (WTranslation). * @param wikt_lang language of Wiktionary * @param page_title word which are described in this article 'text' * @param lang_section language of this section of an article * @param pt POSText defines POS stored in pt.text * @return */ public static WTranslation[] parse ( LanguageType wikt_lang, LanguageType lang_section, String page_title, POSText pt) { // === Level III. Translation === if(null == pt.getText()) { return NULL_WTRANSLATION_ARRAY; } StringBuffer text_source_sb = pt.getText(); if(0 == text_source_sb.length()) { return NULL_WTRANSLATION_ARRAY; } // 1. gets position in text after ====Translations==== String text_source = text_source_sb.toString(); Matcher m = ptrn_translation_level.matcher(text_source_sb); boolean b_next = m.find(); if(!b_next) { // there is no translation section! //if(lang_section == LanguageType.en) // System.out.println("Warning in WTranslationEn.parse(): The English word '"+ // page_title + "' has no section ====Translation====."); return NULL_WTRANSLATION_ARRAY; } // one more check that there is any translation if(!text_source.contains("{{trans-top|")) { //System.out.println("Warning in WTranslationEn.parse(): The English word '" + page_title + // "' has section ====Translation==== but there is no any translation box \"{{trans-top|\"."); return NULL_WTRANSLATION_ARRAY; } // x = gets position of the next 2nd - 5th level block == See also or Bibliography == // gets text till the last: "{{trans-bottom}}" String text = StringUtilRegular.getTextTillFirstHeaderPosition(m.end(), text_source); int len = text.length(); if(0 == len) return NULL_WTRANSLATION_ARRAY; List<WTranslation> wt_list = new ArrayList<WTranslation>(); int prev_end = 1; // previous end of previous translation box + len("\n")=1 boolean to_continue = true; while(to_continue) { // 3. gets next substring "{{trans-top|" int next_end = text.indexOf("{{trans-top|", prev_end + 1); if(-1 == next_end) { to_continue = false; next_end = len; } String trans_block = text.substring(prev_end, next_end); // 4. extracts lang code "|en=", e.g. wikified translation [[angel]] // return WTranslation or null if the translation text block was not found. WTranslation wt = WTranslation.parseOneTranslationBox(wikt_lang, page_title, trans_block); if(null != wt) wt_list.add(wt); if(to_continue) to_continue = -1 != next_end && next_end < len; prev_end = next_end; } /* if(!atLeastOneTranslationExists(wt_list)) return NULL_WTRANSLATION_ARRAY; if(wt_list.size() > 1 && !allTranslationsHaveHeader(wt_list)) { System.out.println("Warning in WTranslationRu.parse(): The article '"+ page_title + "' has several translation boxes, but not all of them have headers."); } */ return( (WTranslation[])wt_list.toArray(NULL_WTRANSLATION_ARRAY) ); } /** Parses one translation box, i.e. extracts languages and a list of * translations (wikified words) for each language, * creates and fills WTranslation. * * @param wikt_lang language of Wiktionary * @param page_title word which are described in this article 'text' * @param text translaton box text * @return WTranslation or null if the translation text block was not found. */ public static WTranslation parseOneTranslationBox(LanguageType wikt_lang, String page_title, String text) { String meaning_summary = ""; String text_wo_header; // 1. extract header (meaning summary, first line in translation box) Matcher m = ptrn_translation_box_header.matcher(text.toString()); boolean b_found = m.find(); // System.out.println("WTranslationRu.parseOneTranslationBox(): The article '"+page_title + "'."); if(b_found) { // there is a header meaning_summary = m.group(1); if(text.length() <= m.end() + 1) return null; // header without text text_wo_header = text.substring(m.end() + 1); // text without header } else text_wo_header = text; String[] lines = text_wo_header.split("\n"); List<WTranslationEntry> wte_list = null; for(String s : lines) { s = s.trim(); if(s.equalsIgnoreCase("{{trans-mid}}")) continue; if(s.equalsIgnoreCase("{{trans-bottom}}")) break; // for each language (for each line) WTranslationEntry wte = WTranslationEntry.parse(wikt_lang, page_title, s); if(null != wte) { if(null == wte_list) wte_list = new ArrayList<WTranslationEntry>(); wte_list.add(wte); } } if(null == wte_list || wte_list.size() == 0) return null; return new WTranslation( meaning_summary, (WTranslationEntry[])wte_list.toArray(NULL_WTRANSLATIONENTRY_ARRAY)); } }