/* WQuoteRu.java - corresponds to the phrase/sentence that illustrates a meaning * of a word in Russian Wiktionary. * * Copyright (c) 2009-2011 Andrew Krizhanovsky <andrew.krizhanovsky at gmail.com> * Distributed under EPL/LGPL/GPL/AL/BSD multi-license. */ package wikokit.base.wikt.multi.ru; import wikokit.base.wikipedia.util.StringUtilRegular; import wikokit.base.wikipedia.util.StringUtil; import wikokit.base.wikt.word.WQuote; import wikokit.base.wikt.multi.ru.quote.*; import java.util.ArrayList; import java.util.Arrays; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import wikokit.base.wikipedia.util.template.TemplateParser; /** Phrase or sentence that illustrates a meaning of a word in Russian Wiktionary. */ public class WQuoteRu { private static final boolean DEBUG = false; private final static WQuote[] NULL_WQUOTE_ARRAY = new WQuote[0]; private final static String[] NULL_STRING_ARRAY = new String[0]; /** Gets, extracts from 'text' a definition till first example sentence starting from {{пример|. */ public static String getDefinitionBeforeFirstQuote (String page_title, String text) { // Gets position before first example sentence {{пример|Самолёт-истребитель.}} int pos_quote = text.indexOf("{{пример|"); if(-1 == pos_quote) { // there is no quote section! // out of date quote template pos_quote = text.indexOf("{{пример перевод|"); if(-1 == pos_quote) { // there is no quote with translation section! if(DEBUG) System.out.println("Warning in WQuoteRu.getDefinitionBeforeFirstQuote(): The article '"+ page_title + "' has no quote '{{пример|' in a definition."); return text; } } return text.substring(0, pos_quote).trim(); } /** Checks wheather the text has closing brackets without open brackets, * so if text looks like "|The title]]" (open "[[" is absent), * then return true. */ private static boolean isAbsentOpenDoubleSquareBrackets(String text) { int pos = text.indexOf("]]"); if(-1 == pos) return false; return -1 == text.substring(0, pos).indexOf("[["); } /** Intellectual splitting of parameters of the template {{пример|}}. * It splits: {{пример|текст|автор|титул|дата|}}, * but it does not split: * 1) [[:s:The title|The title]] * 2) [[some wikified word|it is fine]] * * As a result the functions extracts quote parameters from the template "{{пример|". |текст=|перевод=|автор=|титул=|издание=|перев=|дата=|источник= */ private static String[] splitParameters(String text) { String[] pipe_chunks = text.split("\\|"); List<String> source_list = new LinkedList(Arrays.asList(pipe_chunks)); List<String> result_list = new ArrayList(); // merge adjacent chunks if chunk.prev.contains("[[") and chunk.next.has("]]") Iterator it_source = source_list.iterator(); while(it_source.hasNext()) { // for (Iterator it = chunk_list.iterator(); it.hasNext();) { //prev_value = next_value; String value = (String)it_source.next(); // if value looks like "|The title]]" (open "[[" is absent) // then it should be merged with previous chunk if(!isAbsentOpenDoubleSquareBrackets(value)) { result_list.add(value); } else { // result.last += value String prev = result_list.remove( result_list.size()-1 ); result_list.add( prev + "|" + value ); } } return (String[])result_list.toArray(NULL_STRING_ARRAY); } /** Removes highlighted marks from a sentence. * 1) Sentence with '''words'''. -> Sentence with <start_replacement>words</end_replacement>. * 2) Sentence with {{выдел|words}}. -> Sentence with <start_replacement>words</end_replacement>. */ public static String removeHighlightedMarksFromSentence(String str, String start_replacement, String end_replacement) { if(str.contains("{{выдел|")) { String s = str.replace("{{выдел|", start_replacement).replace("}}", end_replacement); // because, there are "{{-}}" -> "{{-" in the text return s.replace("{{-", " - "); } else if(str.contains("'''")) { return str.replace("'''", ""); } return str; } /** Additional treatment of the sentence text: * 1)  ,   -> " " * 2) {{-}} -> " - " * 3) poetry: "//" -> "\n" */ public static String transformSentenceText(boolean is_sqlite, String str) { str = StringUtil.replaceSpecialChars( str ); if(str.contains("{{-}}")) str = str.replace("{{-}}", " - "); if(str.contains(" // ")) str = str.replace(" // ", "\n"); if(str.contains("//")) str = str.replace("//", "\n"); if(is_sqlite && str.contains("\\\"")) // \" -> " (SQLite feature) str = str.replace("\\\"", "\""); return str; } /** Additional treatment of the sentence text: * 1)  ,   -> " " * 2) {{-}} -> " — " @see ru.wiktionary.org/wiki/template:- * 3) poetry: "//" -> "<br>" */ public static String transformSentenceTextToHTML(boolean is_sqlite, String str) { str = StringUtil.replaceSpecialChars( str ); if(str.contains("{{-}}")) str = str.replace("{{-}}", " — "); if(str.contains(" // ")) str = str.replace(" // ", "<br>"); if(str.contains("//")) str = str.replace("//", "<br>"); if(is_sqlite && str.contains("\\\"")) // \" -> " (SQLite feature) str = str.replace("\\\"", "\""); return str; } /** Replaces quotation template:" by quotations, * e.g. 'Фрегат {{"|Паллада}}' -> * 'Фрегат "Паллада"'; * * @param text source text with quotation template * @param pos_quote position of the the quotation template in the 'text', quote != -1 * @return text withtout template, but with quotes */ private static String replaceQuoteTemplateByQuotationMarks(String text, int pos_quote) { // int pos = str.indexOf("{{\"|"); // -1 != pos_quote int pos_quote_end = text.indexOf("}}", pos_quote+3); // end of template if(-1 != pos_quote_end) { // yes, replace // str : remove <pos_quote, pos_quote_end+2> StringBuilder sb_without_template = new StringBuilder(text.length() - 4); // 4 = - length("{{\"|" + "}}") + length('""') sb_without_template.append( text.substring(0, pos_quote) ); sb_without_template.append( '"' ); sb_without_template.append( text.substring(pos_quote + 4, pos_quote_end) ); sb_without_template.append( '"' ); sb_without_template.append( text.substring(pos_quote_end + 2) ); return sb_without_template.toString(); } return text; } /** Replaces all quotation template:" by quotations if there is any, * e.g. 'Фрегат {{"|Паллада}}' -> * 'Фрегат "Паллада"'; * * @param text source text with quotation template * @return text without template, but with quotes */ private static String replaceAllQuoteTemplateByQuotationMarks(String text) { int pos_quote = text.indexOf("{{\"|"); while(-1 != pos_quote) { text = replaceQuoteTemplateByQuotationMarks(text, pos_quote); pos_quote = text.indexOf("{{\"|"); } return text; } /** Replaces quotation template:кавычки|ru| by quotations, * e.g. {{кавычки|ru|Jam temp'esta}} -> * "Jam temp'esta" * * @param text source text with quotation template * @param pos_quote position of the the quotation template in the 'text', quote != -1 * @return text withtout template, but with quotes */ private static String replaceKavychkiTemplateByQuotationMarks(String text, int pos_quote) { // int pos = str.indexOf("{{кавычки|"); // -1 != pos_quote int pos_quote_end = text.indexOf("}}", pos_quote+3); // end of template if(-1 != pos_quote_end) { // yes, replace // | pipe between (optional) // pos_quote pos_quote_end // "{{кавычки|ru|Jam temp'esta}},{{-}}отвечала ему...|Л. Юзефович|Казароза|2002" // "{{кавычки|Jam temp'esta}},{{-}}отвечала ему...|Л. Юзефович|Казароза|2002" int pos_pipe = text.indexOf("|", pos_quote+10); // 10 = length("{{кавычки|") if(pos_pipe >= pos_quote_end) pos_pipe = -1; // it's not our pipe, it's after this template // str : remove <pos_quote, pos_quote_end+2> StringBuilder sb_without_template = new StringBuilder(text.length()); sb_without_template.append( text.substring(0, pos_quote) ); sb_without_template.append( '"' ); if(-1 == pos_pipe){ // {{кавычки|Jam 10 = len("{{кавычки|") sb_without_template.append( text.substring(pos_quote + 10, pos_quote_end) ); } else { sb_without_template.append( text.substring(pos_pipe + 1, pos_quote_end) ); } sb_without_template.append( '"' ); sb_without_template.append( text.substring(pos_quote_end + 2) ); return sb_without_template.toString(); } return text; } /** Replaces all quotation кавычки|ru| by quotations if there is any, * e.g. {{кавычки|ru|Jam temp'esta}} -> * "Jam temp'esta" * * @param text source text with quotation template * @return text without template, but with quotes */ private static String replaceAllKavychkiTemplateByQuotationMarks(String text) { int pos_quote = text.indexOf("{{кавычки|"); while(-1 != pos_quote) { text = replaceKavychkiTemplateByQuotationMarks(text, pos_quote); pos_quote = text.indexOf("{{кавычки|"); } return text; } /** Extracts quote parameters from the template "{{пример|" * without start "{{пример|" and end "}}" elements of the template. * * There are two variants: * {{пример|текст|автор|титул|дата|}} - without parameters names * {{пример|текст=|перевод=|автор=|титул=|издание=|перев=|дата=|источник=}} - with names * * @param page_title word which is described in this article * @param sb_line template without start "{{пример|" and "}}" * * @return filled WQuote, null if there are no text in the example sentence */ private static WQuote parseQuoteParameters(String page_title, StringBuilder sb) { String text = ""; String translation = ""; String transcription = ""; String publisher = ""; String source = ""; AuthorAndWikilink author_and_wikilink = new AuthorAndWikilink(); TitleAndWikilink title_and_wikilink = new TitleAndWikilink(); YearsRange years_range = new YearsRange(); String str = sb.toString(); // 0a. before splitting by "|", replace {{выдел| by {{выдел! if(str.contains("{{выдел|")) str = str.replace("{{выдел|", "{{выдел!"); // 0b. before splitting by "|" // expand parameters in the template "{{библия|", replace pipes "|" by dots "." if(str.toLowerCase().contains("{{библия")) { str = TemplateParser.expandTemplateParams(str, "библия2", "|", "."); str = TemplateParser.expandTemplateParams(str, "библия", "|", "."); } // 0c. before splitting by "|", replace template:" by quotations, e.g. 'Фрегат {{"|Паллада}}' -> 'Фрегат "Паллада"'; if(-1 != str.indexOf("{{\"|")) str = replaceAllQuoteTemplateByQuotationMarks(str); if(-1 != str.indexOf("{{кавычки|")) str = replaceAllKavychkiTemplateByQuotationMarks(str); // 0d. before splitting by "|" - expand the parameter in the template "{{Cyrs|" if(str.startsWith("{{Cyrs")) { str = TemplateParser.expandTemplateWithOneParameter(str, "Cyrs"); } // 1. split //String[] params = str.split("\\|"); String[] params = splitParameters(str); // 2. fills hash int param_counter = 0; // counter of unnamed parameters for_label: for(String p : params) { int pos_equal = p.indexOf("="); if(-1 == pos_equal) { // there is no equal sign for this parameter param_counter ++; switch (param_counter) { // {{пример|1 текст|2 автор|3 титул|4 дата|}} case 1: if(p.length() == 0) break for_label; text = p; break; case 2: author_and_wikilink.parseAuthorName(p); break; case 3: title_and_wikilink.parseTitle(p); break; case 4: years_range.parseYearsRange(page_title, p.trim()); break; } } else { if(pos_equal+1 >= p.length() || pos_equal < 3) // 4 == shortest parameter length = "дата".lenth() continue; // split by "=" String param_name = p.substring(0, pos_equal); String value = p.substring(pos_equal+1).trim(); // {{пример|текст=|перевод=|автор=|титул=|издание=|перев=|дата=|источник=}} if(param_name.equalsIgnoreCase("текст")) { text = value; } else if(param_name.equalsIgnoreCase("перевод")) { translation = value; } else if(param_name.equalsIgnoreCase("автор")) { author_and_wikilink.parseAuthorName(value); } else if(param_name.equalsIgnoreCase("титул")) { title_and_wikilink.parseTitle(value); } else if(param_name.equalsIgnoreCase("издание")) { publisher = value; } else if(param_name.equalsIgnoreCase("дата")) { years_range.parseYearsRange(page_title, value.trim()); } else if(param_name.equalsIgnoreCase("источник")) { source = value; } } } if (text.length() == 0) return null; // last. return format back text = text.replace("{{выдел!", "{{выдел|"); if(translation.length() > 0) translation = translation.replace("{{выдел!", "{{выдел|"); if(transcription.length() > 0) transcription = transcription.replace("{{выдел!", "{{выдел|"); return new WQuote ( text, translation, transcription, author_and_wikilink.author, author_and_wikilink.author_wikilink, title_and_wikilink.title, title_and_wikilink.title_wikilink, publisher, source, years_range.year_from, years_range.year_to); } /** Extracts quotations from 'text' after the definition, * each quotation starts from "{{пример|". */ public static WQuote[] getQuotes (String page_title, String text) { List<WQuote> quote_list = null; if(-1 != text.indexOf("{{пример|}}") || -1 != text.indexOf("{{пример||перевод=}}")) return NULL_WQUOTE_ARRAY; // examples are empty // Gets position of the first example sentence {{пример|Самолёт-истребитель.}} int pos_quote = text.indexOf("{{пример|"); if(-1 == pos_quote) // there is no quote section! return NULL_WQUOTE_ARRAY; StringBuilder sb = new StringBuilder( text.substring(pos_quote + 9).trim() // 9 == "{{пример|".length() ); if(sb.length() < 3) // sb == "some text }}", length >=3 return NULL_WQUOTE_ARRAY; String[] lines = sb.toString().split("\\{\\{пример\\|"); for(String line : lines) { StringBuilder sb_line = new StringBuilder(line); pos_quote = sb_line.lastIndexOf("}}"); if(-1 == pos_quote) // there is no close brackets, skip continue; if(pos_quote < 2) // too short, skip continue; sb_line.setLength(pos_quote); WQuote wq = parseQuoteParameters(page_title, sb_line); if(null != wq) { if(null == quote_list) quote_list = new ArrayList<WQuote>(); quote_list.add(wq); } } if(null == quote_list) return NULL_WQUOTE_ARRAY; return( (WQuote[])quote_list.toArray(NULL_WQUOTE_ARRAY) ); } }