/* ImageParser.java - parser of wiki Image [[Image:...]]. * * Copyright (c) 2005-2008 Andrew Krizhanovsky /aka at mail.iias.spb.su/ * Distributed under GNU Public License. */ package wikokit.base.wikipedia.text; import wikokit.base.wikipedia.language.LanguageType; import java.util.regex.Pattern; import java.util.regex.Matcher; /** Parser of wiki Image [[Image:...]]. */ public class ImageParser { // start of the image private final static Pattern ptrn_image_ru = Pattern.compile( "\\[\\[Изображение:"); private final static Pattern ptrn_image_en = Pattern.compile( "\\[\\[Image:"); //private final static Pattern ptrn_image_en = Pattern.compile("\\[\\[Image:(.+?)\\]\\]"); // end of the image private final static Pattern ptrn_image_boundaries = Pattern.compile( "\\||\\[\\[|\\]\\]"); // pipe |, or open [[, or close ]] private final static StringBuffer NULL_STRINGBUFFER = new StringBuffer(""); /** Removes Image tag and remains the title of the image. * This func should be called before WikiParser.parseDoubleBrackets(). * * @param wiki_lang the word "Image" depends on wiki language, * e.g. "Dosiero" (Esperanto), "Изображение" (Russian) etc. */ public static StringBuffer parseImageDescription( StringBuffer text,LanguageType wiki_lang) { if(null == text || 0 == text.length()) { return NULL_STRINGBUFFER; } if( wiki_lang == LanguageType.ru) { // parse English and Russian images text = parseImageDescription(text, ptrn_image_en); return parseImageDescription(text, ptrn_image_ru); } else { if(wiki_lang == LanguageType.en || wiki_lang == LanguageType.simple) { return parseImageDescription(text, ptrn_image_en); } else { // print message: todo System.out.println("Warning (wikipedia.text.ImageParser.parseImageDescription()): is valid only for English and Russian. Todo."); } } return text; } private static StringBuffer parseImageDescription( StringBuffer text,Pattern p_image_start) { final String w_closed_too_many = "Warning (wikipedia.text.ImageParser.parseImageDescription()): number of opened brackets '[[' < than closed brackets ']]' in image"; final String w_opened_too_many = "Warning (wikipedia.text.ImageParser.parseImageDescription()): number of opened brackets '[[' > than closed brackets ']]' in image"; int n_nested = 0; Matcher m_start = p_image_start.matcher(text.toString()); boolean b_start = m_start.find(); if(!b_start) return text; StringBuffer sb = new StringBuffer(); while(b_start) { m_start.appendReplacement(sb, ""); StringBuffer after_image = new StringBuffer(); m_start.appendTail(after_image); Matcher m = ptrn_image_boundaries.matcher(after_image.toString()); boolean b_internal = m.find(); boolean b_desc_exist = false; // description of the image boolean b_desc_started = false; // the text is started after first open brackets if(b_internal) { n_nested = 1; // [[Image: - already 1 open bracket StringBuffer s_candidate_desc = new StringBuffer(); // candidate of text after last pipe | in Image while(b_internal) { String g0 = m.group(0); if('|' == g0.charAt(0)) { b_desc_exist = true; if(b_desc_started) { // pipe within desc, e.g. Image:a.jpg|[[Lemma|Word]] m.appendReplacement(s_candidate_desc, g0); } else { // vertical line, pipe | then start new candidate of description s_candidate_desc.setLength(0); m.appendReplacement(new StringBuffer(), ""); // clear regex buffer } } else { if('[' == g0.charAt(0)) { // opened '[' n_nested ++; b_desc_started = true; } else { // closed ']' n_nested --; } if(n_nested == 0) { // [[Image:...]] closed b_internal = false; if (b_desc_exist) { m.appendReplacement(s_candidate_desc, ""); } else { m.appendReplacement(new StringBuffer(), ""); // clear regex buffer } } else { m.appendReplacement(s_candidate_desc, g0); } } b_internal = b_internal && m.find(); } sb.append(s_candidate_desc); } StringBuffer remain = new StringBuffer(); m.appendTail(remain); m_start = p_image_start.matcher(remain.toString()); b_start = m_start.find(); } m_start.appendTail(sb); if(n_nested < 0) { System.out.println(w_closed_too_many); } else { if(n_nested > 0) { System.out.println(w_opened_too_many); } } return sb; } }