/*
* WikiParser.java - parses texts in wiki format.
*
* Copyright (c) 2005-2008 Andrew Krizhanovsky /aka at mail.iias.spb.su/
* Distributed under GNU Public License.
*/
package wikokit.base.wikipedia.text;
import wikokit.base.wikipedia.language.LanguageType;
import wikokit.base.wikipedia.language.WikimediaSisterProject;
import wikokit.base.wikipedia.util.StringUtil;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
//import java.util.regex.PatternSyntaxException;
/** Converts wiki-texts to texts without [[]], interwiki, <code>..</code>, etc.
* Definitions:
* [[...]] - wikilink,
* [http:// site name] - hyperlink.
*/
public class WikiParser {
// metacharacters: ([{\^-$|]})?*+.
// "\\A\\W*(.+?)\\W*\\Z"
private final static Pattern ptrn_remove_interwiki = Pattern.compile("\\[\\[\\w\\w\\w?:.+?\\]\\]");
private final static Pattern ptrn_remove_brackets_in_interwiki = Pattern.compile("\\[\\[\\w\\w\\w?:(.+?)\\]\\]");
private final static Pattern ptrn_remove_category_en = Pattern.compile("\\[\\[Category:.+?\\]\\]");
private final static Pattern ptrn_remove_category_ru = Pattern.compile("\\[\\[Категория:.+?\\]\\]");
private final static Pattern ptrn_tag_code = Pattern.compile("<code>.+?</code>", Pattern.DOTALL);
private final static Pattern ptrn_html_comment = Pattern.compile("<!--.+?-->", Pattern.DOTALL);
private final static Pattern ptrn_pre_code = Pattern.compile("<pre>.+?</pre>", Pattern.DOTALL);
private final static Pattern ptrn_source_code = Pattern.compile("<source.+?</source>", Pattern.DOTALL);
// 1. simple wikilink without '|' inside link, e.g. [[Tsar]] -> Tsar
private final static Pattern ptrn_remove_brackets_in_wikilinks = Pattern.compile("\\[\\[([^:|]+?)\\]\\]");
// 2. with '|' inside link, e.g. [[The Russian language|Russian]] -> Russian
// [^[:|]| - not '[', ':' till first '|'
//private final static Pattern ptrn_remove_brackets_in_wikilinks_vertical_line = Pattern.compile("\\[\\[[^[:]*?|(.+?)\\]\\]");
//private final static Pattern ptrn_remove_brackets_in_wikilinks_vertical_line = Pattern.compile("\\[\\[[^\Q:]\E]*?|(.+?)\\]\\]");
//private final static Pattern ptrn_remove_brackets_in_wikilinks_vertical_line = Pattern.compile("\\[\\[[^:\\]]+?|([[^\\]]+?)\\]\\]");
private final static Pattern ptrn_double_brackets = Pattern.compile("\\[\\[(.+?)\\]\\]");
private final static Pattern ptrn_single_brackets = Pattern.compile( "\\[(.+?)\\]" );
//private final static Pattern ptrn_double_curly_brackets = Pattern.compile("\\{\\{(.+?)\\}\\}", Pattern.DOTALL);
private final static Pattern ptrn_double_curly_brackets = Pattern.compile("\\{\\{([^\\{]+?)\\}\\}", Pattern.DOTALL);
private final static Pattern ptrn_accent_sign = Pattern.compile("́");
private final static Pattern ptrn_triple_apostrophe = Pattern.compile("'''(.+?)'''");
private final static Pattern ptrn_double_apostrophe = Pattern.compile("''(.+?)''");
// remove [site names] in brackets
//private final static Pattern ptrn_site_name = Pattern.compile("\\bhttp://.+?(\\s|$)");
//private final static Pattern ptrn_site_name = Pattern.compile("[-/_!*'():~a-z%0-9A-Z]+?.[-/_!*'():~a-z%0-9A-Z]+");
//private final static Pattern ptrn_site_name = Pattern.compile("[-./_!*'():~a-z%0-9A-Z]+");
/** Hostname (without spaces) contains the dot '.' at least once, except the last symbol. */
private final static Pattern ptrn_site_name = Pattern.compile("(\\A|\\s)\\S+?[.]\\S+?[^.]([\\s,!?]|\\z)");
// final static Pattern ptrn_site_name = Pattern.compile("\\b\\S+?[.]\\S+?[^.]\\b");
// final static Pattern ptrn_site_name = Pattern.compile("\\b.+?[.]+.+?[^.]\\b");
private final static StringBuffer NULL_STRINGBUFFER = new StringBuffer("");
/** Creates a new instance of WikiParser */
//public WikiParser() { }
/** Removes interwiki, e.g. "[[et:Talvepalee]] text" -> " text",
* where language code (e.g. 'et') can have two or three letters.
*/
public static StringBuffer removeInterwiki(StringBuffer text) {
Matcher m = ptrn_remove_interwiki.matcher(text.toString());
return new StringBuffer(m.replaceAll(""));
}
/** Expands interwiki by removing interwiki brackets and language code,
* e.g. "[[et:Talvepalee]] text" -> "Talvepalee text".
*/
public static StringBuffer removeBracketsInInterwiki(StringBuffer text) {
Matcher m = ptrn_remove_brackets_in_interwiki.matcher(text.toString());
StringBuffer sb = new StringBuffer();
boolean result = m.find();
while(result) {
m.appendReplacement(sb, "$1");
result = m.find();
}
m.appendTail(sb);
return sb;
}
/** Removes categories for selected language,
* e.g. English: "[[Category:Russia]] text" -> " text",
* or Esperanto: "[[Kategorio:Galaksioj]] text" -> " text".
*/
public static StringBuffer removeCategory(StringBuffer text, LanguageType lang) {
Matcher m = null;
if(lang.equals("en") || lang.equals("simple")) {
m = ptrn_remove_category_en.matcher(text.toString());
} else if (lang.equals("ru")) {
m = ptrn_remove_category_ru.matcher(text.toString());
}
return new StringBuffer(m.replaceAll(""));
}
/** Removes XML tag <code> with text till the next </code>. */
public static StringBuffer removeXMLTag(StringBuffer text,String tag) {
if(null == tag || tag.length() == 0)
return text;
Pattern p = Pattern.compile("<"+tag+">.+?</"+tag+">", Pattern.DOTALL);
Matcher m = p.matcher(text.toString());
return new StringBuffer(m.replaceAll(""));
}
/** Removes XML tag <code> with text till the next </code>.
* e.g. "a <code>x+y</code> b" -> "a b". */
public static StringBuffer removeXMLTagCode(StringBuffer text) {
Matcher m = ptrn_tag_code.matcher(text.toString());
return new StringBuffer(m.replaceAll(""));
}
/** Removes all comments: <!-- ... -->. */
public static StringBuffer removeHTMLComments(StringBuffer text) {
Matcher m = ptrn_html_comment.matcher(text.toString());
return new StringBuffer(m.replaceAll(""));
}
/** Removes preformatted code (e.g. xml): <pre> ... </pre>.*/
public static StringBuffer removePreCode(StringBuffer text) {
Matcher m = ptrn_pre_code.matcher(text.toString());
return new StringBuffer(m.replaceAll(""));
}
/** Removes all source codes: <source ... </source>.*/
public static StringBuffer removeSourceCode(StringBuffer text) {
Matcher m = ptrn_source_code.matcher(text.toString());
return new StringBuffer(m.replaceAll(""));
}
/** Expands wiki links removing brackets. There are two cases:
* (1) remove brackets, e.g. [[run]] -> run and
* (2) (todo) [[run|running]] -> run, or [[Russian language|Russian] -> Russian,
* i.e. the visible (to reader) words will remain.
*
* @deprecated Use parseDoubleBrackets()
*/
public static StringBuffer removeBracketsInWikiLink(StringBuffer text) {
Matcher m = ptrn_remove_brackets_in_wikilinks.matcher(text.toString());
StringBuffer sb = new StringBuffer();
boolean result = m.find();
while(result) {
m.appendReplacement(sb, "$1");
result = m.find();
}
m.appendTail(sb);
// (2)
/*m = ptrn_remove_brackets_in_wikilinks_vertical_line.matcher(sb.toString());
StringBuffer sb2 = new StringBuffer();
result = m.find();
while(result) {
m.appendReplacement(sb2, "$1");
result = m.find();
}
m.appendTail(sb2);
return sb2;*/
return sb;
}
/** Expands / removes hyperlinks. Expands hyperlinks with text, e.g.
* "[http:site name of site]" -> "name of site".
* Removes links without text, e.g. [www.site].
*/
public static StringBuffer parseSingleBrackets(StringBuffer text)
{
if(null == text || 0 == text.length()) {
return NULL_STRINGBUFFER;
}
Matcher m = ptrn_single_brackets.matcher(text.toString()); // [(.+?)]
StringBuffer sb = new StringBuffer();
boolean result = m.find();
while(result) {
// g: text within [single brackets]
String g = StringUtil.escapeCharDollarAndBackslash(m.group(1)).toString();
if(g.contains(" ")) {
g = StringUtil.getTextAfterFirstSpace(g);
Matcher m_site = ptrn_site_name.matcher(g);
g = m_site.replaceAll(" ").trim(); // remove rightmost [ site.names.com] in brackets
if(g.length() > 0) {
m.appendReplacement(sb," "); // *[http://www.site.com text] -> * SPACE text
sb.append(g);
} else {
m.appendReplacement(sb,"");
}
} else {
m.appendReplacement(sb, "");
}
result = m.find();
}
m.appendTail(sb);
return sb;
}
/** Removes and expands interwiki, categories, and wiki links in wiki texts.<br>
*
* 1. expands links to Wikimedia sister projects,
* see [[w:Wikipedia:Interwikimedia_links|text to expand]] -> "text to expand"
*
* 2. interwiki
* @param b_remove_not_expand_iwiki if true then
* Removes interwiki, e.g. "[[et:Talvepalee]] text" -> " text";<br>
* if false then
* expands interwiki by removing interwiki brackets and language code,
* e.g. "[[et:Talvepalee]] text" -> "Talvepalee text".
*
* @param lang defines parsed wiki language, it is needed to remove
* category for the selected language, e.g. English (Category) or Esperanto
* (Kategorio).<br><br>
*
* 3. Removes categories for selected language,
* e.g. English: "[[Category:Russia]] text" -> " text".<br><br>
*
* 4. Expands wiki links by removing brackets. There are two cases:
* (1) remove brackets, e.g. [[run]] -> run and
* (2) [[run|running]] -> running, or [[Russian language|Russian]] -> Russian,
* i.e. the visible (to reader) words will remain.<br><br>
*
* It is recommended to call StringUtil.escapeCharDollarAndBackslash(text)
* before this function.
*
* See also WikiWord.parseDoubleBrackets
*/
public static StringBuffer parseDoubleBrackets(
StringBuffer text,
LanguageType lang,
boolean b_remove_not_expand_iwiki)
{
if(null == text || 0 == text.length()) {
return NULL_STRINGBUFFER;
}
Matcher m = ptrn_double_brackets.matcher(text.toString()); // [[(.+?)]]
//StringUtil.escapeCharDollarAndBackslash(text.toString())); // [[(.+?)]]
String before, after;
StringBuffer sb = new StringBuffer();
boolean result = m.find();
while(result) {
// g: text within [[brackets]]
String g = StringUtil.escapeCharDollarAndBackslash(m.group(1)).toString();
if(-1 != g.indexOf(':')) {
before = StringUtil.getTextBeforeFirstColumn(g);
after = StringUtil.getTextAfterFirstColumn(g);
// categories
if( ((lang.equals("en") || lang.equals("simple")) && before.equalsIgnoreCase("Category")) ||
(lang.equals("ru") && before.equalsIgnoreCase("Категория"))
)
{ // remove [[Category:Title]]
m.appendReplacement(sb, "");
} else if (WikimediaSisterProject.existsCode(before))
{
m.appendReplacement(sb, WikimediaSisterProject.getLinkText(before, after));
} else {
// interwiki
if(LanguageType.has(before)) {
if(b_remove_not_expand_iwiki) {
m.appendReplacement(sb, "");
} else {
m.appendReplacement(sb, after);
} } }
} else {
if(-1 != g.indexOf('|')) {
before = StringUtil.getTextBeforeFirstVerticalPipe(g);
after = StringUtil.getTextAfterFirstVerticalPipe(g);
//System.out.println("sb="+sb+ "; after="+after);
m.appendReplacement(sb, after);
} else {
// [[run]] -> run
m.appendReplacement(sb, g);
}
}
//m.appendReplacement(sb, "$1");
result = m.find();
}
m.appendTail(sb);
return sb;
}
/** Removes texts withing curly brackets, e.g. {{templates}}.<br><br>
*
* Todo: expand templates (optionally).
*/
public static StringBuffer parseCurlyBrackets(StringBuffer text)
{
if(null == text || 0 == text.length()) {
return NULL_STRINGBUFFER;
}
Matcher m = ptrn_double_curly_brackets.matcher(text.toString()); // {{(.+?)}}
boolean result = m.find();
if(result) {
StringBuffer sb = new StringBuffer();
while(result) {
//String g = m.group(1); // texts within {{curly brackets}}
m.appendReplacement(sb, "");
result = m.find();
}
m.appendTail(sb);
return sb;
}
return text;
}
/** Removes boundaries of something (e.g. double or triple apostrophes)
* used in pairs, e.g. ''italics'' -> italics. .<br><br>
*
* It is recommended to call StringUtil.escapeCharDollarAndBackslash(text)
* before this function.
*/
private static StringBuffer parseBounds(StringBuffer text, Pattern p)
{
if(null == text || 0 == text.length()) {
return NULL_STRINGBUFFER;
}
//Matcher m = p.matcher(StringUtil.escapeCharDollarAndBackslash(text.toString()));
Matcher m = p.matcher(text.toString());
boolean result = m.find();
if(result) {
StringBuffer sb = new StringBuffer();
while(result) {
// g1: text within ''some boundaries''
String g1 = StringUtil.escapeCharDollarAndBackslash(m.group(1)).toString();
m.appendReplacement(sb, g1);
result = m.find();
}
m.appendTail(sb);
return sb;
}
return text;
}
/** Removes douple apostrophes used in pairs, e.g. ''italics'' -> italics.
* It is recommended to call StringUtil.escapeCharDollarAndBackslash(text)
* before this function.
*/
public static StringBuffer parseDoubleApostrophe(StringBuffer text)
{
return parseBounds(text, ptrn_double_apostrophe);
}
/** Removes triple apostrophes used in pairs, e.g. '''bold''' -> bold.
* It is recommended to call StringUtil.escapeCharDollarAndBackslash(text)
* before this function.
*/
public static StringBuffer parseTripleApostrophe(StringBuffer text)
{
return parseBounds(text, ptrn_triple_apostrophe);
}
/** Removes sign of acute accent "'" for Russian wiki texts,
* it is placed in the begin of article often e.g. '''itálics''' -> '''italics'''.
*/
public static StringBuffer removeAcuteAccent(
StringBuffer text,LanguageType wiki_lang)
{
if( wiki_lang != LanguageType.ru) // skip English Wiki
return text;
if(null == text || 0 == text.length()) {
return NULL_STRINGBUFFER;
}
Matcher m = ptrn_accent_sign.matcher(text.toString());
return new StringBuffer (m.replaceAll(""));
}
/** Removes / expands interwiki, removes categories, expands wiki links.
*
* @param b_remove_not_expand_iwiki if true then removes interwiki,
* e.g. "[[et:Talvepalee]] text" -> " text"; else expands interwiki by
* removing interwiki brackets and language code,
* e.g. "[[et:Talvepalee]] text" -> "Talvepalee text".
*
* @param lang defines parsed wiki language, it is needed to remove
* category for the selected language, e.g. English (Category) or Esperanto
* (Kategorio).<br><br>
*
* 2. Removes categories for selected language,
* e.g. English: "[[Category:Russia]] text" -> " text".<br><br>
*
* 3. Expands wiki links by removing brackets. There are two cases:
* (1) remove brackets, e.g. [[run]] -> run and <br>
* (2) [[run|running]] -> running, or [[Russian language|Russian]] -> Russian,
* i.e. the visible (to reader) words will remain.
*/
public static StringBuffer convertWikiToText(
StringBuffer wiki_text,
LanguageType lang,
boolean b_remove_not_expand_iwiki)
{
// StringBuffer result = WikiParser.removeInterwiki(wiki_text);
// or
// StringBuffer result = WikiParser.removeBracketsInInterwiki(wiki_text);
// StringBuffer result = WikiParser.removeCategory(wiki_text, LanguageType.ru);
// or
// StringBuffer result = WikiParser.removeCategory(wiki_text, LanguageType.en);
if(null == wiki_text || 0 == wiki_text.length()) {
return NULL_STRINGBUFFER;
}
//StringBuffer wiki_text_trim = new StringBuffer(wiki_text.toString().trim());
// I. removing
StringBuffer s = removeHTMLComments(wiki_text);
s = removePreCode(s);
s = removeSourceCode(s);
//s = StringUtil.escapeCharDollarAndBackslash(s.toString());
s = removeXMLTagCode(s);
// II. transformation and removing
s = ReferenceParser.expandMoveToEndOfText(s);
s = parseCurlyBrackets(s);
s = parseCurlyBrackets(s); // {{template in {{template}}}}
s = TableParser.removeWikiTables(s); // after CurlyBrackets, remarks in func
s = removeAcuteAccent(s, lang);
s = parseTripleApostrophe(s);
s = parseDoubleApostrophe(s);
s = ImageParser.parseImageDescription(s, lang);
s = parseDoubleBrackets(s, lang, b_remove_not_expand_iwiki);
String str = WikiParser.parseSingleBrackets(s).toString().trim();
//str = XMLTagsParser.escapeCharFromXML(str); // for GATE XML parsers
str = XMLTagsParser.replaceCharFromXML(str, ' '); // for GATE XML parsers
return new StringBuffer(str);
}
}