/*
* Licensed to ElasticSearch and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. ElasticSearch licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.river.wikipedia.support;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* For internal use only -- Used by the {@link WikiPage} class.
* Can also be used as a stand alone class to parse wiki formatted text.
*
* @author Delip Rao
*/
public class WikiTextParser {
private String wikiText = null;
private ArrayList<String> pageCats = null;
private ArrayList<String> pageLinks = null;
private boolean redirect = false;
private String redirectString = null;
private static Pattern redirectPattern =
Pattern.compile("#REDIRECT\\s+\\[\\[(.*?)\\]\\]");
private boolean stub = false;
private boolean disambiguation = false;
private static Pattern stubPattern = Pattern.compile("\\-stub\\}\\}");
private static Pattern disambCatPattern = Pattern.compile("\\{\\{disambig\\}\\}");
private InfoBox infoBox = null;
public WikiTextParser(String wtext) {
wikiText = wtext;
Matcher matcher = redirectPattern.matcher(wikiText);
if (matcher.find()) {
redirect = true;
if (matcher.groupCount() == 1)
redirectString = matcher.group(1);
}
matcher = stubPattern.matcher(wikiText);
stub = matcher.find();
matcher = disambCatPattern.matcher(wikiText);
disambiguation = matcher.find();
}
public boolean isRedirect() {
return redirect;
}
public boolean isStub() {
return stub;
}
public String getRedirectText() {
return redirectString;
}
public String getText() {
return wikiText;
}
public ArrayList<String> getCategories() {
if (pageCats == null) parseCategories();
return pageCats;
}
public ArrayList<String> getLinks() {
if (pageLinks == null) parseLinks();
return pageLinks;
}
private void parseCategories() {
pageCats = new ArrayList<String>();
Pattern catPattern = Pattern.compile("\\[\\[Category:(.*?)\\]\\]", Pattern.MULTILINE);
Matcher matcher = catPattern.matcher(wikiText);
while (matcher.find()) {
String[] temp = matcher.group(1).split("\\|");
pageCats.add(temp[0]);
}
}
private void parseLinks() {
pageLinks = new ArrayList<String>();
Pattern catPattern = Pattern.compile("\\[\\[(.*?)\\]\\]", Pattern.MULTILINE);
Matcher matcher = catPattern.matcher(wikiText);
while (matcher.find()) {
String[] temp = matcher.group(1).split("\\|");
if (temp == null || temp.length == 0) continue;
String link = temp[0];
if (link.contains(":") == false) {
pageLinks.add(link);
}
}
}
public String getPlainText() {
String text = wikiText.replaceAll(">", ">");
text = text.replaceAll("<", "<");
text = text.replaceAll("<ref>.*?</ref>", " ");
text = text.replaceAll("</?.*?>", " ");
text = text.replaceAll("\\{\\{.*?\\}\\}", " ");
text = text.replaceAll("\\[\\[.*?:.*?\\]\\]", " ");
text = text.replaceAll("\\[\\[(.*?)\\]\\]", "$1");
text = text.replaceAll("\\s(.*?)\\|(\\w+\\s)", " $2");
text = text.replaceAll("\\[.*?\\]", " ");
text = text.replaceAll("\\'+", "");
return text;
}
public InfoBox getInfoBox() {
//parseInfoBox is expensive. Doing it only once like other parse* methods
if (infoBox == null)
infoBox = parseInfoBox();
return infoBox;
}
private InfoBox parseInfoBox() {
String INFOBOX_CONST_STR = "{{Infobox";
int startPos = wikiText.indexOf(INFOBOX_CONST_STR);
if (startPos < 0) return null;
int bracketCount = 2;
int endPos = startPos + INFOBOX_CONST_STR.length();
for (; endPos < wikiText.length(); endPos++) {
switch (wikiText.charAt(endPos)) {
case '}':
bracketCount--;
break;
case '{':
bracketCount++;
break;
default:
}
if (bracketCount == 0) break;
}
String infoBoxText = wikiText.substring(startPos, endPos + 1);
infoBoxText = stripCite(infoBoxText); // strip clumsy {{cite}} tags
// strip any html formatting
infoBoxText = infoBoxText.replaceAll(">", ">");
infoBoxText = infoBoxText.replaceAll("<", "<");
infoBoxText = infoBoxText.replaceAll("<ref.*?>.*?</ref>", " ");
infoBoxText = infoBoxText.replaceAll("</?.*?>", " ");
return new InfoBox(infoBoxText);
}
private String stripCite(String text) {
String CITE_CONST_STR = "{{cite";
int startPos = text.indexOf(CITE_CONST_STR);
if (startPos < 0) return text;
int bracketCount = 2;
int endPos = startPos + CITE_CONST_STR.length();
for (; endPos < text.length(); endPos++) {
switch (text.charAt(endPos)) {
case '}':
bracketCount--;
break;
case '{':
bracketCount++;
break;
default:
}
if (bracketCount == 0) break;
}
text = text.substring(0, startPos - 1) + text.substring(endPos);
return stripCite(text);
}
public boolean isDisambiguationPage() {
return disambiguation;
}
public String getTranslatedTitle(String languageCode) {
Pattern pattern = Pattern.compile("^\\[\\[" + languageCode + ":(.*?)\\]\\]$", Pattern.MULTILINE);
Matcher matcher = pattern.matcher(wikiText);
if (matcher.find()) {
return matcher.group(1);
}
return null;
}
}