package edu.jhu.nlp.wikipedia;
import java.util.HashMap;
import java.util.Vector;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* For internal use only -- Used by the {@link WikiPage} class.
* Can also be used as a stand alone class to parse wiki formatted text.
* @author Delip Rao
*
*/
public class WikiTextParser {
private String wikiText = null;
private Vector<String> pageCats = null;
private Vector<String> pageLinks = null;
private HashMap<String,Vector> pageLinksWithTexts = null;
private boolean redirect = false;
private String redirectString = null;
private static Pattern redirectPattern =
Pattern.compile("#REDIRECT\\s+\\[\\[(.*?)\\]\\]");
private boolean stub = false;
private boolean disambiguation = false;
private static Pattern stubPattern = Pattern.compile("\\-stub\\}\\}");
private static Pattern disambCatPattern = Pattern.compile("\\{\\{disambig\\}\\}");
private InfoBox infoBox = null;
public WikiTextParser(String wtext) {
wikiText = wtext;
Matcher matcher = redirectPattern.matcher(wikiText);
if(matcher.find()) {
redirect = true;
if(matcher.groupCount() == 1)
redirectString = matcher.group(1);
}
matcher = stubPattern.matcher(wikiText);
stub = matcher.find();
matcher = disambCatPattern.matcher(wikiText);
disambiguation = matcher.find();
}
public boolean isRedirect() {
return redirect;
}
public boolean isStub() {
return stub;
}
public String getRedirectText() {
return redirectString;
}
public String getText() {
return wikiText;
}
public Vector<String> getCategories() {
if(pageCats == null) parseCategories();
return pageCats;
}
public Vector<String> getLinks() {
if(pageLinks == null) parseLinks();
return pageLinks;
}
public HashMap<String,Vector> getLinksWithText() {
if(pageLinksWithTexts == null) parseLinksWithText();
return pageLinksWithTexts;
}
private void parseCategories() {
pageCats = new Vector<String>();
Pattern catPattern = Pattern.compile("\\[\\[Category:(.*?)\\]\\]", Pattern.MULTILINE);
Matcher matcher = catPattern.matcher(wikiText);
while(matcher.find()) {
String [] temp = matcher.group(1).split("\\|");
pageCats.add(temp[0]);
}
}
private void parseLinks() {
pageLinks = new Vector<String>();
Pattern catPattern = Pattern.compile("\\[\\[(.*?)\\]\\]", Pattern.MULTILINE);
Matcher matcher = catPattern.matcher(wikiText);
while(matcher.find()) {
String [] temp = matcher.group(1).split("\\|");
if(temp == null || temp.length == 0) continue;
String link = temp[0];
if(link.contains(":") == false) {
pageLinks.add(link);
}
}
}
private void parseLinksWithText() {
Vector<String> _pageLinks = new Vector<String>();
Vector<String> _pageLinksTexts = new Vector<String>();
Pattern linkPattern = Pattern.compile("\\[\\[(.*?)\\]\\]", Pattern.MULTILINE);
Matcher matcher = linkPattern.matcher(wikiText);
while(matcher.find()) {
String [] temp = matcher.group(1).split("\\|");
if(temp == null || temp.length == 0) continue;
String link = temp[0];
String linkText;
if (temp.length > 1)
linkText = temp[1];
else
linkText = link;
if(link.contains(":") == false) {
_pageLinks.add(link);
_pageLinksTexts.add(linkText);
}
}
pageLinksWithTexts = new HashMap<String, Vector>();
pageLinksWithTexts.put("pageLinks", _pageLinks);
pageLinksWithTexts.put("pageLinkTexts", _pageLinksTexts);
}
public String getPlainText() {
String text = wikiText.replaceAll(">", ">");
text = text.replaceAll("<", "<");
text = text.replaceAll("<ref>.*?</ref>", " ");
text = text.replaceAll("</?.*?>", " ");
text = text.replaceAll("\\{\\{.*?\\}\\}", " ");
text = text.replaceAll("\\[\\[.*?:.*?\\]\\]", " ");
text = text.replaceAll("\\[\\[(.*?)\\]\\]", "$1");
text = text.replaceAll("\\s(.*?)\\|(\\w+\\s)", " $2");
text = text.replaceAll("\\[.*?\\]", " ");
text = text.replaceAll("\\'+", "");
return text;
}
public InfoBox getInfoBox() {
//parseInfoBox is expensive. Doing it only once like other parse* methods
if(infoBox == null)
infoBox = parseInfoBox();
return infoBox;
}
private InfoBox parseInfoBox() {
String INFOBOX_CONST_STR = "{{Infobox";
int startPos = wikiText.indexOf(INFOBOX_CONST_STR);
if(startPos < 0) return null;
int bracketCount = 2;
int endPos = startPos + INFOBOX_CONST_STR.length();
for(; endPos < wikiText.length(); endPos++) {
switch(wikiText.charAt(endPos)) {
case '}':
bracketCount--;
break;
case '{':
bracketCount++;
break;
default:
}
if(bracketCount == 0) break;
}
if(endPos+1 >= wikiText.length()) return null;
// This happens due to malformed Infoboxes in wiki text. See Issue #10
// Giving up parsing is the easier thing to do.
String infoBoxText = wikiText.substring(startPos, endPos+1);
infoBoxText = stripCite(infoBoxText); // strip clumsy {{cite}} tags
// strip any html formatting
infoBoxText = infoBoxText.replaceAll(">", ">");
infoBoxText = infoBoxText.replaceAll("<", "<");
infoBoxText = infoBoxText.replaceAll("<ref.*?>.*?</ref>", " ");
infoBoxText = infoBoxText.replaceAll("</?.*?>", " ");
return new InfoBox(infoBoxText);
}
private String stripCite(String text) {
String CITE_CONST_STR = "{{cite";
int startPos = text.indexOf(CITE_CONST_STR);
if(startPos < 0) return text;
int bracketCount = 2;
int endPos = startPos + CITE_CONST_STR.length();
for(; endPos < text.length(); endPos++) {
switch(text.charAt(endPos)) {
case '}':
bracketCount--;
break;
case '{':
bracketCount++;
break;
default:
}
if(bracketCount == 0) break;
}
text = text.substring(0, startPos-1) + text.substring(endPos);
return stripCite(text);
}
public boolean isDisambiguationPage() {
return disambiguation;
}
public String getTranslatedTitle(String languageCode) {
Pattern pattern = Pattern.compile("^\\[\\[" + languageCode + ":(.*?)\\]\\]$", Pattern.MULTILINE);
Matcher matcher = pattern.matcher(wikiText);
if(matcher.find()) {
return matcher.group(1);
}
return null;
}
}