package info.bliki.wiki.filter;
import info.bliki.commons.validator.routines.EmailValidator;
import info.bliki.htmlcleaner.ContentToken;
import info.bliki.htmlcleaner.EndTagToken;
import info.bliki.htmlcleaner.TagNode;
import info.bliki.htmlcleaner.TagToken;
import info.bliki.wiki.model.Configuration;
import info.bliki.wiki.model.DefaultEventListener;
import info.bliki.wiki.model.IEventListener;
import info.bliki.wiki.model.ITableOfContent;
import info.bliki.wiki.model.IWikiModel;
import info.bliki.wiki.tags.HTMLBlockTag;
import info.bliki.wiki.tags.HTMLTag;
import info.bliki.wiki.tags.HrTag;
import info.bliki.wiki.tags.PTag;
import info.bliki.wiki.tags.WPBoldItalicTag;
import info.bliki.wiki.tags.WPPreTag;
import info.bliki.wiki.tags.WPTag;
import info.bliki.wiki.tags.util.Attribute;
import info.bliki.wiki.tags.util.IBodyTag;
import info.bliki.wiki.tags.util.INoBodyParsingTag;
import info.bliki.wiki.tags.util.NodeAttribute;
import info.bliki.wiki.tags.util.TagStack;
import info.bliki.wiki.tags.util.WikiTagNode;
import java.util.List;
/**
* A Wikipedia syntax parser for the second pass in the parsing of a Wikipedia
* source text.
*
* @see TemplateParser for the first pass
*/
public class WikipediaParser extends AbstractParser implements IParser {
private ITableOfContent fTableOfContentTag = null;
private int fHeadCounter = 0;
/**
* Enable HTML tags
*/
private final boolean fHtmlCodes = true;
private boolean fNoToC = false;
private boolean fRenderTemplate = false;
private boolean fForceToC = false;
private IEventListener fEventListener = null;
public WikipediaParser(String stringSource, boolean renderTemplate) {
this(stringSource, renderTemplate, null);
}
public WikipediaParser(String stringSource, boolean renderTemplate,
IEventListener wikiListener) {
super(stringSource);
fRenderTemplate = renderTemplate;
if (wikiListener == null) {
fEventListener = DefaultEventListener.CONST;
} else {
fEventListener = wikiListener;
}
}
/**
* Copy the read ahead content in the resulting HTML text token.
*
* @param diff
* subtract <code>diff</code> form the current parser position to get
* the HTML text token end position.
*/
private boolean createPreContentToken(final int diff) {
if (fWhiteStart) {
try {
final int count = fCurrentPosition - diff - fWhiteStartPosition;
if (count > 0) {
String rawWikiText =
fStringSource.substring(fWhiteStartPosition, fWhiteStartPosition
+ count);
WikipediaPreTagParser.parseRecursive(rawWikiText, fWikiModel);
fWhiteStart = false;
}
return true;
} catch (InvalidPreWikiTag ipwt) {
}
}
return false;
}
public int getNextToken() // throws InvalidInputException
{
fWhiteStart = true;
fWhiteStartPosition = fCurrentPosition;
try {
while (true) {
fCurrentCharacter = fSource[fCurrentPosition++];
// ---------Identify the next token-------------
switch (fCurrentCharacter) {
case '\n':
// check at the end of line, if there is open wiki bold or italic
// markup
reduceTokenStackBoldItalic();
break;
case '{':
// dummy parsing of wikipedia templates for event listeners
if (parseTemplate()) {
} else {
// wikipedia table handling
if (parseTable()) {
continue;
} else if (parseCode()) {
continue;
} else if (parseBlockQuote()) {
continue;
}
}
break;
case '_': // TOC identifiers __NOTOC__, __FORCETOC__ ...
if (parseSpecialIdentifiers()) {
continue;
}
break;
case '=': // wikipedia header ?
if (parseSectionHeaders()) {
continue;
}
break;
case WPList.DL_DD_CHAR: // start of <dl><dd> list
case WPList.DL_DT_CHAR: // start of <dl><dt> list
case WPList.OL_CHAR: // start of <ol> list
case WPList.UL_CHAR: // start of <ul> list
if (parseLists()) {
continue;
}
break;
// case ':':
// if (parseSimpleDefinitionLists()) {
// continue;
// }
// break;
// case ';':
// if (parseDefinitionLists()) {
// continue;
// }
// break;
case '-': // parse ---- as <hr>
if (parseHorizontalRuler()) {
continue;
}
break;
case ' ': // pre-formatted text?
case '\t':
if (parsePreformattedWikiBlock()) {
continue;
}
break;
}
if (isStartOfLine() && fWikiModel.getRecursionLevel() == 1) {
if (isEmptyLine(1)) {
if (fWikiModel.stackSize() > 0
&& (fWikiModel.peekNode() instanceof PTag)) {
// close <p> tag:
createContentToken(2);
fWikiModel.reduceTokenStack(Configuration.HTML_PARAGRAPH_OPEN);
}
} else {
if (fWikiModel.stackSize() == 0) {
addParagraph();
// if (fWikiModel.getRecursionLevel() == 1) {
// addParagraph();
// } else {
// if (fCurrentPosition > 1) {
// addParagraph();
// }
// }
} else {
if (isEmptyLine(2)) {
if (fWikiModel.stackSize() > 0
&& (fWikiModel.peekNode() instanceof PTag)) {
// add <br> tag for one newline
createContentToken(2);
fWikiModel.pushNode(new HTMLTag("br"));
fWikiModel.popNode();
}
}
TagToken tag = fWikiModel.peekNode();
if (tag instanceof WPPreTag) {
addPreformattedText();
// } else if (tag instanceof PTag) {
// createContentToken(fWhiteStart, fWhiteStartPosition, 2);
// reduceTokenStack(Configuration.HTML_PARAGRAPH_OPEN);
} else {
String allowedParents =
Configuration.HTML_PARAGRAPH_OPEN.getParents();
if (allowedParents != null) {
int index = -1;
index = allowedParents.indexOf("|" + tag.getName() + "|");
if (index >= 0) {
addParagraph();
}
}
}
}
}
}
// ---------Identify the next token-------------
switch (fCurrentCharacter) {
case '[':
if (parseWikiLink()) {
continue;
}
break;
case '\'':
if (getNextChar('\'')) {
if (getNextChar('\'')) {
if (getNextChar('\'')) {
if (getNextChar('\'')) {
createContentToken(5);
return TokenBOLDITALIC;
}
fCurrentPosition -= 1;
fWhiteStart = true;
createContentToken(3);
return TokenBOLD;
}
createContentToken(3);
return TokenBOLD;
}
createContentToken(2);
return TokenITALIC;
}
break;
case '%':
if (getNextChar('%')) {
if (getNextChar('%')) {
createContentToken(3);
return TokenDELETEDLINE;
}
createContentToken(2);
return TokenUNDERLNE;
}
break;
case '<':
if (fHtmlCodes) {
int htmlStartPosition = fCurrentPosition;
// HTML tags are allowed
try {
switch (fStringSource.charAt(fCurrentPosition)) {
case '!': // <!-- HTML comment -->
if (parseHTMLCommentTags()) {
continue;
}
break;
default:
if (fSource[fCurrentPosition] != '/') {
// opening HTML tag
WikiTagNode tagNode = parseTag(fCurrentPosition);
if (tagNode != null) {
String tagName = tagNode.getTagName();
TagToken tag = fWikiModel.getTokenMap().get(tagName);
if (tag != null) {
tag = (TagToken) tag.clone();
if (tag instanceof TagNode) {
TagNode node = (TagNode) tag;
List<NodeAttribute> attributes =
tagNode.getAttributesEx();
Attribute attr;
String temp;
for (int i = 1; i < attributes.size(); i++) {
attr = attributes.get(i);
temp = attr.getValue();
if (temp != null) {
temp = parseNowiki(temp);
}
node.addAttribute(attr.getName(), temp, true);
}
}
if (tag instanceof HTMLTag) {
((HTMLTag) tag).setTemplate(isTemplate());
}
createContentToken(1);
fCurrentPosition = fScannerPosition;
String allowedParents = tag.getParents();
if (allowedParents != null) {
fWikiModel.reduceTokenStack(tag);
}
createTag(tag, tagNode, tagNode.getEndPosition());
return TokenIgnore;
} else {
fWhiteStart = true;
skipUntilEndOfTag(tagNode, tagNode.getEndPosition());
createContentToken(0);
return TokenIgnore;
}
// break;
}
} else {
// closing HTML tag
WikiTagNode tagNode = parseTag(++fCurrentPosition);
if (tagNode != null) {
String tagName = tagNode.getTagName();
TagToken tag = fWikiModel.getTokenMap().get(tagName);
if (tag != null) {
createContentToken(2);
fCurrentPosition = fScannerPosition;
if (fWikiModel.stackSize() > 0) {
TagToken topToken = fWikiModel.peekNode();
if (topToken.getName().equals(tag.getName())) {
fWikiModel.popNode();
return TokenIgnore;
} else {
if (tag.isReduceTokenStack()) {
reduceStackUntilToken(tag);
}
}
} else {
}
return TokenIgnore;
}
break;
}
}
}
} catch (IndexOutOfBoundsException e) {
// do nothing
}
fCurrentPosition = htmlStartPosition;
}
break;
default:
if (Character.isLetter(fCurrentCharacter)) {
if (fCurrentPosition < 2
|| !Character.isLetterOrDigit(fSource[fCurrentPosition - 2])) {
if (fCurrentCharacter == 'i' || fCurrentCharacter == 'I') {
// ISBN ?
if (parseISBNLinks()) {
continue;
}
}
if (parseURIScheme()) {
// a URI scheme registered in the wiki model (ftp, http,
// https,...)
continue;
}
if (fWikiModel.isCamelCaseEnabled()
&& Character.isUpperCase(fCurrentCharacter)
&& fWikiModel.getRecursionLevel() <= 1) {
if (parseCamelCaseLink()) {
continue;
}
}
}
}
}
if (!fWhiteStart) {
fWhiteStart = true;
fWhiteStartPosition = fCurrentPosition - 1;
}
}
// -----------------end switch while try--------------------
} catch (IndexOutOfBoundsException e) {
// end of scanner text
}
try {
createContentToken(1);
} catch (IndexOutOfBoundsException e) {
// end of scanner text
}
return TokenEOF;
}
/**
* Parse nowiki tags.
*
* @param input
* @return
*/
private String parseNowiki(String input) {
int indx = input.indexOf("<nowiki>");
int indx2;
int lastIndx = 0;
if (indx >= 0) {
StringBuilder buf = new StringBuilder(input.length());
while (indx >= 0) {
buf.append(input.substring(lastIndx, indx));
lastIndx = indx + 8; // <nowiki> length
indx2 = input.indexOf("</nowiki>", indx + 1);
if (indx2 >= 0) {
buf.append(input.substring(lastIndx, indx2));
lastIndx = indx2 + 9;// </nowiki> length
} else {
break;
}
indx = input.indexOf("<nowiki>", indx2 + 1);
}
buf.append(input.substring(lastIndx, input.length()));
return buf.toString();
}
return input;
}
private void addParagraph() {
createContentToken(2);
fWikiModel.reduceTokenStack(Configuration.HTML_PARAGRAPH_OPEN);
fWikiModel.pushNode(new PTag());
}
/**
* Add the content of the wiki <pre> block. Trim the content at the
* right side.
*/
private void addPreformattedText() {
if (fWhiteStart) {
int currentPos = fCurrentPosition;
int whiteEndPosition = fCurrentPosition - 2;
while (whiteEndPosition > fWhiteStartPosition) {
if (!Character.isWhitespace(fSource[whiteEndPosition])) {
whiteEndPosition++;
break;
}
whiteEndPosition--;
}
try {
fCurrentPosition = whiteEndPosition;
createContentToken(0);
} finally {
fCurrentPosition = currentPos;
}
}
fWikiModel.reduceTokenStack(Configuration.HTML_PARAGRAPH_OPEN);
fWikiModel.pushNode(new PTag());
}
private boolean parseHTMLCommentTags() {
int htmlStartPosition = fCurrentPosition;
String htmlCommentString =
fStringSource.substring(fCurrentPosition - 1, fCurrentPosition + 3);
if (htmlCommentString.equals("<!--")) {
fCurrentPosition += 3;
if (readUntil("-->")) {
String htmlCommentContent =
fStringSource.substring(htmlStartPosition + 3, fCurrentPosition - 3);
if (htmlCommentContent != null) {
createContentToken(fCurrentPosition - htmlStartPosition + 1);
return true;
}
}
}
return false;
}
private boolean parseISBNLinks() {
final int urlStartPosition = fCurrentPosition;
boolean foundISBN = false;
try {
if ((fCurrentCharacter == 'i' || fCurrentCharacter == 'I')
&& (fSource[fCurrentPosition] == 's' || fSource[fCurrentPosition] == 'S')
&& (fSource[++fCurrentPosition] == 'b' || fSource[fCurrentPosition] == 'B')
&& (fSource[++fCurrentPosition] == 'n' || fSource[fCurrentPosition] == 'N')
&& fSource[++fCurrentPosition] == ' ') {
fCurrentPosition++;
createContentToken(5);
foundISBN = true;
char ch;
ch = fSource[fCurrentPosition++];
while ((ch >= '0' && ch <= '9') || ch == '-') {
ch = fSource[fCurrentPosition++];
}
}
} catch (IndexOutOfBoundsException e) {
}
if (foundISBN) {
String urlString =
fStringSource.substring(urlStartPosition - 1, fCurrentPosition - 1);
fCurrentPosition--;
fWikiModel.appendISBNLink(urlString);
return true;
}
// rollback work :-)
fCurrentPosition = urlStartPosition;
return false;
}
private boolean parseMailtoLinks() {
final int urlStartPosition = fCurrentPosition;
int tempPosition = fCurrentPosition;
boolean foundUrl = false;
try {
if ((fCurrentCharacter == 'm' || fCurrentCharacter == 'M')
&& (fSource[fCurrentPosition] == 'a' || fSource[fCurrentPosition] == 'A')
&& (fSource[++fCurrentPosition] == 'i' || fSource[fCurrentPosition] == 'I')
&& (fSource[++fCurrentPosition] == 'l' || fSource[fCurrentPosition] == 'L')
&& (fSource[++fCurrentPosition] == 't' || fSource[fCurrentPosition] == 'T')
&& (fSource[++fCurrentPosition] == 'o' || fSource[fCurrentPosition] == 'O')) {
tempPosition += 6;
fCurrentCharacter = fSource[tempPosition++];
foundUrl = true;
while (!Character.isWhitespace(fSource[tempPosition++])) {
}
}
} catch (IndexOutOfBoundsException e) {
}
if (foundUrl) {
String urlString =
fStringSource.substring(urlStartPosition - 1, tempPosition - 1);
String email = urlString.substring(7);
if (EmailValidator.getInstance().isValid(email)) {
createContentToken(5);
fWhiteStart = false;
fCurrentPosition = tempPosition;
fCurrentPosition--;
fWikiModel.appendMailtoLink(urlString, urlString, true);
return true;
}
}
// rollback work :-)
fCurrentPosition = urlStartPosition;
return false;
}
/**
* See <a href="http://en.wikipedia.org/wiki/URI_scheme">URI scheme</a>
*
* @return <code>true</code> if a registered URI scheme was found in the wiki
* models configuration..
*/
private boolean parseURIScheme() {
if (fCurrentCharacter == 'm' || fCurrentCharacter == 'M') {
// mailto ?
if (parseMailtoLinks()) {
return true;
}
}
int urlStartPosition = fCurrentPosition;
int tempPosition = fCurrentPosition;
String uriSchemeName = "";
int index = -1;
boolean foundUrl = false;
try {
index = indexOfUntilNoLetter(':', fCurrentPosition);
if (index > 0) {
uriSchemeName = fStringSource.substring(fCurrentPosition - 1, index);
if (fWikiModel.isValidUriScheme(uriSchemeName)) {
// found something like "ftp", "http", "https"
tempPosition += uriSchemeName.length() + 1;
fCurrentCharacter = fSource[tempPosition++];
createContentToken(1);
fWhiteStart = false;
foundUrl = true;
while (Encoder.isUrlIdentifierPart(fSource[tempPosition++])) {
}
}
}
} catch (IndexOutOfBoundsException e) {
}
if (foundUrl) {
String restString =
fStringSource.substring(urlStartPosition - 1, tempPosition - 1);
String uriSchemeSpecificPart =
fStringSource.substring(index + 1, tempPosition - 1);
if (fWikiModel.isValidUriSchemeSpecificPart(
uriSchemeName,
uriSchemeSpecificPart)) {
fWhiteStart = false;
fCurrentPosition = tempPosition;
fCurrentPosition--;
fWikiModel.appendExternalLink(
uriSchemeName,
restString,
restString,
true);
return true;
}
}
// rollback work :-)
fCurrentPosition = urlStartPosition;
return false;
}
private boolean parseCamelCaseLink() {
int startLinkPosition = fCurrentPosition - 1;
int temp = fCurrentPosition;
boolean isCamelCase = false;
try {
char ch = fSource[temp++];
while (Character.isLetterOrDigit(ch)) {
if (Character.isUpperCase(ch)) {
// at least 2 upper case characters appear in the word
isCamelCase = true;
}
ch = fSource[temp++];
}
} catch (IndexOutOfBoundsException iobe) {
}
if (isCamelCase) {
createContentToken(1);
fWhiteStart = false;
fCurrentPosition = temp - 1;
String name =
fStringSource.substring(startLinkPosition, fCurrentPosition);
fWikiModel.appendInternalLink(name, null, name, null, false);
return true;
}
return false;
}
/**
* Parse a wiki section starting with a '[' character
*
* @return <code>true</code> if a correct link was found
*/
private boolean parseWikiLink() {
int startLinkPosition = fCurrentPosition;
if (getNextChar('[')) {
return parseWikiTag();
} else if (getNextCharAsWhitespace()) {
fCurrentPosition--;
return false;
} else {
createContentToken(1);
fWhiteStart = false;
if (readUntilCharOrStopAtEOL(']')) {
String name =
fStringSource.substring(startLinkPosition, fCurrentPosition - 1);
// bbcode start
if (fWikiModel.parseBBCodes() && name.length() > 0) {
// parse start tokens like phpBB forum syntax style (bbcode)
char ch = name.charAt(0);
if ('a' <= ch && ch <= 'z') {
// first character must be a letter
StringBuilder bbCode = new StringBuilder(name.length());
bbCode.append(ch);
if (parsePHPBBCode(name, bbCode)) {
return true;
}
}
}
// bbcode end
if (handleHTTPLink(name)) {
return true;
}
}
fCurrentPosition = startLinkPosition;
}
return false;
}
/**
* Parse a wiki section starting with a '[[' sequence
*
* @return <code>true</code> if a correct link was found
*/
private boolean parseWikiTag() {
int startLinkPosition = fCurrentPosition;
int endLinkPosition;
// wikipedia link style
createContentToken(2);
int temp = fCurrentPosition;
if (findWikiLinkEnd()) {
endLinkPosition = fCurrentPosition - 2;
String name = fStringSource.substring(startLinkPosition, endLinkPosition);
// test for a suffix string behind the Wiki link. Useful for plurals.
// Example:
// Dolphins are [[aquatic mammal]]s that are closely related to [[whale]]s
// and [[porpoise]]s.
temp = fCurrentPosition;
String suffix = "";
try {
fCurrentCharacter = fSource[fCurrentPosition];
if (Character.isLowerCase(fCurrentCharacter)) {
fCurrentPosition++;
StringBuilder suffixBuffer = new StringBuilder(16);
suffixBuffer.append(fCurrentCharacter);
while (true) {
fCurrentCharacter = fSource[fCurrentPosition++];
if (!Character.isLowerCase(fCurrentCharacter)) {
fCurrentPosition--;
break;
}
suffixBuffer.append(fCurrentCharacter);
}
suffix = suffixBuffer.toString();
}
} catch (IndexOutOfBoundsException e) {
fCurrentPosition = temp;
}
fEventListener.onWikiLink(
fSource,
startLinkPosition,
endLinkPosition,
suffix);
if (!fWikiModel.appendRawWikipediaLink(name, suffix)) {
fCurrentPosition = temp;
}
return true;
} else {
fWhiteStart = true;
fWhiteStartPosition = startLinkPosition - 2;
fCurrentPosition = temp + 1;
}
return false;
}
private boolean parsePreformattedWikiBlock() {
if (isStartOfLine() && !isEmptyLine(1)) {
if (fWikiModel.stackSize() == 0
|| !(fWikiModel.peekNode() instanceof HTMLBlockTag)
|| (fWikiModel.peekNode() instanceof PTag)) {
createContentToken(2);
fWikiModel.reduceTokenStack(Configuration.HTML_PRE_OPEN);
// don't use Configuration.HTML_PRE_OPEN here
// rendering differs between these tags!
fWikiModel.pushNode(new WPPreTag());
char ch = ' ';
try {
while (ch == ' ' || ch == '\t') {
// SPACE or TAB => check if it's a pre-formatted text
fWhiteStart = true;
fWhiteStartPosition = fCurrentPosition;
ch = fSource[fCurrentPosition++];
while (ch != '\n' && fCurrentPosition < fSource.length) {
ch = fSource[fCurrentPosition++];
}
if (fCurrentPosition == fSource.length) {
// scanner reached end of text
if (!createPreContentToken(0)) {
fCurrentPosition = fWhiteStartPosition;
fSource[fWhiteStartPosition - 1] = '\n';
return false;
}
} else {
ch = fSource[fCurrentPosition++];
if (ch == ' ' || ch == '\t') {
if (!createPreContentToken(1)) {
fCurrentPosition = fWhiteStartPosition;
fSource[fWhiteStartPosition - 1] = '\n';
return false;
}
} else {
// skip the newline character at the end of the pre-formatted
// block
if (!createPreContentToken(2)) {
fCurrentPosition = fWhiteStartPosition;
fSource[fWhiteStartPosition - 1] = '\n';
return false;
} else {
fCurrentPosition--;
return true;
}
}
}
}
} catch (IndexOutOfBoundsException e) {
fCurrentPosition--;
} finally {
fWikiModel.popNode();
}
}
return true;
}
return false;
}
/**
* Parse <code>----</code> as <hr> tag
*
* @return
*/
private boolean parseHorizontalRuler() {
if (isStartOfLine()) {
int tempCurrPosition = fCurrentPosition;
try {
if (fSource[tempCurrPosition++] == '-'
&& fSource[tempCurrPosition++] == '-'
&& fSource[tempCurrPosition++] == '-') {
int pos = isEndOfLine('-', tempCurrPosition);
if (pos > 0) {
HrTag hr = new HrTag();
createContentToken(2);
fWikiModel.reduceTokenStack(hr);
fCurrentPosition = pos;
fWikiModel.append(hr);
fWhiteStart = false;
return true;
}
}
} catch (IndexOutOfBoundsException e) {
}
fCurrentPosition = tempCurrPosition;
}
return false;
}
/**
* Parse a wiki list <br/>
* <br/>
* Example:<br/>
*
* <pre>
* * first line
* * second line
* ** third line
* </pre>
*
* @return
*/
private boolean parseLists() {
// set scanner pointer to '\n' character:
if (isStartOfLine()) {
setPosition(fCurrentPosition - 2);
WPList list = wpList();
if (list != null && !list.isEmpty()) {
createContentToken(1);
fWikiModel.reduceTokenStack(list);
fCurrentPosition = getPosition() - 1;
fWikiModel.append(list);
return true;
}
}
return false;
}
/**
* Parses a wiki header line into "h1, h2, h3, h4, h5, h6" HTML
* tags. <br/>
* <br/>
* Example wiki syntax header line: <br/>
* <code>== Test header 2 ==</code>
*
* @return <code>true</code> if a header line could be parsed correctly,
* <code>false</code> otherwise.
*/
private boolean parseSectionHeaders() {
if (isStartOfLine()) {
int headerStartPosition = fCurrentPosition - 1;
int endIndex = fStringSource.indexOf("\n", fCurrentPosition);
if (endIndex < 0) {
endIndex = fStringSource.length();
}
int headerEndPosition = endIndex;
char ch;
while (headerEndPosition > 0) {
ch = fSource[--headerEndPosition];
if (!Character.isWhitespace(ch)) {
break;
}
}
if (headerEndPosition < 0 || headerEndPosition <= headerStartPosition) {
return false;
}
int level = 0;
int startPosition = headerStartPosition;
int endPosition = headerEndPosition + 1;
while (headerStartPosition < headerEndPosition) {
if (fSource[headerStartPosition] == '='
&& fSource[headerEndPosition] == '=') {
level++;
headerStartPosition++;
headerEndPosition--;
} else {
headerEndPosition++;
break;
}
}
if (level == 0) {
return false;
}
if (level > 6) {
level = 6;
}
createContentToken(1);
reduceTokenStack();
String head = "";
if (headerEndPosition >= headerStartPosition) {
if (headerEndPosition > headerStartPosition) {
head =
fStringSource.substring(headerStartPosition, headerEndPosition);
} else {
head = String.valueOf(fStringSource.charAt(headerStartPosition));
}
}
fEventListener.onHeader(
fSource,
startPosition,
endPosition,
headerStartPosition,
headerEndPosition,
level);
fCurrentPosition = endIndex;
if (head != null) {
fTableOfContentTag =
fWikiModel.appendHead(
head,
level,
fNoToC,
++fHeadCounter,
startPosition,
endPosition);
}
return true;
}
return false;
}
private boolean parseTable() {
if (isStartOfLine()) {
// wiki table ?
setPosition(fCurrentPosition - 1);
WPTable table = wpTable(fTableOfContentTag);
if (table != null) {
createContentToken(1);
fWikiModel.reduceTokenStack(table);
// set pointer behind: "\n|}"
fCurrentPosition = getPosition();
fWikiModel.append(table);
// table.filter(fSource, fWikiModel);
return true;
}
}
return false;
}
private boolean parseCode() {
if (isStartOfLine()) {
setPosition(fCurrentPosition - 1);
WPCodeBlock code = codeBlock();
if (code != null) {
createContentToken(1);
fWikiModel.reduceTokenStack(code);
fCurrentPosition = getPosition();
fWikiModel.append(code);
}
}
return false;
}
private boolean parseBlockQuote() {
if (isStartOfLine()) {
setPosition(fCurrentPosition - 1);
WPBlockQuote code = blockQuote();
if (code != null) {
createContentToken(1);
fWikiModel.reduceTokenStack(code);
fCurrentPosition = getPosition();
fWikiModel.append(code);
}
}
return false;
}
private boolean parseTemplate() {
// dummy parsing of Wikipedia templates for event listeners
// doesn't change fCurrentPosition
if (fSource[fCurrentPosition] == '{') {
int templateStartPosition = fCurrentPosition + 1;
if (fSource[templateStartPosition] != '{') {
int templateEndPosition =
findNestedTemplateEnd(fSource, templateStartPosition);
if (templateEndPosition > 0) {
fEventListener.onTemplate(
fSource,
templateStartPosition,
templateEndPosition - 2);
return true;
}
}
}
return false;
}
/**
* Parse special identifiers like __TOC__, __NOTOC__, __FORCETOC__
*
* @return
*/
private boolean parseSpecialIdentifiers() {
if (fSource[fCurrentPosition] == '_') {
fCurrentPosition++;
int tocEndPosition = fCurrentPosition;
char ch;
while (true) {
ch = fSource[tocEndPosition++];
if (ch >= 'A' && ch <= 'Z') {
continue;
}
break;
}
if (ch == '_' && fSource[tocEndPosition] == '_') {
String tocIdent =
fStringSource.substring(fCurrentPosition, tocEndPosition - 1);
if (fWikiModel.parseBehaviorSwitch(tocIdent)) {
createContentToken(2);
fCurrentPosition = tocEndPosition + 1;
return true;
}
boolean tocRecognized = false;
for (int i = 0; i < TOC_IDENTIFIERS.length; i++) {
if (TOC_IDENTIFIERS[i].equals(tocIdent)) {
createContentToken(2);
tocRecognized = true;
fCurrentPosition = tocEndPosition + 1;
switch (i) {
case 0: // TOC
fTableOfContentTag = fWikiModel.createTableOfContent(true);
fForceToC = true;
break;
case 1: // NOTOC
setNoToC(true);
break;
case 2: // FORCETOC
fForceToC = true;
break;
}
break;
}
}
if (tocRecognized) {
return true;
}
}
}
return false;
}
/**
* Check if the scanners cursor position is at the beginning of a line.
*
* @return <code>true</code> if the scanners cursor points to the beginning of
* a line, <code>false</code> otherwise.
*/
private boolean isStartOfLine() {
if (fCurrentPosition >= 2) {
if (fSource[fCurrentPosition - 2] == '\n') {
return true;
}
} else if (fCurrentPosition == 1) {
return true;
}
return false;
}
private int isEndOfLine(char testChar, int currentPosition) {
int tempPosition = currentPosition;
try {
char ch;
while (true) {
ch = fSource[tempPosition];
if (ch != testChar) {
break;
}
tempPosition++;
}
while (true) {
ch = fSource[tempPosition++];
if (ch == '\n') {
return tempPosition;
} else if (!Character.isWhitespace(ch)) {
return -1;
}
}
} catch (IndexOutOfBoundsException e) {
}
return -1;
}
private void createTag(TagToken tag, WikiTagNode tagNode,
int startMacroPosition) {
String endTag;
String macroBodyString = "";
int index0;
String command = tagNode.getTagName();
if ((tag != null)
&& (tag instanceof IBodyTag)
&& (!tagNode.isEmptyXmlTag())) {
endTag = command + '>';
index0 =
Util.indexOfIgnoreCase(fStringSource, "</", endTag, startMacroPosition);
if (index0 >= 0) {
macroBodyString = fStringSource.substring(startMacroPosition, index0);
fCurrentPosition = index0 + endTag.length() + 2;
} else {
macroBodyString =
fStringSource.substring(startMacroPosition, fSource.length);
fCurrentPosition = fSource.length;
}
} else {
macroBodyString = null;
fCurrentPosition = startMacroPosition;
}
handleTag(tag, tagNode, macroBodyString);
}
private void skipUntilEndOfTag(WikiTagNode tagNode, int startMacroPosition) {
String endTag;
int index0;
String command = tagNode.getTagName();
if (!tagNode.isEmptyXmlTag()) {
endTag = command + '>';
index0 =
Util.indexOfIgnoreCase(fStringSource, "</", endTag, startMacroPosition);
if (index0 >= 0) {
fCurrentPosition = index0 + endTag.length() + 2;
} else {
fCurrentPosition = fSource.length;
}
}
}
private boolean handleHTTPLink(String name) {
String urlString;
String uriSchemeName = "";
if (name != null) {
boolean isEmail = false;
int index = -1;
boolean foundUrl = false;
boolean protocolRelativeURL = false;
urlString = name.trim();
if (urlString.length() >= 2
&& urlString.charAt(0) == '/'
&& urlString.charAt(1) == '/') {
// issue 89
foundUrl = true;
protocolRelativeURL = true;
} else {
try {
index = urlString.indexOf(':', 1);
if (index > 0) {
uriSchemeName = urlString.substring(0, index);
if (uriSchemeName.equalsIgnoreCase("mailto")) {
isEmail = true;
foundUrl = true;
} else {
if (fWikiModel.isValidUriScheme(uriSchemeName)) {
foundUrl = true;
}
}
}
} catch (IndexOutOfBoundsException e) {
}
}
if (foundUrl) {
// Wikipedia link style: name separated by space?
int pipeIndex = urlString.indexOf(' ');
String alias = "";
if (pipeIndex != (-1)) {
alias = urlString.substring(pipeIndex + 1);
urlString = urlString.substring(0, pipeIndex);
} else {
if (protocolRelativeURL) {
alias = urlString.substring(2);
} else {
alias = urlString;
}
}
if (isEmail) {
String email;
if (pipeIndex > 7) {
email = urlString.substring(7, pipeIndex);
} else {
email = urlString.substring(7);
}
if (EmailValidator.getInstance().isValid(email)) {
fWikiModel.appendMailtoLink(urlString, alias, false);
return true;
}
} else {
if (protocolRelativeURL) {
fWikiModel.appendExternalLink(
uriSchemeName,
urlString,
alias,
false);
return true;
}
parseURIScheme();
String uriSchemeSpecificPart = urlString.substring(index + 1);
if (fWikiModel.isValidUriSchemeSpecificPart(
uriSchemeName,
uriSchemeSpecificPart)) {
fWikiModel.appendExternalLink(
uriSchemeName,
urlString,
alias,
false);
return true;
}
}
}
}
return false;
}
private void handleTag(TagToken tag, WikiTagNode tagNode, String bodyString) {
String command = tagNode.getTagName();
try {
if (tag instanceof EndTagToken) {
fWikiModel.append(tag);
} else {
fWikiModel.pushNode(tag);
if (null != bodyString) {
if (tag instanceof INoBodyParsingTag) {
((TagNode) tag).addChild(new ContentToken(bodyString));
} else {
// recursively filter tags within the tags body string
WikipediaParser.parseRecursive(
bodyString.trim(),
fWikiModel,
false,
true);
}
}
if (tag instanceof IBodyTag) {
fWikiModel.popNode();
}
}
} catch (IllegalArgumentException e) {
TagNode divTagNode = new TagNode("div");
divTagNode.addAttribute("class", "error", true);
divTagNode.addChild(new ContentToken("IllegalArgumentException: "
+ command
+ " - "
+ e.getMessage()));
fWikiModel.append(divTagNode);
e.printStackTrace();
} catch (Throwable e) {
e.printStackTrace();
TagNode divTagNode = new TagNode("div");
divTagNode.addAttribute("class", "error", true);
divTagNode.addChild(new ContentToken(command + ": " + e.getMessage()));
fWikiModel.append(divTagNode);
e.printStackTrace();
}
}
@Override
public void runParser() {
int token = TokenSTART;
while ((token = getNextToken()) != TokenEOF) {
switch (token) {
case TokenBOLDITALIC:
if (fWikiModel.stackSize() > 0
&& fWikiModel.peekNode().equals(BOLDITALIC)) {
fWikiModel.popNode();
} else if (fWikiModel.stackSize() > 1
&& fWikiModel.peekNode().equals(BOLD)
&& fWikiModel.getNode(fWikiModel.stackSize() - 2).equals(ITALIC)) {
fWikiModel.popNode();
fWikiModel.popNode();
} else if (fWikiModel.stackSize() > 1
&& fWikiModel.peekNode().equals(ITALIC)
&& fWikiModel.getNode(fWikiModel.stackSize() - 2).equals(BOLD)) {
fWikiModel.popNode();
fWikiModel.popNode();
} else if (fWikiModel.stackSize() > 0
&& fWikiModel.peekNode().equals(BOLD)) {
fWikiModel.popNode();
fWikiModel.pushNode(new WPTag("i"));
} else if (fWikiModel.stackSize() > 0
&& fWikiModel.peekNode().equals(ITALIC)) {
fWikiModel.popNode();
fWikiModel.pushNode(new WPTag("b"));
} else {
fWikiModel.pushNode(new WPBoldItalicTag());
}
break;
case TokenBOLD:
if (fWikiModel.stackSize() > 0
&& fWikiModel.peekNode().equals(BOLDITALIC)) {
fWikiModel.popNode();
fWikiModel.pushNode(new WPTag("i"));
// fResultBuffer.append("</b>");
} else if (fWikiModel.stackSize() > 0
&& fWikiModel.peekNode().equals(BOLD)) {
fWikiModel.popNode();
} else {
fWikiModel.pushNode(new WPTag("b"));
}
break;
case TokenITALIC:
if (fWikiModel.stackSize() > 0
&& fWikiModel.peekNode().equals(BOLDITALIC)) {
fWikiModel.popNode();
fWikiModel.pushNode(new WPTag("b"));
} else if (fWikiModel.stackSize() > 0
&& fWikiModel.peekNode().equals(ITALIC)) {
fWikiModel.popNode();
} else {
fWikiModel.pushNode(new WPTag("i"));
}
break;
case TokenUNDERLNE:
if (fWikiModel.stackSize() > 0
&& fWikiModel.peekNode().equals(UNDERLINE)) {
fWikiModel.popNode();
} else {
fWikiModel.pushNode(new WPTag("u"));
}
break;
case TokenDELETEDLINE:
if (fWikiModel.stackSize() > 0
&& fWikiModel.peekNode().equals(DELETEDLINE)) {
fWikiModel.popNode();
} else {
fWikiModel.pushNode(new WPTag("del"));
}
break;
}
}
reduceTokenStack();
if (!fNoToC && fTableOfContentTag != null) {
if (fHeadCounter > 3 || fForceToC) {
/** bug fix */
// fTableOfContentTag.setShowToC(true);
}
}
}
@Override
public boolean isNoToC() {
return fNoToC;
}
@Override
public void setNoToC(boolean noToC) {
fNoToC = noToC;
}
/**
* Call the parser on the first recursion level, where the text can contain a
* table of contents (TOC).
*
* <br/>
* <br/>
* <b>Note:</b> in this level the wiki model will call the
* <code>setUp()</code> method before parsing and the <code>tearDown()</code>
* method after the parser has finished.
*
* @param rawWikitext
* the raw text of the article
* @param wikiModel
* a suitable wiki model for the given wiki article text
* @param parseTemplates
* parse the template expansion step
* @param templateParserBuffer
* if the <code>templateParserBuffer != null</code> the
* <code>templateParserBuffer</code> will be used to append the
* result of the template expansion step
*
*/
public static void parse(String rawWikiText, IWikiModel wikiModel,
boolean parseTemplates, Appendable templateParserBuffer) {
try {
// initialize the wiki model
wikiModel.setUp();
if (parseTemplates) {
Appendable buf;
if (templateParserBuffer != null) {
buf = templateParserBuffer;
} else {
buf =
new StringBuilder(rawWikiText.length() + rawWikiText.length() / 10);
}
String pass1Text = null;
try {
TemplateParser.parse(rawWikiText, wikiModel, buf, wikiModel
.isTemplateTopic());
pass1Text = buf.toString();
} catch (Exception ioe) {
ioe.printStackTrace();
pass1Text =
"<span class=\"error\">TemplateParser exception: "
+ ioe.getClass().getSimpleName()
+ "</span>";
}
String redirectedLink =
AbstractParser.parseRedirect(pass1Text, wikiModel);
if (redirectedLink == null) {
parseRecursive(pass1Text, wikiModel, false, false);
}
} else {
if (AbstractParser.parseRedirect(rawWikiText, wikiModel) == null) {
parseRecursive(rawWikiText, wikiModel, false, false);
}
}
} finally {
// clean up wiki model if necessary
wikiModel.tearDown();
}
}
/**
* Call the parser on the subsequent recursion levels, where the subtexts (of
* templates, table cells, list items or image captions) don't contain a table
* of contents (TOC)
*
* <b>Note:</b> the wiki model doesn't call the <code>setUp()</code> or
* <code>tearDown()</code> methods for the subsequent recursive parser steps.
*
* @param rawWikitext
* @param wikiModel
* @return
*/
public static void parseRecursive(String rawWikitext, IWikiModel wikiModel) {
parseRecursive(rawWikitext, wikiModel, false, true);
}
/**
* Call the parser on the subsequent recursion levels, where the subtexts (of
* templates, table cells, list items or image captions) don't contain a table
* of contents (TOC)
*
* <b>Note:</b> the wiki model doesn't call the <code>setUp()</code> or
* <code>tearDown()</code> methods for the subsequent recursive parser steps.
*
* @param rawWikitext
* @param wikiModel
* @param noTOC
* @param appendStack
* @return
* @return
*/
public static TagStack parseRecursive(String rawWikitext,
IWikiModel wikiModel, boolean createOnlyLocalStack, boolean noTOC) {
AbstractParser parser = wikiModel.createNewInstance(rawWikitext);
return parser
.parseRecursiveInternal(wikiModel, createOnlyLocalStack, noTOC);
}
/**
* Determine if the currently parsed wiki text is a template text.
*
* @return <code>true</code> if the currently parsed wiki text is a template
*/
@Override
public boolean isTemplate() {
return fRenderTemplate;
}
}