package info.bliki.wiki.filter; import info.bliki.commons.validator.routines.EmailValidator; import info.bliki.htmlcleaner.ContentToken; import info.bliki.htmlcleaner.EndTagToken; import info.bliki.htmlcleaner.TagNode; import info.bliki.htmlcleaner.TagToken; import info.bliki.wiki.model.Configuration; import info.bliki.wiki.model.DefaultEventListener; import info.bliki.wiki.model.IEventListener; import info.bliki.wiki.model.ITableOfContent; import info.bliki.wiki.model.IWikiModel; import info.bliki.wiki.tags.HTMLBlockTag; import info.bliki.wiki.tags.HTMLTag; import info.bliki.wiki.tags.HrTag; import info.bliki.wiki.tags.PTag; import info.bliki.wiki.tags.WPBoldItalicTag; import info.bliki.wiki.tags.WPPreTag; import info.bliki.wiki.tags.WPTag; import info.bliki.wiki.tags.util.Attribute; import info.bliki.wiki.tags.util.IBodyTag; import info.bliki.wiki.tags.util.INoBodyParsingTag; import info.bliki.wiki.tags.util.NodeAttribute; import info.bliki.wiki.tags.util.TagStack; import info.bliki.wiki.tags.util.WikiTagNode; import java.util.List; /** * A Wikipedia syntax parser for the second pass in the parsing of a Wikipedia * source text. * * @see TemplateParser for the first pass */ public class WikipediaParser extends AbstractParser implements IParser { private ITableOfContent fTableOfContentTag = null; private int fHeadCounter = 0; /** * Enable HTML tags */ private final boolean fHtmlCodes = true; private boolean fNoToC = false; private boolean fRenderTemplate = false; private boolean fForceToC = false; private IEventListener fEventListener = null; public WikipediaParser(String stringSource, boolean renderTemplate) { this(stringSource, renderTemplate, null); } public WikipediaParser(String stringSource, boolean renderTemplate, IEventListener wikiListener) { super(stringSource); fRenderTemplate = renderTemplate; if (wikiListener == null) { fEventListener = DefaultEventListener.CONST; } else { fEventListener = wikiListener; } } /** * Copy the read ahead content in the resulting HTML text token. * * @param diff * subtract <code>diff</code> form the current parser position to get * the HTML text token end position. */ private boolean createPreContentToken(final int diff) { if (fWhiteStart) { try { final int count = fCurrentPosition - diff - fWhiteStartPosition; if (count > 0) { String rawWikiText = fStringSource.substring(fWhiteStartPosition, fWhiteStartPosition + count); WikipediaPreTagParser.parseRecursive(rawWikiText, fWikiModel); fWhiteStart = false; } return true; } catch (InvalidPreWikiTag ipwt) { } } return false; } public int getNextToken() // throws InvalidInputException { fWhiteStart = true; fWhiteStartPosition = fCurrentPosition; try { while (true) { fCurrentCharacter = fSource[fCurrentPosition++]; // ---------Identify the next token------------- switch (fCurrentCharacter) { case '\n': // check at the end of line, if there is open wiki bold or italic // markup reduceTokenStackBoldItalic(); break; case '{': // dummy parsing of wikipedia templates for event listeners if (parseTemplate()) { } else { // wikipedia table handling if (parseTable()) { continue; } else if (parseCode()) { continue; } else if (parseBlockQuote()) { continue; } } break; case '_': // TOC identifiers __NOTOC__, __FORCETOC__ ... if (parseSpecialIdentifiers()) { continue; } break; case '=': // wikipedia header ? if (parseSectionHeaders()) { continue; } break; case WPList.DL_DD_CHAR: // start of <dl><dd> list case WPList.DL_DT_CHAR: // start of <dl><dt> list case WPList.OL_CHAR: // start of <ol> list case WPList.UL_CHAR: // start of <ul> list if (parseLists()) { continue; } break; // case ':': // if (parseSimpleDefinitionLists()) { // continue; // } // break; // case ';': // if (parseDefinitionLists()) { // continue; // } // break; case '-': // parse ---- as <hr> if (parseHorizontalRuler()) { continue; } break; case ' ': // pre-formatted text? case '\t': if (parsePreformattedWikiBlock()) { continue; } break; } if (isStartOfLine() && fWikiModel.getRecursionLevel() == 1) { if (isEmptyLine(1)) { if (fWikiModel.stackSize() > 0 && (fWikiModel.peekNode() instanceof PTag)) { // close <p> tag: createContentToken(2); fWikiModel.reduceTokenStack(Configuration.HTML_PARAGRAPH_OPEN); } } else { if (fWikiModel.stackSize() == 0) { addParagraph(); // if (fWikiModel.getRecursionLevel() == 1) { // addParagraph(); // } else { // if (fCurrentPosition > 1) { // addParagraph(); // } // } } else { if (isEmptyLine(2)) { if (fWikiModel.stackSize() > 0 && (fWikiModel.peekNode() instanceof PTag)) { // add <br> tag for one newline createContentToken(2); fWikiModel.pushNode(new HTMLTag("br")); fWikiModel.popNode(); } } TagToken tag = fWikiModel.peekNode(); if (tag instanceof WPPreTag) { addPreformattedText(); // } else if (tag instanceof PTag) { // createContentToken(fWhiteStart, fWhiteStartPosition, 2); // reduceTokenStack(Configuration.HTML_PARAGRAPH_OPEN); } else { String allowedParents = Configuration.HTML_PARAGRAPH_OPEN.getParents(); if (allowedParents != null) { int index = -1; index = allowedParents.indexOf("|" + tag.getName() + "|"); if (index >= 0) { addParagraph(); } } } } } } // ---------Identify the next token------------- switch (fCurrentCharacter) { case '[': if (parseWikiLink()) { continue; } break; case '\'': if (getNextChar('\'')) { if (getNextChar('\'')) { if (getNextChar('\'')) { if (getNextChar('\'')) { createContentToken(5); return TokenBOLDITALIC; } fCurrentPosition -= 1; fWhiteStart = true; createContentToken(3); return TokenBOLD; } createContentToken(3); return TokenBOLD; } createContentToken(2); return TokenITALIC; } break; case '%': if (getNextChar('%')) { if (getNextChar('%')) { createContentToken(3); return TokenDELETEDLINE; } createContentToken(2); return TokenUNDERLNE; } break; case '<': if (fHtmlCodes) { int htmlStartPosition = fCurrentPosition; // HTML tags are allowed try { switch (fStringSource.charAt(fCurrentPosition)) { case '!': // <!-- HTML comment --> if (parseHTMLCommentTags()) { continue; } break; default: if (fSource[fCurrentPosition] != '/') { // opening HTML tag WikiTagNode tagNode = parseTag(fCurrentPosition); if (tagNode != null) { String tagName = tagNode.getTagName(); TagToken tag = fWikiModel.getTokenMap().get(tagName); if (tag != null) { tag = (TagToken) tag.clone(); if (tag instanceof TagNode) { TagNode node = (TagNode) tag; List<NodeAttribute> attributes = tagNode.getAttributesEx(); Attribute attr; String temp; for (int i = 1; i < attributes.size(); i++) { attr = attributes.get(i); temp = attr.getValue(); if (temp != null) { temp = parseNowiki(temp); } node.addAttribute(attr.getName(), temp, true); } } if (tag instanceof HTMLTag) { ((HTMLTag) tag).setTemplate(isTemplate()); } createContentToken(1); fCurrentPosition = fScannerPosition; String allowedParents = tag.getParents(); if (allowedParents != null) { fWikiModel.reduceTokenStack(tag); } createTag(tag, tagNode, tagNode.getEndPosition()); return TokenIgnore; } else { fWhiteStart = true; skipUntilEndOfTag(tagNode, tagNode.getEndPosition()); createContentToken(0); return TokenIgnore; } // break; } } else { // closing HTML tag WikiTagNode tagNode = parseTag(++fCurrentPosition); if (tagNode != null) { String tagName = tagNode.getTagName(); TagToken tag = fWikiModel.getTokenMap().get(tagName); if (tag != null) { createContentToken(2); fCurrentPosition = fScannerPosition; if (fWikiModel.stackSize() > 0) { TagToken topToken = fWikiModel.peekNode(); if (topToken.getName().equals(tag.getName())) { fWikiModel.popNode(); return TokenIgnore; } else { if (tag.isReduceTokenStack()) { reduceStackUntilToken(tag); } } } else { } return TokenIgnore; } break; } } } } catch (IndexOutOfBoundsException e) { // do nothing } fCurrentPosition = htmlStartPosition; } break; default: if (Character.isLetter(fCurrentCharacter)) { if (fCurrentPosition < 2 || !Character.isLetterOrDigit(fSource[fCurrentPosition - 2])) { if (fCurrentCharacter == 'i' || fCurrentCharacter == 'I') { // ISBN ? if (parseISBNLinks()) { continue; } } if (parseURIScheme()) { // a URI scheme registered in the wiki model (ftp, http, // https,...) continue; } if (fWikiModel.isCamelCaseEnabled() && Character.isUpperCase(fCurrentCharacter) && fWikiModel.getRecursionLevel() <= 1) { if (parseCamelCaseLink()) { continue; } } } } } if (!fWhiteStart) { fWhiteStart = true; fWhiteStartPosition = fCurrentPosition - 1; } } // -----------------end switch while try-------------------- } catch (IndexOutOfBoundsException e) { // end of scanner text } try { createContentToken(1); } catch (IndexOutOfBoundsException e) { // end of scanner text } return TokenEOF; } /** * Parse nowiki tags. * * @param input * @return */ private String parseNowiki(String input) { int indx = input.indexOf("<nowiki>"); int indx2; int lastIndx = 0; if (indx >= 0) { StringBuilder buf = new StringBuilder(input.length()); while (indx >= 0) { buf.append(input.substring(lastIndx, indx)); lastIndx = indx + 8; // <nowiki> length indx2 = input.indexOf("</nowiki>", indx + 1); if (indx2 >= 0) { buf.append(input.substring(lastIndx, indx2)); lastIndx = indx2 + 9;// </nowiki> length } else { break; } indx = input.indexOf("<nowiki>", indx2 + 1); } buf.append(input.substring(lastIndx, input.length())); return buf.toString(); } return input; } private void addParagraph() { createContentToken(2); fWikiModel.reduceTokenStack(Configuration.HTML_PARAGRAPH_OPEN); fWikiModel.pushNode(new PTag()); } /** * Add the content of the wiki <pre> block. Trim the content at the * right side. */ private void addPreformattedText() { if (fWhiteStart) { int currentPos = fCurrentPosition; int whiteEndPosition = fCurrentPosition - 2; while (whiteEndPosition > fWhiteStartPosition) { if (!Character.isWhitespace(fSource[whiteEndPosition])) { whiteEndPosition++; break; } whiteEndPosition--; } try { fCurrentPosition = whiteEndPosition; createContentToken(0); } finally { fCurrentPosition = currentPos; } } fWikiModel.reduceTokenStack(Configuration.HTML_PARAGRAPH_OPEN); fWikiModel.pushNode(new PTag()); } private boolean parseHTMLCommentTags() { int htmlStartPosition = fCurrentPosition; String htmlCommentString = fStringSource.substring(fCurrentPosition - 1, fCurrentPosition + 3); if (htmlCommentString.equals("<!--")) { fCurrentPosition += 3; if (readUntil("-->")) { String htmlCommentContent = fStringSource.substring(htmlStartPosition + 3, fCurrentPosition - 3); if (htmlCommentContent != null) { createContentToken(fCurrentPosition - htmlStartPosition + 1); return true; } } } return false; } private boolean parseISBNLinks() { final int urlStartPosition = fCurrentPosition; boolean foundISBN = false; try { if ((fCurrentCharacter == 'i' || fCurrentCharacter == 'I') && (fSource[fCurrentPosition] == 's' || fSource[fCurrentPosition] == 'S') && (fSource[++fCurrentPosition] == 'b' || fSource[fCurrentPosition] == 'B') && (fSource[++fCurrentPosition] == 'n' || fSource[fCurrentPosition] == 'N') && fSource[++fCurrentPosition] == ' ') { fCurrentPosition++; createContentToken(5); foundISBN = true; char ch; ch = fSource[fCurrentPosition++]; while ((ch >= '0' && ch <= '9') || ch == '-') { ch = fSource[fCurrentPosition++]; } } } catch (IndexOutOfBoundsException e) { } if (foundISBN) { String urlString = fStringSource.substring(urlStartPosition - 1, fCurrentPosition - 1); fCurrentPosition--; fWikiModel.appendISBNLink(urlString); return true; } // rollback work :-) fCurrentPosition = urlStartPosition; return false; } private boolean parseMailtoLinks() { final int urlStartPosition = fCurrentPosition; int tempPosition = fCurrentPosition; boolean foundUrl = false; try { if ((fCurrentCharacter == 'm' || fCurrentCharacter == 'M') && (fSource[fCurrentPosition] == 'a' || fSource[fCurrentPosition] == 'A') && (fSource[++fCurrentPosition] == 'i' || fSource[fCurrentPosition] == 'I') && (fSource[++fCurrentPosition] == 'l' || fSource[fCurrentPosition] == 'L') && (fSource[++fCurrentPosition] == 't' || fSource[fCurrentPosition] == 'T') && (fSource[++fCurrentPosition] == 'o' || fSource[fCurrentPosition] == 'O')) { tempPosition += 6; fCurrentCharacter = fSource[tempPosition++]; foundUrl = true; while (!Character.isWhitespace(fSource[tempPosition++])) { } } } catch (IndexOutOfBoundsException e) { } if (foundUrl) { String urlString = fStringSource.substring(urlStartPosition - 1, tempPosition - 1); String email = urlString.substring(7); if (EmailValidator.getInstance().isValid(email)) { createContentToken(5); fWhiteStart = false; fCurrentPosition = tempPosition; fCurrentPosition--; fWikiModel.appendMailtoLink(urlString, urlString, true); return true; } } // rollback work :-) fCurrentPosition = urlStartPosition; return false; } /** * See <a href="http://en.wikipedia.org/wiki/URI_scheme">URI scheme</a> * * @return <code>true</code> if a registered URI scheme was found in the wiki * models configuration.. */ private boolean parseURIScheme() { if (fCurrentCharacter == 'm' || fCurrentCharacter == 'M') { // mailto ? if (parseMailtoLinks()) { return true; } } int urlStartPosition = fCurrentPosition; int tempPosition = fCurrentPosition; String uriSchemeName = ""; int index = -1; boolean foundUrl = false; try { index = indexOfUntilNoLetter(':', fCurrentPosition); if (index > 0) { uriSchemeName = fStringSource.substring(fCurrentPosition - 1, index); if (fWikiModel.isValidUriScheme(uriSchemeName)) { // found something like "ftp", "http", "https" tempPosition += uriSchemeName.length() + 1; fCurrentCharacter = fSource[tempPosition++]; createContentToken(1); fWhiteStart = false; foundUrl = true; while (Encoder.isUrlIdentifierPart(fSource[tempPosition++])) { } } } } catch (IndexOutOfBoundsException e) { } if (foundUrl) { String restString = fStringSource.substring(urlStartPosition - 1, tempPosition - 1); String uriSchemeSpecificPart = fStringSource.substring(index + 1, tempPosition - 1); if (fWikiModel.isValidUriSchemeSpecificPart( uriSchemeName, uriSchemeSpecificPart)) { fWhiteStart = false; fCurrentPosition = tempPosition; fCurrentPosition--; fWikiModel.appendExternalLink( uriSchemeName, restString, restString, true); return true; } } // rollback work :-) fCurrentPosition = urlStartPosition; return false; } private boolean parseCamelCaseLink() { int startLinkPosition = fCurrentPosition - 1; int temp = fCurrentPosition; boolean isCamelCase = false; try { char ch = fSource[temp++]; while (Character.isLetterOrDigit(ch)) { if (Character.isUpperCase(ch)) { // at least 2 upper case characters appear in the word isCamelCase = true; } ch = fSource[temp++]; } } catch (IndexOutOfBoundsException iobe) { } if (isCamelCase) { createContentToken(1); fWhiteStart = false; fCurrentPosition = temp - 1; String name = fStringSource.substring(startLinkPosition, fCurrentPosition); fWikiModel.appendInternalLink(name, null, name, null, false); return true; } return false; } /** * Parse a wiki section starting with a '[' character * * @return <code>true</code> if a correct link was found */ private boolean parseWikiLink() { int startLinkPosition = fCurrentPosition; if (getNextChar('[')) { return parseWikiTag(); } else if (getNextCharAsWhitespace()) { fCurrentPosition--; return false; } else { createContentToken(1); fWhiteStart = false; if (readUntilCharOrStopAtEOL(']')) { String name = fStringSource.substring(startLinkPosition, fCurrentPosition - 1); // bbcode start if (fWikiModel.parseBBCodes() && name.length() > 0) { // parse start tokens like phpBB forum syntax style (bbcode) char ch = name.charAt(0); if ('a' <= ch && ch <= 'z') { // first character must be a letter StringBuilder bbCode = new StringBuilder(name.length()); bbCode.append(ch); if (parsePHPBBCode(name, bbCode)) { return true; } } } // bbcode end if (handleHTTPLink(name)) { return true; } } fCurrentPosition = startLinkPosition; } return false; } /** * Parse a wiki section starting with a '[[' sequence * * @return <code>true</code> if a correct link was found */ private boolean parseWikiTag() { int startLinkPosition = fCurrentPosition; int endLinkPosition; // wikipedia link style createContentToken(2); int temp = fCurrentPosition; if (findWikiLinkEnd()) { endLinkPosition = fCurrentPosition - 2; String name = fStringSource.substring(startLinkPosition, endLinkPosition); // test for a suffix string behind the Wiki link. Useful for plurals. // Example: // Dolphins are [[aquatic mammal]]s that are closely related to [[whale]]s // and [[porpoise]]s. temp = fCurrentPosition; String suffix = ""; try { fCurrentCharacter = fSource[fCurrentPosition]; if (Character.isLowerCase(fCurrentCharacter)) { fCurrentPosition++; StringBuilder suffixBuffer = new StringBuilder(16); suffixBuffer.append(fCurrentCharacter); while (true) { fCurrentCharacter = fSource[fCurrentPosition++]; if (!Character.isLowerCase(fCurrentCharacter)) { fCurrentPosition--; break; } suffixBuffer.append(fCurrentCharacter); } suffix = suffixBuffer.toString(); } } catch (IndexOutOfBoundsException e) { fCurrentPosition = temp; } fEventListener.onWikiLink( fSource, startLinkPosition, endLinkPosition, suffix); if (!fWikiModel.appendRawWikipediaLink(name, suffix)) { fCurrentPosition = temp; } return true; } else { fWhiteStart = true; fWhiteStartPosition = startLinkPosition - 2; fCurrentPosition = temp + 1; } return false; } private boolean parsePreformattedWikiBlock() { if (isStartOfLine() && !isEmptyLine(1)) { if (fWikiModel.stackSize() == 0 || !(fWikiModel.peekNode() instanceof HTMLBlockTag) || (fWikiModel.peekNode() instanceof PTag)) { createContentToken(2); fWikiModel.reduceTokenStack(Configuration.HTML_PRE_OPEN); // don't use Configuration.HTML_PRE_OPEN here // rendering differs between these tags! fWikiModel.pushNode(new WPPreTag()); char ch = ' '; try { while (ch == ' ' || ch == '\t') { // SPACE or TAB => check if it's a pre-formatted text fWhiteStart = true; fWhiteStartPosition = fCurrentPosition; ch = fSource[fCurrentPosition++]; while (ch != '\n' && fCurrentPosition < fSource.length) { ch = fSource[fCurrentPosition++]; } if (fCurrentPosition == fSource.length) { // scanner reached end of text if (!createPreContentToken(0)) { fCurrentPosition = fWhiteStartPosition; fSource[fWhiteStartPosition - 1] = '\n'; return false; } } else { ch = fSource[fCurrentPosition++]; if (ch == ' ' || ch == '\t') { if (!createPreContentToken(1)) { fCurrentPosition = fWhiteStartPosition; fSource[fWhiteStartPosition - 1] = '\n'; return false; } } else { // skip the newline character at the end of the pre-formatted // block if (!createPreContentToken(2)) { fCurrentPosition = fWhiteStartPosition; fSource[fWhiteStartPosition - 1] = '\n'; return false; } else { fCurrentPosition--; return true; } } } } } catch (IndexOutOfBoundsException e) { fCurrentPosition--; } finally { fWikiModel.popNode(); } } return true; } return false; } /** * Parse <code>----</code> as <hr> tag * * @return */ private boolean parseHorizontalRuler() { if (isStartOfLine()) { int tempCurrPosition = fCurrentPosition; try { if (fSource[tempCurrPosition++] == '-' && fSource[tempCurrPosition++] == '-' && fSource[tempCurrPosition++] == '-') { int pos = isEndOfLine('-', tempCurrPosition); if (pos > 0) { HrTag hr = new HrTag(); createContentToken(2); fWikiModel.reduceTokenStack(hr); fCurrentPosition = pos; fWikiModel.append(hr); fWhiteStart = false; return true; } } } catch (IndexOutOfBoundsException e) { } fCurrentPosition = tempCurrPosition; } return false; } /** * Parse a wiki list <br/> * <br/> * Example:<br/> * * <pre> * * first line * * second line * ** third line * </pre> * * @return */ private boolean parseLists() { // set scanner pointer to '\n' character: if (isStartOfLine()) { setPosition(fCurrentPosition - 2); WPList list = wpList(); if (list != null && !list.isEmpty()) { createContentToken(1); fWikiModel.reduceTokenStack(list); fCurrentPosition = getPosition() - 1; fWikiModel.append(list); return true; } } return false; } /** * Parses a wiki header line into "h1, h2, h3, h4, h5, h6" HTML * tags. <br/> * <br/> * Example wiki syntax header line: <br/> * <code>== Test header 2 ==</code> * * @return <code>true</code> if a header line could be parsed correctly, * <code>false</code> otherwise. */ private boolean parseSectionHeaders() { if (isStartOfLine()) { int headerStartPosition = fCurrentPosition - 1; int endIndex = fStringSource.indexOf("\n", fCurrentPosition); if (endIndex < 0) { endIndex = fStringSource.length(); } int headerEndPosition = endIndex; char ch; while (headerEndPosition > 0) { ch = fSource[--headerEndPosition]; if (!Character.isWhitespace(ch)) { break; } } if (headerEndPosition < 0 || headerEndPosition <= headerStartPosition) { return false; } int level = 0; int startPosition = headerStartPosition; int endPosition = headerEndPosition + 1; while (headerStartPosition < headerEndPosition) { if (fSource[headerStartPosition] == '=' && fSource[headerEndPosition] == '=') { level++; headerStartPosition++; headerEndPosition--; } else { headerEndPosition++; break; } } if (level == 0) { return false; } if (level > 6) { level = 6; } createContentToken(1); reduceTokenStack(); String head = ""; if (headerEndPosition >= headerStartPosition) { if (headerEndPosition > headerStartPosition) { head = fStringSource.substring(headerStartPosition, headerEndPosition); } else { head = String.valueOf(fStringSource.charAt(headerStartPosition)); } } fEventListener.onHeader( fSource, startPosition, endPosition, headerStartPosition, headerEndPosition, level); fCurrentPosition = endIndex; if (head != null) { fTableOfContentTag = fWikiModel.appendHead( head, level, fNoToC, ++fHeadCounter, startPosition, endPosition); } return true; } return false; } private boolean parseTable() { if (isStartOfLine()) { // wiki table ? setPosition(fCurrentPosition - 1); WPTable table = wpTable(fTableOfContentTag); if (table != null) { createContentToken(1); fWikiModel.reduceTokenStack(table); // set pointer behind: "\n|}" fCurrentPosition = getPosition(); fWikiModel.append(table); // table.filter(fSource, fWikiModel); return true; } } return false; } private boolean parseCode() { if (isStartOfLine()) { setPosition(fCurrentPosition - 1); WPCodeBlock code = codeBlock(); if (code != null) { createContentToken(1); fWikiModel.reduceTokenStack(code); fCurrentPosition = getPosition(); fWikiModel.append(code); } } return false; } private boolean parseBlockQuote() { if (isStartOfLine()) { setPosition(fCurrentPosition - 1); WPBlockQuote code = blockQuote(); if (code != null) { createContentToken(1); fWikiModel.reduceTokenStack(code); fCurrentPosition = getPosition(); fWikiModel.append(code); } } return false; } private boolean parseTemplate() { // dummy parsing of Wikipedia templates for event listeners // doesn't change fCurrentPosition if (fSource[fCurrentPosition] == '{') { int templateStartPosition = fCurrentPosition + 1; if (fSource[templateStartPosition] != '{') { int templateEndPosition = findNestedTemplateEnd(fSource, templateStartPosition); if (templateEndPosition > 0) { fEventListener.onTemplate( fSource, templateStartPosition, templateEndPosition - 2); return true; } } } return false; } /** * Parse special identifiers like __TOC__, __NOTOC__, __FORCETOC__ * * @return */ private boolean parseSpecialIdentifiers() { if (fSource[fCurrentPosition] == '_') { fCurrentPosition++; int tocEndPosition = fCurrentPosition; char ch; while (true) { ch = fSource[tocEndPosition++]; if (ch >= 'A' && ch <= 'Z') { continue; } break; } if (ch == '_' && fSource[tocEndPosition] == '_') { String tocIdent = fStringSource.substring(fCurrentPosition, tocEndPosition - 1); if (fWikiModel.parseBehaviorSwitch(tocIdent)) { createContentToken(2); fCurrentPosition = tocEndPosition + 1; return true; } boolean tocRecognized = false; for (int i = 0; i < TOC_IDENTIFIERS.length; i++) { if (TOC_IDENTIFIERS[i].equals(tocIdent)) { createContentToken(2); tocRecognized = true; fCurrentPosition = tocEndPosition + 1; switch (i) { case 0: // TOC fTableOfContentTag = fWikiModel.createTableOfContent(true); fForceToC = true; break; case 1: // NOTOC setNoToC(true); break; case 2: // FORCETOC fForceToC = true; break; } break; } } if (tocRecognized) { return true; } } } return false; } /** * Check if the scanners cursor position is at the beginning of a line. * * @return <code>true</code> if the scanners cursor points to the beginning of * a line, <code>false</code> otherwise. */ private boolean isStartOfLine() { if (fCurrentPosition >= 2) { if (fSource[fCurrentPosition - 2] == '\n') { return true; } } else if (fCurrentPosition == 1) { return true; } return false; } private int isEndOfLine(char testChar, int currentPosition) { int tempPosition = currentPosition; try { char ch; while (true) { ch = fSource[tempPosition]; if (ch != testChar) { break; } tempPosition++; } while (true) { ch = fSource[tempPosition++]; if (ch == '\n') { return tempPosition; } else if (!Character.isWhitespace(ch)) { return -1; } } } catch (IndexOutOfBoundsException e) { } return -1; } private void createTag(TagToken tag, WikiTagNode tagNode, int startMacroPosition) { String endTag; String macroBodyString = ""; int index0; String command = tagNode.getTagName(); if ((tag != null) && (tag instanceof IBodyTag) && (!tagNode.isEmptyXmlTag())) { endTag = command + '>'; index0 = Util.indexOfIgnoreCase(fStringSource, "</", endTag, startMacroPosition); if (index0 >= 0) { macroBodyString = fStringSource.substring(startMacroPosition, index0); fCurrentPosition = index0 + endTag.length() + 2; } else { macroBodyString = fStringSource.substring(startMacroPosition, fSource.length); fCurrentPosition = fSource.length; } } else { macroBodyString = null; fCurrentPosition = startMacroPosition; } handleTag(tag, tagNode, macroBodyString); } private void skipUntilEndOfTag(WikiTagNode tagNode, int startMacroPosition) { String endTag; int index0; String command = tagNode.getTagName(); if (!tagNode.isEmptyXmlTag()) { endTag = command + '>'; index0 = Util.indexOfIgnoreCase(fStringSource, "</", endTag, startMacroPosition); if (index0 >= 0) { fCurrentPosition = index0 + endTag.length() + 2; } else { fCurrentPosition = fSource.length; } } } private boolean handleHTTPLink(String name) { String urlString; String uriSchemeName = ""; if (name != null) { boolean isEmail = false; int index = -1; boolean foundUrl = false; boolean protocolRelativeURL = false; urlString = name.trim(); if (urlString.length() >= 2 && urlString.charAt(0) == '/' && urlString.charAt(1) == '/') { // issue 89 foundUrl = true; protocolRelativeURL = true; } else { try { index = urlString.indexOf(':', 1); if (index > 0) { uriSchemeName = urlString.substring(0, index); if (uriSchemeName.equalsIgnoreCase("mailto")) { isEmail = true; foundUrl = true; } else { if (fWikiModel.isValidUriScheme(uriSchemeName)) { foundUrl = true; } } } } catch (IndexOutOfBoundsException e) { } } if (foundUrl) { // Wikipedia link style: name separated by space? int pipeIndex = urlString.indexOf(' '); String alias = ""; if (pipeIndex != (-1)) { alias = urlString.substring(pipeIndex + 1); urlString = urlString.substring(0, pipeIndex); } else { if (protocolRelativeURL) { alias = urlString.substring(2); } else { alias = urlString; } } if (isEmail) { String email; if (pipeIndex > 7) { email = urlString.substring(7, pipeIndex); } else { email = urlString.substring(7); } if (EmailValidator.getInstance().isValid(email)) { fWikiModel.appendMailtoLink(urlString, alias, false); return true; } } else { if (protocolRelativeURL) { fWikiModel.appendExternalLink( uriSchemeName, urlString, alias, false); return true; } parseURIScheme(); String uriSchemeSpecificPart = urlString.substring(index + 1); if (fWikiModel.isValidUriSchemeSpecificPart( uriSchemeName, uriSchemeSpecificPart)) { fWikiModel.appendExternalLink( uriSchemeName, urlString, alias, false); return true; } } } } return false; } private void handleTag(TagToken tag, WikiTagNode tagNode, String bodyString) { String command = tagNode.getTagName(); try { if (tag instanceof EndTagToken) { fWikiModel.append(tag); } else { fWikiModel.pushNode(tag); if (null != bodyString) { if (tag instanceof INoBodyParsingTag) { ((TagNode) tag).addChild(new ContentToken(bodyString)); } else { // recursively filter tags within the tags body string WikipediaParser.parseRecursive( bodyString.trim(), fWikiModel, false, true); } } if (tag instanceof IBodyTag) { fWikiModel.popNode(); } } } catch (IllegalArgumentException e) { TagNode divTagNode = new TagNode("div"); divTagNode.addAttribute("class", "error", true); divTagNode.addChild(new ContentToken("IllegalArgumentException: " + command + " - " + e.getMessage())); fWikiModel.append(divTagNode); e.printStackTrace(); } catch (Throwable e) { e.printStackTrace(); TagNode divTagNode = new TagNode("div"); divTagNode.addAttribute("class", "error", true); divTagNode.addChild(new ContentToken(command + ": " + e.getMessage())); fWikiModel.append(divTagNode); e.printStackTrace(); } } @Override public void runParser() { int token = TokenSTART; while ((token = getNextToken()) != TokenEOF) { switch (token) { case TokenBOLDITALIC: if (fWikiModel.stackSize() > 0 && fWikiModel.peekNode().equals(BOLDITALIC)) { fWikiModel.popNode(); } else if (fWikiModel.stackSize() > 1 && fWikiModel.peekNode().equals(BOLD) && fWikiModel.getNode(fWikiModel.stackSize() - 2).equals(ITALIC)) { fWikiModel.popNode(); fWikiModel.popNode(); } else if (fWikiModel.stackSize() > 1 && fWikiModel.peekNode().equals(ITALIC) && fWikiModel.getNode(fWikiModel.stackSize() - 2).equals(BOLD)) { fWikiModel.popNode(); fWikiModel.popNode(); } else if (fWikiModel.stackSize() > 0 && fWikiModel.peekNode().equals(BOLD)) { fWikiModel.popNode(); fWikiModel.pushNode(new WPTag("i")); } else if (fWikiModel.stackSize() > 0 && fWikiModel.peekNode().equals(ITALIC)) { fWikiModel.popNode(); fWikiModel.pushNode(new WPTag("b")); } else { fWikiModel.pushNode(new WPBoldItalicTag()); } break; case TokenBOLD: if (fWikiModel.stackSize() > 0 && fWikiModel.peekNode().equals(BOLDITALIC)) { fWikiModel.popNode(); fWikiModel.pushNode(new WPTag("i")); // fResultBuffer.append("</b>"); } else if (fWikiModel.stackSize() > 0 && fWikiModel.peekNode().equals(BOLD)) { fWikiModel.popNode(); } else { fWikiModel.pushNode(new WPTag("b")); } break; case TokenITALIC: if (fWikiModel.stackSize() > 0 && fWikiModel.peekNode().equals(BOLDITALIC)) { fWikiModel.popNode(); fWikiModel.pushNode(new WPTag("b")); } else if (fWikiModel.stackSize() > 0 && fWikiModel.peekNode().equals(ITALIC)) { fWikiModel.popNode(); } else { fWikiModel.pushNode(new WPTag("i")); } break; case TokenUNDERLNE: if (fWikiModel.stackSize() > 0 && fWikiModel.peekNode().equals(UNDERLINE)) { fWikiModel.popNode(); } else { fWikiModel.pushNode(new WPTag("u")); } break; case TokenDELETEDLINE: if (fWikiModel.stackSize() > 0 && fWikiModel.peekNode().equals(DELETEDLINE)) { fWikiModel.popNode(); } else { fWikiModel.pushNode(new WPTag("del")); } break; } } reduceTokenStack(); if (!fNoToC && fTableOfContentTag != null) { if (fHeadCounter > 3 || fForceToC) { /** bug fix */ // fTableOfContentTag.setShowToC(true); } } } @Override public boolean isNoToC() { return fNoToC; } @Override public void setNoToC(boolean noToC) { fNoToC = noToC; } /** * Call the parser on the first recursion level, where the text can contain a * table of contents (TOC). * * <br/> * <br/> * <b>Note:</b> in this level the wiki model will call the * <code>setUp()</code> method before parsing and the <code>tearDown()</code> * method after the parser has finished. * * @param rawWikitext * the raw text of the article * @param wikiModel * a suitable wiki model for the given wiki article text * @param parseTemplates * parse the template expansion step * @param templateParserBuffer * if the <code>templateParserBuffer != null</code> the * <code>templateParserBuffer</code> will be used to append the * result of the template expansion step * */ public static void parse(String rawWikiText, IWikiModel wikiModel, boolean parseTemplates, Appendable templateParserBuffer) { try { // initialize the wiki model wikiModel.setUp(); if (parseTemplates) { Appendable buf; if (templateParserBuffer != null) { buf = templateParserBuffer; } else { buf = new StringBuilder(rawWikiText.length() + rawWikiText.length() / 10); } String pass1Text = null; try { TemplateParser.parse(rawWikiText, wikiModel, buf, wikiModel .isTemplateTopic()); pass1Text = buf.toString(); } catch (Exception ioe) { ioe.printStackTrace(); pass1Text = "<span class=\"error\">TemplateParser exception: " + ioe.getClass().getSimpleName() + "</span>"; } String redirectedLink = AbstractParser.parseRedirect(pass1Text, wikiModel); if (redirectedLink == null) { parseRecursive(pass1Text, wikiModel, false, false); } } else { if (AbstractParser.parseRedirect(rawWikiText, wikiModel) == null) { parseRecursive(rawWikiText, wikiModel, false, false); } } } finally { // clean up wiki model if necessary wikiModel.tearDown(); } } /** * Call the parser on the subsequent recursion levels, where the subtexts (of * templates, table cells, list items or image captions) don't contain a table * of contents (TOC) * * <b>Note:</b> the wiki model doesn't call the <code>setUp()</code> or * <code>tearDown()</code> methods for the subsequent recursive parser steps. * * @param rawWikitext * @param wikiModel * @return */ public static void parseRecursive(String rawWikitext, IWikiModel wikiModel) { parseRecursive(rawWikitext, wikiModel, false, true); } /** * Call the parser on the subsequent recursion levels, where the subtexts (of * templates, table cells, list items or image captions) don't contain a table * of contents (TOC) * * <b>Note:</b> the wiki model doesn't call the <code>setUp()</code> or * <code>tearDown()</code> methods for the subsequent recursive parser steps. * * @param rawWikitext * @param wikiModel * @param noTOC * @param appendStack * @return * @return */ public static TagStack parseRecursive(String rawWikitext, IWikiModel wikiModel, boolean createOnlyLocalStack, boolean noTOC) { AbstractParser parser = wikiModel.createNewInstance(rawWikitext); return parser .parseRecursiveInternal(wikiModel, createOnlyLocalStack, noTOC); } /** * Determine if the currently parsed wiki text is a template text. * * @return <code>true</code> if the currently parsed wiki text is a template */ @Override public boolean isTemplate() { return fRenderTemplate; } }