package info.bliki.wiki.filter; import info.bliki.htmlcleaner.ContentToken; import info.bliki.htmlcleaner.EndTagToken; import info.bliki.htmlcleaner.TagNode; import info.bliki.htmlcleaner.TagToken; import info.bliki.wiki.model.Configuration; import info.bliki.wiki.model.DefaultEventListener; import info.bliki.wiki.model.IEventListener; import info.bliki.wiki.model.IWikiModel; import info.bliki.wiki.tags.HTMLBlockTag; import info.bliki.wiki.tags.HTMLTag; import info.bliki.wiki.tags.WPBoldItalicTag; import info.bliki.wiki.tags.WPTag; import info.bliki.wiki.tags.util.Attribute; import info.bliki.wiki.tags.util.IBodyTag; import info.bliki.wiki.tags.util.INoBodyParsingTag; import info.bliki.wiki.tags.util.NodeAttribute; import info.bliki.wiki.tags.util.TagStack; import info.bliki.wiki.tags.util.WikiTagNode; import java.util.List; /** * A Wikipedia syntax parser for parsing in wiki preformatted blocks (rendered * as <pre>...</pre>) * */ public class WikipediaPreTagParser extends AbstractParser { /** * Enable HTML tags */ private final boolean fHtmlCodes = true; private IEventListener fEventListener = null; public WikipediaPreTagParser(String stringSource) { this(stringSource, null); } public WikipediaPreTagParser(String stringSource, IEventListener wikiListener) { super(stringSource); if (wikiListener == null) { fEventListener = DefaultEventListener.CONST; } else { fEventListener = wikiListener; } } public int getNextToken() // throws InvalidInputException { fWhiteStart = true; fWhiteStartPosition = fCurrentPosition; try { while (true) { fCurrentCharacter = fSource[fCurrentPosition++]; // ---------Identify the next token------------- switch (fCurrentCharacter) { case '\n': // check at the end of line, if there is open wiki bold or italic // markup reduceTokenStackBoldItalic(); break; } // ---------Identify the next token------------- switch (fCurrentCharacter) { case '[': if (parseWikiLink()) { continue; } break; case '\'': if (getNextChar('\'')) { if (getNextChar('\'')) { if (getNextChar('\'')) { if (getNextChar('\'')) { createContentToken(5); return TokenBOLDITALIC; } fCurrentPosition -= 1; fWhiteStart = true; createContentToken(3); return TokenBOLD; } createContentToken(3); return TokenBOLD; } createContentToken(2); return TokenITALIC; } break; case '%': if (getNextChar('%')) { if (getNextChar('%')) { createContentToken(3); return TokenDELETEDLINE; } createContentToken(2); return TokenUNDERLNE; } break; case '<': if (fHtmlCodes) { int htmlStartPosition = fCurrentPosition; // HTML tags are allowed try { switch (fStringSource.charAt(fCurrentPosition)) { case '!': // <!-- HTML comment --> if (parseHTMLCommentTags()) { continue; } break; default: if (fSource[fCurrentPosition] != '/') { // opening HTML tag WikiTagNode tagNode = parseTag(fCurrentPosition); if (tagNode != null) { String tagName = tagNode.getTagName(); TagToken tag = fWikiModel.getTokenMap().get(tagName); if ((tag != null) && !(tag instanceof HTMLBlockTag)) { tag = (TagToken) tag.clone(); if (tag instanceof TagNode) { TagNode node = (TagNode) tag; List<NodeAttribute> attributes = tagNode.getAttributesEx(); Attribute attr; for (int i = 1; i < attributes.size(); i++) { attr = attributes.get(i); node.addAttribute( attr.getName(), attr.getValue(), true); } } if (tag instanceof HTMLTag) { // ((HTMLTag) tag).setTemplate(isTemplate()); } createContentToken(1); fCurrentPosition = fScannerPosition; String allowedParents = tag.getParents(); if (allowedParents != null) { reduceTokenStack(tag); } createTag(tag, tagNode, tagNode.getEndPosition()); return TokenIgnore; } break; } } else { // closing HTML tag WikiTagNode tagNode = parseTag(++fCurrentPosition); if (tagNode != null) { String tagName = tagNode.getTagName(); TagToken tag = fWikiModel.getTokenMap().get(tagName); if ((tag != null) && !(tag instanceof HTMLBlockTag)) { createContentToken(2); fCurrentPosition = fScannerPosition; if (fWikiModel.stackSize() > 0) { TagToken topToken = fWikiModel.peekNode(); if (topToken.getName().equals(tag.getName())) { fWikiModel.popNode(); return TokenIgnore; } else { if (tag.isReduceTokenStack()) { reduceStackUntilToken(tag); } } } else { } return TokenIgnore; } break; } } } } catch (IndexOutOfBoundsException e) { // do nothing } fCurrentPosition = htmlStartPosition; } break; default: // if (Character.isLetter(fCurrentCharacter)) { // if (fCurrentPosition < 2 || // !Character.isLetterOrDigit(fSource[fCurrentPosition - 2])) { // if (fCurrentCharacter == 'i' || fCurrentCharacter == 'I') { // // ISBN ? // if (parseISBNLinks()) { // continue; // } // } // // if (parseURIScheme()) { // // a URI scheme registered in the wiki model (ftp, http, // // https,...) // continue; // } // // if (fWikiModel.isCamelCaseEnabled() && // Character.isUpperCase(fCurrentCharacter) // && fWikiModel.getRecursionLevel() <= 1) { // if (parseCamelCaseLink()) { // continue; // } // } // } // } } if (!fWhiteStart) { fWhiteStart = true; fWhiteStartPosition = fCurrentPosition - 1; } } // -----------------end switch while try-------------------- } catch (IndexOutOfBoundsException e) { // end of scanner text } try { createContentToken(1); } catch (IndexOutOfBoundsException e) { // end of scanner text } return TokenEOF; } private boolean parseHTMLCommentTags() { int htmlStartPosition = fCurrentPosition; String htmlCommentString = fStringSource.substring(fCurrentPosition - 1, fCurrentPosition + 3); if (htmlCommentString.equals("<!--")) { fCurrentPosition += 3; if (readUntil("-->")) { String htmlCommentContent = fStringSource.substring(htmlStartPosition + 3, fCurrentPosition - 3); if (htmlCommentContent != null) { createContentToken(fCurrentPosition - htmlStartPosition + 1); return true; } } } return false; } /** * Parse a wiki section starting with a '[' character * * @return <code>true</code> if a correct link was found */ private boolean parseWikiLink() { int startLinkPosition = fCurrentPosition; if (getNextChar('[')) { return parseWikiTag(); } else { createContentToken(1); fWhiteStart = false; if (readUntilCharOrStopAtEOL(']')) { String name = fStringSource.substring(startLinkPosition, fCurrentPosition - 1); // bbcode start if (fWikiModel.parseBBCodes() && name.length() > 0) { // parse start tokens like phpBB forum syntax style (bbcode) char ch = name.charAt(0); if ('a' <= ch && ch <= 'z') { // first character must be a letter StringBuilder bbCode = new StringBuilder(name.length()); bbCode.append(ch); if (parsePHPBBCode(name, bbCode)) { return true; } } } // bbcode end // if (handleHTTPLink(name)) { // return true; // } } fCurrentPosition = startLinkPosition; } return false; } /** * Parse a wiki section starting with a '[[' sequence * * @return <code>true</code> if a correct link was found */ private boolean parseWikiTag() { int startLinkPosition = fCurrentPosition; int endLinkPosition; // wikipedia link style createContentToken(2); int temp = fCurrentPosition; if (findWikiLinkEnd()) { endLinkPosition = fCurrentPosition - 2; String name = fStringSource.substring(startLinkPosition, endLinkPosition); // test for a suffix string behind the Wiki link. Useful for plurals. // Example: // Dolphins are [[aquatic mammal]]s that are closely related to [[whale]]s // and [[porpoise]]s. temp = fCurrentPosition; String suffix = ""; try { fCurrentCharacter = fSource[fCurrentPosition]; if (Character.isLowerCase(fCurrentCharacter)) { fCurrentPosition++; StringBuilder suffixBuffer = new StringBuilder(16); suffixBuffer.append(fCurrentCharacter); while (true) { fCurrentCharacter = fSource[fCurrentPosition++]; if (!Character.isLowerCase(fCurrentCharacter)) { fCurrentPosition--; break; } suffixBuffer.append(fCurrentCharacter); } suffix = suffixBuffer.toString(); } } catch (IndexOutOfBoundsException e) { fCurrentPosition = temp; } fEventListener.onWikiLink( fSource, startLinkPosition, endLinkPosition, suffix); if (!fWikiModel.appendRawWikipediaLink(name, suffix)) { fCurrentPosition = temp; // this is probably a special image link throw new InvalidPreWikiTag("parseWikiTag"); } return true; } else { fWhiteStart = true; fWhiteStartPosition = startLinkPosition - 2; fCurrentPosition = temp + 1; } return false; } private void createTag(TagToken tag, WikiTagNode tagNode, int startMacroPosition) { String endTag; String macroBodyString = ""; int index0; String command = tagNode.getTagName(); if ((tag != null) && (tag instanceof IBodyTag) && (!tagNode.isEmptyXmlTag())) { endTag = command + '>'; index0 = Util.indexOfIgnoreCase(fStringSource, "</", endTag, startMacroPosition); if (index0 >= 0) { macroBodyString = fStringSource.substring(startMacroPosition, index0); fCurrentPosition = index0 + endTag.length() + 2; } else { macroBodyString = fStringSource.substring(startMacroPosition, fSource.length); fCurrentPosition = fSource.length; } } else { macroBodyString = null; fCurrentPosition = startMacroPosition; } handleTag(tag, tagNode, macroBodyString); } private void handleTag(TagToken tag, WikiTagNode tagNode, String bodyString) { String command = tagNode.getTagName(); try { if (tag instanceof EndTagToken) { fWikiModel.append(tag); } else { fWikiModel.pushNode(tag); if (null != bodyString) { if (tag instanceof INoBodyParsingTag) { ((TagNode) tag).addChild(new ContentToken(bodyString)); } else { // recursively filter tags within the tags body string WikipediaPreTagParser.parseRecursive( bodyString.trim(), fWikiModel, false, true); } } if (tag instanceof IBodyTag) { fWikiModel.popNode(); } } } catch (IllegalArgumentException e) { TagNode divTagNode = new TagNode("div"); divTagNode.addAttribute("class", "error", true); divTagNode.addChild(new ContentToken("IllegalArgumentException: " + command + " - " + e.getMessage())); fWikiModel.append(divTagNode); e.printStackTrace(); } catch (Throwable e) { e.printStackTrace(); TagNode divTagNode = new TagNode("div"); divTagNode.addAttribute("class", "error", true); divTagNode.addChild(new ContentToken(command + ": " + e.getMessage())); fWikiModel.append(divTagNode); e.printStackTrace(); } } @Override public void runParser() { int token = TokenSTART; while ((token = getNextToken()) != TokenEOF) { switch (token) { case TokenBOLDITALIC: if (fWikiModel.stackSize() > 0 && fWikiModel.peekNode().equals(BOLDITALIC)) { fWikiModel.popNode(); // fResultBuffer.append("</i></b>"); } else if (fWikiModel.stackSize() > 1 && fWikiModel.peekNode().equals(BOLD) && fWikiModel.getNode(fWikiModel.stackSize() - 2).equals(ITALIC)) { fWikiModel.popNode(); fWikiModel.popNode(); // fResultBuffer.append("</b></i>"); } else if (fWikiModel.stackSize() > 1 && fWikiModel.peekNode().equals(ITALIC) && fWikiModel.getNode(fWikiModel.stackSize() - 2).equals(BOLD)) { fWikiModel.popNode(); fWikiModel.popNode(); // fResultBuffer.append("</i></b>"); } else if (fWikiModel.stackSize() > 0 && fWikiModel.peekNode().equals(BOLD)) { fWikiModel.popNode(); fWikiModel.pushNode(new WPTag("i")); } else if (fWikiModel.stackSize() > 0 && fWikiModel.peekNode().equals(ITALIC)) { fWikiModel.popNode(); fWikiModel.pushNode(new WPTag("b")); } else { fWikiModel.pushNode(new WPBoldItalicTag()); // fResultBuffer.append("<b><i>"); } break; case TokenBOLD: if (fWikiModel.stackSize() > 0 && fWikiModel.peekNode().equals(BOLDITALIC)) { fWikiModel.popNode(); fWikiModel.pushNode(new WPTag("i")); // fResultBuffer.append("</b>"); } else if (fWikiModel.stackSize() > 0 && fWikiModel.peekNode().equals(BOLD)) { fWikiModel.popNode(); // fResultBuffer.append("</b>"); } else { fWikiModel.pushNode(new WPTag("b")); // fResultBuffer.append("<b>"); } break; case TokenITALIC: if (fWikiModel.stackSize() > 0 && fWikiModel.peekNode().equals(BOLDITALIC)) { fWikiModel.popNode(); fWikiModel.pushNode(new WPTag("b")); // fResultBuffer.append("</i>"); } else if (fWikiModel.stackSize() > 0 && fWikiModel.peekNode().equals(ITALIC)) { fWikiModel.popNode(); // fResultBuffer.append("</i>"); } else { fWikiModel.pushNode(new WPTag("i")); // fResultBuffer.append("<i>"); } break; case TokenUNDERLNE: if (fWikiModel.stackSize() > 0 && fWikiModel.peekNode().equals(UNDERLINE)) { fWikiModel.popNode(); } else { fWikiModel.pushNode(UNDERLINE); } break; case TokenDELETEDLINE: if (fWikiModel.stackSize() > 0 && fWikiModel.peekNode().equals(DELETEDLINE)) { fWikiModel.popNode(); } else { fWikiModel.pushNode(DELETEDLINE); } break; } } reduceTokenStack(); } public boolean isNoToC() { return false; } @Override public void setNoToC(boolean noToC) { } /** * Call the parser on the subsequent recursion levels, where the subtexts (of * templates, table cells, list items or image captions) don't contain a table * of contents (TOC) * * <b>Note:</b> the wiki model doesn't call the <code>setUp()</code> or * <code>tearDown()</code> methods for the subsequent recursive parser steps. * * @param rawWikitext * @param wikiModel * @return */ public static void parseRecursive(String rawWikitext, IWikiModel wikiModel) { parseRecursive(rawWikitext, wikiModel, false, true); } /** * Call the parser on the subsequent recursion levels, where the subtexts (of * templates, table cells, list items or image captions) don't contain a table * of contents (TOC) * * <b>Note:</b> the wiki model doesn't call the <code>setUp()</code> or * <code>tearDown()</code> methods for the subsequent recursive parser steps. * * @param rawWikitext * @param wikiModel * @param noTOC * @param appendStack * @return * @return */ public static TagStack parseRecursive(String rawWikitext, IWikiModel wikiModel, boolean createOnlyLocalStack, boolean noTOC) { AbstractParser parser = new WikipediaPreTagParser(rawWikitext); return parser .parseRecursiveInternal(wikiModel, createOnlyLocalStack, noTOC); } @Override public TagStack parseRecursiveInternal(IWikiModel wikiModel, boolean createOnlyLocalStack, boolean noTOC) { // local stack for this wiki snippet TagStack localStack = new TagStack(); // global wiki model stack TagStack globalWikiModelStack = wikiModel.swapStack(localStack); try { // fix for infinite recursion // if (wikiModel.incrementParserRecursionLevel() > // Configuration.PARSER_RECURSION_LIMIT) { // TagNode error = new TagNode("span"); // error.addAttribute("class", "error", true); // error.addChild(new // ContentToken("Error - total recursion count limit exceeded parsing wiki tags.")); // localStack.append(error); // return localStack; // } int level = wikiModel.incrementRecursionLevel(); if (level > Configuration.PARSER_RECURSION_LIMIT) { TagNode error = new TagNode("span"); error.addAttribute("class", "error", true); error.addChild(new ContentToken( "Error - recursion limit exceeded parsing wiki tags.")); localStack.append(error); return localStack; } // WikipediaParser parser = new WikipediaParser(rawWikitext, // wikiModel.isTemplateTopic(), wikiModel.getWikiListener()); setModel(wikiModel); runParser(); return localStack; } catch (InvalidPreWikiTag ipwt) { createOnlyLocalStack = true; throw ipwt; } catch (Exception e) { e.printStackTrace(); TagNode error = new TagNode("span"); error.addAttribute("class", "error", true); error.addChild(new ContentToken(e.getClass().getSimpleName())); localStack.append(error); } catch (Error e) { e.printStackTrace(); TagNode error = new TagNode("span"); error.addAttribute("class", "error", true); error.addChild(new ContentToken(e.getClass().getSimpleName())); localStack.append(error); } finally { wikiModel.decrementRecursionLevel(); if (!createOnlyLocalStack) { // append the resursively parsed local stack to the global wiki // model // stack globalWikiModelStack.append(localStack); } wikiModel.swapStack(globalWikiModelStack); } return localStack; } }