WikipediaPreTagParser.java example

Explorer
aipo-master
package info.bliki.wiki.filter;

import info.bliki.htmlcleaner.ContentToken;
import info.bliki.htmlcleaner.EndTagToken;
import info.bliki.htmlcleaner.TagNode;
import info.bliki.htmlcleaner.TagToken;
import info.bliki.wiki.model.Configuration;
import info.bliki.wiki.model.DefaultEventListener;
import info.bliki.wiki.model.IEventListener;
import info.bliki.wiki.model.IWikiModel;
import info.bliki.wiki.tags.HTMLBlockTag;
import info.bliki.wiki.tags.HTMLTag;
import info.bliki.wiki.tags.WPBoldItalicTag;
import info.bliki.wiki.tags.WPTag;
import info.bliki.wiki.tags.util.Attribute;
import info.bliki.wiki.tags.util.IBodyTag;
import info.bliki.wiki.tags.util.INoBodyParsingTag;
import info.bliki.wiki.tags.util.NodeAttribute;
import info.bliki.wiki.tags.util.TagStack;
import info.bliki.wiki.tags.util.WikiTagNode;

import java.util.List;

/**
 * A Wikipedia syntax parser for parsing in wiki preformatted blocks (rendered
 * as <pre>...</pre>)
 * 
 */
public class WikipediaPreTagParser extends AbstractParser {

  /**
   * Enable HTML tags
   */
  private final boolean fHtmlCodes = true;

  private IEventListener fEventListener = null;

  public WikipediaPreTagParser(String stringSource) {
    this(stringSource, null);
  }

  public WikipediaPreTagParser(String stringSource, IEventListener wikiListener) {
    super(stringSource);
    if (wikiListener == null) {
      fEventListener = DefaultEventListener.CONST;
    } else {
      fEventListener = wikiListener;
    }
  }

  public int getNextToken() // throws InvalidInputException
  {
    fWhiteStart = true;
    fWhiteStartPosition = fCurrentPosition;
    try {
      while (true) {
        fCurrentCharacter = fSource[fCurrentPosition++];

        // ---------Identify the next token-------------
        switch (fCurrentCharacter) {
          case '\n':
            // check at the end of line, if there is open wiki bold or italic
            // markup
            reduceTokenStackBoldItalic();
            break;
        }

        // ---------Identify the next token-------------
        switch (fCurrentCharacter) {
          case '[':
            if (parseWikiLink()) {
              continue;
            }
            break;
          case '\'':
            if (getNextChar('\'')) {
              if (getNextChar('\'')) {
                if (getNextChar('\'')) {
                  if (getNextChar('\'')) {
                    createContentToken(5);
                    return TokenBOLDITALIC;
                  }
                  fCurrentPosition -= 1;
                  fWhiteStart = true;
                  createContentToken(3);
                  return TokenBOLD;
                }
                createContentToken(3);
                return TokenBOLD;
              }
              createContentToken(2);
              return TokenITALIC;
            }
            break;
          case '%':
            if (getNextChar('%')) {
              if (getNextChar('%')) {
                createContentToken(3);
                return TokenDELETEDLINE;
              }
              createContentToken(2);
              return TokenUNDERLNE;
            }
            break;
          case '<':
            if (fHtmlCodes) {
              int htmlStartPosition = fCurrentPosition;
              // HTML tags are allowed
              try {
                switch (fStringSource.charAt(fCurrentPosition)) {
                  case '!': // <!-- HTML comment -->
                    if (parseHTMLCommentTags()) {
                      continue;
                    }
                    break;
                  default:

                    if (fSource[fCurrentPosition] != '/') {
                      // opening HTML tag
                      WikiTagNode tagNode = parseTag(fCurrentPosition);
                      if (tagNode != null) {
                        String tagName = tagNode.getTagName();
                        TagToken tag = fWikiModel.getTokenMap().get(tagName);
                        if ((tag != null) && !(tag instanceof HTMLBlockTag)) {
                          tag = (TagToken) tag.clone();

                          if (tag instanceof TagNode) {
                            TagNode node = (TagNode) tag;
                            List<NodeAttribute> attributes =
                              tagNode.getAttributesEx();
                            Attribute attr;
                            for (int i = 1; i < attributes.size(); i++) {
                              attr = attributes.get(i);
                              node.addAttribute(
                                attr.getName(),
                                attr.getValue(),
                                true);
                            }
                          }
                          if (tag instanceof HTMLTag) {
                            // ((HTMLTag) tag).setTemplate(isTemplate());
                          }

                          createContentToken(1);

                          fCurrentPosition = fScannerPosition;

                          String allowedParents = tag.getParents();
                          if (allowedParents != null) {
                            reduceTokenStack(tag);
                          }
                          createTag(tag, tagNode, tagNode.getEndPosition());
                          return TokenIgnore;

                        }
                        break;
                      }
                    } else {
                      // closing HTML tag
                      WikiTagNode tagNode = parseTag(++fCurrentPosition);
                      if (tagNode != null) {
                        String tagName = tagNode.getTagName();
                        TagToken tag = fWikiModel.getTokenMap().get(tagName);
                        if ((tag != null) && !(tag instanceof HTMLBlockTag)) {
                          createContentToken(2);
                          fCurrentPosition = fScannerPosition;

                          if (fWikiModel.stackSize() > 0) {
                            TagToken topToken = fWikiModel.peekNode();
                            if (topToken.getName().equals(tag.getName())) {
                              fWikiModel.popNode();
                              return TokenIgnore;
                            } else {
                              if (tag.isReduceTokenStack()) {
                                reduceStackUntilToken(tag);
                              }
                            }
                          } else {
                          }
                          return TokenIgnore;
                        }
                        break;
                      }
                    }
                }
              } catch (IndexOutOfBoundsException e) {
                // do nothing
              }
              fCurrentPosition = htmlStartPosition;
            }
            break;
          default:
            // if (Character.isLetter(fCurrentCharacter)) {
            // if (fCurrentPosition < 2 ||
            // !Character.isLetterOrDigit(fSource[fCurrentPosition - 2])) {
            // if (fCurrentCharacter == 'i' || fCurrentCharacter == 'I') {
            // // ISBN ?
            // if (parseISBNLinks()) {
            // continue;
            // }
            // }
            //
            // if (parseURIScheme()) {
            // // a URI scheme registered in the wiki model (ftp, http,
            // // https,...)
            // continue;
            // }
            //
            // if (fWikiModel.isCamelCaseEnabled() &&
            // Character.isUpperCase(fCurrentCharacter)
            // && fWikiModel.getRecursionLevel() <= 1) {
            // if (parseCamelCaseLink()) {
            // continue;
            // }
            // }
            // }
            // }
        }

        if (!fWhiteStart) {
          fWhiteStart = true;
          fWhiteStartPosition = fCurrentPosition - 1;
        }

      }
      // -----------------end switch while try--------------------
    } catch (IndexOutOfBoundsException e) {
      // end of scanner text
    }
    try {
      createContentToken(1);
    } catch (IndexOutOfBoundsException e) {
      // end of scanner text
    }
    return TokenEOF;
  }

  private boolean parseHTMLCommentTags() {
    int htmlStartPosition = fCurrentPosition;
    String htmlCommentString =
      fStringSource.substring(fCurrentPosition - 1, fCurrentPosition + 3);

    if (htmlCommentString.equals("<!--")) {
      fCurrentPosition += 3;
      if (readUntil("-->")) {
        String htmlCommentContent =
          fStringSource.substring(htmlStartPosition + 3, fCurrentPosition - 3);
        if (htmlCommentContent != null) {
          createContentToken(fCurrentPosition - htmlStartPosition + 1);
          return true;
        }
      }
    }
    return false;
  }

  /**
   * Parse a wiki section starting with a '[' character
   * 
   * @return <code>true</code> if a correct link was found
   */
  private boolean parseWikiLink() {
    int startLinkPosition = fCurrentPosition;
    if (getNextChar('[')) {
      return parseWikiTag();
    } else {
      createContentToken(1);
      fWhiteStart = false;

      if (readUntilCharOrStopAtEOL(']')) {
        String name =
          fStringSource.substring(startLinkPosition, fCurrentPosition - 1);

        // bbcode start
        if (fWikiModel.parseBBCodes() && name.length() > 0) {
          // parse start tokens like phpBB forum syntax style (bbcode)
          char ch = name.charAt(0);
          if ('a' <= ch && ch <= 'z') {
            // first character must be a letter
            StringBuilder bbCode = new StringBuilder(name.length());
            bbCode.append(ch);
            if (parsePHPBBCode(name, bbCode)) {
              return true;
            }
          }
        }
        // bbcode end

        // if (handleHTTPLink(name)) {
        // return true;
        // }
      }
      fCurrentPosition = startLinkPosition;
    }
    return false;
  }

  /**
   * Parse a wiki section starting with a '[[' sequence
   * 
   * @return <code>true</code> if a correct link was found
   */
  private boolean parseWikiTag() {
    int startLinkPosition = fCurrentPosition;
    int endLinkPosition;
    // wikipedia link style
    createContentToken(2);

    int temp = fCurrentPosition;
    if (findWikiLinkEnd()) {
      endLinkPosition = fCurrentPosition - 2;
      String name = fStringSource.substring(startLinkPosition, endLinkPosition);
      // test for a suffix string behind the Wiki link. Useful for plurals.
      // Example:
      // Dolphins are [[aquatic mammal]]s that are closely related to [[whale]]s
      // and [[porpoise]]s.
      temp = fCurrentPosition;
      String suffix = "";
      try {
        fCurrentCharacter = fSource[fCurrentPosition];
        if (Character.isLowerCase(fCurrentCharacter)) {
          fCurrentPosition++;
          StringBuilder suffixBuffer = new StringBuilder(16);
          suffixBuffer.append(fCurrentCharacter);
          while (true) {
            fCurrentCharacter = fSource[fCurrentPosition++];
            if (!Character.isLowerCase(fCurrentCharacter)) {
              fCurrentPosition--;
              break;
            }
            suffixBuffer.append(fCurrentCharacter);
          }
          suffix = suffixBuffer.toString();
        }
      } catch (IndexOutOfBoundsException e) {
        fCurrentPosition = temp;
      }
      fEventListener.onWikiLink(
        fSource,
        startLinkPosition,
        endLinkPosition,
        suffix);
      if (!fWikiModel.appendRawWikipediaLink(name, suffix)) {
        fCurrentPosition = temp;
        // this is probably a special image link
        throw new InvalidPreWikiTag("parseWikiTag");
      }
      return true;
    } else {
      fWhiteStart = true;
      fWhiteStartPosition = startLinkPosition - 2;
      fCurrentPosition = temp + 1;
    }
    return false;
  }

  private void createTag(TagToken tag, WikiTagNode tagNode,
      int startMacroPosition) {
    String endTag;
    String macroBodyString = "";
    int index0;
    String command = tagNode.getTagName();
    if ((tag != null)
      && (tag instanceof IBodyTag)
      && (!tagNode.isEmptyXmlTag())) {
      endTag = command + '>';
      index0 =
        Util.indexOfIgnoreCase(fStringSource, "</", endTag, startMacroPosition);

      if (index0 >= 0) {
        macroBodyString = fStringSource.substring(startMacroPosition, index0);
        fCurrentPosition = index0 + endTag.length() + 2;
      } else {
        macroBodyString =
          fStringSource.substring(startMacroPosition, fSource.length);
        fCurrentPosition = fSource.length;
      }
    } else {
      macroBodyString = null;
      fCurrentPosition = startMacroPosition;
    }

    handleTag(tag, tagNode, macroBodyString);
  }

  private void handleTag(TagToken tag, WikiTagNode tagNode, String bodyString) {
    String command = tagNode.getTagName();
    try {
      if (tag instanceof EndTagToken) {
        fWikiModel.append(tag);
      } else {
        fWikiModel.pushNode(tag);
        if (null != bodyString) {
          if (tag instanceof INoBodyParsingTag) {
            ((TagNode) tag).addChild(new ContentToken(bodyString));
          } else {
            // recursively filter tags within the tags body string
            WikipediaPreTagParser.parseRecursive(
              bodyString.trim(),
              fWikiModel,
              false,
              true);
          }
        }
        if (tag instanceof IBodyTag) {
          fWikiModel.popNode();
        }
      }
    } catch (IllegalArgumentException e) {
      TagNode divTagNode = new TagNode("div");
      divTagNode.addAttribute("class", "error", true);
      divTagNode.addChild(new ContentToken("IllegalArgumentException: "
        + command
        + " - "
        + e.getMessage()));
      fWikiModel.append(divTagNode);
      e.printStackTrace();
    } catch (Throwable e) {
      e.printStackTrace();
      TagNode divTagNode = new TagNode("div");
      divTagNode.addAttribute("class", "error", true);
      divTagNode.addChild(new ContentToken(command + ": " + e.getMessage()));
      fWikiModel.append(divTagNode);
      e.printStackTrace();
    }
  }

  @Override
  public void runParser() {
    int token = TokenSTART;
    while ((token = getNextToken()) != TokenEOF) {
      switch (token) {
        case TokenBOLDITALIC:
          if (fWikiModel.stackSize() > 0
            && fWikiModel.peekNode().equals(BOLDITALIC)) {
            fWikiModel.popNode();
            // fResultBuffer.append("</i></b>");
          } else if (fWikiModel.stackSize() > 1
            && fWikiModel.peekNode().equals(BOLD)
            && fWikiModel.getNode(fWikiModel.stackSize() - 2).equals(ITALIC)) {
            fWikiModel.popNode();
            fWikiModel.popNode();
            // fResultBuffer.append("</b></i>");
          } else if (fWikiModel.stackSize() > 1
            && fWikiModel.peekNode().equals(ITALIC)
            && fWikiModel.getNode(fWikiModel.stackSize() - 2).equals(BOLD)) {
            fWikiModel.popNode();
            fWikiModel.popNode();
            // fResultBuffer.append("</i></b>");
          } else if (fWikiModel.stackSize() > 0
            && fWikiModel.peekNode().equals(BOLD)) {
            fWikiModel.popNode();
            fWikiModel.pushNode(new WPTag("i"));
          } else if (fWikiModel.stackSize() > 0
            && fWikiModel.peekNode().equals(ITALIC)) {
            fWikiModel.popNode();
            fWikiModel.pushNode(new WPTag("b"));
          } else {
            fWikiModel.pushNode(new WPBoldItalicTag());
            // fResultBuffer.append("<b><i>");
          }
          break;
        case TokenBOLD:
          if (fWikiModel.stackSize() > 0
            && fWikiModel.peekNode().equals(BOLDITALIC)) {
            fWikiModel.popNode();
            fWikiModel.pushNode(new WPTag("i"));
            // fResultBuffer.append("</b>");
          } else if (fWikiModel.stackSize() > 0
            && fWikiModel.peekNode().equals(BOLD)) {
            fWikiModel.popNode();
            // fResultBuffer.append("</b>");
          } else {
            fWikiModel.pushNode(new WPTag("b"));
            // fResultBuffer.append("<b>");
          }
          break;
        case TokenITALIC:
          if (fWikiModel.stackSize() > 0
            && fWikiModel.peekNode().equals(BOLDITALIC)) {
            fWikiModel.popNode();
            fWikiModel.pushNode(new WPTag("b"));
            // fResultBuffer.append("</i>");
          } else if (fWikiModel.stackSize() > 0
            && fWikiModel.peekNode().equals(ITALIC)) {
            fWikiModel.popNode();
            // fResultBuffer.append("</i>");
          } else {
            fWikiModel.pushNode(new WPTag("i"));
            // fResultBuffer.append("<i>");
          }
          break;
        case TokenUNDERLNE:
          if (fWikiModel.stackSize() > 0
            && fWikiModel.peekNode().equals(UNDERLINE)) {
            fWikiModel.popNode();
          } else {
            fWikiModel.pushNode(UNDERLINE);
          }
          break;
        case TokenDELETEDLINE:
          if (fWikiModel.stackSize() > 0
            && fWikiModel.peekNode().equals(DELETEDLINE)) {
            fWikiModel.popNode();
          } else {
            fWikiModel.pushNode(DELETEDLINE);
          }
          break;
      }
    }
    reduceTokenStack();

  }

  public boolean isNoToC() {
    return false;
  }

  @Override
  public void setNoToC(boolean noToC) {
  }

  /**
   * Call the parser on the subsequent recursion levels, where the subtexts (of
   * templates, table cells, list items or image captions) don't contain a table
   * of contents (TOC)
   * 
   * <b>Note:</b> the wiki model doesn't call the <code>setUp()</code> or
   * <code>tearDown()</code> methods for the subsequent recursive parser steps.
   * 
   * @param rawWikitext
   * @param wikiModel
   * @return
   */
  public static void parseRecursive(String rawWikitext, IWikiModel wikiModel) {
    parseRecursive(rawWikitext, wikiModel, false, true);
  }

  /**
   * Call the parser on the subsequent recursion levels, where the subtexts (of
   * templates, table cells, list items or image captions) don't contain a table
   * of contents (TOC)
   * 
   * <b>Note:</b> the wiki model doesn't call the <code>setUp()</code> or
   * <code>tearDown()</code> methods for the subsequent recursive parser steps.
   * 
   * @param rawWikitext
   * @param wikiModel
   * @param noTOC
   * @param appendStack
   * @return
   * @return
   */
  public static TagStack parseRecursive(String rawWikitext,
      IWikiModel wikiModel, boolean createOnlyLocalStack, boolean noTOC) {
    AbstractParser parser = new WikipediaPreTagParser(rawWikitext);
    return parser
      .parseRecursiveInternal(wikiModel, createOnlyLocalStack, noTOC);
  }

  @Override
  public TagStack parseRecursiveInternal(IWikiModel wikiModel,
      boolean createOnlyLocalStack, boolean noTOC) {
    // local stack for this wiki snippet
    TagStack localStack = new TagStack();
    // global wiki model stack
    TagStack globalWikiModelStack = wikiModel.swapStack(localStack);
    try {
      // fix for infinite recursion
      // if (wikiModel.incrementParserRecursionLevel() >
      // Configuration.PARSER_RECURSION_LIMIT) {
      // TagNode error = new TagNode("span");
      // error.addAttribute("class", "error", true);
      // error.addChild(new
      // ContentToken("Error - total recursion count limit exceeded parsing wiki tags."));
      // localStack.append(error);
      // return localStack;
      // }

      int level = wikiModel.incrementRecursionLevel();

      if (level > Configuration.PARSER_RECURSION_LIMIT) {
        TagNode error = new TagNode("span");
        error.addAttribute("class", "error", true);
        error.addChild(new ContentToken(
          "Error - recursion limit exceeded parsing wiki tags."));
        localStack.append(error);
        return localStack;
      }
      // WikipediaParser parser = new WikipediaParser(rawWikitext,
      // wikiModel.isTemplateTopic(), wikiModel.getWikiListener());
      setModel(wikiModel);
      runParser();
      return localStack;
    } catch (InvalidPreWikiTag ipwt) {
      createOnlyLocalStack = true;
      throw ipwt;
    } catch (Exception e) {
      e.printStackTrace();
      TagNode error = new TagNode("span");
      error.addAttribute("class", "error", true);
      error.addChild(new ContentToken(e.getClass().getSimpleName()));
      localStack.append(error);
    } catch (Error e) {
      e.printStackTrace();
      TagNode error = new TagNode("span");
      error.addAttribute("class", "error", true);
      error.addChild(new ContentToken(e.getClass().getSimpleName()));
      localStack.append(error);
    } finally {
      wikiModel.decrementRecursionLevel();
      if (!createOnlyLocalStack) {
        // append the resursively parsed local stack to the global wiki
        // model
        // stack
        globalWikiModelStack.append(localStack);
      }
      wikiModel.swapStack(globalWikiModelStack);
    }

    return localStack;
  }
}