/* * This library is part of OpenCms - * the Open Source Content Management System * * Copyright (c) Alkacon Software GmbH (http://www.alkacon.com) * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * For further information about Alkacon Software GmbH, please see the * company website: http://www.alkacon.com * * For further information about OpenCms, please see the * project website: http://www.opencms.org * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ package org.opencms.util; import org.opencms.main.CmsLog; import java.util.Set; import java.util.TreeSet; import java.util.Vector; import org.apache.commons.logging.Log; import org.htmlparser.Attribute; import org.htmlparser.Node; import org.htmlparser.NodeFilter; import org.htmlparser.PrototypicalNodeFactory; import org.htmlparser.Tag; import org.htmlparser.lexer.Page; import org.htmlparser.scanners.Scanner; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; import org.htmlparser.util.SimpleNodeIterator; import org.htmlparser.visitors.NodeVisitor; /** * * A tag factory for htmlparser that is able to "remove tags".<p> * * Create an instance, add the {@link org.htmlparser.Tag} instances to remove and assign this * factory to the {@link org.htmlparser.Parser} before starting a visit. A demo usage is shown in * <code>CmsTagReplaceParser</code>.<p> * * The tags are not actually removed: They are linked in the document object model tree of the HTML * that the parser generates. They just will not accept any {@link NodeVisitor} instances and * therefore be invisible in any output a visitor will generate from the visited tree.<p> * * The decision whether a tag is removed can be controlled in two ways: * <ol> * <li> * <code>{@link #addTagRemoval(Tag)}</code><br/> * <p> * The given tag will be removed ("invisible in the DOM"). * </p> * </li> * <li> * <code>{@link #addTagPreserve(Tag)}</code><br/> * <p> * The given tag will be kept as-is. The following behaviour happens if this method is used: * <ol> * <li> * Once <code>{@link #addTagPreserve(Tag)}</code> has been called all Tags that are not added * to this method will be removed. <strong>We are in include mode then</strong>. * </li> * <li> * The Tags provided to <code>{@link #addTagRemoval(Tag)}</code> will only have the * power to hide exactly the same tags that are given to <code>{@link #addTagPreserve(Tag)}</code>: * <strong>Deny is stronger than allow.</strong> * </li> * </ol> * </p> * </li> * </ol> * * @since 6.1.8 */ public final class CmsHtmlTagRemoveFactory extends PrototypicalNodeFactory { /** * A Tag implementation that will not accept any {@link NodeVisitor} stopping by.<p> * * When visiting the corresponding tree of tags, this tag will be there but the visitor will not * see it as it is not accepted. This allows "elimination" of this tag in the output the visitor * generates from the document object model (e.g. HTML code again).<p> * * Potential child tags will be visible to visitors (unless they are instances of this class).<p> * * @since 6.1.8 */ private static final class CmsInvisibleTag implements Tag { /** Generated serial version UID. */ private static final long serialVersionUID = -3397880117291165819L; /** The real underlying tag. */ private Tag m_decorated; /** * Constructor with the delegate to wrap. * <p> * * Every property is accessed transparently from the delegate, except that visitors are not * welcome. * <p> * * @param delegate the tag to hide. */ CmsInvisibleTag(Tag delegate) { m_decorated = delegate; } /** * @see org.htmlparser.Tag#accept(org.htmlparser.visitors.NodeVisitor) */ public void accept(NodeVisitor visitor) { // be invisible but show the children (if they like visits) NodeList children = m_decorated.getChildren(); if (children == null) { return; } SimpleNodeIterator itChildren = children.elements(); while (itChildren.hasMoreNodes()) { itChildren.nextNode().accept(visitor); } } /** * @see org.htmlparser.Tag#breaksFlow() */ public boolean breaksFlow() { return m_decorated.breaksFlow(); } /** * @see org.htmlparser.Node#clone() */ public Object clone() throws CloneNotSupportedException { return m_decorated.clone(); } /** * @see org.htmlparser.Node#collectInto(org.htmlparser.util.NodeList, * org.htmlparser.NodeFilter) */ public void collectInto(NodeList arg0, NodeFilter arg1) { m_decorated.collectInto(arg0, arg1); } /** * @see org.htmlparser.Node#doSemanticAction() */ public void doSemanticAction() throws ParserException { m_decorated.doSemanticAction(); } /** * @see org.htmlparser.Tag#getAttribute(java.lang.String) */ public String getAttribute(String arg0) { return m_decorated.getAttribute(arg0); } /** * @see org.htmlparser.Tag#getAttributeEx(java.lang.String) */ public Attribute getAttributeEx(String arg0) { return m_decorated.getAttributeEx(arg0); } /** * @see org.htmlparser.Tag#getAttributesEx() */ public Vector getAttributesEx() { return m_decorated.getAttributesEx(); } /** * @see org.htmlparser.Node#getChildren() */ public NodeList getChildren() { return m_decorated.getChildren(); } /** * @see org.htmlparser.Tag#getEnders() */ public String[] getEnders() { return m_decorated.getEnders(); } /** * @see org.htmlparser.Tag#getEndingLineNumber() */ public int getEndingLineNumber() { return m_decorated.getEndingLineNumber(); } /** * @see org.htmlparser.Node#getEndPosition() */ public int getEndPosition() { return m_decorated.getEndPosition(); } /** * @see org.htmlparser.Tag#getEndTag() */ public Tag getEndTag() { return m_decorated.getEndTag(); } /** * @see org.htmlparser.Tag#getEndTagEnders() */ public String[] getEndTagEnders() { return m_decorated.getEndTagEnders(); } /** * @see org.htmlparser.Node#getFirstChild() */ public Node getFirstChild() { return m_decorated.getFirstChild(); } /** * @see org.htmlparser.Tag#getIds() */ public String[] getIds() { return m_decorated.getIds(); } /** * @see org.htmlparser.Node#getLastChild() */ public Node getLastChild() { return m_decorated.getLastChild(); } /** * @see org.htmlparser.Node#getNextSibling() */ public Node getNextSibling() { return m_decorated.getNextSibling(); } /** * @see org.htmlparser.Node#getPage() */ public Page getPage() { return m_decorated.getPage(); } /** * @see org.htmlparser.Node#getParent() */ public Node getParent() { return m_decorated.getParent(); } /** * @see org.htmlparser.Node#getPreviousSibling() */ public Node getPreviousSibling() { return m_decorated.getPreviousSibling(); } /** * @see org.htmlparser.Tag#getRawTagName() */ public String getRawTagName() { return m_decorated.getRawTagName(); } /** * @see org.htmlparser.Tag#getStartingLineNumber() */ public int getStartingLineNumber() { return m_decorated.getStartingLineNumber(); } /** * @see org.htmlparser.Node#getStartPosition() */ public int getStartPosition() { return m_decorated.getStartPosition(); } /** * @see org.htmlparser.Tag#getTagName() */ public String getTagName() { return m_decorated.getTagName(); } /** * @see org.htmlparser.Node#getText() */ public String getText() { return m_decorated.getText(); } /** * @see org.htmlparser.Tag#getThisScanner() */ public Scanner getThisScanner() { return m_decorated.getThisScanner(); } /** * @see org.htmlparser.Tag#isEmptyXmlTag() */ public boolean isEmptyXmlTag() { return m_decorated.isEmptyXmlTag(); } /** * @see org.htmlparser.Tag#isEndTag() */ public boolean isEndTag() { return m_decorated.isEndTag(); } /** * @see org.htmlparser.Tag#removeAttribute(java.lang.String) */ public void removeAttribute(String arg0) { m_decorated.removeAttribute(arg0); } /** * @see org.htmlparser.Tag#setAttribute(java.lang.String, java.lang.String) */ public void setAttribute(String arg0, String arg1) { m_decorated.setAttribute(arg0, arg1); } /** * @see org.htmlparser.Tag#setAttribute(java.lang.String, java.lang.String, char) */ public void setAttribute(String arg0, String arg1, char arg2) { m_decorated.setAttribute(arg0, arg1, arg2); } /** * @see org.htmlparser.Tag#setAttributeEx(org.htmlparser.Attribute) */ public void setAttributeEx(Attribute arg0) { m_decorated.setAttributeEx(arg0); } /** * @see org.htmlparser.Tag#setAttributesEx(java.util.Vector) */ public void setAttributesEx(Vector arg0) { m_decorated.setAttributesEx(arg0); } /** * @see org.htmlparser.Node#setChildren(org.htmlparser.util.NodeList) */ public void setChildren(NodeList arg0) { m_decorated.setChildren(arg0); } /** * @see org.htmlparser.Tag#setEmptyXmlTag(boolean) */ public void setEmptyXmlTag(boolean arg0) { m_decorated.setEmptyXmlTag(arg0); } /** * @see org.htmlparser.Node#setEndPosition(int) */ public void setEndPosition(int arg0) { m_decorated.setEndPosition(arg0); } /** * @see org.htmlparser.Tag#setEndTag(org.htmlparser.Tag) */ public void setEndTag(Tag arg0) { m_decorated.setEndTag(arg0); } /** * @see org.htmlparser.Node#setPage(org.htmlparser.lexer.Page) */ public void setPage(Page arg0) { m_decorated.setPage(arg0); } /** * @see org.htmlparser.Node#setParent(org.htmlparser.Node) */ public void setParent(Node arg0) { m_decorated.setParent(arg0); } /** * @see org.htmlparser.Node#setStartPosition(int) */ public void setStartPosition(int arg0) { m_decorated.setStartPosition(arg0); } /** * @see org.htmlparser.Tag#setTagName(java.lang.String) */ public void setTagName(String arg0) { m_decorated.setTagName(arg0); } /** * @see org.htmlparser.Node#setText(java.lang.String) */ public void setText(String arg0) { m_decorated.setText(arg0); } /** * @see org.htmlparser.Tag#setThisScanner(org.htmlparser.scanners.Scanner) */ public void setThisScanner(Scanner arg0) { m_decorated.setThisScanner(arg0); } /** * @see org.htmlparser.Node#toHtml() */ public String toHtml() { return m_decorated.toHtml(); } /** * @see org.htmlparser.Node#toHtml(boolean) */ public String toHtml(boolean value) { return m_decorated.toHtml(value); } /** * @see org.htmlparser.Node#toPlainTextString() */ public String toPlainTextString() { return m_decorated.toPlainTextString(); } /** * @see org.htmlparser.Node#toString() */ public String toString() { return m_decorated.toString(); } } /** The log object for this class. */ private static final Log LOG = CmsLog.getLog(CmsHtmlTagRemoveFactory.class); /** Generated serial version UID. */ private static final long serialVersionUID = 6961158563666656633L; /** The tags to hide tothe node visitors. */ private Set m_invisibleTags; /** The tags to show to the node visitors. */ private Set m_visibleTags; /** * Create a new factory with all tags registered. * <p> * */ public CmsHtmlTagRemoveFactory() { super(); m_invisibleTags = new TreeSet(); m_visibleTags = new TreeSet(); } /** * Add a tag that will be visible for {@link NodeVisitor} instances. * <p> * * Not only "this" tag will be visible but all parsed Tags that have the same name (case * insensitive). * <p> * * The given tag will be kept as-is. The following behaviour happens if this method is used: * <ol> * <li> * Once <code>{@link #addTagPreserve(Tag)}</code> has been called all Tags that are not added * to this method will be removed. <strong>We are in include mode then</strong>. * </li> * <li> * The Tags provided to <code>{@link #addTagRemoval(Tag)}</code> will only have the * power to hide exactly the same tags that are given to <code>{@link #addTagPreserve(Tag)}</code>: * <strong>Deny is stronger than allow.</strong> * </li> * </ol> * <p> * * * @param tag the tag that will be visible for all {@link NodeVisitor} instances. * * @return true if the tag was added to the internal set of tags to keep, false if not (was * contained before, has no name,...). */ public boolean addTagPreserve(final Tag tag) { boolean result = false; String tagName = tag.getTagName(); if (CmsStringUtil.isNotEmptyOrWhitespaceOnly(tagName)) { result = m_visibleTags.add(tagName.toLowerCase()); } return result; } /** * Add a tag that will be invisible for {@link NodeVisitor} instances. * <p> * * Not only "this" tag will be invisible but all parsed Tags that have the same name (case * insensitive). * <p> * * @param tag the tag that will be visible for all {@link NodeVisitor} instances. * * @return true if the tag was added to the internal set of tags to remove, false if not (was * contained before, has no name,...). */ public boolean addTagRemoval(final Tag tag) { boolean result = false; String tagName = tag.getTagName(); if (CmsStringUtil.isNotEmptyOrWhitespaceOnly(tagName)) { result = m_invisibleTags.add(tagName.toLowerCase()); } return result; } /** * @see org.htmlparser.PrototypicalNodeFactory#createTagNode(org.htmlparser.lexer.Page, int, * int, java.util.Vector) */ public Tag createTagNode(Page arg0, int arg1, int arg2, Vector arg3) { try { String tagName = ((Attribute)arg3.get(0)).getName().toLowerCase(); // end tags have names like "/a".... if (tagName.charAt(0) == '/') { tagName = tagName.substring(1); } Tag result = super.createTagNode(arg0, arg1, arg2, arg3); if (!keepTag(tagName)) { result = new CmsInvisibleTag(result); } return result; } catch (RuntimeException rte) { if (LOG.isErrorEnabled()) { // log here, as htmlparser 1.5 did swallow exceptions from here and threw NPEs from // other places LOG.error(rte); } throw rte; } } /** * Encapsulation of the "preserve / remove" logic.<p> * * @param tagName the lower case name of the tag to keep or hide * * @return if true the given Tag will be kept, if false it will be removed */ private boolean keepTag(final String tagName) { boolean result = false; // include mode: if (m_visibleTags.size() > 0) { if (m_visibleTags.contains(tagName)) { result = true; } else { result = false; } } // Power of hide: if no visible tags configured this works as a normal remove, // if visible tags are configured this can change a visible tag to be invisible if (m_invisibleTags.contains(tagName)) { result = false; } return result; } }