/*
* Copyright 2002-2009 Andy Clark, Marc Guillemot
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.cyberneko.html.filters;
import java.util.Hashtable;
import mf.org.apache.xerces.xni.*;
/**
* This class is a document filter capable of removing specified
* elements from the processing stream. There are two options for
* processing document elements:
* <ul>
* <li>specifying those elements which should be accepted and,
* optionally, which attributes of that element should be
* kept; and
* <li>specifying those elements whose tags and content should be
* completely removed from the event stream.
* </ul>
* <p>
* The first option allows the application to specify which elements
* appearing in the event stream should be accepted and, therefore,
* passed on to the next stage in the pipeline. All elements
* <em>not</em> in the list of acceptable elements have their start
* and end tags stripped from the event stream <em>unless</em> those
* elements appear in the list of elements to be removed.
* <p>
* The second option allows the application to specify which elements
* should be completely removed from the event stream. When an element
* appears that is to be removed, the element's start and end tag as
* well as all of that element's content is removed from the event
* stream.
* <p>
* A common use of this filter would be to only allow rich-text
* and linking elements as well as the character content to pass
* through the filter — all other elements would be stripped.
* The following code shows how to configure this filter to perform
* this task:
* <pre>
* ElementRemover remover = new ElementRemover();
* remover.acceptElement("b", null);
* remover.acceptElement("i", null);
* remover.acceptElement("u", null);
* remover.acceptElement("a", new String[] { "href" });
* </pre>
* <p>
* However, this would still allow the text content of other
* elements to pass through, which may not be desirable. In order
* to further "clean" the input, the <code>removeElement</code>
* option can be used. The following piece of code adds the ability
* to completely remove any <SCRIPT> tags and content
* from the stream.
* <pre>
* remover.removeElement("script");
* </pre>
* <p>
* <strong>Note:</strong>
* All text and accepted element children of a stripped element is
* retained. To completely remove an element's content, use the
* <code>removeElement</code> method.
* <p>
* <strong>Note:</strong>
* Care should be taken when using this filter because the output
* may not be a well-balanced tree. Specifically, if the application
* removes the <HTML> element (with or without retaining its
* children), the resulting document event stream will no longer be
* well-formed.
*
* @author Andy Clark
*
* @version $Id: ElementRemover.java,v 1.5 2005/02/14 03:56:54 andyc Exp $
*/
public class ElementRemover
extends DefaultFilter {
//
// Constants
//
/** A "null" object. */
protected static final Object NULL = new Object();
//
// Data
//
// information
/** Accepted elements. */
protected Hashtable fAcceptedElements = new Hashtable();
/** Removed elements. */
protected Hashtable fRemovedElements = new Hashtable();
// state
/** The element depth. */
protected int fElementDepth;
/** The element depth at element removal. */
protected int fRemovalElementDepth;
//
// Public methods
//
/**
* Specifies that the given element should be accepted and, optionally,
* which attributes of that element should be kept.
*
* @param element The element to accept.
* @param attributes The list of attributes to be kept or null if no
* attributes should be kept for this element.
*
* see #removeElement
*/
public void acceptElement(String element, String[] attributes) {
Object key = element.toLowerCase();
Object value = NULL;
if (attributes != null) {
String[] newarray = new String[attributes.length];
for (int i = 0; i < attributes.length; i++) {
newarray[i] = attributes[i].toLowerCase();
}
value = attributes;
}
fAcceptedElements.put(key, value);
} // acceptElement(String,String[])
/**
* Specifies that the given element should be completely removed. If an
* element is encountered during processing that is on the remove list,
* the element's start and end tags as well as all of content contained
* within the element will be removed from the processing stream.
*
* @param element The element to completely remove.
*/
public void removeElement(String element) {
Object key = element.toLowerCase();
Object value = NULL;
fRemovedElements.put(key, value);
} // removeElement(String)
//
// XMLDocumentHandler methods
//
// since Xerces-J 2.2.0
/** Start document. */
public void startDocument(XMLLocator locator, String encoding,
NamespaceContext nscontext, Augmentations augs)
throws XNIException {
fElementDepth = 0;
fRemovalElementDepth = Integer.MAX_VALUE;
super.startDocument(locator, encoding, nscontext, augs);
} // startDocument(XMLLocator,String,NamespaceContext,Augmentations)
// old methods
/** Start document. */
public void startDocument(XMLLocator locator, String encoding, Augmentations augs)
throws XNIException {
startDocument(locator, encoding, null, augs);
} // startDocument(XMLLocator,String,Augmentations)
/** Start prefix mapping. */
public void startPrefixMapping(String prefix, String uri, Augmentations augs)
throws XNIException {
if (fElementDepth <= fRemovalElementDepth) {
super.startPrefixMapping(prefix, uri, augs);
}
} // startPrefixMapping(String,String,Augmentations)
/** Start element. */
public void startElement(QName element, XMLAttributes attributes, Augmentations augs)
throws XNIException {
if (fElementDepth <= fRemovalElementDepth && handleOpenTag(element, attributes)) {
super.startElement(element, attributes, augs);
}
fElementDepth++;
} // startElement(QName,XMLAttributes,Augmentations)
/** Empty element. */
public void emptyElement(QName element, XMLAttributes attributes, Augmentations augs)
throws XNIException {
if (fElementDepth <= fRemovalElementDepth && handleOpenTag(element, attributes)) {
super.emptyElement(element, attributes, augs);
}
} // emptyElement(QName,XMLAttributes,Augmentations)
/** Comment. */
public void comment(XMLString text, Augmentations augs)
throws XNIException {
if (fElementDepth <= fRemovalElementDepth) {
super.comment(text, augs);
}
} // comment(XMLString,Augmentations)
/** Processing instruction. */
public void processingInstruction(String target, XMLString data, Augmentations augs)
throws XNIException {
if (fElementDepth <= fRemovalElementDepth) {
super.processingInstruction(target, data, augs);
}
} // processingInstruction(String,XMLString,Augmentations)
/** Characters. */
public void characters(XMLString text, Augmentations augs)
throws XNIException {
if (fElementDepth <= fRemovalElementDepth) {
super.characters(text, augs);
}
} // characters(XMLString,Augmentations)
/** Ignorable whitespace. */
public void ignorableWhitespace(XMLString text, Augmentations augs)
throws XNIException {
if (fElementDepth <= fRemovalElementDepth) {
super.ignorableWhitespace(text, augs);
}
} // ignorableWhitespace(XMLString,Augmentations)
/** Start general entity. */
public void startGeneralEntity(String name, XMLResourceIdentifier id, String encoding, Augmentations augs)
throws XNIException {
if (fElementDepth <= fRemovalElementDepth) {
super.startGeneralEntity(name, id, encoding, augs);
}
} // startGeneralEntity(String,XMLResourceIdentifier,String,Augmentations)
/** Text declaration. */
public void textDecl(String version, String encoding, Augmentations augs)
throws XNIException {
if (fElementDepth <= fRemovalElementDepth) {
super.textDecl(version, encoding, augs);
}
} // textDecl(String,String,Augmentations)
/** End general entity. */
public void endGeneralEntity(String name, Augmentations augs)
throws XNIException {
if (fElementDepth <= fRemovalElementDepth) {
super.endGeneralEntity(name, augs);
}
} // endGeneralEntity(String,Augmentations)
/** Start CDATA section. */
public void startCDATA(Augmentations augs) throws XNIException {
if (fElementDepth <= fRemovalElementDepth) {
super.startCDATA(augs);
}
} // startCDATA(Augmentations)
/** End CDATA section. */
public void endCDATA(Augmentations augs) throws XNIException {
if (fElementDepth <= fRemovalElementDepth) {
super.endCDATA(augs);
}
} // endCDATA(Augmentations)
/** End element. */
public void endElement(QName element, Augmentations augs)
throws XNIException {
if (fElementDepth <= fRemovalElementDepth && elementAccepted(element.rawname)) {
super.endElement(element, augs);
}
fElementDepth--;
if (fElementDepth == fRemovalElementDepth) {
fRemovalElementDepth = Integer.MAX_VALUE;
}
} // endElement(QName,Augmentations)
/** End prefix mapping. */
public void endPrefixMapping(String prefix, Augmentations augs)
throws XNIException {
if (fElementDepth <= fRemovalElementDepth) {
super.endPrefixMapping(prefix, augs);
}
} // endPrefixMapping(String,Augmentations)
//
// Protected methods
//
/** Returns true if the specified element is accepted. */
protected boolean elementAccepted(String element) {
Object key = element.toLowerCase();
return fAcceptedElements.containsKey(key);
} // elementAccepted(String):boolean
/** Returns true if the specified element should be removed. */
protected boolean elementRemoved(String element) {
Object key = element.toLowerCase();
return fRemovedElements.containsKey(key);
} // elementRemoved(String):boolean
/** Handles an open tag. */
protected boolean handleOpenTag(QName element, XMLAttributes attributes) {
if (elementAccepted(element.rawname)) {
Object key = element.rawname.toLowerCase();
Object value = fAcceptedElements.get(key);
if (value != NULL) {
String[] anames = (String[])value;
int attributeCount = attributes.getLength();
LOOP: for (int i = 0; i < attributeCount; i++) {
String aname = attributes.getQName(i).toLowerCase();
for (int j = 0; j < anames.length; j++) {
if (anames[j].equals(aname)) {
continue LOOP;
}
}
attributes.removeAttributeAt(i--);
attributeCount--;
}
}
else {
attributes.removeAllAttributes();
}
return true;
}
else if (elementRemoved(element.rawname)) {
fRemovalElementDepth = fElementDepth;
}
return false;
} // handleOpenTag(QName,XMLAttributes):boolean
} // class DefaultFilter