CompositeTag.java example

Explorer
EclipseTrader-master
// HTMLParser Library $Name: v1_6_20060319 $ - A java-based parser for HTML
// http://sourceforge.org/projects/htmlparser
// Copyright (C) 2004 Somik Raha
//
// Revision Control Information
//
// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags/CompositeTag.java,v $
// $Author: derrickoswald $
// $Date: 2005/06/20 01:56:32 $
// $Revision: 1.81 $
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//

package org.htmlparser.tags;

import java.util.Locale;

import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Text;
import org.htmlparser.Tag;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.nodes.AbstractNode;
import org.htmlparser.nodes.TagNode;
import org.htmlparser.scanners.CompositeTagScanner;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.SimpleNodeIterator;
import org.htmlparser.visitors.NodeVisitor;

/**
 * The base class for tags that have an end tag.
 * Provided extra accessors for the children above and beyond what the basic
 * {@link Tag} provides. Also handles the conversion of it's children for
 * the {@link #toHtml toHtml} method.
 */
public class CompositeTag extends TagNode
{
    /**
     * The tag that causes this tag to finish.
     * May be a virtual tag generated by the scanning logic.
     */
    protected Tag mEndTag;

    /**
     * The default scanner for non-composite tags.
     */
    protected final static CompositeTagScanner mDefaultCompositeScanner = new CompositeTagScanner ();

    /**
     * Create a composite tag.
     */
    public CompositeTag ()
    {
        setThisScanner (mDefaultCompositeScanner);
    }
    
    /**
     * Get an iterator over the children of this node.
     * @return Am iterator over the children of this node.
     */
    public SimpleNodeIterator children ()
    {
        SimpleNodeIterator ret;

        if (null != getChildren ())
            ret = getChildren ().elements ();
        else
            ret = (new NodeList ()).elements ();

        return (ret);
    }

    /**
     * Get the child of this node at the given position.
     * @param index The in the node list of the child.
     * @return The child at that index.
     */
    public Node getChild (int index)
    {
        return (
            (null == getChildren ()) ? null :
            getChildren ().elementAt (index));
    }

    /**
     * Get the children as an array of <code>Node</code> objects.
     * @return The children in an array.
     */
    public Node [] getChildrenAsNodeArray ()
    {
        return (
            (null == getChildren ()) ? new Node[0] :
            getChildren ().toNodeArray ());
    }

    /**
     * Remove the child at the position given.
     * @param i The index of the child to remove.
     */
    public void removeChild (int i)
    {
        if (null != getChildren ())
            getChildren ().remove (i);
    }

    /**
     * Return the child tags as an iterator.
     * Equivalent to calling getChildren ().elements ().
     * @return An iterator over the children.
     */
    public SimpleNodeIterator elements()
    {
        return (
            (null == getChildren ()) ? new NodeList ().elements () :
            getChildren ().elements ());
    }

    /**
     * Return the textual contents of this tag and it's children.
     * @return The 'browser' text contents of this tag.
     */
    public String toPlainTextString() {
        StringBuffer stringRepresentation = new StringBuffer();
        for (SimpleNodeIterator e=children();e.hasMoreNodes();) {
            stringRepresentation.append(e.nextNode().toPlainTextString());
        }
        return stringRepresentation.toString();
    }

    /**
     * Add the textual contents of the children of this node to the buffer.
     * @param sb The buffer to append to.
     */
    protected void putChildrenInto(StringBuffer sb)
    {
        Node node;
        for (SimpleNodeIterator e = children (); e.hasMoreNodes ();)
        {
            node = e.nextNode ();
            // eliminate virtual tags
//            if (!(node.getStartPosition () == node.getEndPosition ()))
                sb.append (node.toHtml ());
        }
    }

    /**
     * Add the textual contents of the end tag of this node to the buffer.
     * @param sb The buffer to append to.
     */
    protected void putEndTagInto(StringBuffer sb)
    {
        // eliminate virtual tags
//        if (!(endTag.getStartPosition () == endTag.getEndPosition ()))
            sb.append(getEndTag ().toHtml());
    }

    /**
     * Return this tag as HTML code.
     * @return This tag and it's contents (children) and the end tag
     * as HTML code.
     */
    public String toHtml() {
        StringBuffer sb = new StringBuffer();
        sb.append (super.toHtml ());
        if (!isEmptyXmlTag())
        {
            putChildrenInto(sb);
            if (null != getEndTag ())
                putEndTagInto(sb);
        }
        return sb.toString();
    }

    /**
     * Searches all children who for a name attribute. Returns first match.
     * @param name Attribute to match in tag
     * @return Tag Tag matching the name attribute
     */
    public Tag searchByName(String name) {
        Node node;
        Tag tag = null;
        boolean found = false;
        for (SimpleNodeIterator e = children();e.hasMoreNodes() && !found;) {
            node = e.nextNode();
            if (node instanceof Tag)
            {
                tag = (Tag)node;
                String nameAttribute = tag.getAttribute("NAME");
                if (nameAttribute!=null && nameAttribute.equals(name))
                    found=true;
            }
        }
        if (found)
            return tag;
        else
            return null;
    }

    /**
     * Searches for all nodes whose text representation contains the search string.
     * Collects all nodes containing the search string into a NodeList.
     * This search is <b>case-insensitive</b> and the search string and the
     * node text are converted to uppercase using an English locale.
     * For example, if you wish to find any textareas in a form tag containing
     * "hello world", the code would be:
     * <code>
     * NodeList nodeList = formTag.searchFor("Hello World");
     * </code>
     * @param searchString Search criterion.
     * @return A collection of nodes whose string contents or
     * representation have the <code>searchString</code> in them.
     */
    public NodeList searchFor (String searchString)
    {
        return (searchFor (searchString, false));
    }

    /**
     * Searches for all nodes whose text representation contains the search string.
     * Collects all nodes containing the search string into a NodeList.
     * For example, if you wish to find any textareas in a form tag containing
     * "hello world", the code would be:
     * <code>
     * NodeList nodeList = formTag.searchFor("Hello World");
     * </code>
     * @param searchString Search criterion.
     * @param caseSensitive If <code>true</code> this search should be case
     * sensitive. Otherwise, the search string and the node text are converted
     * to uppercase using an English locale.
     * @return A collection of nodes whose string contents or
     * representation have the <code>searchString</code> in them.
     */
    public NodeList searchFor (String searchString, boolean caseSensitive)
    {
        return (searchFor (searchString, caseSensitive, Locale.ENGLISH));
    }

    /**
     * Searches for all nodes whose text representation contains the search string.
     * Collects all nodes containing the search string into a NodeList.
     * For example, if you wish to find any textareas in a form tag containing
     * "hello world", the code would be:
     * <code>
     * NodeList nodeList = formTag.searchFor("Hello World");
     * </code>
     * @param searchString Search criterion.
     * @param caseSensitive If <code>true</code> this search should be case
     * sensitive. Otherwise, the search string and the node text are converted
     * to uppercase using the locale provided.
     * @param locale The locale for uppercase conversion.
     * @return A collection of nodes whose string contents or
     * representation have the <code>searchString</code> in them.
     */
    public NodeList searchFor (String searchString, boolean caseSensitive, Locale locale)
    {
        Node node;
        String text;
        NodeList ret;
        
        ret = new NodeList ();

        if (!caseSensitive)
            searchString = searchString.toUpperCase (locale);
        for (SimpleNodeIterator e = children (); e.hasMoreNodes (); )
        {
            node = e.nextNode ();
            text = node.toPlainTextString ();
            if (!caseSensitive)
                text = text.toUpperCase (locale);
            if (-1 != text.indexOf (searchString))
                ret.add (node);
        }

        return (ret);
    }

    /**
     * Collect all objects that are of a certain type
     * Note that this will not check for parent types, and will not
     * recurse through child tags
     * @param classType The class to search for.
     * @param recursive If true, recursively search through the children.
     * @return A list of children found.
     */
    public NodeList searchFor (Class classType, boolean recursive)
    {
        NodeList children;
        NodeList ret;

        children = getChildren ();
        if (null == children)
            ret = new NodeList ();
        else
            ret = children.extractAllNodesThatMatch (
                new NodeClassFilter (classType), recursive);

        return (ret);
    }

    /**
     * Returns the node number of the first node containing the given text.
     * This can be useful to index into the composite tag and get other children.
     * Text is compared without case sensitivity and conversion to uppercase
     * uses an English locale.
     * @param text The text to search for.
     * @return int The node index in the children list of the node containing
     * the text or -1 if not found.
     * @see #findPositionOf (String, Locale)
     */
    public int findPositionOf (String text)
    {
        return (findPositionOf (text, Locale.ENGLISH));
    }

    /**
     * Returns the node number of the first node containing the given text.
     * This can be useful to index into the composite tag and get other children.
     * Text is compared without case sensitivity and conversion to uppercase
     * uses the supplied locale.
     * @return int The node index in the children list of the node containing
     * the text or -1 if not found.
     * @param locale The locale to use in converting to uppercase.
     * @param text The text to search for.
     */
    public int findPositionOf (String text, Locale locale)
    {
        Node node;
        int loc;
        
        loc = 0;
        text = text.toUpperCase (locale);
        for (SimpleNodeIterator e = children (); e.hasMoreNodes (); )
        {
            node = e.nextNode ();
            if (-1 != node.toPlainTextString ().toUpperCase (locale).indexOf (text))
                return loc;
            loc++;
        }
        return -1;
    }

    /**
     * Returns the node number of a child node given the node object.
     * This would typically be used in conjuction with digUpStringNode,
     * after which the string node's parent can be used to find the
     * string node's position. Faster than calling findPositionOf(text)
     * again. Note that the position is at a linear level alone - there
     * is no recursion in this method.
     * @param searchNode The child node to find.
     * @return The offset of the child tag or -1 if it was not found.
     */
    public int findPositionOf(Node searchNode) {
        Node node;
        int loc = 0;
        for (SimpleNodeIterator e=children();e.hasMoreNodes();) {
            node = e.nextNode();
            if (node==searchNode) {
                return loc;
            }
            loc++;
        }
        return -1;
    }

    /**
     * Get child at given index
     * @param index The index into the child node list.
     * @return Node The child node at the given index or null if none.
     */
    public Node childAt (int index)
    {
        return (
            (null == getChildren ()) ? null :
            getChildren ().elementAt (index));
    }

    /**
     * Collect this node and its child nodes (if-applicable) into the list parameter,
     * provided the node satisfies the filtering criteria.
     * <p>This mechanism allows powerful filtering code to be written very easily,
     * without bothering about collection of embedded tags separately.
     * e.g. when we try to get all the links on a page, it is not possible to
     * get it at the top-level, as many tags (like form tags), can contain
     * links embedded in them. We could get the links out by checking if the
     * current node is a {@link CompositeTag}, and going through its children.
     * So this method provides a convenient way to do this.</p>
     * <p>Using collectInto(), programs get a lot shorter. Now, the code to
     * extract all links from a page would look like:
     * <pre>
     * NodeList list = new NodeList();
     * NodeFilter filter = new TagNameFilter ("A");
     * for (NodeIterator e = parser.elements(); e.hasMoreNodes();)
     *      e.nextNode().collectInto(list, filter);
     * </pre>
     * Thus, <code>list</code> will hold all the link nodes, irrespective of how
     * deep the links are embedded.</p>
     * <p>Another way to accomplish the same objective is:
     * <pre>
     * NodeList list = new NodeList();
     * NodeFilter filter = new TagClassFilter (LinkTag.class);
     * for (NodeIterator e = parser.elements(); e.hasMoreNodes();)
     *      e.nextNode().collectInto(list, filter);
     * </pre>
     * This is slightly less specific because the LinkTag class may be
     * registered for more than one node name, e.g. <LINK> tags too.</p>
     * @param list The list to add nodes to.
     * @param filter The filter to apply.
     * @see org.htmlparser.filters
     */
    public void collectInto (NodeList list, NodeFilter filter)
    {
        super.collectInto (list, filter);
        for (SimpleNodeIterator e = children(); e.hasMoreNodes ();)
            e.nextNode ().collectInto (list, filter);
        if ((null != getEndTag ()) && (this != getEndTag ())) // 2nd guard handles <tag/>
            getEndTag ().collectInto (list, filter);
    }

    /**
     * Return the HTML code for the children of this tag.
     * @return A string with the HTML code for the contents of this tag.
     */
    public String getChildrenHTML() {
        StringBuffer buff = new StringBuffer();
        for (SimpleNodeIterator e = children();e.hasMoreNodes();) {
            AbstractNode node = (AbstractNode)e.nextNode();
            buff.append(node.toHtml());
        }
        return buff.toString();
    }

    /**
     * Tag visiting code.
     * Invokes <code>accept()</code> on the start tag and then
     * walks the child list invoking <code>accept()</code> on each
     * of the children, finishing up with an <code>accept()</code>
     * call on the end tag. If <code>shouldRecurseSelf()</code>
     * returns true it then asks the visitor to visit itself.
     * @param visitor The <code>NodeVisitor</code> object to be signalled
     * for each child and possibly this tag.
     */
    public void accept (NodeVisitor visitor)
    {
        SimpleNodeIterator children;
        Node child;

        if (visitor.shouldRecurseSelf ())
            visitor.visitTag (this);
        if (visitor.shouldRecurseChildren ())
        {
            if (null != getChildren ())
            {
                children = children ();
                while (children.hasMoreNodes ())
                {
                    child = children.nextNode ();
                    child.accept (visitor);
                }
            }
            if ((null != getEndTag ()) && (this != getEndTag ())) // 2nd guard handles <tag/>
                getEndTag ().accept (visitor);
        }
    }

    /**
     * Return the number of child nodes in this tag.
     * @return The child node count.
     */
    public int getChildCount()
    {
        NodeList children;
        
        children = getChildren ();

        return ((null == children) ? 0 : children.size ());
    }

    /**
     * Get the end tag for this tag.
     * For example, if the node is {@.html <LABEL>The label</LABLE>}, then
     * this method would return the {@.html </LABLE>} end tag.
     * @return The end tag for this node.
     * <em>Note: If the start and end position of the end tag is the same,
     * then the end tag was injected (it's a virtual end tag).</em>
     */
    public Tag getEndTag()
    {
        return (mEndTag);
    }

    /**
     * Set the end tag for this tag.
     * @param tag The new end tag for this tag.
     * Note: no checking is perfromed so you can generate bad HTML by setting
     * the end tag with a name not equal to the name of the start tag,
     * i.e. {@.html <LABEL>The label</TITLE>}
     */
    public void setEndTag (Tag tag)
    {
        mEndTag = tag;
    }

    /**
     * Finds a text node, however embedded it might be, and returns
     * it. The text node will retain links to its parents, so
     * further navigation is possible.
     * @param searchText The text to search for.
     * @return The list of text nodes (recursively) found.
     */
    public Text[] digupStringNode(String searchText) {
        NodeList nodeList = searchFor(searchText);
        NodeList stringNodes = new NodeList();
        for (int i=0;i<nodeList.size();i++) {
            Node node = nodeList.elementAt(i);
            if (node instanceof Text) {
                stringNodes.add(node);
            } else {
                if (node instanceof CompositeTag) {
                    CompositeTag ctag = (CompositeTag)node;
                    Text[] nodes = ctag.digupStringNode(searchText);
                    for (int j=0;j<nodes.length;j++)
                        stringNodes.add(nodes[j]);
                }
            }
        }
        Text[] stringNode = new Text[stringNodes.size()];
        for (int i=0;i<stringNode.length;i++) {
            stringNode[i] = (Text)stringNodes.elementAt(i);
        }
        return stringNode;
    }

    /**
     * Return a string representation of the contents of this tag, it's children and it's end tag suitable for debugging.
     * @return A textual representation of the tag.
     */
    public String toString ()
    {
        StringBuffer ret;
        
        ret = new StringBuffer (1024);
        toString (0, ret);
        
        return (ret.toString ());
    }

    /**
     * Return the text contained in this tag.
     * @return The complete contents of the tag (within the angle brackets).
     */
    public String getText ()
    {
        String ret;
        
        ret = super.toHtml ();
        ret = ret.substring (1, ret.length () - 1);
        
        return (ret);
    }

    /**
     * Return the text between the start tag and the end tag.
     * @return The contents of the CompositeTag.
     */
    public String getStringText ()
    {
        String ret;
        int start = getEndPosition ();
        int end = mEndTag.getStartPosition ();
        ret = getPage ().getText (start, end);
        
        return (ret);
    }

    /**
     * Return a string representation of the contents of this tag, it's children and it's end tag suitable for debugging.
     * @param level The indentation level to use.
     * @param buffer The buffer to append to.
     */
    public void toString (int level, StringBuffer buffer)
    {
        Node node;

        for (int i = 0; i < level; i++)
            buffer.append ("  ");
        buffer.append (super.toString ());
        buffer.append (System.getProperty ("line.separator"));
        for (SimpleNodeIterator e = children (); e.hasMoreNodes ();)
        {
            node = e.nextNode ();
            if (node instanceof CompositeTag)
                ((CompositeTag)node).toString (level + 1, buffer);
            else
            {
                for (int i = 0; i <= level; i++)
                    buffer.append ("  ");
                buffer.append (node);
                buffer.append (System.getProperty ("line.separator"));
            }
        }
        
        if ((null != getEndTag ()) && (this != getEndTag ())) // 2nd guard handles <tag/>
            // eliminate virtual tags
//            if (!(getEndTag ().getStartPosition () == getEndTag ().getEndPosition ()))
            {
                for (int i = 0; i <= level; i++)
                    buffer.append ("  ");
                buffer.append (getEndTag ().toString ());
                buffer.append (System.getProperty ("line.separator"));
            }
    }
}