CompositeTagScanner.java example

Explorer
EclipseTrader-master
// HTMLParser Library $Name: v1_6_20060319 $ - A java-based parser for HTML
// http://sourceforge.org/projects/htmlparser
// Copyright (C) 2003 Somik Raha
//
// Revision Control Information
//
// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/CompositeTagScanner.java,v $
// $Author: derrickoswald $
// $Date: 2005/04/10 23:20:44 $
// $Revision: 1.90 $
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//

package org.htmlparser.scanners;

import java.util.Vector;

import org.htmlparser.Attribute;
import org.htmlparser.Node;
import org.htmlparser.Tag;
import org.htmlparser.lexer.Lexer;
import org.htmlparser.lexer.Page;
import org.htmlparser.scanners.Scanner;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;

/**
 * The main scanning logic for nested tags.
 * When asked to scan, this class gathers nodes into a heirarchy of tags.
 */
public class CompositeTagScanner extends TagScanner
{
    /**
     * Determine whether to use JVM or NodeList stack.
     * This can be set to true to get the original behaviour of
     * recursion into composite tags on the JVM stack.
     * This may lead to StackOverFlowException problems in some cases
     * i.e. Windows.
     */
    private static final boolean mUseJVMStack = false;

    /**
     * Determine whether unexpected end tags should cause stack roll-up.
     * This can be set to true to get the original behaviour of gathering
     * end tags into whatever tag is open.
     * This can be expensive, but should only be needed in the presence of
     * bad HTML.
     */
    private static final boolean mLeaveEnds = false;

    /**
     * Create a composite tag scanner.
     */
    public CompositeTagScanner ()
    {
    }

    /**
     * Collect the children.
     * <p>An initial test is performed for an empty XML tag, in which case
     * the start tag and end tag of the returned tag are the same and it has
     * no children.<p>
     * If it's not an empty XML tag, the lexer is repeatedly asked for
     * subsequent nodes until an end tag is found or a node is encountered
     * that matches the tag ender set or end tag ender set.
     * In the latter case, a virtual end tag is created.
     * Each node found that is not the end tag is added to
     * the list of children. The end tag is special and not a child.<p>
     * Nodes that also have a CompositeTagScanner as their scanner are
     * recursed into, which provides the nested structure of an HTML page.
     * This method operates in two possible modes, depending on a private boolean.
     * It can recurse on the JVM stack, which has caused some overflow problems
     * in the past, or it can use the supplied stack argument to nest scanning
     * of child tags within itself. The former is left as an option in the code,
     * mostly to help subsequent modifiers visualize what the internal nesting
     * is doing.
     * @param tag The tag this scanner is responsible for.
     * @param lexer The source of subsequent nodes.
     * @param stack The parse stack. May contain pending tags that enclose
     * this tag.
     * @return The resultant tag (may be unchanged).
     */
    public Tag scan (Tag tag, Lexer lexer, NodeList stack) throws ParserException
    {
        Node node;
        Tag next;
        String name;
        Scanner scanner;
        Tag ret;
        
        ret = tag;

        if (ret.isEmptyXmlTag ())
            ret.setEndTag (ret);
        else
            do
            {
                node = lexer.nextNode (false);
                if (null != node)
                {
                    if (node instanceof Tag)
                    {
                        next = (Tag)node;
                        name = next.getTagName ();
                        // check for normal end tag
                        if (next.isEndTag () && name.equals (ret.getTagName ()))
                        {
                            ret.setEndTag (next);
                            node = null;
                        }
                        else if (isTagToBeEndedFor (ret, next)) // check DTD
                        {
                            // backup one node. insert a virtual end tag later
                            lexer.setPosition (next.getStartPosition ());
                            node = null;
                        }
                        else if (!next.isEndTag ())
                        {
                            // now recurse if there is a scanner for this type of tag
                            scanner = next.getThisScanner ();
                            if (null != scanner)
                            {
                                if (mUseJVMStack)
                                {   // JVM stack recursion
                                    node = scanner.scan (next, lexer, stack);
                                    addChild (ret, node);
                                }
                                else
                                {
                                    // fake recursion:
                                    if (scanner == this)
                                    {
                                        if (next.isEmptyXmlTag ())
                                        {
                                            next.setEndTag (next);
                                            finishTag (next, lexer);
                                            addChild (ret, next);
                                        }
                                        else
                                        {
                                            stack.add (ret);
                                            ret = next;
                                        }
                                    }
                                    else
                                    {   // normal recursion if switching scanners
                                        node = scanner.scan (next, lexer, stack);
                                        addChild (ret, node);
                                    }
                                }
                            }
                            else
                                addChild (ret, next);
                        }
                        else
                        {
                            if (!mUseJVMStack && !mLeaveEnds)
                            {
                                // Since all non-end tags are consumed by the
                                // previous clause, we're here because we have an
                                // end tag with no opening tag... this could be bad.
                                // There are two cases...
                                // 1) The tag hasn't been registered, in which case
                                // we just add it as a simple child, like it's
                                // opening tag
                                // 2) There may be an opening tag further up the
                                // parse stack that needs closing.
                                // So, we ask the factory for a node like this one
                                // (since end tags never have scanners) and see
                                // if it's scanner is a composite tag scanner.
                                // If it is we walk up the parse stack looking for
                                // something that needs this end tag to finish it.
                                // If there is something, we close off all the tags
                                // walked over and continue on as if nothing
                                // happened.
                                Vector attributes = new Vector ();
                                attributes.addElement (new Attribute (name, null));
                                Tag opener = lexer.getNodeFactory ().createTagNode (
                                    lexer.getPage (), next.getStartPosition (), next.getEndPosition (),
                                    attributes);

                                scanner = opener.getThisScanner ();
                                if ((null != scanner) && (scanner == this))
                                {
                                    // uh-oh
                                    int index = -1;
                                    for (int i = stack.size () - 1; (-1 == index) && (i >= 0); i--)
                                    {
                                        // short circuit here... assume everything on the stack has this as it's scanner
                                        // we'll need to stop if either of those conditions isn't met
                                        Tag boffo = (Tag)stack.elementAt (i);
                                        if (name.equals (boffo.getTagName ()))
                                            index = i;
                                        else if (isTagToBeEndedFor (boffo, next)) // check DTD
                                            index = i;
                                    }
                                    if (-1 != index)
                                    {
                                        // finish off the current one first
                                        finishTag (ret, lexer);
                                        addChild ((Tag)stack.elementAt (stack.size () - 1), ret);
                                        for (int i = stack.size () - 1; i > index; i--)
                                        {
                                            Tag fred = (Tag)stack.remove (i);
                                            finishTag (fred, lexer);
                                            addChild ((Tag)stack.elementAt (i - 1), fred);
                                        }
                                        ret = (Tag)stack.remove (index);
                                        node = null;
                                    }
                                    else
                                        addChild (ret, next); // default behaviour
                                }
                                else
                                    addChild (ret, next); // default behaviour
                            }
                            else
                                addChild (ret, next);
                        }
                    }
                    else
                    {
                        addChild (ret, node);
                        node.doSemanticAction ();
                    }
                }

                if (!mUseJVMStack)
                {
                    // handle coming out of fake recursion
                    if (null == node)
                    {
                        int depth = stack.size ();
                        if (0 != depth)
                        {
                            node = stack.elementAt (depth - 1);
                            if (node instanceof Tag)
                            {
                                Tag precursor = (Tag)node;
                                scanner = precursor.getThisScanner ();
                                if (scanner == this)
                                {
                                    stack.remove (depth - 1);
                                    finishTag (ret, lexer);
                                    addChild (precursor, ret);
                                    ret = precursor;
                                }
                                else
                                    node = null; // normal recursion
                            }
                            else
                                node = null; // normal recursion
                        }
                    }
                }
            }
            while (null != node);

        finishTag (ret, lexer);

        return (ret);
    }

    /**
     * Add a child to the given tag.
     * @param parent The parent tag.
     * @param child The child node.
     */
    protected void addChild (Tag parent, Node child)
    {
        if (null == parent.getChildren ())
            parent.setChildren (new NodeList ());
        child.setParent (parent);
        parent.getChildren ().add (child);
    }

    /**
     * Finish off a tag.
     * Perhap add a virtual end tag.
     * Set the end tag parent as this tag.
     * Perform the semantic acton.
     * @param tag The tag to finish off.
     * @param lexer A lexer positioned at the end of the tag.
     */
    protected void finishTag (Tag tag, Lexer lexer)
        throws
            ParserException
    {
        if (null == tag.getEndTag ())
            tag.setEndTag (createVirtualEndTag (tag, lexer, lexer.getPage (), lexer.getCursor ().getPosition ()));
        tag.getEndTag ().setParent (tag);
        tag.doSemanticAction ();
    }

    /**
     * Creates an end tag with the same name as the given tag.
     * @param tag The tag to end.
     * @param lexer The object containg the node factory.
     * @param page The page the tag is on (virtually).
     * @param position The offset into the page at which the tag is to
     * be anchored.
     * @return An end tag with the name '"/" + tag.getTagName()' and a start
     * and end position at the given position. The fact these positions are
     * equal may be used to distinguish it as a virtual tag later on.
     */
    protected Tag createVirtualEndTag (Tag tag, Lexer lexer, Page page, int position)
        throws
            ParserException
    {
        Tag ret;
        String name;
        Vector attributes;
        
        name = "/" + tag.getRawTagName ();
        attributes = new Vector ();
        attributes.addElement (new Attribute (name, (String)null));
        ret = lexer.getNodeFactory ().createTagNode (
                                    page, position, position, attributes);
        
        return (ret);
    }

    /**
     * Determine if the current tag should be terminated by the given tag.
     * Examines the 'enders' or 'end tag enders' lists of the current tag
     * for a match with the given tag. Which list is chosen depends on whether
     * tag is an end tag ('end tag enders') or not ('enders').
     * @param current The tag that might need to be ended.
     * @param tag The candidate tag that might end the current one.
     * @return <code>true</code> if the name of the given tag is a member of
     * the appropriate list.
     */
    public final boolean isTagToBeEndedFor (Tag current, Tag tag)
    {
        String name;
        String[] ends;
        boolean ret;

        ret = false;

        name = tag.getTagName ();
        if (tag.isEndTag ())
            ends = current.getEndTagEnders ();
        else
            ends = current.getEnders ();
        for (int i = 0; i < ends.length; i++)
            if (name.equalsIgnoreCase (ends[i]))
            {
                ret = true;
                break;
            }
        
        return (ret);
    }
}