// HTMLParser Library $Name: v1_6_20060319 $ - A java-based parser for HTML
// http://sourceforge.org/projects/htmlparser
// Copyright (C) 2003 Somik Raha
//
// Revision Control Information
//
// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/CompositeTagScanner.java,v $
// $Author: derrickoswald $
// $Date: 2005/04/10 23:20:44 $
// $Revision: 1.90 $
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
package org.htmlparser.scanners;
import java.util.Vector;
import org.htmlparser.Attribute;
import org.htmlparser.Node;
import org.htmlparser.Tag;
import org.htmlparser.lexer.Lexer;
import org.htmlparser.lexer.Page;
import org.htmlparser.scanners.Scanner;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
/**
* The main scanning logic for nested tags.
* When asked to scan, this class gathers nodes into a heirarchy of tags.
*/
public class CompositeTagScanner extends TagScanner
{
/**
* Determine whether to use JVM or NodeList stack.
* This can be set to true to get the original behaviour of
* recursion into composite tags on the JVM stack.
* This may lead to StackOverFlowException problems in some cases
* i.e. Windows.
*/
private static final boolean mUseJVMStack = false;
/**
* Determine whether unexpected end tags should cause stack roll-up.
* This can be set to true to get the original behaviour of gathering
* end tags into whatever tag is open.
* This can be expensive, but should only be needed in the presence of
* bad HTML.
*/
private static final boolean mLeaveEnds = false;
/**
* Create a composite tag scanner.
*/
public CompositeTagScanner ()
{
}
/**
* Collect the children.
* <p>An initial test is performed for an empty XML tag, in which case
* the start tag and end tag of the returned tag are the same and it has
* no children.<p>
* If it's not an empty XML tag, the lexer is repeatedly asked for
* subsequent nodes until an end tag is found or a node is encountered
* that matches the tag ender set or end tag ender set.
* In the latter case, a virtual end tag is created.
* Each node found that is not the end tag is added to
* the list of children. The end tag is special and not a child.<p>
* Nodes that also have a CompositeTagScanner as their scanner are
* recursed into, which provides the nested structure of an HTML page.
* This method operates in two possible modes, depending on a private boolean.
* It can recurse on the JVM stack, which has caused some overflow problems
* in the past, or it can use the supplied stack argument to nest scanning
* of child tags within itself. The former is left as an option in the code,
* mostly to help subsequent modifiers visualize what the internal nesting
* is doing.
* @param tag The tag this scanner is responsible for.
* @param lexer The source of subsequent nodes.
* @param stack The parse stack. May contain pending tags that enclose
* this tag.
* @return The resultant tag (may be unchanged).
*/
public Tag scan (Tag tag, Lexer lexer, NodeList stack) throws ParserException
{
Node node;
Tag next;
String name;
Scanner scanner;
Tag ret;
ret = tag;
if (ret.isEmptyXmlTag ())
ret.setEndTag (ret);
else
do
{
node = lexer.nextNode (false);
if (null != node)
{
if (node instanceof Tag)
{
next = (Tag)node;
name = next.getTagName ();
// check for normal end tag
if (next.isEndTag () && name.equals (ret.getTagName ()))
{
ret.setEndTag (next);
node = null;
}
else if (isTagToBeEndedFor (ret, next)) // check DTD
{
// backup one node. insert a virtual end tag later
lexer.setPosition (next.getStartPosition ());
node = null;
}
else if (!next.isEndTag ())
{
// now recurse if there is a scanner for this type of tag
scanner = next.getThisScanner ();
if (null != scanner)
{
if (mUseJVMStack)
{ // JVM stack recursion
node = scanner.scan (next, lexer, stack);
addChild (ret, node);
}
else
{
// fake recursion:
if (scanner == this)
{
if (next.isEmptyXmlTag ())
{
next.setEndTag (next);
finishTag (next, lexer);
addChild (ret, next);
}
else
{
stack.add (ret);
ret = next;
}
}
else
{ // normal recursion if switching scanners
node = scanner.scan (next, lexer, stack);
addChild (ret, node);
}
}
}
else
addChild (ret, next);
}
else
{
if (!mUseJVMStack && !mLeaveEnds)
{
// Since all non-end tags are consumed by the
// previous clause, we're here because we have an
// end tag with no opening tag... this could be bad.
// There are two cases...
// 1) The tag hasn't been registered, in which case
// we just add it as a simple child, like it's
// opening tag
// 2) There may be an opening tag further up the
// parse stack that needs closing.
// So, we ask the factory for a node like this one
// (since end tags never have scanners) and see
// if it's scanner is a composite tag scanner.
// If it is we walk up the parse stack looking for
// something that needs this end tag to finish it.
// If there is something, we close off all the tags
// walked over and continue on as if nothing
// happened.
Vector attributes = new Vector ();
attributes.addElement (new Attribute (name, null));
Tag opener = lexer.getNodeFactory ().createTagNode (
lexer.getPage (), next.getStartPosition (), next.getEndPosition (),
attributes);
scanner = opener.getThisScanner ();
if ((null != scanner) && (scanner == this))
{
// uh-oh
int index = -1;
for (int i = stack.size () - 1; (-1 == index) && (i >= 0); i--)
{
// short circuit here... assume everything on the stack has this as it's scanner
// we'll need to stop if either of those conditions isn't met
Tag boffo = (Tag)stack.elementAt (i);
if (name.equals (boffo.getTagName ()))
index = i;
else if (isTagToBeEndedFor (boffo, next)) // check DTD
index = i;
}
if (-1 != index)
{
// finish off the current one first
finishTag (ret, lexer);
addChild ((Tag)stack.elementAt (stack.size () - 1), ret);
for (int i = stack.size () - 1; i > index; i--)
{
Tag fred = (Tag)stack.remove (i);
finishTag (fred, lexer);
addChild ((Tag)stack.elementAt (i - 1), fred);
}
ret = (Tag)stack.remove (index);
node = null;
}
else
addChild (ret, next); // default behaviour
}
else
addChild (ret, next); // default behaviour
}
else
addChild (ret, next);
}
}
else
{
addChild (ret, node);
node.doSemanticAction ();
}
}
if (!mUseJVMStack)
{
// handle coming out of fake recursion
if (null == node)
{
int depth = stack.size ();
if (0 != depth)
{
node = stack.elementAt (depth - 1);
if (node instanceof Tag)
{
Tag precursor = (Tag)node;
scanner = precursor.getThisScanner ();
if (scanner == this)
{
stack.remove (depth - 1);
finishTag (ret, lexer);
addChild (precursor, ret);
ret = precursor;
}
else
node = null; // normal recursion
}
else
node = null; // normal recursion
}
}
}
}
while (null != node);
finishTag (ret, lexer);
return (ret);
}
/**
* Add a child to the given tag.
* @param parent The parent tag.
* @param child The child node.
*/
protected void addChild (Tag parent, Node child)
{
if (null == parent.getChildren ())
parent.setChildren (new NodeList ());
child.setParent (parent);
parent.getChildren ().add (child);
}
/**
* Finish off a tag.
* Perhap add a virtual end tag.
* Set the end tag parent as this tag.
* Perform the semantic acton.
* @param tag The tag to finish off.
* @param lexer A lexer positioned at the end of the tag.
*/
protected void finishTag (Tag tag, Lexer lexer)
throws
ParserException
{
if (null == tag.getEndTag ())
tag.setEndTag (createVirtualEndTag (tag, lexer, lexer.getPage (), lexer.getCursor ().getPosition ()));
tag.getEndTag ().setParent (tag);
tag.doSemanticAction ();
}
/**
* Creates an end tag with the same name as the given tag.
* @param tag The tag to end.
* @param lexer The object containg the node factory.
* @param page The page the tag is on (virtually).
* @param position The offset into the page at which the tag is to
* be anchored.
* @return An end tag with the name '"/" + tag.getTagName()' and a start
* and end position at the given position. The fact these positions are
* equal may be used to distinguish it as a virtual tag later on.
*/
protected Tag createVirtualEndTag (Tag tag, Lexer lexer, Page page, int position)
throws
ParserException
{
Tag ret;
String name;
Vector attributes;
name = "/" + tag.getRawTagName ();
attributes = new Vector ();
attributes.addElement (new Attribute (name, (String)null));
ret = lexer.getNodeFactory ().createTagNode (
page, position, position, attributes);
return (ret);
}
/**
* Determine if the current tag should be terminated by the given tag.
* Examines the 'enders' or 'end tag enders' lists of the current tag
* for a match with the given tag. Which list is chosen depends on whether
* tag is an end tag ('end tag enders') or not ('enders').
* @param current The tag that might need to be ended.
* @param tag The candidate tag that might end the current one.
* @return <code>true</code> if the name of the given tag is a member of
* the appropriate list.
*/
public final boolean isTagToBeEndedFor (Tag current, Tag tag)
{
String name;
String[] ends;
boolean ret;
ret = false;
name = tag.getTagName ();
if (tag.isEndTag ())
ends = current.getEndTagEnders ();
else
ends = current.getEnders ();
for (int i = 0; i < ends.length; i++)
if (name.equalsIgnoreCase (ends[i]))
{
ret = true;
break;
}
return (ret);
}
}