// HTMLParser Library $Name: v1_6_20060319 $ - A java-based parser for HTML // http://sourceforge.org/projects/htmlparser // Copyright (C) 2004 Somik Raha // // Revision Control Information // // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/nodes/AbstractNode.java,v $ // $Author: derrickoswald $ // $Date: 2005/10/26 22:01:23 $ // $Revision: 1.5 $ // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // package org.htmlparser.nodes; import java.io.Serializable; import org.htmlparser.Node; import org.htmlparser.NodeFilter; import org.htmlparser.lexer.Page; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; import org.htmlparser.visitors.NodeVisitor; /** * The concrete base class for all types of nodes (tags, text remarks). * This class provides basic functionality to hold the {@link Page}, the * starting and ending position in the page, the parent and the list of * {@link NodeList children}. */ public abstract class AbstractNode implements Node, Serializable { /** * The page this node came from. */ protected Page mPage; /** * The beginning position of the tag in the line */ protected int nodeBegin; /** * The ending position of the tag in the line */ protected int nodeEnd; /** * The parent of this node. */ protected Node parent; /** * The children of this node. */ protected NodeList children; /** * Create an abstract node with the page positions given. * Remember the page and start & end cursor positions. * @param page The page this tag was read from. * @param start The starting offset of this node within the page. * @param end The ending offset of this node within the page. */ public AbstractNode (Page page, int start, int end) { mPage = page; nodeBegin = start; nodeEnd = end; parent = null; children = null; } /** * Clone this object. * Exposes java.lang.Object clone as a public method. * @return A clone of this object. * @exception CloneNotSupportedException This shouldn't be thrown since * the {@link Node} interface extends Cloneable. */ public Object clone() throws CloneNotSupportedException { return (super.clone ()); } /** * Returns a string representation of the node. * It allows a simple string transformation * of a web page, regardless of node type.<br> * Typical application code (for extracting only the text from a web page) * would then be simplified to:<br> * <pre> * Node node; * for (Enumeration e = parser.elements (); e.hasMoreElements (); ) * { * node = (Node)e.nextElement(); * System.out.println (node.toPlainTextString ()); * // or do whatever processing you wish with the plain text string * } * </pre> * @return The 'browser' content of this node. */ public abstract String toPlainTextString (); /** * Return the HTML that generated this node. * This method will make it easier when using html parser to reproduce html * pages (with or without modifications). * Applications reproducing html can use this method on nodes which are to * be used or transferred as they were recieved, with the original html. * @return The HTML code for this node. */ public abstract String toHtml (); /** * Return a string representation of the node. * Subclasses must define this method, and this is typically to be used in the manner<br> * <pre>System.out.println(node)</pre> * @return A textual representation of the node suitable for debugging */ public abstract String toString (); /** * Collect this node and its child nodes (if-applicable) into the collectionList parameter, provided the node * satisfies the filtering criteria.<P> * * This mechanism allows powerful filtering code to be written very easily, * without bothering about collection of embedded tags separately. * e.g. when we try to get all the links on a page, it is not possible to * get it at the top-level, as many tags (like form tags), can contain * links embedded in them. We could get the links out by checking if the * current node is a {@link org.htmlparser.tags.CompositeTag}, and going through its children. * So this method provides a convenient way to do this.<P> * * Using collectInto(), programs get a lot shorter. Now, the code to * extract all links from a page would look like: * <pre> * NodeList collectionList = new NodeList(); * NodeFilter filter = new TagNameFilter ("A"); * for (NodeIterator e = parser.elements(); e.hasMoreNodes();) * e.nextNode().collectInto(collectionList, filter); * </pre> * Thus, collectionList will hold all the link nodes, irrespective of how * deep the links are embedded.<P> * * Another way to accomplish the same objective is: * <pre> * NodeList collectionList = new NodeList(); * NodeFilter filter = new TagClassFilter (LinkTag.class); * for (NodeIterator e = parser.elements(); e.hasMoreNodes();) * e.nextNode().collectInto(collectionList, filter); * </pre> * This is slightly less specific because the LinkTag class may be * registered for more than one node name, e.g. <LINK> tags too. * @param list The node list to collect acceptable nodes into. * @param filter The filter to determine which nodes are retained. */ public void collectInto (NodeList list, NodeFilter filter) { if (filter.accept (this)) list.add (this); } /** * Get the page this node came from. * @return The page that supplied this node. */ public Page getPage () { return (mPage); } /** * Set the page this node came from. * @param page The page that supplied this node. */ public void setPage (Page page) { mPage = page; } /** * Gets the starting position of the node. * @return The start position. */ public int getStartPosition () { return (nodeBegin); } /** * Sets the starting position of the node. * @param position The new start position. */ public void setStartPosition (int position) { nodeBegin = position; } /** * Gets the ending position of the node. * @return The end position. */ public int getEndPosition () { return (nodeEnd); } /** * Sets the ending position of the node. * @param position The new end position. */ public void setEndPosition (int position) { nodeEnd = position; } /** * Visit this node. * @param visitor The visitor that is visiting this node. */ public abstract void accept (NodeVisitor visitor); /** * Get the parent of this node. * This will always return null when parsing without scanners, * i.e. if semantic parsing was not performed. * The object returned from this method can be safely cast to a <code>CompositeTag</code>. * @return The parent of this node, if it's been set, <code>null</code> otherwise. */ public Node getParent () { return (parent); } /** * Sets the parent of this node. * @param node The node that contains this node. Must be a <code>CompositeTag</code>. */ public void setParent (Node node) { parent = node; } /** * Get the children of this node. * @return The list of children contained by this node, if it's been set, <code>null</code> otherwise. */ public NodeList getChildren () { return (children); } /** * Set the children of this node. * @param children The new list of children this node contains. */ public void setChildren (NodeList children) { this.children = children; } /** * Get the first child of this node. * @return The first child in the list of children contained by this node, * <code>null</code> otherwise. */ public Node getFirstChild () { if (children == null) return null; if (children.size() == 0) return null; return children.elementAt(0); } /** * Get the last child of this node. * @return The last child in the list of children contained by this node, * <code>null</code> otherwise. */ public Node getLastChild () { if (children == null) return null; int numChildren = children.size(); if (numChildren == 0) return null; return children.elementAt(numChildren - 1); } /** * Get the previous sibling to this node. * @return The previous sibling to this node if one exists, * <code>null</code> otherwise. */ public Node getPreviousSibling () { Node parentNode = this.getParent(); if (parentNode == null)//root node return null; NodeList siblings = parentNode.getChildren(); if (siblings == null)//this should actually be an error return null; int numSiblings = siblings.size(); if (numSiblings < 2)//need at least one other node to have a chance of having any siblings return null; int positionInParent = -1; for (int i = 0; i < numSiblings; i++) { if (siblings.elementAt(i) == this) { positionInParent = i; break; } } if (positionInParent < 1)//no previous siblings return null; return siblings.elementAt(positionInParent - 1); } /** * Get the next sibling to this node. * @return The next sibling to this node if one exists, * <code>null</code> otherwise. */ public Node getNextSibling () { Node parentNode = this.getParent(); if (parentNode == null)//root node return null; NodeList siblings = parentNode.getChildren(); if (siblings == null)//this should actually be an error return null; int numSiblings = siblings.size(); if (numSiblings < 2)//need at least one other node to have a chance of having any siblings return null; int positionInParent = -1; for (int i = 0; i < numSiblings; i++) { if (siblings.elementAt(i) == this) { positionInParent = i; break; } } if (positionInParent == -1)//this should actually be an error return null; if (positionInParent == (numSiblings - 1))//no next sibling return null; return siblings.elementAt(positionInParent + 1); } /** * Returns the text of the node. * @return The text of this node. The default is <code>null</code>. */ public String getText () { return null; } /** * Sets the string contents of the node. * @param text The new text for the node. */ public void setText(String text) { } /** * Perform the meaning of this tag. * The default action is to do nothing. * @exception ParserException <em>Not used.</em> Provides for subclasses * that may want to indicate an exceptional condition. */ public void doSemanticAction () throws ParserException { } }