// HTMLParser Library $Name: v1_6_20060319 $ - A java-based parser for HTML // http://sourceforge.org/projects/htmlparser // Copyright (C) 2004 Derrick Oswald // // Revision Control Information // // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/nodes/TagNode.java,v $ // $Author: derrickoswald $ // $Date: 2005/04/10 23:20:44 $ // $Revision: 1.6 $ // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // package org.htmlparser.nodes; import java.util.Enumeration; import java.util.Hashtable; import java.util.Locale; import java.util.Vector; import org.htmlparser.Attribute; import org.htmlparser.Tag; import org.htmlparser.lexer.Cursor; import org.htmlparser.lexer.Lexer; import org.htmlparser.lexer.Page; import org.htmlparser.scanners.Scanner; import org.htmlparser.scanners.TagScanner; import org.htmlparser.util.ParserException; import org.htmlparser.util.SpecialHashtable; import org.htmlparser.visitors.NodeVisitor; /** * TagNode represents a generic tag. * If no scanner is registered for a given tag name, this is what you get. * This is also the base class for all tags created by the parser. */ public class TagNode extends AbstractNode implements Tag { /** * An empty set of tag names. */ private final static String[] NONE = new String[0]; /** * The scanner for this tag. */ private Scanner mScanner; /** * The default scanner for non-composite tags. */ protected final static Scanner mDefaultScanner = new TagScanner (); /** * The tag attributes. * Objects of type {@link Attribute}. * The first element is the tag name, subsequent elements being either * whitespace or real attributes. */ protected Vector mAttributes; /** * Set of tags that breaks the flow. */ protected static Hashtable breakTags; static { breakTags = new Hashtable (30); breakTags.put ("BLOCKQUOTE", Boolean.TRUE); breakTags.put ("BODY", Boolean.TRUE); breakTags.put ("BR", Boolean.TRUE); breakTags.put ("CENTER", Boolean.TRUE); breakTags.put ("DD", Boolean.TRUE); breakTags.put ("DIR", Boolean.TRUE); breakTags.put ("DIV", Boolean.TRUE); breakTags.put ("DL", Boolean.TRUE); breakTags.put ("DT", Boolean.TRUE); breakTags.put ("FORM", Boolean.TRUE); breakTags.put ("H1", Boolean.TRUE); breakTags.put ("H2", Boolean.TRUE); breakTags.put ("H3", Boolean.TRUE); breakTags.put ("H4", Boolean.TRUE); breakTags.put ("H5", Boolean.TRUE); breakTags.put ("H6", Boolean.TRUE); breakTags.put ("HEAD", Boolean.TRUE); breakTags.put ("HR", Boolean.TRUE); breakTags.put ("HTML", Boolean.TRUE); breakTags.put ("ISINDEX", Boolean.TRUE); breakTags.put ("LI", Boolean.TRUE); breakTags.put ("MENU", Boolean.TRUE); breakTags.put ("NOFRAMES", Boolean.TRUE); breakTags.put ("OL", Boolean.TRUE); breakTags.put ("P", Boolean.TRUE); breakTags.put ("PRE", Boolean.TRUE); breakTags.put ("TD", Boolean.TRUE); breakTags.put ("TH", Boolean.TRUE); breakTags.put ("TITLE", Boolean.TRUE); breakTags.put ("UL", Boolean.TRUE); } /** * Create an empty tag. */ public TagNode () { this (null, -1, -1, new Vector ()); } /** * Create a tag with the location and attributes provided * @param page The page this tag was read from. * @param start The starting offset of this node within the page. * @param end The ending offset of this node within the page. * @param attributes The list of attributes that were parsed in this tag. * @see Attribute */ public TagNode (Page page, int start, int end, Vector attributes) { super (page, start, end); mScanner = mDefaultScanner; mAttributes = attributes; if ((null == mAttributes) || (0 == mAttributes.size ())) { String[] names; names = getIds (); if ((null != names) && (0 != names.length)) setTagName (names[0]); else setTagName (""); // make sure it's not null } } /** * Create a tag like the one provided. * @param tag The tag to emulate. * @param scanner The scanner for this tag. */ public TagNode (TagNode tag, TagScanner scanner) { this (tag.getPage (), tag.getTagBegin (), tag.getTagEnd (), tag.getAttributesEx ()); setThisScanner (scanner); } /** * Returns the value of an attribute. * @param name Name of attribute, case insensitive. * @return The value associated with the attribute or null if it does * not exist, or is a stand-alone or */ public String getAttribute (String name) { Attribute attribute; String ret; ret = null; if (name.equalsIgnoreCase (SpecialHashtable.TAGNAME)) ret = ((Attribute)getAttributesEx ().elementAt (0)).getName (); else { attribute = getAttributeEx (name); if (null != attribute) ret = attribute.getValue (); } return (ret); } /** * Set attribute with given key, value pair. * Figures out a quote character to use if necessary. * @param key The name of the attribute. * @param value The value of the attribute. */ public void setAttribute (String key, String value) { char ch; boolean needed; boolean singleq; boolean doubleq; String ref; StringBuffer buffer; char quote; Attribute attribute; // first determine if there's whitespace in the value // and while we'return at it find a suitable quote character needed = false; singleq = true; doubleq = true; if (null != value) for (int i = 0; i < value.length (); i++) { ch = value.charAt (i); if (Character.isWhitespace (ch)) needed = true; else if ('\'' == ch) singleq = false; else if ('"' == ch) doubleq = false; } // now apply quoting if (needed) { if (doubleq) quote = '"'; else if (singleq) quote = '\''; else { // uh-oh, we need to convert some quotes into character references // convert all double quotes into " quote = '"'; ref = """; // Translate.encode (quote); // JDK 1.4: value = value.replaceAll ("\"", ref); buffer = new StringBuffer (value.length() * 5); for (int i = 0; i < value.length (); i++) { ch = value.charAt (i); if (quote == ch) buffer.append (ref); else buffer.append (ch); } value = buffer.toString (); } } else quote = 0; attribute = getAttributeEx (key); if (null != attribute) { // see if we can splice it in rather than replace it attribute.setValue (value); if (0 != quote) attribute.setQuote (quote); } else setAttribute (key, value, quote); } /** * Remove the attribute with the given key, if it exists. * @param key The name of the attribute. */ public void removeAttribute (String key) { Attribute attribute; attribute = getAttributeEx (key); if (null != attribute) getAttributesEx ().remove (attribute); } /** * Set attribute with given key, value pair where the value is quoted by quote. * @param key The name of the attribute. * @param value The value of the attribute. * @param quote The quote character to be used around value. * If zero, it is an unquoted value. */ public void setAttribute (String key, String value, char quote) { setAttribute (new Attribute (key, value, quote)); } /** * Returns the attribute with the given name. * @param name Name of attribute, case insensitive. * @return The attribute or null if it does * not exist. */ public Attribute getAttributeEx (String name) { Vector attributes; int size; Attribute attribute; String string; Attribute ret; ret = null; attributes = getAttributesEx (); if (null != attributes) { size = attributes.size (); for (int i = 0; i < size; i++) { attribute = (Attribute)attributes.elementAt (i); string = attribute.getName (); if ((null != string) && name.equalsIgnoreCase (string)) { ret = attribute; i = size; // exit fast } } } return (ret); } /** * Set an attribute. * @param attribute The attribute to set. * @see #setAttribute(Attribute) */ public void setAttributeEx (Attribute attribute) { setAttribute (attribute); } /** * Set an attribute. * This replaces an attribute of the same name. * To set the zeroth attribute (the tag name), use setTagName(). * @param attribute The attribute to set. */ public void setAttribute (Attribute attribute) { boolean replaced; Vector attributes; int length; String name; Attribute test; String test_name; replaced = false; attributes = getAttributesEx (); length = attributes.size (); if (0 < length) { name = attribute.getName (); for (int i = 1; i < attributes.size (); i++) { test = (Attribute)attributes.elementAt (i); test_name = test.getName (); if (null != test_name) if (test_name.equalsIgnoreCase (name)) { attributes.setElementAt (attribute, i); replaced = true; } } } if (!replaced) { // add whitespace between attributes if ((0 != length) && !((Attribute)attributes.elementAt (length - 1)).isWhitespace ()) attributes.addElement (new Attribute (" ")); attributes.addElement (attribute); } } /** * Gets the attributes in the tag. * @return Returns the list of {@link Attribute Attributes} in the tag. * The first element is the tag name, subsequent elements being either * whitespace or real attributes. */ public Vector getAttributesEx () { return (mAttributes); } /** * Gets the attributes in the tag. * This is not the preferred method to get attributes, see {@link * #getAttributesEx getAttributesEx} which returns a list of {@link * Attribute} objects, which offer more information than the simple * <code>String</code> objects available from this <code>Hashtable</code>. * @return Returns a list of name/value pairs representing the attributes. * These are not in order, the keys (names) are converted to uppercase and the values * are not quoted, even if they need to be. The table <em>will</em> return * <code>null</code> if there was no value for an attribute (no equals * sign or nothing to the right of the equals sign). A special entry with * a key of SpecialHashtable.TAGNAME ("$<TAGNAME>$") holds the tag name. * The conversion to uppercase is performed with an ENGLISH locale. */ public Hashtable getAttributes () { Vector attributes; Attribute attribute; String value; Hashtable ret; ret = new SpecialHashtable (); attributes = getAttributesEx (); if (0 < attributes.size ()) { // special handling for the node name attribute = (Attribute)attributes.elementAt (0); ret.put (SpecialHashtable.TAGNAME, attribute.getName ().toUpperCase (Locale.ENGLISH)); // the rest for (int i = 1; i < attributes.size (); i++) { attribute = (Attribute)attributes.elementAt (i); if (!attribute.isWhitespace ()) { value = attribute.getValue (); if (attribute.isEmpty ()) value = SpecialHashtable.NOTHING; if (null == value) value = SpecialHashtable.NULLVALUE; ret.put (attribute.getName ().toUpperCase (Locale.ENGLISH), value); } } } else ret.put (SpecialHashtable.TAGNAME, ""); return (ret); } /** * Return the name of this tag. * <p> * <em> * Note: This value is converted to uppercase and does not * begin with "/" if it is an end tag. Nor does it end with * a slash in the case of an XML type tag. * To get at the original text of the tag name use * {@link #getRawTagName getRawTagName()}. * The conversion to uppercase is performed with an ENGLISH locale. * </em> * @return The tag name. */ public String getTagName () { String ret; ret = getRawTagName (); if (null != ret) { ret = ret.toUpperCase (Locale.ENGLISH); if (ret.startsWith ("/")) ret = ret.substring (1); if (ret.endsWith ("/")) ret = ret.substring (0, ret.length () - 1); } return (ret); } /** * Return the name of this tag. * @return The tag name or null if this tag contains nothing or only * whitespace. */ public String getRawTagName () { Vector attributes; String ret; ret = null; attributes = getAttributesEx (); if (0 != attributes.size ()) ret = ((Attribute)attributes.elementAt (0)).getName (); return (ret); } /** * Set the name of this tag. * This creates or replaces the first attribute of the tag (the * zeroth element of the attribute vector). * @param name The tag name. */ public void setTagName (String name) { Attribute attribute; Vector attributes; Attribute zeroth; attribute = new Attribute (name, null, (char)0); attributes = getAttributesEx (); if (null == attributes) { attributes = new Vector (); setAttributesEx (attributes); } if (0 == attributes.size ()) // nothing added yet attributes.addElement (attribute); else { zeroth = (Attribute)attributes.elementAt (0); // check for attribute that looks like a name if ((null == zeroth.getValue ()) && (0 == zeroth.getQuote ())) attributes.setElementAt (attribute, 0); else attributes.insertElementAt (attribute, 0); } } /** * Return the text contained in this tag. * @return The complete contents of the tag (within the angle brackets). */ public String getText () { String ret; ret = toHtml (); ret = ret.substring (1, ret.length () - 1); return (ret); } /** * Sets the attributes. * A special entry with a key of SpecialHashtable.TAGNAME ("$<TAGNAME>$") * sets the tag name. * @param attributes The attribute collection to set. */ public void setAttributes (Hashtable attributes) { Vector att; String key; String value; char quote; Attribute attribute; att = new Vector (); for (Enumeration e = attributes.keys (); e.hasMoreElements (); ) { key = (String)e.nextElement (); value = (String)attributes.get (key); if (value.startsWith ("'") && value.endsWith ("'") && (2 <= value.length ())) { quote = '\''; value = value.substring (1, value.length () - 1); } else if (value.startsWith ("\"") && value.endsWith ("\"") && (2 <= value.length ())) { quote = '"'; value = value.substring (1, value.length () - 1); } else quote = (char)0; if (key.equals (SpecialHashtable.TAGNAME)) { attribute = new Attribute (value, null, quote); att.insertElementAt (attribute, 0); } else { // add whitespace between attributes attribute = new Attribute (" "); att.addElement (attribute); attribute = new Attribute (key, value, quote); att.addElement (attribute); } } this.mAttributes = att; } /** * Sets the attributes. * NOTE: Values of the extended hashtable are two element arrays of String, * with the first element being the original name (not uppercased), * and the second element being the value. * @param attribs The attribute collection to set. */ public void setAttributesEx (Vector attribs) { mAttributes = attribs; } /** * Sets the nodeBegin. * @param tagBegin The nodeBegin to set */ public void setTagBegin (int tagBegin) { nodeBegin = tagBegin; } /** * Gets the nodeBegin. * @return The nodeBegin value. */ public int getTagBegin () { return (nodeBegin); } /** * Sets the nodeEnd. * @param tagEnd The nodeEnd to set */ public void setTagEnd (int tagEnd) { nodeEnd = tagEnd; } /** * Gets the nodeEnd. * @return The nodeEnd value. */ public int getTagEnd () { return (nodeEnd); } /** * Parses the given text to create the tag contents. * @param text A string of the form <TAGNAME xx="yy">. */ public void setText (String text) { Lexer lexer; TagNode output; lexer = new Lexer (text); try { output = (TagNode)lexer.nextNode (); mPage = output.getPage (); nodeBegin = output.getStartPosition (); nodeEnd = output.getEndPosition (); mAttributes = output.getAttributesEx (); } catch (ParserException pe) { throw new IllegalArgumentException (pe.getMessage ()); } } /** * Get the plain text from this node. * @return An empty string (tag contents do not display in a browser). * If you want this tags HTML equivalent, use {@link #toHtml toHtml()}. */ public String toPlainTextString () { return (""); } /** * Render the tag as HTML. * A call to a tag's <code>toHtml()</code> method will render it in HTML. * @return The tag as an HTML fragment. * @see org.htmlparser.Node#toHtml() */ public String toHtml () { int length; int size; Vector attributes; Attribute attribute; StringBuffer ret; length = 2; attributes = getAttributesEx (); size = attributes.size (); for (int i = 0; i < size; i++) { attribute = (Attribute)attributes.elementAt (i); length += attribute.getLength (); } ret = new StringBuffer (length); ret.append ("<"); for (int i = 0; i < size; i++) { attribute = (Attribute)attributes.elementAt (i); attribute.toString (ret); } ret.append (">"); return (ret.toString ()); } /** * Print the contents of the tag. * @return An string describing the tag. For text that looks like HTML use #toHtml(). */ public String toString () { String text; String type; Cursor start; Cursor end; StringBuffer ret; text = getText (); ret = new StringBuffer (20 + text.length ()); if (isEndTag ()) type = "End"; else type = "Tag"; start = new Cursor (getPage (), getStartPosition ()); end = new Cursor (getPage (), getEndPosition ()); ret.append (type); ret.append (" ("); ret.append (start); ret.append (","); ret.append (end); ret.append ("): "); if (80 < ret.length () + text.length ()) { text = text.substring (0, 77 - ret.length ()); ret.append (text); ret.append ("..."); } else ret.append (text); return (ret.toString ()); } /** * Determines if the given tag breaks the flow of text. * @return <code>true</code> if following text would start on a new line, * <code>false</code> otherwise. */ public boolean breaksFlow () { return (breakTags.containsKey (getTagName ())); } /** * Returns table of attributes in the tag * @return Hashtable * @deprecated This method is deprecated. Use getAttributes() instead. */ public Hashtable getParsed () { return getAttributes (); } /** * Default tag visiting code. * Based on <code>isEndTag()</code>, calls either <code>visitTag()</code> or * <code>visitEndTag()</code>. * @param visitor The visitor that is visiting this node. */ public void accept (NodeVisitor visitor) { if (isEndTag ()) visitor.visitEndTag (this); else visitor.visitTag (this); } /** * Is this an empty xml tag of the form <tag/>. * @return true if the last character of the last attribute is a '/'. */ public boolean isEmptyXmlTag () { Vector attributes; int size; Attribute attribute; String name; int length; boolean ret; ret = false; attributes = getAttributesEx (); size = attributes.size (); if (0 < size) { attribute = (Attribute)attributes.elementAt (size - 1); name = attribute.getName (); if (null != name) { length = name.length (); ret = name.charAt (length - 1) == '/'; } } return (ret); } /** * Set this tag to be an empty xml node, or not. * Adds or removes an ending slash on the tag. * @param emptyXmlTag If true, ensures there is an ending slash in the node, * i.e. <tag/>, otherwise removes it. */ public void setEmptyXmlTag (boolean emptyXmlTag) { Vector attributes; int size; Attribute attribute; String name; String value; int length; attributes = getAttributesEx (); size = attributes.size (); if (0 < size) { attribute = (Attribute)attributes.elementAt (size - 1); name = attribute.getName (); if (null != name) { length = name.length (); value = attribute.getValue (); if (null == value) if (name.charAt (length - 1) == '/') { // already exists, remove if requested if (!emptyXmlTag) if (1 == length) attributes.removeElementAt (size - 1); else { // this shouldn't happen, but covers the case // where no whitespace separates the slash // from the previous attribute name = name.substring (0, length - 1); attribute = new Attribute (name, null); attributes.removeElementAt (size - 1); attributes.addElement (attribute); } } else { // ends with attribute, add whitespace + slash if requested if (emptyXmlTag) { attribute = new Attribute (" "); attributes.addElement (attribute); attribute = new Attribute ("/", null); attributes.addElement (attribute); } } else { // some valued attribute, add whitespace + slash if requested if (emptyXmlTag) { attribute = new Attribute (" "); attributes.addElement (attribute); attribute = new Attribute ("/", null); attributes.addElement (attribute); } } } else { // ends with whitespace, add if requested if (emptyXmlTag) { attribute = new Attribute ("/", null); attributes.addElement (attribute); } } } else // nothing there, add if requested if (emptyXmlTag) { attribute = new Attribute ("/", null); attributes.addElement (attribute); } } /** * Predicate to determine if this tag is an end tag (i.e. </HTML>). * @return <code>true</code> if this tag is an end tag. */ public boolean isEndTag () { String raw; raw = getRawTagName (); return ((null == raw) ? false : ((0 != raw.length ()) && ('/' == raw.charAt (0)))); } /** * Get the line number where this tag starts. * @return The (zero based) line number in the page where this tag starts. */ public int getStartingLineNumber () { return (getPage ().row (getStartPosition ())); } /** * Get the line number where this tag ends. * @return The (zero based) line number in the page where this tag ends. */ public int getEndingLineNumber () { return (getPage ().row (getEndPosition ())); } /** * Return the set of names handled by this tag. * Since this a a generic tag, it has no ids. * @return The names to be matched that create tags of this type. */ public String[] getIds () { return (NONE); } /** * Return the set of tag names that cause this tag to finish. * These are the normal (non end tags) that if encountered while * scanning (a composite tag) will cause the generation of a virtual * tag. * Since this a a non-composite tag, the default is no enders. * @return The names of following tags that stop further scanning. */ public String[] getEnders () { return (NONE); } /** * Return the set of end tag names that cause this tag to finish. * These are the end tags that if encountered while * scanning (a composite tag) will cause the generation of a virtual * tag. * Since this a a non-composite tag, it has no end tag enders. * @return The names of following end tags that stop further scanning. */ public String[] getEndTagEnders () { return (NONE); } /** * Return the scanner associated with this tag. * @return The scanner associated with this tag. */ public Scanner getThisScanner () { return (mScanner); } /** * Set the scanner associated with this tag. * @param scanner The scanner for this tag. */ public void setThisScanner (Scanner scanner) { mScanner = scanner; } /** * Get the end tag for this (composite) tag. * For a non-composite tag this always returns <code>null</code>. * @return The tag that terminates this composite tag, i.e. </HTML>. */ public Tag getEndTag () { return (null); } /** * Set the end tag for this (composite) tag. * For a non-composite tag this is a no-op. * @param end The tag that terminates this composite tag, i.e. </HTML>. */ public void setEndTag (Tag end) { } }