// HTMLParser Library $Name: v1_6_20060319 $ - A java-based parser for HTML // http://sourceforge.org/projects/htmlparser // Copyright (C) 2004 Derrick Oswald // // Revision Control Information // // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/Attribute.java,v $ // $Author: derrickoswald $ // $Date: 2005/11/15 02:09:10 $ // $Revision: 1.8 $ // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // package org.htmlparser; import java.io.Serializable; /** * An attribute within a tag. * Holds the name, assignment string, value and quote character. * <p> * This class was made deliberately simple. Except for * {@link #setRawValue RawValue}, the properties are completely orthogonal, * that is: each property is independant of the others. This means you have * enough rope here to hang yourself, and it's very easy to create * malformed HTML. Where it's obvious, warnings and notes have been provided * in the setters javadocs, but it is up to you -- the programmer -- * to ensure that the contents of the four fields will yield valid HTML * (if that's what you want). * <p> * Be especially mindful of quotes and assignment strings. These are handled * by the constructors where it's obvious, but in general, you need to set * them explicitly when building an attribute. For example to construct * the attribute <b><code>label="A multi word value."</code></b> you could use: * <pre> * attribute = new Attribute (); * attribute.setName ("label"); * attribute.setAssignment ("="); * attribute.setValue ("A multi word value."); * attribute.setQuote ('"'); * </pre> * or * <pre> * attribute = new Attribute (); * attribute.setName ("label"); * attribute.setAssignment ("="); * attribute.setRawValue ("A multi word value."); * </pre> * or * <pre> * attribute = new Attribute ("label", "A multi word value."); * </pre> * Note that the assignment value and quoting need to be set separately when * building the attribute from scratch using the properties. * <p> * <table width="100.0%" align="Center" border="1"> * <caption>Valid States for Attributes.</caption> * <tr> * <th align="Center">Description</th> * <th align="Center">toString()</th> * <th align="Center">Name</th> * <th align="Center">Assignment</th> * <th align="Center">Value</th> * <th align="Center">Quote</th> * </tr> * <tr> * <td align="Center">whitespace attribute</td> * <td align="Center">value</td> * <td align="Center"><code>null</code></td> * <td align="Center"><code>null</code></td> * <td align="Center">"value"</td> * <td align="Center"><code>0</code></td> * </tr> * <tr> * <td align="Center">standalone attribute</td> * <td align="Center">name</td> * <td align="Center">"name"</td> * <td align="Center"><code>null</code></td> * <td align="Center"><code>null</code></td> * <td align="Center"><code>0</code></td> * </tr> * <tr> * <td align="Center">empty attribute</td> * <td align="Center">name=</td> * <td align="Center">"name"</td> * <td align="Center">"="</td> * <td align="Center"><code>null</code></td> * <td align="Center"><code>0</code></td> * </tr> * <tr> * <td align="Center">empty single quoted attribute</td> * <td align="Center">name=''</td> * <td align="Center">"name"</td> * <td align="Center">"="</td> * <td align="Center"><code>null</code></td> * <td align="Center"><code>'</code></td> * </tr> * <tr> * <td align="Center">empty double quoted attribute</td> * <td align="Center">name=""</td> * <td align="Center">"name"</td> * <td align="Center">"="</td> * <td align="Center"><code>null</code></td> * <td align="Center"><code>"</code></td> * </tr> * <tr> * <td align="Center">naked attribute</td> * <td align="Center">name=value</td> * <td align="Center">"name"</td> * <td align="Center">"="</td> * <td align="Center">"value"</td> * <td align="Center"><code>0</code></td> * </tr> * <tr> * <td align="Center">single quoted attribute</td> * <td align="Center">name='value'</td> * <td align="Center">"name"</td> * <td align="Center">"="</td> * <td align="Center">"value"</td> * <td align="Center"><code>'</code></td> * </tr> * <tr> * <td align="Center">double quoted attribute</td> * <td align="Center">name="value"</td> * <td align="Center">"name"</td> * <td align="Center">"="</td> * <td align="Center">"value"</td> * <td align="Center"><code>"</code></td> * </tr> * </table> * <br>In words: * <br>If Name is null, and Assignment is null, and Quote is zero, * it's whitepace and Value has the whitespace text -- value * <br>If Name is not null, and both Assignment and Value are null * it's a standalone attribute -- name * <br>If Name is not null, and Assignment is an equals sign, and Quote is zero * it's an empty attribute -- name= * <br>If Name is not null, and Assignment is an equals sign, * and Value is "" or null, and Quote is ' * it's an empty single quoted attribute -- name='' * <br>If Name is not null, and Assignment is an equals sign, * and Value is "" or null, and Quote is " * it's an empty double quoted attribute -- name="" * <br>If Name is not null, and Assignment is an equals sign, * and Value is something, and Quote is zero * it's a naked attribute -- name=value * <br>If Name is not null, and Assignment is an equals sign, * and Value is something, and Quote is ' * it's a single quoted attribute -- name='value' * <br>If Name is not null, and Assignment is an equals sign, * and Value is something, and Quote is " * it's a double quoted attribute -- name="value" * <br>All other states are invalid HTML. * <p> * From the <a href="http://www.w3.org/TR/html4/intro/sgmltut.html#h-3.2.2"> * HTML 4.01 Specification, W3C Recommendation 24 December 1999</a> * http://www.w3.org/TR/html4/intro/sgmltut.html#h-3.2.2:<p> * <cite> * 3.2.2 Attributes<p> * Elements may have associated properties, called attributes, which may * have values (by default, or set by authors or scripts). Attribute/value * pairs appear before the final ">" of an element's start tag. Any number * of (legal) attribute value pairs, separated by spaces, may appear in an * element's start tag. They may appear in any order.<p> * In this example, the id attribute is set for an H1 element: * <pre> * <code> * {@.html * <H1 id="section1"> * This is an identified heading thanks to the id attribute * </H1>} * </code> * </pre> * By default, SGML requires that all attribute values be delimited using * either double quotation marks (ASCII decimal 34) or single quotation * marks (ASCII decimal 39). Single quote marks can be included within the * attribute value when the value is delimited by double quote marks, and * vice versa. Authors may also use numeric character references to * represent double quotes (&#34;) and single quotes (&#39;). * For doublequotes authors can also use the character entity reference * &quot;.<p> * In certain cases, authors may specify the value of an attribute without * any quotation marks. The attribute value may only contain letters * (a-z and A-Z), digits (0-9), hyphens (ASCII decimal 45), * periods (ASCII decimal 46), underscores (ASCII decimal 95), * and colons (ASCII decimal 58). We recommend using quotation marks even * when it is possible to eliminate them.<p> * Attribute names are always case-insensitive.<p> * Attribute values are generally case-insensitive. The definition of each * attribute in the reference manual indicates whether its value is * case-insensitive.<p> * All the attributes defined by this specification are listed in the * <a href="http://www.w3.org/TR/html4/index/attributes.html">attribute * index</a>.<p> * </cite> * <p> */ public class Attribute implements Serializable { /** * The name of this attribute. * The part before the equals sign, or the stand-alone attribute. * This will be <code>null</code> if the attribute is whitespace. */ protected String mName; /** * The assignment string of the attribute. * The equals sign. * This will be <code>null</code> if the attribute is a * stand-alone attribute. */ protected String mAssignment; /** * The value of the attribute. * The part after the equals sign. * This will be <code>null</code> if the attribute is an empty or * stand-alone attribute. */ protected String mValue; /** * The quote, if any, surrounding the value of the attribute, if any. * This will be zero if there are no quotes around the value. */ protected char mQuote; /** * Create an attribute with the name, assignment, value and quote given. * If the quote value is zero, assigns the value using {@link #setRawValue} * which sets the quote character to a proper value if necessary. * @param name The name of this attribute. * @param assignment The assignment string of this attribute. * @param value The value of this attribute. * @param quote The quote around the value of this attribute. */ public Attribute (String name, String assignment, String value, char quote) { setName (name); setAssignment (assignment); if (0 == quote) setRawValue (value); else { setValue (value); setQuote (quote); } } /** * Create an attribute with the name, value and quote given. * Uses an equals sign as the assignment string if the value is not * <code>null</code>, and calls {@link #setRawValue} to get the * correct quoting if <code>quote</code> is zero. * @param name The name of this attribute. * @param value The value of this attribute. * @param quote The quote around the value of this attribute. */ public Attribute (String name, String value, char quote) { this (name, (null == value ? "" : "="), value, quote); } /** * Create a whitespace attribute with the value given. * @param value The value of this attribute. * @exception IllegalArgumentException if the value contains other than * whitespace. To set a real value use {@link #Attribute(String,String)}. */ public Attribute (String value) throws IllegalArgumentException { if (0 != value.trim ().length ()) throw new IllegalArgumentException ("non whitespace value"); else { setName (null); setAssignment (null); setValue (value); setQuote ((char)0); } } /** * Create an attribute with the name and value given. * Uses an equals sign as the assignment string if the value is not * <code>null</code>, and calls {@link #setRawValue} to get the * correct quoting. * @param name The name of this attribute. * @param value The value of this attribute. */ public Attribute (String name, String value) { this (name, (null == value ? "" : "="), value, (char)0); } /** * Create an attribute with the name, assignment string and value given. * Calls {@link #setRawValue} to get the correct quoting. * @param name The name of this attribute. * @param assignment The assignment string of this attribute. * @param value The value of this attribute. */ public Attribute (String name, String assignment, String value) { this (name, assignment, value, (char)0); } /** * Create an empty attribute. * This will provide "" from the {@link #toString} and * {@link #toString(StringBuffer)} methods. */ public Attribute () { this (null, null, null, (char)0); } /** * Get the name of this attribute. * The part before the equals sign, or the contents of the * stand-alone attribute. * @return The name, or <code>null</code> if it's just a whitepace * 'attribute'. * @see #setName */ public String getName () { return (mName); } /** * Get the name of this attribute. * @param buffer The buffer to place the name in. * @see #getName() * @see #setName */ public void getName (StringBuffer buffer) { if (null != mName) buffer.append (mName); } /** * Set the name of this attribute. * Set the part before the equals sign, or the contents of the * stand-alone attribute. * <em>WARNING:</em> Setting this to <code>null</code> can result in * malformed HTML if the assignment string is not <code>null</code>. * @param name The new name. * @see #getName * @see #getName(StringBuffer) */ public void setName (String name) { mName = name; } /** * Get the assignment string of this attribute. * This is usually just an equals sign, but in poorly formed attributes it * can include whitespace on either or both sides of an equals sign. * @return The assignment string. * @see #setAssignment */ public String getAssignment () { return (mAssignment); } /** * Get the assignment string of this attribute. * @param buffer The buffer to place the assignment string in. * @see #getAssignment() * @see #setAssignment */ public void getAssignment (StringBuffer buffer) { if (null != mAssignment) buffer.append (mAssignment); } /** * Set the assignment string of this attribute. * <em>WARNING:</em> Setting this property to other than an equals sign * or <code>null</code> will result in malformed HTML. In the case of a * <code>null</code>, the {@link #setValue value} should also be set to * <code>null</code>. * @param assignment The new assignment string. * @see #getAssignment * @see #getAssignment(StringBuffer) */ public void setAssignment (String assignment) { mAssignment = assignment; } /** * Get the value of the attribute. * The part after the equals sign, or the text if it's just a whitepace * 'attribute'. * <em>NOTE:</em> This does not include any quotes that may have enclosed * the value when it was read. To get the un-stripped value use * {@link #getRawValue}. * @return The value, or <code>null</code> if it's a stand-alone or * empty attribute, or the text if it's just a whitepace 'attribute'. * @see #setValue */ public String getValue () { return (mValue); } /** * Get the value of the attribute. * @param buffer The buffer to place the value in. * @see #getValue() * @see #setValue */ public void getValue (StringBuffer buffer) { if (null != mValue) buffer.append (mValue); } /** * Set the value of the attribute. * The part after the equals sign, or the text if it's a whitepace * 'attribute'. * <em>WARNING:</em> Setting this property to a value that needs to be * quoted without also setting the quote character will result in malformed * HTML. * @param value The new value. * @see #getValue * @see #getValue(StringBuffer) */ public void setValue (String value) { mValue = value; } /** * Get the quote, if any, surrounding the value of the attribute, if any. * @return Either ' or " if the attribute value was quoted, or zero * if there are no quotes around it. * @see #setQuote */ public char getQuote () { return (mQuote); } /** * Get the quote, if any, surrounding the value of the attribute, if any. * @param buffer The buffer to place the quote in. * @see #getQuote() * @see #setQuote */ public void getQuote (StringBuffer buffer) { if (0 != mQuote) buffer.append (mQuote); } /** * Set the quote surrounding the value of the attribute. * <em>WARNING:</em> Setting this property to zero will result in malformed * HTML if the {@link #getValue value} needs to be quoted (i.e. contains * whitespace). * @param quote The new quote value. * @see #getQuote * @see #getQuote(StringBuffer) */ public void setQuote (char quote) { mQuote = quote; } /** * Get the raw value of the attribute. * The part after the equals sign, or the text if it's just a whitepace * 'attribute'. This includes the quotes around the value if any. * @return The value, or <code>null</code> if it's a stand-alone attribute, * or the text if it's just a whitepace 'attribute'. * @see #setRawValue */ public String getRawValue () { char quote; StringBuffer buffer; String ret; if (isValued ()) { quote = getQuote (); if (0 != quote) { buffer = new StringBuffer (); // todo: what is the value length? buffer.append (quote); getValue (buffer); buffer.append (quote); ret = buffer.toString (); } else ret = getValue (); } else ret = null; return (ret); } /** * Get the raw value of the attribute. * The part after the equals sign, or the text if it's just a whitepace * 'attribute'. This includes the quotes around the value if any. * @param buffer The string buffer to append the attribute value to. * @see #getRawValue() * @see #setRawValue */ public void getRawValue (StringBuffer buffer) { getQuote (buffer); getValue (buffer); getQuote (buffer); } /** * Set the value of the attribute and the quote character. * If the value is pure whitespace, assign it 'as is' and reset the * quote character. If not, check for leading and trailing double or * single quotes, and if found use this as the quote character and * the inner contents of <code>value</code> as the real value. * Otherwise, examine the string to determine if quotes are needed * and an appropriate quote character if so. This may involve changing * double quotes within the string to character references. * @param value The new value. * @see #getRawValue * @see #getRawValue(StringBuffer) */ public void setRawValue (String value) { char ch; boolean needed; boolean singleq; boolean doubleq; String ref; StringBuffer buffer; char quote; quote = 0; if ((null != value) && (0 != value.trim ().length ())) { if (value.startsWith ("'") && value.endsWith ("'") && (2 <= value.length ())) { quote = '\''; value = value.substring (1, value.length () - 1); } else if (value.startsWith ("\"") && value.endsWith ("\"") && (2 <= value.length ())) { quote = '"'; value = value.substring (1, value.length () - 1); } else { // first determine if there's whitespace in the value // and while we're at it find a suitable quote character needed = false; singleq = true; doubleq = true; for (int i = 0; i < value.length (); i++) { ch = value.charAt (i); if ('\'' == ch) { singleq = false; needed = true; } else if ('"' == ch) { doubleq = false; needed = true; } else if (!('-' == ch) && !('.' == ch) && !('_' == ch) && !(':' == ch) && !Character.isLetterOrDigit (ch)) { needed = true; } } // now apply quoting if (needed) { if (doubleq) quote = '"'; else if (singleq) quote = '\''; else { // uh-oh, we need to convert some quotes into character // references, so convert all double quotes into " quote = '"'; ref = """; // Translate.encode (quote); // JDK 1.4: value = value.replaceAll ("\"", ref); buffer = new StringBuffer ( value.length() * (ref.length () - 1)); for (int i = 0; i < value.length (); i++) { ch = value.charAt (i); if (quote == ch) buffer.append (ref); else buffer.append (ch); } value = buffer.toString (); } } } } setValue (value); setQuote (quote); } /** * Predicate to determine if this attribute is whitespace. * @return <code>true</code> if this attribute is whitespace, * <code>false</code> if it is a real attribute. */ public boolean isWhitespace () { return (null == getName ()); } /** * Predicate to determine if this attribute has no equals sign (or value). * @return <code>true</code> if this attribute is a standalone attribute. * <code>false</code> if has an equals sign. */ public boolean isStandAlone () { return ((null != getName ()) && (null == getAssignment ())); } /** * Predicate to determine if this attribute has an equals sign but no value. * @return <code>true</code> if this attribute is an empty attribute. * <code>false</code> if has an equals sign and a value. */ public boolean isEmpty () { return ((null != getAssignment ()) && (null == getValue ())); } /** * Predicate to determine if this attribute has a value. * @return <code>true</code> if this attribute has a value. * <code>false</code> if it is empty or standalone. */ public boolean isValued () { return (null != getValue ()); } /** * Get the length of the string value of this attribute. * @return The number of characters required to express this attribute. */ public int getLength () { String name; String assignment; String value; char quote; int ret; ret = 0; name = getName (); if (null != name) ret += name.length (); assignment = getAssignment (); if (null != assignment) ret += assignment.length (); value = getValue (); if (null != value) ret += value.length (); quote = getQuote (); if (0 != quote) ret += 2; return (ret); } /** * Get a text representation of this attribute. * Suitable for insertion into a tag, the output is one of * the forms: * <code> * <pre> * value * name * name= * name=value * name='value' * name="value" * </pre> * </code> * @return A string that can be used within a tag. */ public String toString () { int length; StringBuffer ret; // get the size to avoid extra StringBuffer allocations length = getLength (); ret = new StringBuffer (length); toString (ret); return (ret.toString ()); } /** * Get a text representation of this attribute. * @param buffer The accumulator for placing the text into. * @see #toString() */ public void toString (StringBuffer buffer) { getName (buffer); getAssignment (buffer); getRawValue (buffer); } }