// Jericho HTML Parser - Java based library for analysing and manipulating HTML // Version 3.2 // Copyright (C) 2004-2009 Martin Jericho // http://jericho.htmlparser.net/ // // This library is free software; you can redistribute it and/or // modify it under the terms of either one of the following licences: // // 1. The Eclipse Public License (EPL) version 1.0, // included in this distribution in the file licence-epl-1.0.html // or available at http://www.eclipse.org/legal/epl-v10.html // // 2. The GNU Lesser General Public License (LGPL) version 2.1 or later, // included in this distribution in the file licence-lgpl-2.1.txt // or available at http://www.gnu.org/licenses/lgpl.txt // // This library is distributed on an "AS IS" basis, // WITHOUT WARRANTY OF ANY KIND, either express or implied. // See the individual licence texts for more details. package net.htmlparser.jericho; import java.io.*; /** * Represents a single <a target="_blank" href="http://www.w3.org/TR/html401/intro/sgmltut.html#h-3.2.2">attribute</a> * name/value segment within a {@link StartTag}. * <p> * An instance of this class is a representation of a single attribute in the source document and is not modifiable. * The {@link OutputDocument#replace(Attributes, Map)} and {@link OutputDocument#replace(Attributes, boolean convertNamesToLowerCase)} methods * provide the means to add, delete or modify attributes and their values in an {@link OutputDocument}. * <p> * Obtained using the {@link Attributes#get(String key)} method. * <p> * See also the XML 1.0 specification for <a target="_blank" href="http://www.w3.org/TR/REC-xml#dt-attr">attributes</a>. * * @see Attributes */ public final class Attribute extends Segment { private final String key; private final Segment nameSegment; private final Segment valueSegment; private final Segment valueSegmentIncludingQuotes; StartTag startTag=StartTag.NOT_CACHED; static final String CHECKED="checked"; static final String CLASS="class"; static final String DISABLED="disabled"; static final String ID="id"; static final String MULTIPLE="multiple"; static final String NAME="name"; static final String SELECTED="selected"; static final String STYLE="style"; static final String TYPE="type"; static final String VALUE="value"; /** * Constructs a new Attribute with no value part, called from Attributes class. * <p> * Note that the resulting Attribute segment has the same span as the supplied nameSegment. * * @param source the {@link Source} document. * @param key the name of this attribute in lower case. * @param nameSegment the segment representing the name. */ Attribute(final Source source, final String key, final Segment nameSegment) { this(source,key,nameSegment,null,null); } /** * Constructs a new Attribute, called from Attributes class. * <p> * The resulting Attribute segment begins at the start of the nameSegment * and finishes at the end of the valueSegmentIncludingQuotes. If this attribute * has no value, it finishes at the end of the nameSegment. * <p> * If this attribute has no value, the <code>valueSegment</code> and <code>valueSegmentIncludingQuotes</code> must be null. * The <valueSegmentIncludingQuotes</code> parameter must not be null if the <code>valueSegment</code> is not null, and vice versa * * @param source the {@link Source} document. * @param key the name of this attribute in lower case. * @param nameSegment the segment spanning the name. * @param valueSegment the segment spanning the value. * @param valueSegmentIncludingQuotes the segment spanning the value, including quotation marks if any. */ Attribute(final Source source, final String key, final Segment nameSegment, final Segment valueSegment, final Segment valueSegmentIncludingQuotes) { super(source,nameSegment.getBegin(),(valueSegmentIncludingQuotes==null ? nameSegment.getEnd() : valueSegmentIncludingQuotes.getEnd())); this.key=key; this.nameSegment=nameSegment; this.valueSegment=valueSegment; this.valueSegmentIncludingQuotes=valueSegmentIncludingQuotes; } /** * Returns the name of this attribute in lower case. * <p> * This package treats all attribute names as case insensitive, consistent with * <a target="_blank" href="http://www.w3.org/TR/html401/">HTML</a> but not consistent with * <a target="_blank" href="http://www.w3.org/TR/xhtml1/">XHTML</a>. * * @return the name of this attribute in lower case. * @see #getName() */ public String getKey() { return key; } /** * Returns the name of this attribute in original case. * <p> * This is exactly equivalent to {@link #getNameSegment()}<code>.toString()</code>. * * @return the name of this attribute in original case. * @see #getKey() */ public String getName() { return nameSegment.toString(); } /** * Returns the segment spanning the {@linkplain #getName() name} of this attribute. * @return the segment spanning the {@linkplain #getName() name} of this attribute. * @see #getName() */ public Segment getNameSegment() { return nameSegment; } /** * Indicates whether this attribute has a value. * <p> * This method also returns <code>true</code> if this attribute has been assigned a zero-length value. * <p> * It only returns <code>false</code> if this attribute appears in * <a target="_blank" href="http://www.w3.org/TR/html401/intro/sgmltut.html#didx-boolean_attribute-1">minimized form</a>. * * @return <code>true</code> if this attribute has a value, otherwise <code>false</code>. */ public boolean hasValue() { return valueSegment!=null; } /** * Returns the {@linkplain CharacterReference#decode(CharSequence,boolean) decoded} value of this attribute, * or <code>null</code> if it {@linkplain #hasValue() has no value}. * <p> * This is equivalent to {@link CharacterReference}<code>.</code>{@link CharacterReference#decode(CharSequence,boolean) decode}<code>(</code>{@link #getValueSegment()}<code>,true)</code>. * <p> * Note that before version 1.4.1 this method returned the raw value of the attribute as it appears in the source document, * without {@linkplain CharacterReference#decode(CharSequence,boolean) decoding}. * <p> * To obtain the raw value without decoding, use {@link #getValueSegment()}<code>.toString()</code>. * <p> * Special attention should be given to attributes that contain URLs, such as the * <code><a target="_blank" href="http://www.w3.org/TR/html401/struct/links.html#adef-href">href</a></code> attribute. * When such an attribute contains a URL with parameters (as described in the * <a target="_blank" href="http://www.w3.org/MarkUp/html-spec/html-spec_8.html#SEC8.2.1">form-urlencoded media type</a>), * the ampersand (<code>&</code>) characters used to separate the parameters should be * {@linkplain CharacterReference#encode(CharSequence) encoded} to prevent the parameter names from being * unintentionally interpreted as {@linkplain CharacterEntityReference character entity references}. * This requirement is explicitly stated in the * <a target="_blank" href="http://www.w3.org/TR/html401/charset.html#h-5.3.2">HTML 4.01 specification section 5.3.2</a>. * <p> * For example, take the following element in the source document: * <div style="margin: 0.5em"><code><a href="Report.jsp?chapt=2&sect=3">next</a></code></div> * By default, calling * {@link Element#getAttributes() getAttributes()}<code>.</code>{@link Attributes#getValue(String) getValue}<code>("href")</code> * on this element returns the string * "<code>Report.jsp?chapt=2§=3</code>", since the text "<code>&sect</code>" is interpreted as the rarely used * character entity reference {@link CharacterEntityReference#_sect &sect;} (U+00A7), despite the fact that it is * missing the {@linkplain CharacterReference#isTerminated() terminating semicolon} (<code>;</code>). * <p> * Most browsers recognise <a href="CharacterReference.html#Unterminated">unterminated</a> character entity references * in attribute values representing a codepoint of U+00FF or below, but ignore those representing codepoints above this value. * One relatively popular browser only recognises those representing a codepoint of U+003E or below, meaning it would * have interpreted the URL in the above example differently to most other browsers. * Most browsers also use different rules depending on whether the unterminated character reference is inside or outside * of an attribute value, with both of these possibilities further split into different rules for * {@linkplain CharacterEntityReference character entity references}, * <a href="NumericCharacterReference.html#DecimalCharacterReference">decimal character references</a>, and * <a href="NumericCharacterReference.html#HexadecimalCharacterReference">hexadecimal character references</a>. * <p> * The behaviour of this library is determined by the current {@linkplain Config.CompatibilityMode compatibility mode} setting, * which is determined by the static {@link Config#CurrentCompatibilityMode} property. * * @return the {@linkplain CharacterReference#decode(CharSequence,boolean) decoded} value of this attribute, or <code>null</code> if it {@linkplain #hasValue() has no value}. */ public String getValue() { return CharacterReference.decode(valueSegment,true); } /** * Returns the segment spanning the {@linkplain #getValue() value} of this attribute, or <code>null</code> if it {@linkplain #hasValue() has no value}. * @return the segment spanning the {@linkplain #getValue() value} of this attribute, or <code>null</code> if it {@linkplain #hasValue() has no value}. * @see #getValue() */ public Segment getValueSegment() { return valueSegment; } /** * Returns the segment spanning the {@linkplain #getValue() value} of this attribute, including quotation marks if any, * or <code>null</code> if it {@linkplain #hasValue() has no value}. * <p> * If the value is not enclosed by quotation marks, this is the same as the {@linkplain #getValueSegment() value segment} * * @return the segment spanning the {@linkplain #getValue() value} of this attribute, including quotation marks if any, or <code>null</code> if it {@linkplain #hasValue() has no value}. */ public Segment getValueSegmentIncludingQuotes() { return valueSegmentIncludingQuotes; } /** * Returns the character used to quote the value. * <p> * The return value is either a double-quote (<code>"</code>), a single-quote (<code>'</code>), or a space. * * @return the character used to quote the value, or a space if the value is not quoted or this attribute has no value. */ public char getQuoteChar() { if (valueSegment==valueSegmentIncludingQuotes) return ' '; // no quotes return source.charAt(valueSegmentIncludingQuotes.getBegin()); } /** * Returns the start tag to which this attribute belongs. * @return the start tag to which this attribute belongs, or <code>null</code> if it is not within a start tag. */ public StartTag getStartTag() { if (startTag==StartTag.NOT_CACHED) { final Tag tag=source.getEnclosingTag(begin); startTag=(tag==null || tag instanceof EndTag) ? null : (StartTag)tag; } return startTag; } /** * Returns a string representation of this object useful for debugging purposes. * @return a string representation of this object useful for debugging purposes. */ public String getDebugInfo() { final StringBuilder sb=new StringBuilder().append(key).append(super.getDebugInfo()).append(",name=").append(nameSegment.getDebugInfo()); if (hasValue()) sb.append(",value=").append(valueSegment.getDebugInfo()).append('"').append(valueSegment).append('"').append(Config.NewLine); else sb.append(",NO VALUE").append(Config.NewLine); return sb.toString(); } Tag appendTidy(final Appendable appendable, Tag nextTag) throws IOException { appendable.append(' ').append(nameSegment); if (valueSegment!=null) { appendable.append("=\""); while (nextTag!=null && nextTag.begin<valueSegment.begin) nextTag=nextTag.getNextTag(); if (nextTag==null || nextTag.begin>=valueSegment.end) { appendTidyValue(appendable,valueSegment); } else { int i=valueSegment.begin; while (nextTag!=null && nextTag.begin<valueSegment.end) { appendTidyValue(appendable,new Segment(source,i,nextTag.begin)); if (nextTag.end>valueSegment.end) { appendable.append(new Segment(source,nextTag.begin,i=valueSegment.end)); break; } appendable.append(nextTag); i=nextTag.end; nextTag=nextTag.getNextTag(); } if (i<valueSegment.end) appendTidyValue(appendable,new Segment(source,i,valueSegment.end)); } appendable.append('"'); } return nextTag; } private static void appendTidyValue(final Appendable appendable, final CharSequence unencodedValue) throws IOException { CharacterReference.appendEncode(appendable,CharacterReference.decode(unencodedValue,true),false); } static Appendable appendHTML(final Appendable appendable, final CharSequence name, final CharSequence value) throws IOException { appendable.append(' ').append(name); if (value!=null) { appendable.append("=\""); CharacterReference.appendEncode(appendable,value,false); appendable.append('"'); } return appendable; } }