// Jericho HTML Parser - Java based library for analysing and manipulating HTML // Version 3.2 // Copyright (C) 2004-2009 Martin Jericho // http://jericho.htmlparser.net/ // // This library is free software; you can redistribute it and/or // modify it under the terms of either one of the following licences: // // 1. The Eclipse Public License (EPL) version 1.0, // included in this distribution in the file licence-epl-1.0.html // or available at http://www.eclipse.org/legal/epl-v10.html // // 2. The GNU Lesser General Public License (LGPL) version 2.1 or later, // included in this distribution in the file licence-lgpl-2.1.txt // or available at http://www.gnu.org/licenses/lgpl.txt // // This library is distributed on an "AS IS" basis, // WITHOUT WARRANTY OF ANY KIND, either express or implied. // See the individual licence texts for more details. package net.htmlparser.jericho; import net.htmlparser.jericho.nodoc.*; import java.util.*; import java.io.*; /** * Represents the list of {@link Attribute} objects present within a particular {@link StartTag}. * <p> * This segment starts at the end of the start tag's {@linkplain StartTag#getName() name} * and ends at the end of the last attribute. * <p> * The attributes in this list are a representation of those found in the source document and are not modifiable. * The {@link OutputDocument#replace(Attributes, Map)} and {@link OutputDocument#replace(Attributes, boolean convertNamesToLowerCase)} methods * provide the means to add, delete or modify attributes and their values in an {@link OutputDocument}. * <p> * Any {@linkplain TagType#isServerTag() server tags} encountered inside the attributes area of a non-server tag * do not interfere with the parsing of the attributes. * <p> * If too many syntax errors are encountered while parsing a start tag's attributes, the parser rejects the entire start tag * and generates a {@linkplain Source#getLogger() log} entry. * The threshold for the number of errors allowed can be set using the {@link #setDefaultMaxErrorCount(int)} static method. * <p> * Obtained using the {@link StartTag#getAttributes()} method, or explicitly using the {@link Source#parseAttributes(int pos, int maxEnd)} method. * <p> * It is common for instances of this class to contain no attributes. * <p> * See also the XML 1.0 specification for <a target="_blank" href="http://www.w3.org/TR/REC-xml#dt-attr">attributes</a>. * * @see StartTag * @see Attribute */ public final class Attributes extends SequentialListSegment<Attribute> { private final LinkedList<Attribute> attributeList; // never null final boolean containsServerTagOutsideOfAttributeValue; private enum ParsingState { AFTER_TAG_NAME, BETWEEN_ATTRIBUTES, IN_NAME, AFTER_NAME, // this only happens if an attribute name is followed by whitespace START_VALUE, IN_VALUE, AFTER_VALUE_FINAL_QUOTE } private static int defaultMaxErrorCount=2; // defines maximum number of minor errors that can be encountered in attributes before entire start tag is rejected. private Attributes(final Source source, final int begin, final int end, final LinkedList<Attribute> attributeList, final boolean containsServerTagOutsideOfAttributeValue) { super(source,begin,end); this.attributeList=attributeList; this.containsServerTagOutsideOfAttributeValue=containsServerTagOutsideOfAttributeValue; } /** called from StartTagType.parseAttributes(Source, int startTagBegin, String tagName) */ static Attributes construct(final Source source, final int startTagBegin, final StartTagType startTagType, final String tagName) { return construct(source,"StartTag",ParsingState.AFTER_TAG_NAME,startTagBegin,-1,-1,startTagType,tagName,defaultMaxErrorCount); } /** called from StartTag.parseAttributes(int maxErrorCount) */ static Attributes construct(final Source source, final int startTagBegin, final int attributesBegin, final int maxEnd, final StartTagType startTagType, final String tagName, final int maxErrorCount) { return construct(source,"Attributes for StartTag",ParsingState.BETWEEN_ATTRIBUTES,startTagBegin,attributesBegin,maxEnd,startTagType,tagName,maxErrorCount); } /** called from Source.parseAttributes(int pos, int maxEnd, int maxErrorCount) */ static Attributes construct(final Source source, final int begin, final int maxEnd, final int maxErrorCount) { return construct(source,"Attributes",ParsingState.BETWEEN_ATTRIBUTES,begin,-1,maxEnd,StartTagType.NORMAL,null,maxErrorCount); } /** * Any < character found within the start tag is treated as though it is part of the attribute * list, which is consistent with the way IE treats it. * @param logBegin the position of the beginning of the object being searched (for logging) * @param attributesBegin the position of the beginning of the attribute list, or -1 if it should be calculated automatically from logBegin. * @param maxEnd the position at which the attributes must end if a terminating character is not found, or -1 if no maximum. * @param tagName the name of the enclosing StartTag, or null if constucting attributes directly. */ private static Attributes construct(final Source source, final String logType, ParsingState parsingState, final int logBegin, int attributesBegin, final int maxEnd, final StartTagType startTagType, final String tagName, final int maxErrorCount) { boolean isClosingSlashIgnored=false; if (tagName!=null) { // 'logBegin' parameter is the start of the associated start tag if (attributesBegin==-1) attributesBegin=logBegin+1+tagName.length(); if (startTagType==StartTagType.NORMAL && HTMLElements.isClosingSlashIgnored(tagName)) isClosingSlashIgnored=true; } else { attributesBegin=logBegin; } int attributesEnd=attributesBegin; final LinkedList<Attribute> attributeList=new LinkedList<Attribute>(); boolean containsServerTagOutsideOfAttributeValue=false; final ParseText parseText=source.getParseText(); int i=attributesBegin; char quote=' '; Segment nameSegment=null; String key=null; int currentBegin=-1; boolean isTerminatingCharacter=false; int errorCount=0; try { while (!isTerminatingCharacter) { if (i==maxEnd || startTagType.atEndOfAttributes(source,i,isClosingSlashIgnored)) isTerminatingCharacter=true; final char ch=parseText.charAt(i); // First check if there is a server tag in this position: if (ch=='<') { final Tag interlopingTag=Tag.getTagAt(source,i,true); // search for server tags only if (interlopingTag!=null) { // There is a server tag in this position. Skip over it: if (parsingState==ParsingState.START_VALUE) { currentBegin=i; quote=' '; parsingState=ParsingState.IN_VALUE; } i=attributesEnd=interlopingTag.end; if (parsingState!=ParsingState.IN_VALUE) containsServerTagOutsideOfAttributeValue=true; continue; } } // There is no server tag in this position. Now we can parse the attributes: switch (parsingState) { case IN_VALUE: if (isTerminatingCharacter || ch==quote || (quote==' ' && isWhiteSpace(ch))) { Segment valueSegment; Segment valueSegmentIncludingQuotes; if (quote==' ') { valueSegment=valueSegmentIncludingQuotes=new Segment(source,currentBegin,i); } else { if (isTerminatingCharacter) { if (i==maxEnd) { if (source.logger.isInfoEnabled()) log(source,logType,tagName,logBegin,"terminated in the middle of a quoted attribute value",i); if (reachedMaxErrorCount(++errorCount,source,logType,tagName,logBegin,maxErrorCount)) return null; valueSegment=new Segment(source,currentBegin,i); valueSegmentIncludingQuotes=new Segment(source,currentBegin-1,i); // this is missing the end quote } else { // don't want to terminate, only encountered a terminating character in the middle of a quoted value isTerminatingCharacter=false; break; } } else { valueSegment=new Segment(source,currentBegin,i); valueSegmentIncludingQuotes=new Segment(source,currentBegin-1,i+1); } } attributeList.add(new Attribute(source,key,nameSegment,valueSegment,valueSegmentIncludingQuotes)); attributesEnd=valueSegmentIncludingQuotes.getEnd(); parsingState=ParsingState.BETWEEN_ATTRIBUTES; } else if (ch=='<' && quote==' ') { if (source.logger.isInfoEnabled()) log(source,logType,tagName,logBegin,"rejected because of '<' character in unquoted attribute value",i); return null; } break; case IN_NAME: if (isTerminatingCharacter || ch=='=' || isWhiteSpace(ch)) { nameSegment=new Segment(source,currentBegin,i); key=nameSegment.toString().toLowerCase(); if (isTerminatingCharacter) { attributeList.add(new Attribute(source,key,nameSegment)); // attribute with no value attributesEnd=i; } else { parsingState=(ch=='=' ? ParsingState.START_VALUE : ParsingState.AFTER_NAME); } } else if (!Tag.isXMLNameChar(ch)) { // invalid character detected in attribute name. if (ch=='<') { if (source.logger.isInfoEnabled()) log(source,logType,tagName,logBegin,"rejected because of '<' character in attribute name",i); return null; } if (isInvalidEmptyElementTag(startTagType,source,i,logType,tagName,logBegin)) break; if (source.logger.isInfoEnabled()) log(source,logType,tagName,logBegin,"contains attribute name with invalid character",i); if (reachedMaxErrorCount(++errorCount,source,logType,tagName,logBegin,maxErrorCount)) return null; } break; case AFTER_NAME: // attribute name has been followed by whitespace, but may still be followed by an '=' character. if (isTerminatingCharacter || !(ch=='=' || isWhiteSpace(ch))) { attributeList.add(new Attribute(source,key,nameSegment)); // attribute with no value attributesEnd=nameSegment.getEnd(); if (isTerminatingCharacter) break; // The current character is the first character of an attribute name parsingState=ParsingState.BETWEEN_ATTRIBUTES; i--; // want to reparse the same character again, so decrement i. Note we could instead just fall into the next case statement without a break, but such code is always discouraged. } else if (ch=='=') { parsingState=ParsingState.START_VALUE; } else if (ch=='<') { if (source.logger.isInfoEnabled()) log(source,logType,tagName,logBegin,"rejected because of '<' character after attribute name",i); return null; } break; case BETWEEN_ATTRIBUTES: if (!isTerminatingCharacter) { // the quote variable is used here to make sure whitespace has come after the last quoted attribute value if (isWhiteSpace(ch)) { quote=' '; } else { if (quote!=' ') { if (source.logger.isInfoEnabled()) log(source,logType,tagName,logBegin,"has missing whitespace after quoted attribute value",i); // only count this as an error if there have already been other errors, otherwise allow unlimited errors of this type. if (errorCount>0 && reachedMaxErrorCount(++errorCount,source,logType,tagName,logBegin,maxErrorCount)) return null; } if (!Tag.isXMLNameStartChar(ch)) { // invalid character detected as first character of attribute name. if (ch=='<') { if (source.logger.isInfoEnabled()) log(source,logType,tagName,logBegin,"rejected because of '<' character",i); return null; } if (isInvalidEmptyElementTag(startTagType,source,i,logType,tagName,logBegin)) break; if (startTagType==StartTagType.NORMAL && startTagType.atEndOfAttributes(source,i,false)) { // This checks whether we've found the characters "/>" but it wasn't recognised as the closing delimiter because isClosingSlashIgnored is true. if (source.logger.isInfoEnabled()) log(source,logType,tagName,logBegin,"contains a '/' character before the closing '>', which is ignored because tags of this name cannot be empty-element tags"); break; } if (source.logger.isInfoEnabled()) log(source,logType,tagName,logBegin,"contains attribute name with invalid first character",i); if (reachedMaxErrorCount(++errorCount,source,logType,tagName,logBegin,maxErrorCount)) return null; } parsingState=ParsingState.IN_NAME; currentBegin=i; } } break; case START_VALUE: currentBegin=i; if (isTerminatingCharacter) { if (source.logger.isInfoEnabled()) log(source,logType,tagName,logBegin,"has missing attribute value after '=' sign",i); // only count this as an error if there have already been other errors, otherwise allow unlimited errors of this type. if (errorCount>0 && reachedMaxErrorCount(++errorCount,source,logType,tagName,logBegin,maxErrorCount)) return null; final Segment valueSegment=new Segment(source,i,i); attributeList.add(new Attribute(source,key,nameSegment,valueSegment,valueSegment)); attributesEnd=i; parsingState=ParsingState.BETWEEN_ATTRIBUTES; break; } if (ch=='\'' || ch=='"') { quote=ch; currentBegin++; } else if (isWhiteSpace(ch)) { break; // just ignore whitespace after the '=' sign as nearly all browsers do. } else if (ch=='<') { if (source.logger.isInfoEnabled()) log(source,logType,tagName,logBegin,"rejected because of '<' character at the start of an attribute value",i); return null; } else { quote=' '; } parsingState=ParsingState.IN_VALUE; break; case AFTER_TAG_NAME: if (!isTerminatingCharacter) { if (!isWhiteSpace(ch)) { if (isInvalidEmptyElementTag(startTagType,source,i,logType,tagName,logBegin)) break; if (source.logger.isInfoEnabled()) log(source,logType,tagName,logBegin,"rejected because the name contains an invalid character",i); return null; } parsingState=ParsingState.BETWEEN_ATTRIBUTES; } break; } i++; } return new Attributes(source,attributesBegin,attributesEnd,attributeList,containsServerTagOutsideOfAttributeValue); } catch (IndexOutOfBoundsException ex) { if (source.logger.isInfoEnabled()) log(source,logType,tagName,logBegin,"rejected because it has no closing '>' character"); return null; } } private static boolean reachedMaxErrorCount(final int errorCount, final Source source, final String logType, final String tagName, final int logBegin, final int maxErrorCount) { if (errorCount<=maxErrorCount) return false; if (source.logger.isInfoEnabled()) log(source,logType,tagName,logBegin,"rejected because it contains too many errors"); return true; } private static boolean isInvalidEmptyElementTag(final StartTagType startTagType, final Source source, final int i, final String logType, final String tagName, final int logBegin) { // This checks whether we've found the characters "/>" but it wasn't recognised as the closing delimiter because isClosingSlashIgnored is true. if (startTagType!=StartTagType.NORMAL || !startTagType.atEndOfAttributes(source,i,false)) return false; if (source.logger.isInfoEnabled()) log(source,logType,tagName,logBegin,"contains a '/' character before the closing '>', which is ignored because tags of this name cannot be empty-element tags"); return true; } /** * Returns the {@link Attribute} with the specified name (case insensitive). * <p> * If more than one attribute exists with the specified name (which is illegal HTML), * the first is returned. * * @param name the name of the attribute to get. * @return the attribute with the specified name, or <code>null</code> if no attribute with the specified name exists. * @see #getValue(String name) */ public Attribute get(final String name) { if (size()==0) return null; for (int i=0; i<size(); i++) { final Attribute attribute=get(i); if (attribute.getKey().equalsIgnoreCase(name)) return attribute; } return null; } /** * Returns the {@linkplain CharacterReference#decode(CharSequence) decoded} value of the attribute with the specified name (case insensitive). * <p> * Returns <code>null</code> if no attribute with the specified name exists or * the attribute {@linkplain Attribute#hasValue() has no value}. * <p> * This is equivalent to {@link #get(String) get(name)}<code>.</code>{@link Attribute#getValue() getValue()}, * except that it returns <code>null</code> if no attribute with the specified name exists instead of throwing a * <code>NullPointerException</code>. * * @param name the name of the attribute to get. * @return the {@linkplain CharacterReference#decode(CharSequence) decoded} value of the attribute with the specified name, or <code>null</code> if the attribute does not exist or {@linkplain Attribute#hasValue() has no value}. * @see Attribute#getValue() */ public String getValue(final String name) { final Attribute attribute=get(name); return attribute==null ? null : attribute.getValue(); } /** * Returns the raw (not {@linkplain CharacterReference#decode(CharSequence) decoded}) value of the attribute, or null if the attribute {@linkplain Attribute#hasValue() has no value}. * <p> * This is an internal convenience method. * * @return the raw (not {@linkplain CharacterReference#decode(CharSequence) decoded}) value of the attribute, or null if the attribute {@linkplain Attribute#hasValue() has no value}. */ String getRawValue(final String name) { final Attribute attribute=get(name); return attribute==null || !attribute.hasValue() ? null : attribute.getValueSegment().toString(); } /** * Returns the number of attributes. * <p> * This is equivalent to calling the <code>size()</code> method specified in the <code>List</code> interface. * * @return the number of attributes. */ public int getCount() { return attributeList.size(); } /** * Returns an iterator over the {@link Attribute} objects in this list in order of appearance. * @return an iterator over the {@link Attribute} objects in this list in order of appearance. */ public Iterator<Attribute> iterator() { return listIterator(); } /** * Returns a list iterator of the {@link Attribute} objects in this list in order of appearance, * starting at the specified position in the list. * <p> * The specified index indicates the first item that would be returned by an initial call to the <code>next()</code> method. * An initial call to the <code>previous()</code> method would return the item with the specified index minus one. * <p> * IMPLEMENTATION NOTE: For efficiency reasons this method does not return an immutable list iterator. * Calling any of the <code>add(Object)</code>, <code>remove()</code> or <code>set(Object)</code> methods on the returned * <code>ListIterator</code> does not throw an exception but could result in unexpected behaviour. * * @param index the index of the first item to be returned from the list iterator (by a call to the <code>next()</code> method). * @return a list iterator of the items in this list (in proper sequence), starting at the specified position in the list. * @throws IndexOutOfBoundsException if the specified index is out of range (<code>index < 0 || index > size()</code>). */ public ListIterator<Attribute> listIterator(final int index) { return attributeList.listIterator(index); } /** * Populates the specified <code>Map</code> with the name/value pairs from these attributes. * <p> * Both names and values are stored as <code>String</code> objects. * <p> * The entries are added in order of apprearance in the source document. * <p> * An attribute with {@linkplain Attribute#hasValue() no value} is represented by a map entry with a <code>null</code> value. * <p> * Attribute values are automatically {@linkplain CharacterReference#decode(CharSequence) decoded} * before storage in the map. * * @param attributesMap the map to populate, must not be <code>null</code>. * @param convertNamesToLowerCase specifies whether all attribute names are converted to lower case in the map. * @return the same map specified as the argument to the <code>attributesMap</code> parameter, populated with the name/value pairs from these attributes. * @see #generateHTML(Map attributesMap) */ public Map<String,String> populateMap(final Map<String,String> attributesMap, final boolean convertNamesToLowerCase) { for (Attribute attribute : this) { attributesMap.put(convertNamesToLowerCase ? attribute.getKey() : attribute.getName(),attribute.getValue()); } return attributesMap; } /** * Returns a string representation of this object useful for debugging purposes. * @return a string representation of this object useful for debugging purposes. */ public String getDebugInfo() { final StringBuilder sb=new StringBuilder(); sb.append("Attributes ").append(super.getDebugInfo()).append(": "); if (isEmpty()) { sb.append("EMPTY"); } else { sb.append(Config.NewLine); for (Attribute attribute : this) { sb.append(" ").append(attribute.getDebugInfo()); } } return sb.toString(); } /** * Returns the default maximum error count allowed when parsing attributes. * <p> * The system default value is 2. * <p> * When searching for start tags, the parser can find the end of the start tag only by * {@linkplain StartTagType#parseAttributes(Source,int,String) parsing} * the attributes, as it is valid HTML for attribute values to contain '>' characters * (see the <a target="_blank" href="http://www.w3.org/TR/html401/charset.html#h-5.3.2">HTML 4.01 specification section 5.3.2</a>). * <p> * If the source text being parsed does not follow the syntax of an attribute list at all, the parser assumes * that the text which was originally identified as the beginning of of a start tag is in fact some other text, * such as an invalid '<' character in the middle of some text, or part of a script element. * In this case the entire start tag is rejected. * <p> * On the other hand, it is quite common for attributes to contain minor syntactical errors, * such as an invalid character in an attribute name. * For this reason the parser allows a certain number of minor errors to occur while parsing an * attribute list before the entire start tag or attribute list is rejected. * This property indicates the number of minor errors allowed. * <p> * Major syntactical errors cause the start tag or attribute list to be rejected immediately, regardless * of the maximum error count setting. * <p> * Some errors are considered too minor to count at all (ignorable), such as missing white space between the end * of a quoted attribute value and the start of the next attribute name. * <p> * The classification of particular syntax errors in attribute lists into major, minor, and ignorable is * not part of the specification and may change in future versions. * <p> * Errors are {@linkplain Source#getLogger() logged} as they occur. * <p> * The value of this property is set using the {@link #setDefaultMaxErrorCount(int)} method. * * @return the default maximum error count allowed when parsing attributes. * @see Source#parseAttributes(int pos, int maxEnd, int maxErrorCount) */ public static int getDefaultMaxErrorCount() { return defaultMaxErrorCount; } /** * Sets the default maximum error count allowed when parsing attributes. * <p> * See the {@link #getDefaultMaxErrorCount()} method for a full description of this property. * * @param value the default maximum error count allowed when parsing attributes. */ public static void setDefaultMaxErrorCount(final int value) { defaultMaxErrorCount=value; } /** * Returns the contents of the specified {@linkplain #populateMap(Map,boolean) attributes map} as HTML attribute name/value pairs. * <p> * Each attribute (including the first) is preceded by a single space, and all values are * {@linkplain CharacterReference#encode(CharSequence) encoded} and enclosed in double quotes. * <p> * The map keys must be of type <code>String</code> and values must be objects that implement the <code>CharSequence</code> interface. * <p> * A <code>null</code> value represents an attribute with no value. * * @param attributesMap a map containing attribute name/value pairs. * @return the contents of the specified {@linkplain #populateMap(Map,boolean) attributes map} as HTML attribute name/value pairs. * @see StartTag#generateHTML(String tagName, Map attributesMap, boolean emptyElementTag) */ public static String generateHTML(final Map<String,String> attributesMap) { final StringBuilder sb=new StringBuilder(); try {appendHTML(sb,attributesMap);} catch (IOException ex) {} // IOException never occurs in StringWriter return sb.toString(); } /** * Outputs the contents of the specified {@linkplain #populateMap(Map,boolean) attributes map} as HTML attribute name/value pairs to the specified <code>Appendable</code> object. * <p> * Each attribute is preceded by a single space, and all values are * {@linkplain CharacterReference#encode(CharSequence) encoded} and enclosed in double quotes. * * @param appendable the <code>Appendable</code> object to which the output is to be sent. * @param attributesMap a map containing attribute name/value pairs. * @throws IOException if an I/O exception occurs. * @see #populateMap(Map attributesMap, boolean convertNamesToLowerCase) */ static void appendHTML(final Appendable appendable, final Map<String,String> attributesMap) throws IOException { for (Map.Entry<String,String> entry : attributesMap.entrySet()) { Attribute.appendHTML(appendable,entry.getKey(),entry.getValue()); } } Appendable appendTidy(final Appendable appendable, Tag nextTag) throws IOException { for (Attribute attribute : this) nextTag=attribute.appendTidy(appendable,nextTag); return appendable; } Map<String,String> getMap(final boolean convertNamesToLowerCase) { return populateMap(new LinkedHashMap<String,String>(getCount()*2,1.0F),convertNamesToLowerCase); } void setStartTag(final StartTag startTag) { // this just preloads the startTag cache in each Attribute so we don't have to go looking for it if it is requested. for (Attribute attribute : attributeList) attribute.startTag=startTag; } private static void log(final Source source, final String part1, final CharSequence part2, final int begin, final String part3, final int pos) { source.logger.info(source.getRowColumnVector(pos).appendTo(source.getRowColumnVector(begin).appendTo(new StringBuilder(200).append(part1).append(' ').append(part2).append(" at ")).append(' ').append(part3).append(" at position ")).toString()); } private static void log(final Source source, final String part1, final CharSequence part2, final int begin, final String part3) { source.logger.info(source.getRowColumnVector(begin).appendTo(new StringBuilder(200).append(part1).append(' ').append(part2).append(" at ")).append(' ').append(part3).toString()); } }