// Jericho HTML Parser - Java based library for analysing and manipulating HTML // Version 3.2 // Copyright (C) 2004-2009 Martin Jericho // http://jericho.htmlparser.net/ // // This library is free software; you can redistribute it and/or // modify it under the terms of either one of the following licences: // // 1. The Eclipse Public License (EPL) version 1.0, // included in this distribution in the file licence-epl-1.0.html // or available at http://www.eclipse.org/legal/epl-v10.html // // 2. The GNU Lesser General Public License (LGPL) version 2.1 or later, // included in this distribution in the file licence-lgpl-2.1.txt // or available at http://www.gnu.org/licenses/lgpl.txt // // This library is distributed on an "AS IS" basis, // WITHOUT WARRANTY OF ANY KIND, either express or implied. // See the individual licence texts for more details. package net.htmlparser.jericho; import java.util.*; /** * Represents the <a target="_blank" href="http://www.w3.org/TR/html401/intro/sgmltut.html#didx-element-3">end tag</a> of an * {@linkplain Element element} in a specific {@linkplain Source source} document. * <p> * An end tag always has a {@linkplain #getTagType() type} that is a subclass of {@link EndTagType}, meaning it * always starts with the characters '<code></</code>'. * <p> * <code>EndTag</code> instances are obtained using one of the following methods: * <ul> * <li>{@link Element#getEndTag()} * <li>{@link Tag#getNextTag()} * <li>{@link Tag#getPreviousTag()} * <li>{@link Source#getPreviousEndTag(int pos)} * <li>{@link Source#getPreviousEndTag(int pos, String name)} * <li>{@link Source#getPreviousTag(int pos)} * <li>{@link Source#getPreviousTag(int pos, TagType)} * <li>{@link Source#getNextEndTag(int pos)} * <li>{@link Source#getNextEndTag(int pos, String name)} * <li>{@link Source#getNextEndTag(int pos, String name, EndTagType)} * <li>{@link Source#getNextTag(int pos)} * <li>{@link Source#getNextTag(int pos, TagType)} * <li>{@link Source#getEnclosingTag(int pos)} * <li>{@link Source#getEnclosingTag(int pos, TagType)} * <li>{@link Source#getTagAt(int pos)} * <li>{@link Segment#getAllTags()} * <li>{@link Segment#getAllTags(TagType)} * </ul> * <p> * The {@link Tag} superclass defines the {@link Tag#getName() getName()} method used to get the name of this end tag. * <p> * See also the XML 1.0 specification for <a target="_blank" href="http://www.w3.org/TR/REC-xml#dt-etag">end tags</a>. * * @see Tag * @see StartTag * @see Element */ public final class EndTag extends Tag { private final EndTagType endTagType; /** * Constructs a new <code>EndTag</code>. * * @param source the {@link Source} document. * @param begin the character position in the source document where this tag {@linkplain Segment#getBegin() begins}. * @param end the character position in the source document where this tag {@linkplain Segment#getEnd() ends}. * @param endTagType the {@linkplain #getEndTagType() type} of the end tag. * @param name the {@linkplain Tag#getName() name} of the tag. */ EndTag(final Source source, final int begin, final int end, final EndTagType endTagType, final String name) { super(source,begin,end,name); this.endTagType=endTagType; } /** * Returns the {@linkplain Element element} that is ended by this end tag. * <p> * Returns <code>null</code> if this end tag is not properly matched to any {@linkplain StartTag start tag} in the source document. * <p> * This method is much less efficient than the {@link StartTag#getElement()} method. * <p> * IMPLEMENTATION NOTE: The explanation for why this method is relatively inefficient lies in the fact that more than one * {@linkplain StartTagType start tag type} can have the same * {@linkplain StartTagType#getCorrespondingEndTagType() corresponding end tag type}, so it is not possible to know for certain * which type of start tag this end tag is matched to (see {@link EndTagType#getCorrespondingStartTagType()} for more explanation). * Because of this uncertainty, the implementation of this method must check every start tag preceding this end tag, calling its * {@link StartTag#getElement()} method to see whether it is terminated by this end tag. * * @return the {@linkplain Element element} that is ended by this end tag. */ public Element getElement() { if (element!=Element.NOT_CACHED) return element; int pos=begin; while (pos!=0) { StartTag startTag=source.getPreviousStartTag(pos-1); if (startTag==null) break; Element foundElement=startTag.getElement(); // this automatically sets foundElement.getEndTag().element cache if (foundElement.getEndTag()==this) return foundElement; // no need to set element as it was already done in previous statement pos=startTag.begin; } return element=null; } /** * Returns the {@linkplain EndTagType type} of this end tag. * <p> * This is equivalent to <code>(EndTagType)</code>{@link #getTagType()}. * * @return the {@linkplain EndTagType type} of this end tag. */ public EndTagType getEndTagType() { return endTagType; } // Documentation inherited from Tag public TagType getTagType() { return endTagType; } // Documentation inherited from Tag public boolean isUnregistered() { return endTagType==EndTagType.UNREGISTERED; } /** * Returns an XML representation of this end tag. * <p> * This method is included for symmetry with the {@link StartTag#tidy()} method and simply * returns the {@linkplain Segment#toString() source text} of the tag. * * @return an XML representation of this end tag. */ public String tidy() { return toString(); } /** * Generates the HTML text of a {@linkplain EndTagType#NORMAL normal} end tag with the specified tag {@linkplain #getName() name}. * <p> * <dl> * <dt>Example:</dt> * <dd> * <p> * The following method call: * <blockquote class="code"> * <code>EndTag.generateHTML("INPUT")</code> * </blockquote> * returns the following output: * <blockquote class="code"> * <code></INPUT></code> * </blockquote> * </dd> * </dl> * * @param tagName the {@linkplain #getName() name} of the end tag. * @return the HTML text of a {@linkplain EndTagType#NORMAL normal} end tag with the specified tag {@linkplain #getName() name}. * @see StartTag#generateHTML(String tagName, Map attributesMap, boolean emptyElementTag) */ public static String generateHTML(final String tagName) { return EndTagType.NORMAL.generateHTML(tagName); } public String getDebugInfo() { final StringBuilder sb=new StringBuilder(); sb.append(this).append(' '); if (endTagType!=EndTagType.NORMAL) sb.append('(').append(endTagType.getDescription()).append(") "); sb.append(super.getDebugInfo()); return sb.toString(); } /** * Returns the previous end tag matching the specified {@linkplain #getName() name} and {@linkplain EndTagType type}, starting at the specified position. * <p> * Called from {@link Source#getPreviousEndTag(int pos, String name)}. * * @param source the {@link Source} document. * @param pos the position to search from. * @param name the {@linkplain #getName() name} of the tag including its {@linkplain TagType#getNamePrefix() prefix} (must be lower case, may be null). * @param endTagType the {@linkplain EndTagType type} of end tag to search for. * @return the previous end tag matching the specified {@linkplain #getName() name} and {@linkplain EndTagType type}, starting at the specified position, or null if none is found. */ static EndTag getPrevious(final Source source, final int pos, final String name, final EndTagType endTagType) { if (name==null) return (EndTag)Tag.getPreviousTag(source,pos,endTagType); if (name.length()==0) throw new IllegalArgumentException("name argument must not be zero length"); final String searchString=endTagType.START_DELIMITER_PREFIX+name; try { final ParseText parseText=source.getParseText(); int begin=pos; do { begin=parseText.lastIndexOf(searchString,begin); if (begin==-1) return null; final EndTag endTag=(EndTag)source.getTagAt(begin); if (endTag!=null && endTag.getEndTagType()==endTagType && name.equals(endTag.getName())) return endTag; } while ((begin-=1)>=0); } catch (IndexOutOfBoundsException ex) { // this should never happen during a get previous operation so rethrow it: throw ex; } return null; } /** * Returns the next end tag matching the specified {@linkplain #getName() name} and {@linkplain EndTagType type}, starting at the specified position. * <p> * Called from {@link Source#getNextEndTag(int pos, String name, EndTagType endTagType)}. * * @param source the {@link Source} document. * @param pos the position to search from. * @param name the {@linkplain #getName() name} of the tag including its {@linkplain TagType#getNamePrefix() prefix} (must be lower case, may be null). * @param endTagType the {@linkplain EndTagType type} of end tag to search for. * @return the next end tag matching the specified {@linkplain #getName() name} and {@linkplain EndTagType type}, starting at the specified position, or null if none is found. */ static EndTag getNext(final Source source, final int pos, final String name, final EndTagType endTagType) { if (name==null) return (EndTag)Tag.getNextTag(source,pos,endTagType); if (name.length()==0) throw new IllegalArgumentException("name argument must not be zero length"); final String searchString=endTagType.START_DELIMITER_PREFIX+name; try { final ParseText parseText=source.getParseText(); int begin=pos; do { begin=parseText.indexOf(searchString,begin); if (begin==-1) return null; final EndTag endTag=(EndTag)source.getTagAt(begin); if (endTag!=null && endTag.getEndTagType()==endTagType && name.equals(endTag.getName())) return endTag; } while ((begin+=1)<source.end); } catch (IndexOutOfBoundsException ex) { // this should only happen when the end of file is reached in the middle of a tag. // we don't have to do anything to handle it as there will be no more tags anyway. } return null; } static EndTag getPrevious(final Source source, int pos) { while (true) { final Tag tag=Tag.getPreviousTag(source,pos); if (tag==null) return null; if (tag instanceof EndTag) return (EndTag)tag; pos-=1; } } static EndTag getNext(final Source source, int pos) { while (true) { final Tag tag=Tag.getNextTag(source,pos); if (tag==null) return null; if (tag instanceof EndTag) return (EndTag)tag; pos+=1; } } }