SpiderHtmlParser.java example

Explorer

zaproxy-master
- src
  - ch
    - csnc
      - extension
        httpclient
        AliasCertificate.java
        AliasKeyManager.java
        PKCS11Configuration.java
        SSLContextManager.java
        ui
        AliasTableModel.java
        CertificateView.java
        DriverTableModel.java
        DriversView.java
        util
        DriverConfiguration.java
        Encoding.java
        OptionsParamExperimentalSliSupport.java
  - org
- test
  - ch
    - csnc
      - extension
        httpclient
        AliasCertificateUnitTest.java
        AliasKeyManagerUnitTest.java
        PKCS11ConfigurationUnitTest.java
        SSLContextManagerUnitTest.java
        util
        EncodingUnitTest.java
  - org
    - apache
      - commons
        httpclient
        HttpMethodBaseUnitTest.java
    - parosproxy
      - paros
        CommandLineUnitTest.java
        common
        AbstractParamUnitTest.java
        core
        scanner
        AbstractPluginUnitTest.java
        KbUnitTest.java
        NameValuePairUnitTest.java
        PluginFactoryUnitTest.java
        PluginTestUtils.java
        UtilUnitTest.java
        VariantCookieUnitTest.java
        VariantHeaderUnitTest.java
        VariantODataUnitTest.java
        model
        FileCopierUnitTest.java
        network
        HttpBodyUnitTest.java
        HttpRequestHeaderUnitTest.java
        HttpResponseHeaderUnitTest.java
    - zaproxy
      - zap
        VersionUnitTest.java
        WithConfigsTest.java
        authentication
        AuthenticationMethodIndicatorsUnitTest.java
        UsernamePasswordAuthenticationCredentialsUnitTest.java
        control
        AddOnCollectionUnitTest.java
        AddOnUnitTest.java
        ZapReleaseComparitorUnitTest.java
        ZapReleaseUnitTest.java
        extension
        alert
        ExtensionAlertUnitTest.java
        api
        APIUnitTest.java
        ApiResponseConversionUtilsUnitTest.java
        OptionsParamApiUnitTest.java
        authorization
        BasicAuthorizationDetectionMethodUnitTest.java
        brk
        impl
        http
        HttpBreakpointManagementDaemonImplUnitTest.java
        dynssl
        SslCertificateUtilsUnitTest.java
        ext
        ExtensionParamUnitTest.java
        httppanel
        view
        hex
        HttpPanelHexModelUnitTest.java
        util
        HttpTextViewUtilsUnitTest.java
        lang
        LangImporterUnitTest.java
        pscan
        PluginPassiveScannerUnitTest.java
        ruleconfig
        RuleConfigParamUnitTest.java
        model
        ContextUnitTest.java
        SessionUtilsUnitTest.java
        StandardParameterParserUnitTest.java
        VulnerabilitiesLoaderUnitTest.java
        network
        HttpBodyTestUtils.java
        HttpResponseBodyUnitTest.java
        spider
        URLCanonicalizerUnitTest.java
        URLResolverRfc1808ExamplesUnitTest.java
        URLResolverUnitTest.java
        filters
        DefaultFetchFilterUnitTest.java
        HttpPrefixFetchFilterUnitTest.java
        parser
        SpiderHtmlFormParserUnitTest.java
        SpiderHtmlParserUnitTest.java
        SpiderParserTestUtils.java
        SpiderSitemapXMLParserUnitTest.java
        SpiderTextParserUnitTest.java
        users
        UserUnitTest.java
        UsersTableModelUnitTest.java
        utils
        ApiUtilsUnitTest.java
        BoyerMooreMatcherUnitTest.java
        ByteBuilderUnitTest.java
        HirshbergMatcherUnitTest.java
        LocaleUtilsUnitTest.java
        XMLStringUtilUnitTest.java
        view
        AbstractMultipleOptionsBaseTableModelUnitTest.java
        JCheckBoxTreeUnitTest.java
        LayoutHelperUnitTest.java
        ListModelTestUtils.java
        TableModelTestUtils.java
        widgets
        UsersListModelUnitTest.java

/*
 * Zed Attack Proxy (ZAP) and its related class files.
 * 
 * ZAP is an HTTP/HTTPS proxy for assessing web application security.
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"); 
 * you may not use this file except in compliance with the License. 
 * You may obtain a copy of the License at 
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0 
 *   
 * Unless required by applicable law or agreed to in writing, software 
 * distributed under the License is distributed on an "AS IS" BASIS, 
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
 * See the License for the specific language governing permissions and 
 * limitations under the License. 
 */
package org.zaproxy.zap.spider.parser;

import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import net.htmlparser.jericho.Element;
import net.htmlparser.jericho.HTMLElementName;
import net.htmlparser.jericho.Source;
import net.htmlparser.jericho.StartTag;
import net.htmlparser.jericho.StartTagType;

import org.parosproxy.paros.network.HttpMessage;
import org.zaproxy.zap.spider.SpiderParam;
import org.zaproxy.zap.spider.URLCanonicalizer;

/**
 * The Class SpiderHtmlParser is used for parsing of HTML files, gathering resource urls from them.
 * <p>
 * <strong>NOTE:</strong> Handling of HTML Forms is not done in this Parser. Instead see {@link SpiderHtmlFormParser}.
 */
public class SpiderHtmlParser extends SpiderParser {

	/** The Constant urlPattern defining the pattern for a meta url. */
	private static final Pattern urlPattern = Pattern.compile("url\\s*=\\s*([^;]+)", Pattern.CASE_INSENSITIVE);
	private static final Pattern PLAIN_COMMENTS_URL_PATTERN = Pattern
			.compile("(?:http(?:s?):)?//[^\\x00-\\x1f\"'\\s<>#()\\[\\]{}]+", Pattern.CASE_INSENSITIVE);

	/** The params. */
	private SpiderParam params;

	/**
	 * Instantiates a new spider html parser.
	 * 
	 * @param params the params
	 * @throws IllegalArgumentException if {@code params} is null.
	 */
	public SpiderHtmlParser(SpiderParam params) {
		super();
		if (params == null) {
			throw new IllegalArgumentException("Parameter params must not be null.");
		}
		this.params = params;
	}

	/**
	 * @throws NullPointerException if {@code message} is null.
	 */
	@Override
	public boolean parseResource(HttpMessage message, Source source, int depth) {

		// Prepare the source, if not provided
		if (source == null) {
			source = new Source(message.getResponseBody().toString());
		}

		// Get the context (base url)
		String baseURL = message.getRequestHeader().getURI().toString();

		// Try to see if there's any BASE tag that could change the base URL
		Element base = source.getFirstElement(HTMLElementName.BASE);
		if (base != null) {
			if (log.isDebugEnabled()) {
				log.debug("Base tag was found in HTML: " + base.getDebugInfo());
			}
			String href = base.getAttributeValue("href");
			if (href != null && !href.isEmpty()) {
				baseURL = URLCanonicalizer.getCanonicalURL(href, baseURL);
			}
		}

		// Parse the source
		parseSource(message, source, depth, baseURL);

		// Parse the comments
		if (params.isParseComments()) {
			List<StartTag> comments = source.getAllStartTags(StartTagType.COMMENT);
			for (StartTag comment : comments) {
				Source s = new Source(comment.getTagContent());
				if (!parseSource(message, s, depth, baseURL)) {
					Matcher matcher = PLAIN_COMMENTS_URL_PATTERN.matcher(s.toString());
					while (matcher.find()) {
						processURL(message, depth, matcher.group(), baseURL);
					}
				}
			}
		}
		
		return false;
	}

	/**
	 * Parses the HTML Jericho source for the elements that contain references to other resources.
	 * 
	 * @param message the message
	 * @param source the source
	 * @param depth the depth
	 * @param baseURL the base url
	 * @return {@code true} if at least one URL was found, {@code false} otherwise.
	 */
	private boolean parseSource(HttpMessage message, Source source, int depth, String baseURL) {
		log.debug("Parsing an HTML message...");
		boolean resourcesfound = false;
		// Process A elements
		List<Element> elements = source.getAllElements(HTMLElementName.A);
		for (Element el : elements) {
			resourcesfound |= processAttributeElement(message, depth, baseURL, el, "href");
		}

		// Process AREA elements
		elements = source.getAllElements(HTMLElementName.AREA);
		for (Element el : elements) {
			resourcesfound |= processAttributeElement(message, depth, baseURL, el, "href");
		}

		// Process Frame Elements
		elements = source.getAllElements(HTMLElementName.FRAME);
		for (Element el : elements) {
			resourcesfound |= processAttributeElement(message, depth, baseURL, el, "src");
		}

		// Process IFrame Elements
		elements = source.getAllElements(HTMLElementName.IFRAME);
		for (Element el : elements) {
			resourcesfound |= processAttributeElement(message, depth, baseURL, el, "src");
		}

		// Process Link elements
		elements = source.getAllElements(HTMLElementName.LINK);
		for (Element el : elements) {
			resourcesfound |= processAttributeElement(message, depth, baseURL, el, "href");
		}

		// Process Script elements with src
		elements = source.getAllElements(HTMLElementName.SCRIPT);
		for (Element el : elements) {
			resourcesfound |= processAttributeElement(message, depth, baseURL, el, "src");
		}

		// Process Img elements
		elements = source.getAllElements(HTMLElementName.IMG);
		for (Element el : elements) {
			resourcesfound |= processAttributeElement(message, depth, baseURL, el, "src");
		}

		// Process META elements
		elements = source.getAllElements(HTMLElementName.META);
		for (Element el : elements) {
			// If we have http-equiv attribute, then urls can be found.
			String equiv = el.getAttributeValue("http-equiv");
			String content = el.getAttributeValue("content");
			if (equiv != null && content != null) {

				// For the following cases:
				// http-equiv="refresh" content="0;URL=http://foo.bar/..."
				// http-equiv="location" content="url=http://foo.bar/..."
				if (equiv.equalsIgnoreCase("refresh") || equiv.equalsIgnoreCase("location")) {
					Matcher matcher = urlPattern.matcher(content);
					if (matcher.find()) {
						String url = matcher.group(1);
						processURL(message, depth, url, baseURL);
						resourcesfound = true;
					}
				}
			}
		}

		return resourcesfound;
	}

	/**
	 * Processes the attribute with the given name of a Jericho element, for an URL. If an URL is
	 * found, notifies the listeners.
	 * 
	 * @param message the message
	 * @param depth the depth
	 * @param baseURL the base url
	 * @param element the element
	 * @param attributeName the attribute name
	 * @return {@code true} if a URL was processed, {@code false} otherwise.
	 */
	private boolean processAttributeElement(HttpMessage message, int depth, String baseURL, Element element,
			String attributeName) {
		// The URL as written in the attribute (can be relative or absolute)
		String localURL = element.getAttributeValue(attributeName);
		if (localURL == null) {
			return false;
		}

		processURL(message, depth, localURL, baseURL);
		return true;
	}

	/**
	 * @throws NullPointerException if {@code message} is null.
	 */
	@Override
	public boolean canParseResource(HttpMessage message, String path, boolean wasAlreadyConsumed) {
		// Fallback parser - if it's a HTML message which has not already been processed
		return !wasAlreadyConsumed && message.getResponseHeader().isHtml();
	}

}