/* * Zed Attack Proxy (ZAP) and its related class files. * * ZAP is an HTTP/HTTPS proxy for assessing web application security. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.zaproxy.zap.spider.parser; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import net.htmlparser.jericho.Element; import net.htmlparser.jericho.HTMLElementName; import net.htmlparser.jericho.Source; import net.htmlparser.jericho.StartTag; import net.htmlparser.jericho.StartTagType; import org.parosproxy.paros.network.HttpMessage; import org.zaproxy.zap.spider.SpiderParam; import org.zaproxy.zap.spider.URLCanonicalizer; /** * The Class SpiderHtmlParser is used for parsing of HTML files, gathering resource urls from them. * <p> * <strong>NOTE:</strong> Handling of HTML Forms is not done in this Parser. Instead see {@link SpiderHtmlFormParser}. */ public class SpiderHtmlParser extends SpiderParser { /** The Constant urlPattern defining the pattern for a meta url. */ private static final Pattern urlPattern = Pattern.compile("url\\s*=\\s*([^;]+)", Pattern.CASE_INSENSITIVE); private static final Pattern PLAIN_COMMENTS_URL_PATTERN = Pattern .compile("(?:http(?:s?):)?//[^\\x00-\\x1f\"'\\s<>#()\\[\\]{}]+", Pattern.CASE_INSENSITIVE); /** The params. */ private SpiderParam params; /** * Instantiates a new spider html parser. * * @param params the params * @throws IllegalArgumentException if {@code params} is null. */ public SpiderHtmlParser(SpiderParam params) { super(); if (params == null) { throw new IllegalArgumentException("Parameter params must not be null."); } this.params = params; } /** * @throws NullPointerException if {@code message} is null. */ @Override public boolean parseResource(HttpMessage message, Source source, int depth) { // Prepare the source, if not provided if (source == null) { source = new Source(message.getResponseBody().toString()); } // Get the context (base url) String baseURL = message.getRequestHeader().getURI().toString(); // Try to see if there's any BASE tag that could change the base URL Element base = source.getFirstElement(HTMLElementName.BASE); if (base != null) { if (log.isDebugEnabled()) { log.debug("Base tag was found in HTML: " + base.getDebugInfo()); } String href = base.getAttributeValue("href"); if (href != null && !href.isEmpty()) { baseURL = URLCanonicalizer.getCanonicalURL(href, baseURL); } } // Parse the source parseSource(message, source, depth, baseURL); // Parse the comments if (params.isParseComments()) { List<StartTag> comments = source.getAllStartTags(StartTagType.COMMENT); for (StartTag comment : comments) { Source s = new Source(comment.getTagContent()); if (!parseSource(message, s, depth, baseURL)) { Matcher matcher = PLAIN_COMMENTS_URL_PATTERN.matcher(s.toString()); while (matcher.find()) { processURL(message, depth, matcher.group(), baseURL); } } } } return false; } /** * Parses the HTML Jericho source for the elements that contain references to other resources. * * @param message the message * @param source the source * @param depth the depth * @param baseURL the base url * @return {@code true} if at least one URL was found, {@code false} otherwise. */ private boolean parseSource(HttpMessage message, Source source, int depth, String baseURL) { log.debug("Parsing an HTML message..."); boolean resourcesfound = false; // Process A elements List<Element> elements = source.getAllElements(HTMLElementName.A); for (Element el : elements) { resourcesfound |= processAttributeElement(message, depth, baseURL, el, "href"); } // Process AREA elements elements = source.getAllElements(HTMLElementName.AREA); for (Element el : elements) { resourcesfound |= processAttributeElement(message, depth, baseURL, el, "href"); } // Process Frame Elements elements = source.getAllElements(HTMLElementName.FRAME); for (Element el : elements) { resourcesfound |= processAttributeElement(message, depth, baseURL, el, "src"); } // Process IFrame Elements elements = source.getAllElements(HTMLElementName.IFRAME); for (Element el : elements) { resourcesfound |= processAttributeElement(message, depth, baseURL, el, "src"); } // Process Link elements elements = source.getAllElements(HTMLElementName.LINK); for (Element el : elements) { resourcesfound |= processAttributeElement(message, depth, baseURL, el, "href"); } // Process Script elements with src elements = source.getAllElements(HTMLElementName.SCRIPT); for (Element el : elements) { resourcesfound |= processAttributeElement(message, depth, baseURL, el, "src"); } // Process Img elements elements = source.getAllElements(HTMLElementName.IMG); for (Element el : elements) { resourcesfound |= processAttributeElement(message, depth, baseURL, el, "src"); } // Process META elements elements = source.getAllElements(HTMLElementName.META); for (Element el : elements) { // If we have http-equiv attribute, then urls can be found. String equiv = el.getAttributeValue("http-equiv"); String content = el.getAttributeValue("content"); if (equiv != null && content != null) { // For the following cases: // http-equiv="refresh" content="0;URL=http://foo.bar/..." // http-equiv="location" content="url=http://foo.bar/..." if (equiv.equalsIgnoreCase("refresh") || equiv.equalsIgnoreCase("location")) { Matcher matcher = urlPattern.matcher(content); if (matcher.find()) { String url = matcher.group(1); processURL(message, depth, url, baseURL); resourcesfound = true; } } } } return resourcesfound; } /** * Processes the attribute with the given name of a Jericho element, for an URL. If an URL is * found, notifies the listeners. * * @param message the message * @param depth the depth * @param baseURL the base url * @param element the element * @param attributeName the attribute name * @return {@code true} if a URL was processed, {@code false} otherwise. */ private boolean processAttributeElement(HttpMessage message, int depth, String baseURL, Element element, String attributeName) { // The URL as written in the attribute (can be relative or absolute) String localURL = element.getAttributeValue(attributeName); if (localURL == null) { return false; } processURL(message, depth, localURL, baseURL); return true; } /** * @throws NullPointerException if {@code message} is null. */ @Override public boolean canParseResource(HttpMessage message, String path, boolean wasAlreadyConsumed) { // Fallback parser - if it's a HTML message which has not already been processed return !wasAlreadyConsumed && message.getResponseHeader().isHtml(); } }