/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package org.apache.jmeter.protocol.http.parser; import java.net.MalformedURLException; import java.net.URL; import java.util.Iterator; import org.apache.commons.lang3.StringUtils; import org.apache.jmeter.protocol.http.util.ConversionUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; import org.jsoup.select.NodeTraversor; import org.jsoup.select.NodeVisitor; /** * Parser based on JSOUP * @since 2.10 * TODO Factor out common code between {@link LagartoBasedHtmlParser} and this one (adapter pattern) */ public class JsoupBasedHtmlParser extends HTMLParser { /* * A dummy class to pass the pointer of URL. */ private static class URLPointer { private URLPointer(URL newUrl) { url = newUrl; } private URL url; } private static final class JMeterNodeVisitor implements NodeVisitor { private URLCollection urls; private URLPointer baseUrl; /** * @param baseUrl base url to extract possibly missing information from urls found in <code>urls</code> * @param urls collection of urls to consider */ public JMeterNodeVisitor(final URLPointer baseUrl, URLCollection urls) { this.urls = urls; this.baseUrl = baseUrl; } private void extractAttribute(Element tag, String attributeName) { String url = tag.attr(attributeName); String normalizedUrl = normalizeUrlValue(url); if(normalizedUrl != null) { urls.addURL(normalizedUrl, baseUrl.url); } } @Override public void head(Node node, int depth) { if (!(node instanceof Element)) { return; } Element tag = (Element) node; String tagName = tag.tagName().toLowerCase(); if (tagName.equals(TAG_BODY)) { extractAttribute(tag, ATT_BACKGROUND); } else if (tagName.equals(TAG_SCRIPT)) { extractAttribute(tag, ATT_SRC); } else if (tagName.equals(TAG_BASE)) { String baseref = tag.attr(ATT_HREF); try { if (!StringUtils.isEmpty(baseref))// Bugzilla 30713 { baseUrl.url = ConversionUtils.makeRelativeURL(baseUrl.url, baseref); } } catch (MalformedURLException e1) { throw new RuntimeException(e1); } } else if (tagName.equals(TAG_IMAGE)) { extractAttribute(tag, ATT_SRC); } else if (tagName.equals(TAG_APPLET)) { extractAttribute(tag, ATT_CODE); } else if (tagName.equals(TAG_OBJECT)) { extractAttribute(tag, ATT_CODEBASE); extractAttribute(tag, ATT_DATA); } else if (tagName.equals(TAG_INPUT)) { // we check the input tag type for image if (ATT_IS_IMAGE.equalsIgnoreCase(tag.attr(ATT_TYPE))) { // then we need to download the binary extractAttribute(tag, ATT_SRC); } // Bug 51750 } else if (tagName.equals(TAG_FRAME) || tagName.equals(TAG_IFRAME)) { extractAttribute(tag, ATT_SRC); } else if (tagName.equals(TAG_EMBED)) { extractAttribute(tag, ATT_SRC); } else if (tagName.equals(TAG_BGSOUND)){ extractAttribute(tag, ATT_SRC); } else if (tagName.equals(TAG_LINK)) { // Putting the string first means it works even if the attribute is null if (STYLESHEET.equalsIgnoreCase(tag.attr(ATT_REL))) { extractAttribute(tag, ATT_HREF); } } else { extractAttribute(tag, ATT_BACKGROUND); } // Now look for URLs in the STYLE attribute String styleTagStr = tag.attr(ATT_STYLE); if(styleTagStr != null) { HtmlParsingUtils.extractStyleURLs(baseUrl.url, urls, styleTagStr); } } @Override public void tail(Node arg0, int arg1) { // Noop } } @Override public Iterator<URL> getEmbeddedResourceURLs(String userAgent, byte[] html, URL baseUrl, URLCollection coll, String encoding) throws HTMLParseException { try { // TODO Handle conditional comments for IE String contents = new String(html,encoding); Document doc = Jsoup.parse(contents); JMeterNodeVisitor nodeVisitor = new JMeterNodeVisitor(new URLPointer(baseUrl), coll); new NodeTraversor(nodeVisitor).traverse(doc); return coll.iterator(); } catch (Exception e) { throw new HTMLParseException(e); } } }