/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package org.apache.jmeter.protocol.http.parser;
import java.io.ByteArrayInputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.util.Iterator;
import org.apache.commons.lang3.StringUtils;
import org.apache.jmeter.protocol.http.util.ConversionUtils;
import org.slf4j.LoggerFactory;
import org.slf4j.Logger;
import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.tidy.Tidy;
import org.xml.sax.SAXException;
/**
* HtmlParser implementation using JTidy.
*
*/
class JTidyHTMLParser extends HTMLParser {
private static final Logger log = LoggerFactory.getLogger(JTidyHTMLParser.class);
protected JTidyHTMLParser() {
super();
}
/**
* {@inheritDoc}
*/
@Override
public Iterator<URL> getEmbeddedResourceURLs(String userAgent, byte[] html, URL baseUrl, URLCollection urls, String encoding) throws HTMLParseException {
Document dom = null;
try {
dom = (Document) getDOM(html, encoding);
} catch (SAXException se) {
throw new HTMLParseException(se);
}
// Now parse the DOM tree
scanNodes(dom, urls, baseUrl);
return urls.iterator();
}
/**
* Scan nodes recursively, looking for embedded resources
*
* @param node -
* initial node
* @param urls -
* container for URLs
* @param baseUrl -
* used to create absolute URLs
*
* @return new base URL
*/
private URL scanNodes(Node node, URLCollection urls, URL baseUrl) throws HTMLParseException {
if (node == null) {
return baseUrl;
}
String name = node.getNodeName();
int type = node.getNodeType();
switch (type) {
case Node.DOCUMENT_NODE:
scanNodes(((Document) node).getDocumentElement(), urls, baseUrl);
break;
case Node.ELEMENT_NODE:
NamedNodeMap attrs = node.getAttributes();
if (name.equalsIgnoreCase(TAG_BASE)) {
String tmp = getValue(attrs, ATT_HREF);
if (tmp != null) {
try {
baseUrl = ConversionUtils.makeRelativeURL(baseUrl, tmp);
} catch (MalformedURLException e) {
throw new HTMLParseException(e);
}
}
break;
}
if (name.equalsIgnoreCase(TAG_IMAGE) || name.equalsIgnoreCase(TAG_EMBED)) {
urls.addURL(getValue(attrs, ATT_SRC), baseUrl);
break;
}
if (name.equalsIgnoreCase(TAG_APPLET)) {
urls.addURL(getValue(attrs, "code"), baseUrl);
break;
}
if (name.equalsIgnoreCase(TAG_OBJECT)) {
String data = getValue(attrs, "codebase");
if(!StringUtils.isEmpty(data)) {
urls.addURL(data, baseUrl);
}
data = getValue(attrs, "data");
if(!StringUtils.isEmpty(data)) {
urls.addURL(data, baseUrl);
}
break;
}
if (name.equalsIgnoreCase(TAG_INPUT)) {
String src = getValue(attrs, ATT_SRC);
String typ = getValue(attrs, ATT_TYPE);
if ((src != null) && ATT_IS_IMAGE.equalsIgnoreCase(typ)) {
urls.addURL(src, baseUrl);
}
break;
}
if (TAG_LINK.equalsIgnoreCase(name) && STYLESHEET.equalsIgnoreCase(getValue(attrs, ATT_REL))) {
urls.addURL(getValue(attrs, ATT_HREF), baseUrl);
break;
}
if (name.equalsIgnoreCase(TAG_SCRIPT)) {
urls.addURL(getValue(attrs, ATT_SRC), baseUrl);
break;
}
if (name.equalsIgnoreCase(TAG_FRAME)) {
urls.addURL(getValue(attrs, ATT_SRC), baseUrl);
break;
}
if (name.equalsIgnoreCase(TAG_IFRAME)) {
urls.addURL(getValue(attrs, ATT_SRC), baseUrl);
break;
}
String back = getValue(attrs, ATT_BACKGROUND);
if (back != null) {
urls.addURL(back, baseUrl);
}
if (name.equalsIgnoreCase(TAG_BGSOUND)) {
urls.addURL(getValue(attrs, ATT_SRC), baseUrl);
break;
}
String style = getValue(attrs, ATT_STYLE);
if (style != null) {
HtmlParsingUtils.extractStyleURLs(baseUrl, urls, style);
}
NodeList children = node.getChildNodes();
if (children != null) {
int len = children.getLength();
for (int i = 0; i < len; i++) {
baseUrl = scanNodes(children.item(i), urls, baseUrl);
}
}
break;
// case Node.TEXT_NODE:
// break;
default:
// ignored
break;
}
return baseUrl;
}
/*
* Helper method to get an attribute value, if it exists @param attrs list
* of attributs @param attname attribute name @return
*/
private String getValue(NamedNodeMap attrs, String attname) {
String v = null;
Node n = attrs.getNamedItem(attname);
if (n != null) {
v = n.getNodeValue();
}
return v;
}
/**
* Returns <code>tidy</code> as HTML parser.
*
* @return a <code>tidy</code> HTML parser
*/
private static Tidy getTidyParser(String encoding) {
log.debug("Start : getParser");
Tidy tidy = new Tidy();
tidy.setInputEncoding(encoding);
tidy.setOutputEncoding(StandardCharsets.UTF_8.name());
tidy.setQuiet(true);
tidy.setShowWarnings(false);
if (log.isDebugEnabled()) {
log.debug("getParser : tidy parser created - " + tidy);
}
log.debug("End : getParser");
return tidy;
}
/**
* Returns a node representing a whole xml given an xml document.
*
* @param text
* an xml document (as a byte array)
* @return a node representing a whole xml
*
* @throws SAXException
* indicates an error parsing the xml document
*/
private static Node getDOM(byte[] text, String encoding) throws SAXException {
log.debug("Start : getDOM");
Node node = getTidyParser(encoding).parseDOM(new ByteArrayInputStream(text), null);
if (log.isDebugEnabled()) {
log.debug("node : " + node);
}
log.debug("End : getDOM");
return node;
}
}