/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package org.apache.jmeter.protocol.http.parser;
import java.net.URL;
import java.util.Collection;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.LoggerFactory;
import org.slf4j.Logger;
/**
* {@link HTMLParser} subclasses can parse HTML content to obtain URLs.
*
*/
public abstract class HTMLParser extends BaseParser {
private static final Logger log = LoggerFactory.getLogger(HTMLParser.class);
protected static final String ATT_BACKGROUND = "background";// $NON-NLS-1$
protected static final String ATT_CODE = "code";// $NON-NLS-1$
protected static final String ATT_CODEBASE = "codebase";// $NON-NLS-1$
protected static final String ATT_DATA = "data";// $NON-NLS-1$
protected static final String ATT_HREF = "href";// $NON-NLS-1$
protected static final String ATT_REL = "rel";// $NON-NLS-1$
protected static final String ATT_SRC = "src";// $NON-NLS-1$
protected static final String ATT_STYLE = "style";// $NON-NLS-1$
protected static final String ATT_TYPE = "type";// $NON-NLS-1$
protected static final String ATT_IS_IMAGE = "image";// $NON-NLS-1$
protected static final String TAG_APPLET = "applet";// $NON-NLS-1$
protected static final String TAG_BASE = "base";// $NON-NLS-1$
protected static final String TAG_BGSOUND = "bgsound";// $NON-NLS-1$
protected static final String TAG_BODY = "body";// $NON-NLS-1$
protected static final String TAG_EMBED = "embed";// $NON-NLS-1$
protected static final String TAG_FRAME = "frame";// $NON-NLS-1$
protected static final String TAG_IFRAME = "iframe";// $NON-NLS-1$
protected static final String TAG_IMAGE = "img";// $NON-NLS-1$
protected static final String TAG_INPUT = "input";// $NON-NLS-1$
protected static final String TAG_LINK = "link";// $NON-NLS-1$
protected static final String TAG_OBJECT = "object";// $NON-NLS-1$
protected static final String TAG_SCRIPT = "script";// $NON-NLS-1$
protected static final String STYLESHEET = "stylesheet";// $NON-NLS-1$
protected static final String IE_UA = "MSIE ([0-9]+.[0-9]+)";// $NON-NLS-1$
protected static final Pattern IE_UA_PATTERN = Pattern.compile(IE_UA);
private static final float IE_10 = 10.0f;
public static final String PARSER_CLASSNAME = "htmlParser.className"; // $NON-NLS-1$
public static final String DEFAULT_PARSER =
"org.apache.jmeter.protocol.http.parser.LagartoBasedHtmlParser"; // $NON-NLS-1$
private static final Pattern NORMALIZE_URL_PATTERN = Pattern.compile("[\n\r\b\f]+"); //$NON-NLS-1$
/**
* Protected constructor to prevent instantiation except from within
* subclasses.
*/
protected HTMLParser() {
}
/**
* Get the URLs for all the resources that a browser would automatically
* download following the download of the HTML content, that is: images,
* stylesheets, javascript files, applets, etc...
* <p>
* URLs should not appear twice in the returned iterator.
* <p>
* Malformed URLs can be reported to the caller by having the Iterator
* return the corresponding RL String. Overall problems parsing the html
* should be reported by throwing an HTMLParseException.
* @param userAgent
* User Agent
*
* @param html
* HTML code
* @param baseUrl
* Base URL from which the HTML code was obtained
* @param encoding Charset
* @return an Iterator for the resource URLs
* @throws HTMLParseException when parsing the <code>html</code> fails
*/
@Override
public Iterator<URL> getEmbeddedResourceURLs(
String userAgent, byte[] html, URL baseUrl, String encoding) throws HTMLParseException {
// The Set is used to ignore duplicated binary files.
// Using a LinkedHashSet to avoid unnecessary overhead in iterating
// the elements in the set later on. As a side-effect, this will keep
// them roughly in order, which should be a better model of browser
// behaviour.
Collection<URLString> col = new LinkedHashSet<>();
return getEmbeddedResourceURLs(userAgent, html, baseUrl, new URLCollection(col),encoding);
// An additional note on using HashSets to store URLs: I just
// discovered that obtaining the hashCode of a java.net.URL implies
// a domain-name resolution process. This means significant delays
// can occur, even more so if the domain name is not resolvable.
// Whether this can be a problem in practical situations I can't tell,
// but
// thought I'd keep a note just in case...
// BTW, note that using a List and removing duplicates via scan
// would not help, since URL.equals requires name resolution too.
// The above problem has now been addressed with the URLString and
// URLCollection classes.
}
/**
* Get the URLs for all the resources that a browser would automatically
* download following the download of the HTML content, that is: images,
* stylesheets, javascript files, applets, etc...
* <p>
* All URLs should be added to the Collection.
* <p>
* Malformed URLs can be reported to the caller by having the Iterator
* return the corresponding RL String. Overall problems parsing the html
* should be reported by throwing an HTMLParseException.
* <p>
* N.B. The Iterator returns URLs, but the Collection will contain objects
* of class URLString.
*
* @param userAgent
* User Agent
* @param html
* HTML code
* @param baseUrl
* Base URL from which the HTML code was obtained
* @param coll
* URLCollection
* @param encoding Charset
* @return an Iterator for the resource URLs
* @throws HTMLParseException when parsing the <code>html</code> fails
*/
public abstract Iterator<URL> getEmbeddedResourceURLs(
String userAgent, byte[] html, URL baseUrl, URLCollection coll, String encoding)
throws HTMLParseException;
/**
* Get the URLs for all the resources that a browser would automatically
* download following the download of the HTML content, that is: images,
* stylesheets, javascript files, applets, etc...
* <p>
* N.B. The Iterator returns URLs, but the Collection will contain objects
* of class URLString.
*
* @param userAgent
* User Agent
* @param html
* HTML code
* @param baseUrl
* Base URL from which the HTML code was obtained
* @param coll
* Collection - will contain URLString objects, not URLs
* @param encoding Charset
* @return an Iterator for the resource URLs
* @throws HTMLParseException when parsing the <code>html</code> fails
*/
public Iterator<URL> getEmbeddedResourceURLs(
String userAgent, byte[] html, URL baseUrl, Collection<URLString> coll, String encoding)
throws HTMLParseException {
return getEmbeddedResourceURLs(userAgent, html, baseUrl, new URLCollection(coll), encoding);
}
/**
*
* @param ieVersion Float IE version
* @return true if IE version < IE v10
*/
protected final boolean isEnableConditionalComments(Float ieVersion) {
// Conditional comment have been dropped in IE10
// http://msdn.microsoft.com/en-us/library/ie/hh801214%28v=vs.85%29.aspx
return ieVersion != null && ieVersion.floatValue() < IE_10;
}
/**
*
* @param userAgent User Agent
* @return version null if not IE or the version after MSIE
*/
protected Float extractIEVersion(String userAgent) {
if (StringUtils.isEmpty(userAgent)) {
log.info("userAgent is null");
return null;
}
Matcher matcher = IE_UA_PATTERN.matcher(userAgent);
String ieVersion = null;
if (matcher.find()) {
if (matcher.groupCount() > 0) {
ieVersion = matcher.group(1);
} else {
ieVersion = matcher.group();
}
}
if (ieVersion != null) {
return Float.valueOf(ieVersion);
} else {
return null;
}
}
/**
* Normalizes URL as browsers do
* @param url {@link CharSequence}
* @return normalized url
*/
protected static String normalizeUrlValue(CharSequence url) {
if (!StringUtils.isEmpty(url)) {
String trimmed = NORMALIZE_URL_PATTERN.matcher(url.toString().trim()).replaceAll("");
if (!trimmed.isEmpty()) {
return trimmed;
}
}
return null;
}
}