HTMLParser.java example

Explorer

jmeter-master
- jmeter-trunk
  - src
  - test
    - src
      - org
        apache
        commons
        cli
        avalon
        ClutilTestCase.java
        jmeter
        JMeterVersionTest.java
        assertions
        DurationAssertionTest.java
        MD5HexAssertionTest.java
        ResponseAssertionTest.java
        SMIMEAssertionTest.java
        SizeAssertionTest.java
        XMLSchemaAssertionTest.java
        XPathAssertionTest.java
        XmlAssertionTest.java
        config
        TestCVSDataSet.java
        TestRandomVariableConfig.java
        gui
        TestArgumentsPanel.java
        control
        TestGenericController.java
        TestIfController.java
        TestInterleaveControl.java
        TestLoopController.java
        TestOnceOnlyController.java
        TestRandomController.java
        TestRandomOrderController.java
        TestRunTime.java
        TestSwitchController.java
        TestThroughputController.java
        TestTransactionController.java
        TestWhileController.java
        engine
        DistributedRunnerTest.java
        LocalHostTest.java
        TestTreeCloner.java
        util
        PackageTest.java
        TestValueReplacer.java
        extractor
        TestHtmlExtractorJSoup.java
        TestHtmlExtractorJodd.java
        TestJSONPostProcessor.java
        TestRegexExtractor.java
        TestXPathExtractor.java
        functions
        CSVReadFunctionTest.java
        ComponentReferenceFunctionTest.java
        EvalFunctionTest.java
        FunctionTestHelper.java
        PackageTest.java
        RandomFunctionTest.java
        SplitFunctionTest.java
        StringFromFileFunctionTest.java
        SumFunctionTest.java
        TestEscapeOroRegexpChars.java
        TestFileRowColContainer.java
        TestFileToString.java
        TestGroovyFunction.java
        TestJavascriptFunction.java
        TestJavascriptFunctionWithRhino.java
        TestJexl2Function.java
        TestJexl3Function.java
        TestMachineIPName.java
        TestRandomFromMultipleVars.java
        TestRegexFunction.java
        TestSamplerNameFunction.java
        TestSetProperty.java
        TestSimpleFunctions.java
        TestTimeFunction.java
        TestTimeShiftFunction.java
        TestUrlEncodeDecode.java
        VariableTest.java
        gui
        action
        TestLoad.java
        TestSave.java
        logging
        TestGuiLogEventAppender.java
        util
        JSyntaxTextAreaTest.java
        TestMenuFactory.java
        TristateCheckBoxTest.java
        junit
        JMeterTest.java
        JMeterTestCase.java
        JMeterTestCaseJUnit.java
        categories
        ExcludeCategoryFilter.java
        NeedGuiTests.java
        stubs
        TestSampler.java
        listeners
        TestResultAction.java
        protocol
        http
        config
        MultipartUrlConfigTest.java
        UrlConfigTest.java
        control
        TestAuthManager.java
        TestAuthorization.java
        TestCacheManagerBase.java
        TestCacheManagerHC4.java
        TestCacheManagerUrlConnection.java
        TestCacheManagerUrlConnectionBase.java
        TestDNSCacheManager.java
        TestHC4CookieManager.java
        TestHTTPMirrorThread.java
        gui
        TestHttpTestSampleGui.java
        modifier
        TestAnchorModifier.java
        TestURLRewritingModifier.java
        parser
        NotReusableParser.java
        ReusableParser.java
        TestBaseParser.java
        TestBug60842HtmlParser.java
        TestCssParser.java
        TestHTMLParser.java
        TestHtmlParsingUtils.java
        proxy
        NonGuiProxySample.java
        TestHttpRequestHdr.java
        TestProxyControl.java
        sampler
        HTTPNullSampler.java
        HTTPSampler3.java
        NullURLConnection.java
        PackageTest.java
        PostWriterTest.java
        PutWriterTest.java
        TestHTTPSamplers.java
        TestHTTPSamplersAgainstHttpMirrorServer.java
        TestHttpWebdav.java
        util
        TestHTTPArgument.java
        TestHTTPFileArg.java
        TestHTTPFileArgs.java
        TestHTTPUtils.java
        accesslog
        TestLogFilter.java
        TestSessionFilter.java
        TestTCLogParser.java
        visualizers
        RequestViewHTTPTest.java
        jms
        sampler
        PublisherSamplerTest.java
        render
        BinaryMessageRendererTest.java
        MessageRendererTest.java
        ObjectMessageRendererTest.java
        Person.java
        TextMessageRendererTest.java
        ldap
        config
        gui
        PackageTest.java
        tcp
        sampler
        BinaryTCPClientImplTest.java
        LengthPrefixedBinaryTCPClientImplTest.java
        TCPClientDecoratorTest.java
        report
        core
        CsvSampleReaderTest.java
        SampleMetadataTest.java
        TestCsvSampleWriter.java
        dashboard
        ApdexPerTransactionTest.java
        processor
        ErrorsSummaryConsumerTest.java
        FieldSampleComparatorTest.java
        reporters
        TestResultSaver.java
        resources
        PackageTest.java
        TestPropertiesFiles.java
        samplers
        NullSampler.java
        TestSampleResult.java
        TestSampleSaveConfiguration.java
        save
        TestCSVSaveService.java
        TestSaveService.java
        services
        TestFileServer.java
        test
        ResourceLocator.java
        testbeans
        gui
        PackageTest.java
        TestBooleanPropertyEditor.java
        TestComboStringEditor.java
        TestFieldStringEditor.java
        testelement
        PackageTest.java
        TestElementTest.java
        TestHeaderManager.java
        TestNumberProperty.java
        property
        AbstractPropertyTest.java
        PackageTest.java
        threads
        JMeterContextServiceHelper.java
        TestJMeterContextService.java
        TestTestCompiler.java
        timers
        ConstantThroughputTimerTest.java
        util
        JSR223TestElementTest.java
        LogRecord.java
        LogRecordingDelegatingLogger.java
        PackageTest.java
        StringUtilitiesTest.java
        TestJMeterUtils.java
        XPathUtilTest.java
        visualizers
        GenerateTreeGui.java
        TestRenderAsJson.java
        TestSampleCompareTo.java
        TestSamplingStatCalculator.java
        jorphan
        TestFunctorUsers.java
        TestXMLBuffer.java
        collections
        PackageTest.java
        exec
        TestKeyToolUtils.java
        gui
        ObjectTableModelTest.java
        ObjectTableSorterTest.java
        TableModelEventBacker.java
        math
        TestStatCalculator.java
        reflect
        TestClassFinder.java
        TestClassTools.java
        TestFunctor.java
        test
        AllTests.java
        util
        TestConverter.java
        TestJorphanUtils.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */

package org.apache.jmeter.protocol.http.parser;


import java.net.URL;
import java.util.Collection;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.lang3.StringUtils;
import org.slf4j.LoggerFactory;
import org.slf4j.Logger;

/**
 * {@link HTMLParser} subclasses can parse HTML content to obtain URLs.
 *
 */
public abstract class HTMLParser extends BaseParser {

    private static final Logger log = LoggerFactory.getLogger(HTMLParser.class);

    protected static final String ATT_BACKGROUND    = "background";// $NON-NLS-1$
    protected static final String ATT_CODE          = "code";// $NON-NLS-1$
    protected static final String ATT_CODEBASE      = "codebase";// $NON-NLS-1$
    protected static final String ATT_DATA          = "data";// $NON-NLS-1$
    protected static final String ATT_HREF          = "href";// $NON-NLS-1$
    protected static final String ATT_REL           = "rel";// $NON-NLS-1$
    protected static final String ATT_SRC           = "src";// $NON-NLS-1$
    protected static final String ATT_STYLE         = "style";// $NON-NLS-1$
    protected static final String ATT_TYPE          = "type";// $NON-NLS-1$
    protected static final String ATT_IS_IMAGE      = "image";// $NON-NLS-1$
    protected static final String TAG_APPLET        = "applet";// $NON-NLS-1$
    protected static final String TAG_BASE          = "base";// $NON-NLS-1$
    protected static final String TAG_BGSOUND       = "bgsound";// $NON-NLS-1$
    protected static final String TAG_BODY          = "body";// $NON-NLS-1$
    protected static final String TAG_EMBED         = "embed";// $NON-NLS-1$
    protected static final String TAG_FRAME         = "frame";// $NON-NLS-1$
    protected static final String TAG_IFRAME        = "iframe";// $NON-NLS-1$
    protected static final String TAG_IMAGE         = "img";// $NON-NLS-1$
    protected static final String TAG_INPUT         = "input";// $NON-NLS-1$
    protected static final String TAG_LINK          = "link";// $NON-NLS-1$
    protected static final String TAG_OBJECT        = "object";// $NON-NLS-1$
    protected static final String TAG_SCRIPT        = "script";// $NON-NLS-1$
    protected static final String STYLESHEET        = "stylesheet";// $NON-NLS-1$

    protected static final String IE_UA             = "MSIE ([0-9]+.[0-9]+)";// $NON-NLS-1$
    protected static final Pattern IE_UA_PATTERN    = Pattern.compile(IE_UA);
    private   static final float IE_10                = 10.0f;

    public static final String PARSER_CLASSNAME = "htmlParser.className"; // $NON-NLS-1$

    public static final String DEFAULT_PARSER =
        "org.apache.jmeter.protocol.http.parser.LagartoBasedHtmlParser"; // $NON-NLS-1$

    private static final Pattern NORMALIZE_URL_PATTERN = Pattern.compile("[\n\r\b\f]+"); //$NON-NLS-1$

    /**
     * Protected constructor to prevent instantiation except from within
     * subclasses.
     */
    protected HTMLParser() {
    }

    /**
     * Get the URLs for all the resources that a browser would automatically
     * download following the download of the HTML content, that is: images,
     * stylesheets, javascript files, applets, etc...
     * <p>
     * URLs should not appear twice in the returned iterator.
     * <p>
     * Malformed URLs can be reported to the caller by having the Iterator
     * return the corresponding RL String. Overall problems parsing the html
     * should be reported by throwing an HTMLParseException.
     * @param userAgent
     *            User Agent
     *
     * @param html
     *            HTML code
     * @param baseUrl
     *            Base URL from which the HTML code was obtained
     * @param encoding Charset
     * @return an Iterator for the resource URLs
     * @throws HTMLParseException when parsing the <code>html</code> fails
     */
    @Override
    public Iterator<URL> getEmbeddedResourceURLs(
            String userAgent, byte[] html, URL baseUrl, String encoding) throws HTMLParseException {
        // The Set is used to ignore duplicated binary files.
        // Using a LinkedHashSet to avoid unnecessary overhead in iterating
        // the elements in the set later on. As a side-effect, this will keep
        // them roughly in order, which should be a better model of browser
        // behaviour.

        Collection<URLString> col = new LinkedHashSet<>();
        return getEmbeddedResourceURLs(userAgent, html, baseUrl, new URLCollection(col),encoding);

        // An additional note on using HashSets to store URLs: I just
        // discovered that obtaining the hashCode of a java.net.URL implies
        // a domain-name resolution process. This means significant delays
        // can occur, even more so if the domain name is not resolvable.
        // Whether this can be a problem in practical situations I can't tell,
        // but
        // thought I'd keep a note just in case...
        // BTW, note that using a List and removing duplicates via scan
        // would not help, since URL.equals requires name resolution too.
        // The above problem has now been addressed with the URLString and
        // URLCollection classes.

    }

    /**
     * Get the URLs for all the resources that a browser would automatically
     * download following the download of the HTML content, that is: images,
     * stylesheets, javascript files, applets, etc...
     * <p>
     * All URLs should be added to the Collection.
     * <p>
     * Malformed URLs can be reported to the caller by having the Iterator
     * return the corresponding RL String. Overall problems parsing the html
     * should be reported by throwing an HTMLParseException.
     * <p>
     * N.B. The Iterator returns URLs, but the Collection will contain objects
     * of class URLString.
     *
     * @param userAgent
     *            User Agent
     * @param html
     *            HTML code
     * @param baseUrl
     *            Base URL from which the HTML code was obtained
     * @param coll
     *            URLCollection
     * @param encoding Charset
     * @return an Iterator for the resource URLs
     * @throws HTMLParseException when parsing the <code>html</code> fails
     */
    public abstract Iterator<URL> getEmbeddedResourceURLs(
            String userAgent, byte[] html, URL baseUrl, URLCollection coll, String encoding)
            throws HTMLParseException;

    /**
     * Get the URLs for all the resources that a browser would automatically
     * download following the download of the HTML content, that is: images,
     * stylesheets, javascript files, applets, etc...
     * <p>
     * N.B. The Iterator returns URLs, but the Collection will contain objects
     * of class URLString.
     *
     * @param userAgent
     *            User Agent
     * @param html
     *            HTML code
     * @param baseUrl
     *            Base URL from which the HTML code was obtained
     * @param coll
     *            Collection - will contain URLString objects, not URLs
     * @param encoding Charset
     * @return an Iterator for the resource URLs
     * @throws HTMLParseException when parsing the <code>html</code> fails
     */
    public Iterator<URL> getEmbeddedResourceURLs(
            String userAgent, byte[] html, URL baseUrl, Collection<URLString> coll, String encoding) 
                    throws HTMLParseException {
        return getEmbeddedResourceURLs(userAgent, html, baseUrl, new URLCollection(coll), encoding);
    }
    
    /**
     * 
     * @param ieVersion Float IE version
     * @return true if IE version < IE v10
     */
    protected final boolean isEnableConditionalComments(Float ieVersion) {
        // Conditional comment have been dropped in IE10
        // http://msdn.microsoft.com/en-us/library/ie/hh801214%28v=vs.85%29.aspx
        return ieVersion != null && ieVersion.floatValue() < IE_10;
    }
    
    /**
     * 
     * @param userAgent User Agent
     * @return version null if not IE or the version after MSIE
     */
    protected Float extractIEVersion(String userAgent) {
        if (StringUtils.isEmpty(userAgent)) {
            log.info("userAgent is null");
            return null;
        }
        Matcher matcher = IE_UA_PATTERN.matcher(userAgent);
        String ieVersion = null;
        if (matcher.find()) {
            if (matcher.groupCount() > 0) {
                ieVersion = matcher.group(1);
            } else {
                ieVersion = matcher.group();
            }
        }
        if (ieVersion != null) {
            return Float.valueOf(ieVersion);
        } else {
            return null;
        }
    }

    /**
     * Normalizes URL as browsers do
     * @param url {@link CharSequence}
     * @return normalized url
     */
    protected static String normalizeUrlValue(CharSequence url) {
        if (!StringUtils.isEmpty(url)) {
            String trimmed = NORMALIZE_URL_PATTERN.matcher(url.toString().trim()).replaceAll("");
            if (!trimmed.isEmpty()) {
                return trimmed;
            }
        }
        return null;
    }
}