// Copyright � 2002-2007 Canoo Engineering AG, Switzerland.
package com.canoo.webtest.steps.verify;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import org.apache.commons.lang.StringUtils;
import org.apache.log4j.Logger;
import org.xml.sax.SAXException;
import com.canoo.webtest.boundary.HtmlUnitBoundary;
import com.canoo.webtest.engine.Context;
import com.canoo.webtest.engine.RegExStringVerifier;
import com.canoo.webtest.engine.StepFailedException;
import com.canoo.webtest.steps.Step;
import com.canoo.webtest.util.ConversionUtil;
import com.gargoylesoftware.htmlunit.Page;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlAnchor;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
/**
* @author Dierk Koenig, Urs-Peter Häss
* @author Marc Guillemot, Paul King, Brian Hubbard
* @webtest.step category="Core"
* name="verifyLinks"
* alias="verifylinks"
* description="This step checks the validity of all links on the current page. Non-<key>HTML</key> pages (CSS, <key>javascript</key>, <key>XML</key> files) are not checked for internal links. Non-<key>HTTP</key> links (mail addresses, ftp etc.) are not checked or followed."
*/
public class VerifyLinks extends Step
{
private static final Logger LOG = Logger.getLogger(VerifyLinks.class);
private String fBaseHost;
private int fMaxDepth;
private String fMaxDepthStr;
private int fCurrentDepth;
private boolean fOnsiteonly;
private String fExcludes;
private String fIncludes;
private final Set fFailedVisits = new HashSet();
private final Set fVisitedUrls = new HashSet();
private int fValidLinks;
private boolean fIgnoreForeignJSErrors;
protected Set getFailedVisits() {
return fFailedVisits;
}
public String getDepth() {
return fMaxDepthStr;
}
/**
* @webtest.parameter required="no"
* default="0"
* description="The <em>depth</em> parameter defines the depth of the recursive search for broken links on sub-pages."
*/
public void setDepth(String depth) {
fMaxDepthStr = depth;
}
/**
* @webtest.parameter required="no"
* default="<empty>"
* description="If <em>excludes</em> is set then each link found is compared to the defined string (via regexp), if it matches then the link is not followed."
*/
public void setExcludes(String regex) {
fExcludes = regex;
}
public String getExcludes() {
return fExcludes;
}
/**
* @webtest.parameter required="no"
* default="<all>"
* description="If <em>includes</em> is set then each link found is compared to the defined string (via regexp), if it matches then the link is processed, others are ignored."
*/
public void setIncludes(String regex) {
fIncludes = regex;
}
public String getIncludes() {
return fIncludes;
}
/**
* @webtest.parameter required="no"
* default="false"
* description="If <em>onsiteonly</em> is set to <em>true</em>, the recursive search for invalid links is limited to the local host.
* Only the initial link to a foreign host is checked, but no deeper search is performed."
*/
public void setOnsiteonly(final boolean onsiteonly) {
fOnsiteonly = onsiteonly;
}
/**
*
* @webtest.parameter required="no"
* default="false"
* description="Indicates if JavaScript errors should be ignored on visited pages from a different host
* than the current page."
*/
public void setIgnoreForeignJSErrors(final boolean b)
{
fIgnoreForeignJSErrors = b;
}
public void doExecute() throws SAXException, MalformedURLException {
verifyProperties();
nullResponseCheck();
final Context context = getContext();
final HtmlPage htmlPage = context.getCurrentHtmlResponse(this);
LOG.info("Examining page with title=" + htmlPage.getTitleText());
if (!StringUtils.isEmpty(getIncludes())) {
LOG.info("Only including links which match '" + getIncludes() + "'");
}
if (!StringUtils.isEmpty(getExcludes())) {
LOG.info("Excluding links which match '" + getExcludes() + "'");
}
fBaseHost = htmlPage.getUrl().getHost();
final WebClient client = context.getWebClient();
checkVisits(client, htmlPage);
if (!fFailedVisits.isEmpty()) {
throw new StepFailedException(fFailedVisits.size() + " broken link(s): " + brokenLinksToString(), this);
}
}
protected void addComputedParameters(final Map map)
{
map.put("-> valid links", String.valueOf(fValidLinks));
}
protected void checkVisits(final WebClient webClient, final HtmlPage response) {
final Set urls = getGoodLinks(response);
final RegExStringVerifier verifier = new RegExStringVerifier();
for (final Iterator iter = urls.iterator(); iter.hasNext();) {
final URL url = (URL) iter.next();
if (fVisitedUrls.contains(url)) {
LOG.debug("Skipped already visited: " + url);
fValidLinks++;
continue;
}
if (!StringUtils.isEmpty(getIncludes()) && (!verifier.verifyStrings(getIncludes(), url.toString()))) {
LOG.info("Skipped link as it doesn't match the includes list: " + url);
continue;
}
if (!StringUtils.isEmpty(getExcludes()) && (verifier.verifyStrings(getExcludes(), url.toString()))) {
LOG.info("Skipped link as matched the excludes list: " + url);
continue;
}
visit(response, url, webClient);
}
}
protected void visit(final HtmlPage referingPage, final URL url, final WebClient webClient) {
final boolean ignoreJSErrorsOriginal = webClient.isThrowExceptionOnScriptError();
if (fIgnoreForeignJSErrors && isForeignHost(url))
{
LOG.info("Ignore JS errors (if any) for " + url);
webClient.setThrowExceptionOnScriptError(false);
}
final Page response = HtmlUnitBoundary.tryGetPageNoFail(url, webClient);
webClient.setThrowExceptionOnScriptError(ignoreJSErrorsOriginal);
fVisitedUrls.add(url);
if (response == null) {
fFailedVisits.add(new ZFailedLink(url, referingPage.getUrl()));
}
else {
fValidLinks++;
if (response instanceof HtmlPage) {
followRecursively((HtmlPage) response, webClient);
}
}
}
protected void followRecursively(final HtmlPage htmlPage, final WebClient webClient) {
LOG.debug("fMaxDepth = " + fMaxDepth);
if (fCurrentDepth < fMaxDepth && !stopHunting(htmlPage)) {
++fCurrentDepth;
checkVisits(webClient, htmlPage);
--fCurrentDepth;
}
}
protected String brokenLinksToString() {
StringBuffer sb = new StringBuffer();
for (Iterator iter = fFailedVisits.iterator(); iter.hasNext();) {
ZFailedLink failedLink = (ZFailedLink) iter.next();
sb.append(failedLink.getFailedUrl()).append(" on ").append(failedLink.getReferingUrl()).append("; ");
}
return sb.toString();
}
static int getLinkCount(final HtmlPage response) {
return getGoodLinks(response).size();
}
/**
* Gets all HTTP links in the response
*
* @param response
* @return a set of {@link URL}
*/
static Set getGoodLinks(final HtmlPage response) {
LOG.info("Looking for links in " + response);
final Set urls = new HashSet();
for (final Iterator iter = response.getAnchors().iterator(); iter.hasNext();) {
processLink(response, (HtmlAnchor) iter.next(), urls);
}
LOG.info(urls.size() + " different links found in page " + response.getUrl());
return urls;
}
private static void processLink(final HtmlPage response, final HtmlAnchor link, final Set urls) {
try {
final URL url = response.getFullyQualifiedUrl(link.getHrefAttribute());
final String protocol = url.getProtocol();
if ("http".equals(protocol) || "https".equals(protocol)) {
LOG.info("Adding url to check: " + url);
urls.add(url);
}
else {
LOG.info("Skipped link due to protocol: " + url);
}
}
catch (final MalformedURLException e) {
LOG.info("Skipped link due to bad url: " + link.getHrefAttribute());
}
}
protected boolean stopHunting(final HtmlPage htmlPage) {
return fOnsiteonly && isForeignHost(htmlPage.getUrl());
}
protected boolean isForeignHost(final URL url) {
return !fBaseHost.equals(url.getHost());
}
protected void verifyProperties() {
fMaxDepth = ConversionUtil.convertToInt(getDepth(), 0);
optionalIntegerParamCheck(getDepth(), "depth", true);
}
public boolean isPerformingAction() {
return false;
}
}
/**
* Utility data holder
*/
class ZFailedLink
{
private URL fFailedUrl;
private URL fReferingUrl;
ZFailedLink(final URL failedUrl, final URL referingUrl) {
fFailedUrl = failedUrl;
fReferingUrl = referingUrl;
}
public URL getFailedUrl() {
return fFailedUrl;
}
public URL getReferingUrl() {
return fReferingUrl;
}
}