/* * Copyright (c) 2005 Canoo Engineering. All Rights Reserved. */ package com.canoo.webtest.extension.spider; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.io.OutputStreamWriter; import java.io.Writer; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import java.util.Properties; import org.apache.commons.io.IOUtils; import org.apache.log4j.Logger; import com.canoo.webtest.engine.Context; import com.canoo.webtest.engine.StepFailedException; import com.canoo.webtest.steps.StepUtil; import com.gargoylesoftware.htmlunit.Page; import com.gargoylesoftware.htmlunit.WebResponse; import com.gargoylesoftware.htmlunit.html.HtmlAnchor; import com.gargoylesoftware.htmlunit.html.HtmlPage; /** * @author Denis N. Antonioli */ public class Spider { private static final Logger LOG = Logger.getLogger(Spider.class); public static final IVisitorStrategy ALWAYS_ACCEPT_VISITOR_STRATEGY = new AlwaysAcceptVisitorStrategy(); public static final IReporter NO_OP_REPORTER = new NoOpReporter(); public static final IValidator NO_OP_VALIDATOR = new NoOpValidator(); private final Map fVisitedLinks = new HashMap(); private IReporter fReporter; private IVisitorStrategy fVisitorStrategy; private IValidator fValidator; private String fFileName; private int fDepth; private boolean fFailOnError; private Context fContext; public void setFailOnError(boolean failOnError) { fFailOnError = failOnError; } public void setFileName(final String filename) { fFileName = filename; } public String getFileName() { return fFileName; } public void setDepth(final int depth) { fDepth = depth; } public void setReporter(final IReporter reporter) { fReporter = reporter; } public IReporter getReporter() { return fReporter; } public void setVisitorStrategy(final IVisitorStrategy visitorStrategy) { fVisitorStrategy = visitorStrategy; } public IVisitorStrategy getVisitorStrategy() { return fVisitorStrategy; } public void setValidator(final IValidator validator) { fValidator = validator; } public IValidator getValidator() { return fValidator; } Writer getWriter() throws IOException { final Writer writer; if (fFileName != null) { final File file = new File(fContext.getConfig().getWebTestResultDir(), fFileName); LOG.info("Writing in " + file); writer = new FileWriter(file); } else { LOG.info("Writing in standard output"); writer = new OutputStreamWriter(System.out); } return writer; } void setContext(Context context) { fContext = context; } public void execute(final Context context) { validate(); fVisitedLinks.clear(); setContext(context); doExecute(); } boolean doExecute() { Writer writer = null; boolean success = false; try { writer = getWriter(); fReporter.setWriter(writer); fReporter.writeHeader(); visit((HtmlPage) fContext.getCurrentResponse(), fDepth); fReporter.writeFooter(); success = true; } catch (final Throwable e) { LOG.error("Problems during write: " + e.getMessage(), e); } finally { // doing this to Stdout will cause interuption IOUtils.closeQuietly(writer); } return success; } void validate() { if (fDepth < 0) { throw new IllegalArgumentException("depth must be >= 0"); } if (fFileName == null) { LOG.info("No file name defined, will output to console"); } if (fReporter == null) { LOG.info("No reporter defined, using noop reporter"); fReporter = NO_OP_REPORTER; } if (fValidator == null) { LOG.info("No validator defined, using noop validator"); fValidator = NO_OP_VALIDATOR; } if (fVisitorStrategy == null) { LOG.info("No visitor strategy set, using noop strategy"); fVisitorStrategy = ALWAYS_ACCEPT_VISITOR_STRATEGY; } } void visit(final HtmlPage currentResponse, final int depth) throws IOException { LOG.debug("report depth " + depth); for (final Iterator iter = currentResponse.getAnchors().iterator(); iter.hasNext();) { final HtmlAnchor link = (HtmlAnchor) iter.next(); final Properties linkInfo = fValidator.validate(fDepth - depth, currentResponse, link); fReporter.write(linkInfo); if (depth > 0 && needsReport(link)) { processLink(link, depth); } } } void processLink(final HtmlAnchor link, final int depth) throws IOException { try { follow(link); final Page page = fContext.getCurrentResponse(); if (page instanceof HtmlPage) { visit((HtmlPage) page, depth - 1); } else { final WebResponse response = page.getWebResponse(); LOG.info("Don't going deeper in response for " + response.getWebRequest().getUrl() + " as it isn't an html page (content type: " + response.getContentType() + ", page" + page + ")"); } } catch (final StepFailedException e) { LOG.error(e.getMessage(), e); if (fFailOnError) { throw e; } } } void follow(final HtmlAnchor link) { LOG.debug("Clicking on link with href: " + link.getHrefAttribute()); try { link.click(); } catch (final Exception ex) { StepUtil.handleException(ex); } } boolean needsReport(final HtmlAnchor link) { if (fVisitedLinks.containsKey(link.getHrefAttribute())) { LOG.info(link.getHrefAttribute() + " skipped: already visited"); return false; } if (!fVisitorStrategy.accept(link)) { LOG.info(link.getHrefAttribute() + " skipped: rejected by visitor"); return false; } fVisitedLinks.put(link.getHrefAttribute(), Boolean.TRUE); return true; } private static class AlwaysAcceptVisitorStrategy implements IVisitorStrategy { public boolean accept(HtmlAnchor link) { return true; } } private static class NoOpReporter implements IReporter { public void writeHeader() { } public void write(Properties linkInfo) { } public void setWriter(Writer writer) { } public void writeFooter() { } } private static class NoOpValidator implements IValidator { private static final Properties EMPTY_PROPERTIES = new Properties(); public Properties validate(final int depth, final HtmlPage webResponse, final HtmlAnchor link) { return EMPTY_PROPERTIES; } } }