package ca.concordia.cssanalyser.crawler;
import java.io.File;
import java.util.Collection;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import org.openqa.selenium.JavascriptExecutor;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.firefox.FirefoxDriver;
import com.crawljax.browser.EmbeddedBrowser.BrowserType;
import com.crawljax.core.CrawljaxRunner;
import com.crawljax.core.configuration.BrowserConfiguration;
import com.crawljax.core.configuration.CrawljaxConfiguration;
import com.crawljax.core.configuration.CrawljaxConfiguration.CrawljaxConfigurationBuilder;
import com.crawljax.plugins.crawloverview.CrawlOverview;
import ca.concordia.cssanalyser.crawler.plugin.CSSCatcher;
/**
* Uses Crawljax in order to crawl the web page
* @author Davood Mazinanian
*
*/
public class Crawler {
private final String websiteURI;
private final String outputFolder;
public Crawler(String URI, String outputFolderPath) {
websiteURI = URI;
outputFolder = outputFolderPath;
}
/**
* Starts crawling the given website using Crawljax and
* CSSCatcher plugin.
*/
public void start() {
CrawljaxConfigurationBuilder builder = CrawljaxConfiguration.builderFor(websiteURI);
configureCrawljax(builder);
CrawljaxRunner crawljax = new CrawljaxRunner(builder.build());
crawljax.call();
}
/**
* Set Crawljax configuration here
* @param builder
*/
private void configureCrawljax(CrawljaxConfigurationBuilder builder) {
CSSCatcher cssCatcher = new CSSCatcher();
cssCatcher.setOutputFolder(outputFolder + "css/");
builder.addPlugin(new CrawlOverview());
builder.addPlugin(cssCatcher);
//builder.addPlugin(new LoginPlugin());
//builder.crawlRules().clickDefaultElements();
//builder.crawlRules().dontClick("input").withAttribute("value", "I don't recognize");
//builder.crawlRules().click("input").withAttribute("type", "submit");
//builder.crawlRules().dontClick("a").underXPath("//*[@id='pageFooter']");
//builder.crawlRules().dontClick("a").underXPath("//*[@id='content']/div/div[2]");
//System.getProperties().setProperty("webdriver.chrome.driver", "chromedriver.exe");
//builder.setBrowserConfig(new BrowserConfiguration(BrowserType.CHROME, 2));
builder.setBrowserConfig(new BrowserConfiguration(BrowserType.FIREFOX, 1));
builder.crawlRules().insertRandomDataInInputForms(false);
builder.crawlRules().clickElementsInRandomOrder(false);
builder.crawlRules().crawlFrames(true);
builder.crawlRules().dontClick("*");
//com.crawljax.browser.WebDriverBackedEmbeddedBrowser s;
builder.setOutputDirectory(new File(outputFolder + "/crawljax"));
builder.setMaximumDepth(1);
builder.setMaximumStates(2);
builder.crawlRules().waitAfterReloadUrl(20, TimeUnit.MILLISECONDS);
builder.crawlRules().waitAfterEvent(200, TimeUnit.MILLISECONDS);
}
/**
* Returns hrefs of all initial CSSs of a web site
* @return
*/
public Collection<String> getInitialCSSHrefs() {
Set<String> allHrefs = new HashSet<>();
// Code adapted from seleniumhq
// Create a new instance of the Firefox driver
// Notice that the remainder of the code relies on the interface,
// not the implementation.
WebDriver driver = new FirefoxDriver();
// And now use this to visit Google
driver.get(websiteURI);
// Alternatively the same thing can be done like this
// driver.navigate().to("http://www.google.com");
// Check the title of the page
// System.out.println("Page title is: " + driver.getTitle());
// Google's search is rendered dynamically with JavaScript.
// Wait for the page to load, timeout after 10 seconds
// (new WebDriverWait(driver, 10)).until(new
// ExpectedCondition<Boolean>() {
// public Boolean apply(WebDriver d) {
// return d.getTitle().toLowerCase().startsWith("cheese!");
// }
// });
JavascriptExecutor js = (JavascriptExecutor) driver;
int styleSheetLength = Integer.valueOf(js.executeScript("return document.styleSheets.length").toString());
for (int i = 0; i < styleSheetLength; i++) {
Object hrefObj = js.executeScript("return document.styleSheets[" + i + "].href");
if (hrefObj == null)
continue;
String href = hrefObj.toString();
if (href != null) {
allHrefs.add(href);
}
}
// Close the browser
driver.quit();
return allHrefs;
}
}