package ca.concordia.cssanalyser.crawler.plugin; import java.io.BufferedInputStream; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.net.MalformedURLException; import java.net.URL; import java.net.URLConnection; import java.util.Collection; import java.util.HashSet; import java.util.Set; import org.slf4j.Logger; import com.crawljax.browser.EmbeddedBrowser; import com.crawljax.core.CrawlerContext; import com.crawljax.core.plugin.GeneratesOutput; import com.crawljax.core.plugin.OnNewStatePlugin; import com.crawljax.core.state.StateVertex; import ca.concordia.cssanalyser.app.FileLogger; import ca.concordia.cssanalyser.io.IOHelper; /** * This plugin, written for Crawljax, is responsible for * downloading all CSS files (including those which are added * dynamically at runtime) for analysing purposes. * @author Davood Mazinanian * */ public class CSSCatcher implements OnNewStatePlugin, GeneratesOutput { private static final Logger LOGGER = FileLogger.getLogger(CSSCatcher.class); private final Set<String> cssHrefs; private String outputPatch = ""; public CSSCatcher() { cssHrefs = new HashSet<>(); } @Override public void onNewState(CrawlerContext arg0, StateVertex arg1) { if ("".equals(getOutputFolder())) { LOGGER.warn("Output folder for CSSCather has not been set. " + "So there will be no output for CSSCatcher. " + "Use CSSCather.setOutputFolder() before CrawljaxRunner.call()"); } EmbeddedBrowser browser = arg0.getBrowser(); int styleSheetLength = Integer.valueOf(browser.executeJavaScript("return document.styleSheets.length").toString()); for (int i = 0; i < styleSheetLength; i++) { Object hrefObj = browser.executeJavaScript("return document.styleSheets[" + i + "].href"); if (hrefObj != null) { String href = hrefObj.toString(); cssHrefs.add(href); fetchAndWriteFile(href, arg1.getName(), arg1.getUrl()); } } } /** * Fetches the file, given by href, and saves it to the * folder initialized with setOutputFolder() * @param href * @param forWebSite */ private void fetchAndWriteFile(String href, String stateName, String forWebSite) { File rootFile = new File(getOutputFolder()); if (!rootFile.exists() || !rootFile.isDirectory()) rootFile.mkdir(); String folderPath = getOutputFolder() + "/" + stateName; // Create the desired folder. One folder for each state File outputFolder = new File(folderPath); if (!outputFolder.exists() || !outputFolder.isDirectory()) outputFolder.mkdir(); int questionMark = href.indexOf("?"); if (questionMark >= 0) href = href.substring(0, questionMark); int lastSlashPosition = href.lastIndexOf('/'); // Get the name of file and append it to the desired folder String cssFileName = href.substring(lastSlashPosition + 1).replaceAll("[\\\\\\/:\\*\\?\\\"\\<\\>\\|]", "_"); if (cssFileName.length() > 128) cssFileName = cssFileName.substring(0, 128); //If file name does not end with .css, add it if (!cssFileName.endsWith(".css")) cssFileName = cssFileName + ".css"; String cssFilePath = folderPath + "/" + cssFileName; while ((new File(cssFilePath)).exists()) cssFilePath += "_.css"; try { StringBuilder builder = new StringBuilder(); if (!href.startsWith("file://")) { getRemoteFileContents(href, builder); } else { String localFile = IOHelper.readFileToString(href.replaceFirst("file://[/]?", "")); builder.append(localFile); } if (builder.length() > 0) { final String EOL_CHAR = "\n"; // Lets add some information to the head of this CSS file String headerText = String.format("/* " + EOL_CHAR + " * Created by CSSCatcher plugin for Crawljax" + EOL_CHAR + " * CSS file is for Crawljax DOM state %s" + EOL_CHAR + " * CSS file was contained in %s" + EOL_CHAR + " * Downloaded from %s" + EOL_CHAR + " */" + EOL_CHAR + EOL_CHAR, forWebSite, stateName, href); IOHelper.writeStringToFile(headerText + builder.toString().replace("\r\n", EOL_CHAR), cssFilePath); } } catch (MalformedURLException e) { LOGGER.warn("Malformed url for file:" + href); } catch (IOException e) { LOGGER.warn("IOException for file:" + href); e.printStackTrace(); } } private void getRemoteFileContents(String href, StringBuilder builder) { try { URLConnection urlConnection = (new URL(href)).openConnection(); int contentLength = urlConnection.getContentLength(); if (!"text/css".equals(urlConnection.getContentType()) || contentLength == -1) { LOGGER.warn("{} is not a CSS file, skipping", href); } else { InputStream inputStream = new BufferedInputStream(urlConnection.getInputStream()); byte[] data = new byte[contentLength]; int bytesRead = 0; int offset = 0; while (offset < contentLength) { bytesRead = inputStream.read(data, offset, data.length - offset); if (bytesRead == -1) break; offset += bytesRead; } inputStream.close(); if (offset != contentLength) { throw new IOException("Only read " + offset + " bytes; Expected " + contentLength + " bytes"); } builder.append(new String(data)); } } catch(IOException ex) { ex.printStackTrace(); } } /** * Returns a collection of all CSS hrefs, * downloaded during crawling session. * @return */ public Collection<String> getAllCSSHrefs() { return cssHrefs; } /** * Gets the output folder in which * the caught CSS files would be written */ @Override public String getOutputFolder() { return outputPatch; } /** * Sets the output folder in which * the caught CSS files would be written. * Must be called before using CrawljaxRunner.call() to * let it write the CSS files in the desired folder. If not, * no files would be created, but still one can access the * URIs of the caught CSS files using <code>getAllCSSHrefs()</code> method. * If the specified folder does not exist, it will create the folder. * @param path */ @Override public void setOutputFolder(String path) { File folder = new File(path); if (folder.exists()) { LOGGER.warn(String.format("CSSCatcher: output folder %s is not empty. Existing files would be overwriten.", path)); } else { folder.mkdir(); LOGGER.info(String.format("Created folder %s", path)); } outputPatch = folder.getAbsolutePath(); } }