/*
* Copyright 2014 Google Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
* in compliance with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License
* is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing permissions and limitations under
* the License.
*/
package org.polimi.zarathustra.experiment;
import java.io.File;
import java.io.IOException;
import java.math.BigInteger;
import java.net.URI;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import org.openqa.selenium.TimeoutException;
import org.polimi.zarathustra.DOMHelper;
import org.polimi.zarathustra.webdriver.LocalWebdriverWorker;
import org.w3c.dom.Document;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Charsets;
import com.google.common.io.Files;
/**
* This experiment dumps the DOMs from a set of URLs to files in a directory. It
* also creates a manifest.txt file in the directory.
*/
public class DOMsDumpExperiment {
private static String getFileName(String url) throws NoSuchAlgorithmException {
String domain = URI.create(url).getHost();
MessageDigest md5 = MessageDigest.getInstance("MD5");
md5.update(url.getBytes(), 0, url.length());
String urlMd5 = new BigInteger(1, md5.digest()).toString(16);
return String.format("%s.%s%s", domain, urlMd5, DOMHelper.DOM_DUMP_SUFFIX);
}
/**
* Dumps the DOMs from a set of URLs to a directory. Sample invocation: java
* -jar domdump.jar urllist.txt /tmp/output. urllist.txt is expected to have
* one URL per line, without an empty final line.
* <p>
* NOTE: for testing purposes, this can be run with an extra argument,
* FIREFOX: if so, it runs a firefox webdriver instead of explorer.
*/
public static void main(String[] args) throws Exception {
if (args.length < 2) {
System.err.println("Invoke this experiment with 2 parameters:" + "urlFileList and outputDir");
System.exit(-1);
}
paramsAreOkOrDie(args);
File inputFile = new File(args[0]);
File outputDir = new File(args[1]);
LocalWebdriverWorker worker;
if ((args.length > 2) && (args[2].equals("FIREFOX"))) {
System.out.println("Running in FIREFOX mode, for testing only");
worker = new LocalWebdriverWorker(false);
} else {
worker = new LocalWebdriverWorker(true);
}
List<String> urls = Files.readLines(inputFile, Charsets.UTF_8);
urls = removeComments(urls);
File manifest = new File(outputDir, "manifest.txt");
Files.append("Start capture. Version 1.0\n", manifest, Charsets.UTF_8);
for (String url : urls) {
try {
storeDOM(url, outputDir, worker);
storeDOMManifest(url, outputDir, new Date().getTime());
updateManifest(url, new Date().getTime(), manifest);
} catch (TimeoutException e2) {
System.out.println("WebDriver Timeout: " + e2.toString());
} catch (org.openqa.selenium.UnhandledAlertException e1) {
System.out.println(url + " --- UnhandledAlertException caught" + e1.toString());
} catch (Exception e) {
System.out.println("Generic exception caught: " + e.toString());
}
}
worker.quit();
}
private static void paramsAreOkOrDie(String[] args) {
File inputFile = new File(args[0]);
File outputDir = new File(args[1]);
if (!inputFile.canRead()) {
System.err.println("Cannot read input file");
System.exit(1);
}
if (!outputDir.canWrite()) {
System.err.println("Cannot write to output dir");
System.exit(1);
}
if (!outputDir.isDirectory()) {
System.err.println("Cannot find output dir");
System.exit(1);
}
}
static List<String> removeComments(List<String> urls) {
List<String> selectedLines = new ArrayList<String>();
for (String url : urls) {
if (url.charAt(0) != '#') {
selectedLines.add(url);
}
}
return selectedLines;
}
/**
* Saves the DOM at the provided URL to the given outptuDir.
*/
@VisibleForTesting
static void storeDOM(String url, File outputDir, LocalWebdriverWorker worker) throws IOException,
NoSuchAlgorithmException, TimeoutException {
String fileName = getFileName(url);
Document dom = worker.getDocumentAndStoreSource(url, outputDir, fileName);
File output = new File(outputDir, fileName);
DOMHelper.serializeDocument(dom, output.getAbsolutePath());
}
private static void storeDOMManifest(String url, File outputDir, long time)
throws NoSuchAlgorithmException, IOException {
String fileName = getFileName(url) + ".manifest";
File output = new File(outputDir, fileName);
Files.write(url + ", " + time, output, Charsets.UTF_8);
}
/**
* Updates the manifest file adding a track of the current request
*/
private static void updateManifest(String url, Long timestamp, File manifest) throws IOException {
String trace = String.format("%s, %s\n", url, timestamp.toString());
Files.append(trace, manifest, Charsets.UTF_8);
}
}