/** ========================================================================
* handytrowel: src/main/java/network/HTMLFetcher.java
* Retrieve HTML source code of pages whilst executing JavaScript payload
* ========================================================================
* Copyright (c) 2014, Asim Ihsan, All rights reserved.
* <http://www.asimihsan.com>
* https://github.com/asimihsan/handytrowel/blob/master/LICENSE
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
* ========================================================================
*/
package com.asimihsan.handytrowel.network;
import java.util.concurrent.TimeoutException;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.phantomjs.PhantomJSDriver;
import org.openqa.selenium.phantomjs.PhantomJSDriverService;
import org.openqa.selenium.remote.DesiredCapabilities;
/**
* Retrieve the HTML source code of a web page after also executing its
* JavaScript payload.
*
* @author Asim Ihsan
*/
public class HTMLFetcher {
/**
* How long to attempt to HTTP GET a page before timing out. This time
* could be taken up both by HTTP latency and rendering and JavaScript
* execution time.
*
* The default value is 30 seconds.
*/
private final int timeoutMillis;
public static class HTMLFetcherBuilder {
private int timeoutMillis = 30 * 1000;
public HTMLFetcherBuilder timeoutMillis(int timeoutMillis) {
this.timeoutMillis = timeoutMillis;
return this;
}
public HTMLFetcher build() {
return new HTMLFetcher(this);
}
}
private HTMLFetcher(HTMLFetcherBuilder builder) {
this.timeoutMillis = builder.timeoutMillis;
}
private final Logger phantomJsLogger = Logger.getLogger(PhantomJSDriverService.class.getName());
public String getPageSource(final String url) throws TimeoutException {
// Make the Selenium WebDriver logs be quiet
phantomJsLogger.setLevel(Level.OFF);
DesiredCapabilities desiredCapabilities = DesiredCapabilities.phantomjs();
// What other CLI args there are: http://phantomjs.org/api/command-line.html
// Where the cache goes on Mac OS X: ~/Library/Application\ Support/Ofi\ Labs/PhantomJS/
// Other cache locations: https://groups.google.com/forum/#!topic/phantomjs/8GYaXKmowj0
desiredCapabilities.setCapability(
PhantomJSDriverService.PHANTOMJS_CLI_ARGS,
new String[] {"--ignore-ssl-errors=yes", "--load-images=no",
"--disk-cache=true", "--max-disk-cache-size=size=51200"
});
final WebDriver driver = new PhantomJSDriver(desiredCapabilities);
// doesn't work, keep as reference.
//driver.manage().timeouts().implicitlyWait(10, TimeUnit.SECONDS);
try {
Thread t = new Thread(new Runnable() {
@Override
public void run() {
driver.get(url);
}
});
t.start();
try {
t.join(timeoutMillis);
} catch (InterruptedException e) {
}
if (t.isAlive()) {
System.out.println("Timeout for HTTP GET to: " + url);
t.interrupt();
throw new TimeoutException();
}
String pageSource = driver.getPageSource();
return pageSource;
} finally {
driver.quit();
}
}
}