/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.protocol.htmlunit; import java.lang.invoke.MethodHandles; import java.io.BufferedInputStream; import java.io.File; import java.io.FileInputStream; import java.io.InputStream; import java.io.OutputStream; import java.util.concurrent.TimeUnit; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IOUtils; import org.openqa.selenium.By; import org.openqa.selenium.JavascriptExecutor; import org.openqa.selenium.OutputType; import org.openqa.selenium.TakesScreenshot; import org.openqa.selenium.TimeoutException; import org.openqa.selenium.WebDriver; import org.openqa.selenium.WebElement; import org.openqa.selenium.htmlunit.HtmlUnitDriver; import org.openqa.selenium.io.TemporaryFilesystem; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.gargoylesoftware.htmlunit.WebClient; public class HtmlUnitWebDriver extends HtmlUnitDriver { private static final Logger LOG = LoggerFactory .getLogger(MethodHandles.lookup().lookupClass()); private static boolean enableJavascript; private static boolean enableCss; private static boolean enableRedirect; private static long javascriptTimeout; private static int maxRedirects; public HtmlUnitWebDriver() { super(enableJavascript); } @Override protected WebClient modifyWebClient(WebClient client) { client.getOptions().setJavaScriptEnabled(enableJavascript); client.getOptions().setCssEnabled(enableCss); client.getOptions().setRedirectEnabled(enableRedirect); if(enableJavascript) client.setJavaScriptTimeout(javascriptTimeout); client.getOptions().setThrowExceptionOnScriptError(false); if(enableRedirect) client.addWebWindowListener(new HtmlUnitWebWindowListener(maxRedirects)); return client; } public static WebDriver getDriverForPage(String url, Configuration conf) { long pageLoadTimout = conf.getLong("page.load.delay", 3); enableJavascript = conf.getBoolean("htmlunit.enable.javascript", true); enableCss = conf.getBoolean("htmlunit.enable.css", false); javascriptTimeout = conf.getLong("htmlunit.javascript.timeout", 3500); int redirects = Integer.parseInt(conf.get("http.redirect.max", "0")); enableRedirect = redirects <= 0 ? false : true; maxRedirects = redirects; WebDriver driver = null; try { driver = new HtmlUnitWebDriver(); driver.manage().timeouts().pageLoadTimeout(pageLoadTimout, TimeUnit.SECONDS); driver.get(url); } catch(Exception e) { if(e instanceof TimeoutException) { LOG.debug("HtmlUnit WebDriver: Timeout Exception: Capturing whatever loaded so far..."); return driver; } cleanUpDriver(driver); throw new RuntimeException(e); } return driver; } public static String getHTMLContent(WebDriver driver, Configuration conf) { try { if (conf.getBoolean("take.screenshot", false)) takeScreenshot(driver, conf); String innerHtml = ""; if(enableJavascript) { WebElement body = driver.findElement(By.tagName("body")); innerHtml = (String)((JavascriptExecutor)driver).executeScript("return arguments[0].innerHTML;", body); } else innerHtml = driver.getPageSource().replaceAll("&", "&"); return innerHtml; } catch(Exception e) { TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles(); cleanUpDriver(driver); throw new RuntimeException(e); } } public static void cleanUpDriver(WebDriver driver) { if (driver != null) { try { driver.close(); driver.quit(); TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles(); } catch (Exception e) { throw new RuntimeException(e); } } } /** * Function for obtaining the HTML BODY using the selected * <a href='https://seleniumhq.github.io/selenium/docs/api/java/org/openqa/selenium/WebDriver.html'>selenium webdriver</a> * There are a number of configuration properties within * <code>nutch-site.xml</code> which determine whether to * take screenshots of the rendered pages and persist them * as timestamped .png's into HDFS. * @param url the URL to fetch and render * @param conf the {@link org.apache.hadoop.conf.Configuration} * @return the rendered inner HTML page */ public static String getHtmlPage(String url, Configuration conf) { WebDriver driver = getDriverForPage(url, conf); try { if (conf.getBoolean("take.screenshot", false)) takeScreenshot(driver, conf); String innerHtml = ""; if(enableJavascript) { WebElement body = driver.findElement(By.tagName("body")); innerHtml = (String)((JavascriptExecutor)driver).executeScript("return arguments[0].innerHTML;", body); } else innerHtml = driver.getPageSource().replaceAll("&", "&"); return innerHtml; } catch (Exception e) { TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles(); throw new RuntimeException(e); } finally { cleanUpDriver(driver); } } private static void takeScreenshot(WebDriver driver, Configuration conf) { try { String url = driver.getCurrentUrl(); File srcFile = ((TakesScreenshot)driver).getScreenshotAs(OutputType.FILE); LOG.debug("In-memory screenshot taken of: {}", url); FileSystem fs = FileSystem.get(conf); if (conf.get("screenshot.location") != null) { Path screenshotPath = new Path(conf.get("screenshot.location") + "/" + srcFile.getName()); OutputStream os = null; if (!fs.exists(screenshotPath)) { LOG.debug("No existing screenshot already exists... creating new file at {} {}.", screenshotPath, srcFile.getName()); os = fs.create(screenshotPath); } InputStream is = new BufferedInputStream(new FileInputStream(srcFile)); IOUtils.copyBytes(is, os, conf); LOG.debug("Screenshot for {} successfully saved to: {} {}", url, screenshotPath, srcFile.getName()); } else { LOG.warn("Screenshot for {} not saved to HDFS (subsequently disgarded) as value for " + "'screenshot.location' is absent from nutch-site.xml.", url); } } catch (Exception e) { cleanUpDriver(driver); throw new RuntimeException(e); } } }