package gov.nysenate.openleg.dao.scraping; import com.google.common.eventbus.EventBus; import gov.nysenate.openleg.config.Environment; import gov.nysenate.openleg.model.notification.Notification; import gov.nysenate.openleg.model.notification.NotificationType; import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.exception.ExceptionUtils; import org.apache.log4j.Logger; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.springframework.beans.factory.annotation.Autowired; import java.io.File; import java.io.IOException; import java.net.SocketTimeoutException; import java.net.URL; import java.time.LocalDateTime; import java.time.format.DateTimeFormatter; import java.util.regex.Pattern; /** * Created by kyle on 11/3/14. */ public abstract class LRSScraper { private static final Logger logger = Logger.getLogger(LRSScraper.class); @Autowired protected Environment environment; @Autowired private EventBus eventBus; protected final DateTimeFormatter dateFormat = DateTimeFormatter.ofPattern("'D'yyyyMMdd'.T'HHmmss"); protected final Pattern relativeBasePattern = Pattern.compile("(http://.+/).*"); protected final Pattern absoluteBasePattern = Pattern.compile("(http://.+?)/.*"); protected final Pattern linkPattern = Pattern.compile("<a href=\\\"(.*?)\\\">(.+?)</a>"); protected final Pattern bottomPattern = Pattern.compile("src=\\\"(frmload\\.cgi\\?BOT-([0-9]+))\\\">"); /** * Attempts to scrape LRS data. * any scraping exceptions encountered are logged * @return the number of scraped files * @throws IOException */ public int scrape() throws IOException { try { return doScrape(); } catch (ScrapingIOException ex) { handleScrapingTimeout(ex); return 0; } } /** * An abstract method that performs the scraping */ protected abstract int doScrape() throws IOException, ScrapingIOException; /** * Logs and sends a notification for a scraping exception */ protected void handleScrapingTimeout(ScrapingIOException ex) { logger.error("scraping exception: \n" + ExceptionUtils.getStackTrace(ex)); eventBus.post( new Notification( NotificationType.SCRAPING_EXCEPTION, LocalDateTime.now(), "Scraping exception: " + ExceptionUtils.getStackFrames(ex)[0], ExceptionUtils.getStackTrace(ex) )); } protected Document getJsoupDocument(String url) { try { return Jsoup.connect(url).timeout(10000).get(); } catch (IOException ex) { throw new ScrapingIOException(url, ex); } } protected String getUrlContents(URL url) { try { return IOUtils.toString(url); } catch (IOException ex) { throw new ScrapingIOException(url, ex); } } protected void copyUrlToFile(URL url, File file) { try { FileUtils.copyURLToFile(url, file, 10000, 10000); } catch (IOException ex) { throw new ScrapingIOException(url, ex); } } }