package gov.nysenate.openleg.service.scraping; import com.google.common.collect.ImmutableMap; import gov.nysenate.openleg.dao.bill.text.BillTextReferenceDao; import gov.nysenate.openleg.dao.scraping.LRSScraper; import gov.nysenate.openleg.dao.scraping.ScrapingIOException; import gov.nysenate.openleg.model.bill.BaseBillId; import gov.nysenate.openleg.util.DateUtils; import org.apache.commons.io.FileUtils; import org.apache.commons.lang3.text.StrSubstitutor; import org.apache.http.HttpResponse; import org.apache.http.client.HttpClient; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.HttpClientBuilder; import org.apache.log4j.Logger; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.dao.EmptyResultDataAccessException; import org.springframework.stereotype.Repository; import javax.annotation.PostConstruct; import java.io.*; import java.time.LocalDateTime; /** * Created by kyle on 1/29/15. */ @Repository public class BillTextScraper extends LRSScraper { private static final Logger logger = Logger.getLogger(BillTextScraper.class); private static final String billUrlTemplate = "http://public.leginfo.state.ny.us/navigate.cgi?NVDTO:=&" + "QUERYDATA=${printNo}&QUERYTYPE=BILLNO&SESSYR=${sessionYear}&CBTEXT=Y&CBSPONMEMO=Y"; private static final String billFileTemplate = "${sessionYear}-${printNo}-${scrapedTime}.html"; @Autowired BillTextReferenceDao btrDao; File billScrapedDir; @PostConstruct public void init() { billScrapedDir = new File(environment.getScrapedStagingDir(), "bill"); try { FileUtils.forceMkdir(billScrapedDir); } catch (IOException ex) { logger.error("could not create bill scraped staging dir " + billScrapedDir.getPath()); } } /** * Attempts to get the LRS html for the first bill in the scrape queue * * @return the number of bills scraped * @throws IOException If there is an error while downloading or saving the bill html file */ @Override protected int doScrape() throws IOException { try { BaseBillId billId = btrDao.getScrapeQueueHead(); HttpResponse response = makeRequest(constructUrl(billId)); File file = getSaveFile(billScrapedDir, billId); saveResponseToFile(response, file); btrDao.deleteBillFromScrapeQueue(billId); } catch (EmptyResultDataAccessException ex) { return 0; } return 1; } public void saveResponseToFile(HttpResponse response, File file) throws IOException { FileUtils.copyInputStreamToFile(response.getEntity().getContent(), file); } /** * Returns the HttpResponse received when calling a GET request on the given url. * * @throws ScrapingIOException If response status code != 200 */ public HttpResponse makeRequest(String url) throws IOException { HttpClient httpClient = HttpClientBuilder.create().build(); HttpGet request = new HttpGet(url); HttpResponse response = httpClient.execute(request); if (response.getStatusLine().getStatusCode() != 200) { throw new ScrapingIOException("Cannot scrape url " + url + ". Response status code was " + response.getStatusLine().getStatusCode()); } return response; } public String constructUrl(BaseBillId billId) { return StrSubstitutor.replace(billUrlTemplate, ImmutableMap.of("printNo", billId.getPrintNo(), "sessionYear", Integer.toString(billId.getSession().getYear()))); } public File getSaveFile(File dir, BaseBillId billId) { String file = StrSubstitutor.replace(billFileTemplate, ImmutableMap.<String, String>builder() .put("sessionYear", Integer.toString(billId.getSession().getYear())) .put("printNo", billId.getPrintNo()) .put("scrapedTime", LocalDateTime.now().format(DateUtils.BASIC_ISO_DATE_TIME)) .build()); return new File(dir, file); } }