package gov.nysenate.openleg.dao.scraping; import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; import org.apache.log4j.Logger; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.springframework.stereotype.Repository; import javax.annotation.PostConstruct; import java.io.File; import java.io.IOException; import java.net.URL; import java.time.LocalDateTime; import java.util.ArrayList; /** * Created by kyle on 11/12/14. */ @Repository public class SenateAgnScraper extends LRSScraper{ private static final Logger logger = Logger.getLogger(LRSScraper.class); private static final String senateAgendaLandingPage = "http://public.leginfo.state.ny.us/menugetf.cgi?COMMONQUERY=SENAGEN"; private File senateAgendaDirectory; @PostConstruct public void init() throws IOException { this.senateAgendaDirectory = new File(environment.getScrapedStagingDir(), "sen-agenda"); try { FileUtils.forceMkdir(senateAgendaDirectory); } catch (IOException ex) { logger.error("could not create assembly agenda scraped staging dir " + senateAgendaDirectory.getPath()); } } @Override protected int doScrape() throws IOException { logger.info("SCRETCHING landing page."); Document doc = getJsoupDocument(senateAgendaLandingPage); System.out.println(doc.text()); Element image = doc.select("frame").get(1); String url = image.absUrl("src"); System.out.println("THIS IS THE URL: ::::::::::::::::: " + url); logger.info("Searching for link to bottom half"); Document agendaPage = getJsoupDocument(url); logger.info("Fetching bottom half"); System.out.println(agendaPage.text()); Elements links = agendaPage.select("a"); int scrapedCount = 0; for (Element link : links){ if (link.text().equalsIgnoreCase("All Committee Agendas")){ String absHref = link.attr("abs:href"); System.out.println(absHref); URL contentURL = new URL(absHref); String filename = dateFormat.format(LocalDateTime.now()) + ".all_senate_agendas.html"; File scrapedAgendaFile = new File(senateAgendaDirectory, filename); logger.info("Fetching all committee agendas"); String contents = getUrlContents(contentURL); logger.info("Writing content to "+filename); FileUtils.write(scrapedAgendaFile, contents); scrapedCount++; } } return scrapedCount; } }