package gov.nysenate.openleg.dao.scraping;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.stereotype.Repository;
import javax.annotation.PostConstruct;
import java.io.File;
import java.io.IOException;
import java.net.SocketTimeoutException;
import java.net.URL;
import java.time.LocalDateTime;
import java.util.ArrayList;
/**
* Created by kyle on 11/10/14.
*/
@Repository
public class AssemblyAgnScraper extends LRSScraper {
private static final Logger logger = Logger.getLogger(LRSScraper.class);
String assemblyAgendas = "http://public.leginfo.state.ny.us/menugetf.cgi?COMMONQUERY=ASMAGEN";
String senateAgendas = "http://public.leginfo.state.ny.us/menugetf.cgi?COMMONQUERY=SENAGEN";
protected URL agendaURL;
private File assemblyAgendaDirectory;
private File outfile = null;
@PostConstruct
public void init() throws IOException {
agendaURL = new URL(assemblyAgendas);
this.assemblyAgendaDirectory = new File(environment.getScrapedStagingDir(), "ass-agenda");
try {
FileUtils.forceMkdir(assemblyAgendaDirectory);
} catch (IOException ex) {
logger.error("could not create assembly agenda scraped staging dir " + assemblyAgendaDirectory.getPath());
}
}
@Override
protected int doScrape() throws IOException {
System.out.println("ASSEMBLY AGENDA DIRECTORY ::::::: " + assemblyAgendaDirectory);
logger.info("SCRETCHING landing page.");
Document doc = getJsoupDocument(agendaURL.toString());
System.out.println(doc.text());
Element image = doc.select("frame").get(1);
String url = image.absUrl("src");
System.out.println("THIS IS THE URL: ::::::::::::::::: " + url);
logger.info("Searching for link to bottom half");
Document agendaPage = getJsoupDocument(url);
logger.info("Fetching bottom half");
System.out.println(agendaPage.text());
Elements links = agendaPage.select("a");
for (Element link : links){
if ("All Committee Agendas".equalsIgnoreCase(link.text())){ //possibility for error in generating filename
String absHref = link.attr("abs:href");
System.out.println(absHref);
URL contentURL = new URL(absHref);
String filename = dateFormat.format(LocalDateTime.now()) + ".all_assembly_agendas.html";
outfile = new File(assemblyAgendaDirectory, filename);
logger.info("Fetching all committee agendas");
String contents = getUrlContents(contentURL);
logger.info("Writing content to "+filename);
FileUtils.write(outfile, contents);
}
}
ArrayList<File> list = new ArrayList<>();
list.add(outfile);
return list.size();
}
}