package gov.nysenate.openleg.dao.scraping;
import gov.nysenate.openleg.util.DateUtils;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.stereotype.Repository;
import javax.annotation.PostConstruct;
import java.io.File;
import java.io.IOException;
import java.net.URL;
import java.time.LocalDateTime;
import java.util.ArrayList;
@Repository
public class CalendarScraper extends LRSScraper{
private static final Logger logger = Logger.getLogger(LRSScraper.class);
protected static final String allCalendars = "http://leginfo.state.ny.us/ASMSEN/menugetl.cgi?COMMONQUERY=CALENDAR";
private File outfile = null;
private File calendarDirectory;
@PostConstruct
public void init() throws IOException{
this.calendarDirectory = new File(environment.getScrapedStagingDir(), "calendar");
try {
FileUtils.forceMkdir(calendarDirectory);
} catch (IOException ex) {
logger.error("could not create assembly agenda scraped staging dir " + calendarDirectory.getPath());
}
}
//Active list sequence number get from parsing the page here with all the calendars
//ToDo Scraping doesn't handle going through the supplemental calendar intermediary page
@Override
protected int doScrape() throws IOException{
logger.info("SCRETCHING landing page.");
Document doc = getJsoupDocument(allCalendars);
System.out.println(doc.text());
Element image = doc.select("frame").get(1);
String url = image.absUrl("src");
System.out.println("THIS IS THE URL: ::::::::::::::::: " + url);
logger.info("Searching for link to bottom half");
Document calendarPage = getJsoupDocument(url);
logger.info("Fetching bottom half");
System.out.println(calendarPage.text());
Elements rows = calendarPage.select("tr");
rows.remove(0);
for (Element row : rows) {
Elements td = row.getElementsByTag("td");
int activeCount = 0;
if (!td.get(0).text().equals("\u00a0")){ // 
System.out.println(td.get(0).text());
Element link = td.get(0).select("a").first();
String absHref = link.attr("abs:href");
System.out.println(absHref);
URL contentURL = new URL(absHref);
String filename = null;
String activeInfo = null;
if (td.get(1).text().startsWith("Active List")) { //Create the file for parsing
System.out.println();
System.out.println();
LocalDateTime listDate = LocalDateTime. parse(td.get(2).text(), DateUtils.LRS_WEBSITE_DATETIME_FORMAT);
System.out.println("PARSE DATE with formatter:::: " + listDate);
filename = dateFormat.format(LocalDateTime.now()) + "." +
td.get(0).text().trim().replace(".", "").replace(" ", "_").toLowerCase() +
"_active_list_" + listDate + ".html";
activeInfo = "<h1>Active List</h1><h1>" + td.get(2).text() + "</h1><h1>" + activeCount +"</h1>\n";
activeCount++; //Oldest (highest) active list is the number 0
}else if (td.get(1).text().startsWith("Debate List")) {
filename = dateFormat.format(LocalDateTime.now()) + "." +
td.get(0).text().trim()
.replace(".", "")
.replace(" ", "_")
.toLowerCase() +
"_debate_List" + ".html";
}else{
filename = dateFormat.format(LocalDateTime.now()) + "." +
td.get(0).text().trim()
.replace(".", "")
.replace(" ", "_")
.replace("\u00a0", "")
.toLowerCase() +
".html";
}
outfile = new File(calendarDirectory, filename);
logger.info("Fetching " + td.get(1).text().trim());
String contents = activeInfo + getUrlContents(contentURL);
logger.info("Writing content to " + outfile);
FileUtils.write(outfile, contents);
}
}
ArrayList<File> list = new ArrayList<File>();
list.add(outfile);
return 1;
}
}