package gov.nysenate.openleg.script;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.Options;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.log4j.Logger;
import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class LRSScraper extends BaseScript
{
private static final Logger logger = Logger.getLogger(LRSScraper.class);
public static void main(String[] args) throws Exception
{
logger.info("running");
CommandLine cmd = getCommandLine(new Options(), args);
new LRSScraper().execute(cmd);
}
String allCalendars = "http://leginfo.state.ny.us/ASMSEN/menugetl.cgi?COMMONQUERY=CALENDAR";
String assemblyAgendas = "http://public.leginfo.state.ny.us/menugetf.cgi?COMMONQUERY=SENAGEN";
String senateAgendas = "http://public.leginfo.state.ny.us/menugetf.cgi?COMMONQUERY=ASMAGEN";
private final SimpleDateFormat dateFormat = new SimpleDateFormat("'D'yyyyMMdd'.T'HHmmss");
private final Pattern relativeBasePattern = Pattern.compile("(http://.+/).*");
private final Pattern absoluteBasePattern = Pattern.compile("(http://.+?)/.*");
private final Pattern linkPattern = Pattern.compile("<a href=\\\"(.*?)\\\">(.+?)</a>");
private final Pattern bottomPattern = Pattern.compile("src=\\\"(frmload\\.cgi\\?BOT-([0-9]+))\\\">");
public void scrapeCalendars(URL landingURL, File directory, Date currentTime) throws IOException
{
logger.info("Fetching landing page.");
String landingPage = IOUtils.toString(landingURL.openStream());
Matcher tokenMatcher = bottomPattern.matcher(landingPage);
logger.info("Searching for link to bottom half");
if (tokenMatcher.find()) {
String link = tokenMatcher.group(1);
URL contentURL = resolveLink(landingURL, link);
logger.info("Fetching bottom half");
String contentPage = IOUtils.toString(contentURL.openStream()).replace("\r\n", " ");
Matcher linkMatcher = linkPattern.matcher(contentPage);
while(linkMatcher.find()) {
URL linkURL = resolveLink(contentURL, linkMatcher.group(1));
String filename = dateFormat.format(currentTime)+"."+linkMatcher.group(2).trim().replace(".", "")
.replace(" ", "_").toLowerCase()+".html"; // add 0 for last active list and increment
File outfile = new File(directory, filename);
logger.info("Fetching "+linkMatcher.group(2).trim());
String contents = IOUtils.toString(linkURL);
logger.info("Writing content to "+filename);
FileUtils.write(outfile, contents);
}
}
}
public void scrapeAgendas(URL landingURL, File directory, Date currentTime) throws IOException
{
logger.info("Fetching landing page.");
String landingPage = IOUtils.toString(landingURL.openStream());
Matcher tokenMatcher = bottomPattern.matcher(landingPage);
logger.info("Searching for link to bottom half");
if (tokenMatcher.find()) {
String link = tokenMatcher.group(1);
URL contentURL = resolveLink(landingURL, link);
//calen
logger.info("Fetching bottom half");
String contentPage = IOUtils.toString(contentURL.openStream()).replace("\r\n", " ");
Matcher linkMatcher = linkPattern.matcher(contentPage);
logger.info("Searching for all committee agendas link");
while (linkMatcher.find()) {
if (linkMatcher.group(2).trim().equals("All Committee Agendas")) {
URL linkURL = resolveLink(contentURL, linkMatcher.group(1));
String filename = dateFormat.format(currentTime)+".all_agendas.html";
File outfile = new File(directory, filename);
logger.info("Fetching all committee agendas");
String contents = IOUtils.toString(linkURL);
logger.info("Writing content to "+filename);
FileUtils.write(outfile, contents);
}
}
}
else {
logger.error("NO MATCH on pattern: "+tokenMatcher.toString());
logger.error(landingPage);
}
}
public void execute(CommandLine opts) throws IOException
{
String[] args = opts.getArgs();
File directory = new File(args[0]);
Date currentTime = new Date();
scrapeCalendars(new URL(allCalendars), new File(directory, "CALENDAR"), currentTime);
scrapeAgendas(new URL(senateAgendas), new File(directory, "SENAGEN"), currentTime);
scrapeAgendas(new URL(assemblyAgendas), new File(directory, "ASMAGEN"), currentTime);
}
public URL resolveLink(URL url, String link) throws MalformedURLException
{
Pattern basePattern = link.startsWith("/") ? absoluteBasePattern : relativeBasePattern;
Matcher baseMatcher = basePattern.matcher(url.toString());
if (baseMatcher.find()) {
String base = baseMatcher.group(1);
return new URL(base+link);
}
else {
logger.error("Couldn't extract the link base");
return null;
}
}
}