package gov.nysenate.openleg.processor.agenda.reference; import gov.nysenate.openleg.model.agenda.AgendaInfoCommitteeItem; import gov.nysenate.openleg.model.spotcheck.agenda.AgendaAlertInfoCommittee; import gov.nysenate.openleg.model.base.SessionYear; import gov.nysenate.openleg.model.base.Version; import gov.nysenate.openleg.model.bill.BillId; import gov.nysenate.openleg.model.entity.Chamber; import gov.nysenate.openleg.model.entity.CommitteeId; import gov.nysenate.openleg.model.spotcheck.SpotCheckRefType; import gov.nysenate.openleg.model.spotcheck.SpotCheckReferenceId; import gov.nysenate.openleg.processor.base.ParseError; import gov.nysenate.openleg.util.DateUtils; import gov.nysenate.openleg.util.ScrapeUtils; import org.apache.commons.io.FileUtils; import org.apache.commons.lang3.StringUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.File; import java.io.IOException; import java.time.LocalDate; import java.time.LocalDateTime; import java.time.LocalTime; import java.time.format.DateTimeFormatter; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; public class AgendaAlertParser { private static final Logger logger = LoggerFactory.getLogger(AgendaAlertParser.class); private static final Pattern agendaAlertFilenamePattern = Pattern.compile("^agenda_alert-(\\d{8})-[A-z\\._]+-([A-z]+)-(\\d{8}T\\d{6}).html$"); // This pattern parses both full and individual agenda alert filenames, but currently we can't reliably process full alerts // Pattern.compile("^agenda_alert-(\\d{8})-[A-z\\._-]+(\\d{8}T\\d{6}).html$"); private static final Pattern committeeNamePattern = Pattern.compile("^\\s*Senate\\s+Standing\\s+Committee\\s+on\\s+([A-z, ]+)\\s*$"); private static final Pattern chairPattern = Pattern.compile("^\\s*Senator\\s+([A-z\\.'-, ]*),\\s+Chair\\s*$"); private static final Pattern meetingTimePattern = Pattern.compile("^\\s*(?:(\\d{1,2}:\\d{2} (?:AM|PM)|12 Noon)\\s*,\\s+)?(?:[A-z]+day\\s*,\\s+)?([A-z]+ \\d+, \\d{4})\\s*$"); /** * Parses an agenda alert html file, yielding a list of committee meeting references * * @param agendaAlert File - The file containing the alert text * @return List<AgendaAlertInfoCommittee> * @throws IOException */ public static List<AgendaAlertInfoCommittee> parseAgendaAlert(File agendaAlert) throws IOException, ParseError { Matcher filenameMatcher = agendaAlertFilenamePattern.matcher(agendaAlert.getName()); if (!filenameMatcher.matches()) { throw new IllegalArgumentException("agenda alert filename does not match specification: " + agendaAlert.getName()); } LocalDate weekOf = LocalDate.parse(filenameMatcher.group(1), DateTimeFormatter.BASIC_ISO_DATE); LocalDateTime refDateTime = LocalDateTime.parse(filenameMatcher.group(3), DateUtils.BASIC_ISO_DATE_TIME); // Todo find a way to parse addenda from alert text String addendumString = filenameMatcher.group(2); Version addendum = Version.of(addendumString); List<AgendaAlertInfoCommittee> alertInfoCommittees = new ArrayList<>(); String fileContents = FileUtils.readFileToString(agendaAlert, "ISO-8859-1") .replaceAll("\u001A", ""); // Replace unknown characters with a blank Document document = Jsoup.parse(fileContents); Elements bodyElements = document.getElementsByTag("body").first().children(); Element headerElement = null, notesElement = null, billTableElement = null; // committee meetings consist of a header (<h3>) notes (<p>) and a bill listing(<table>) // iterate through all committee meeting elements, parsing each for (Element currentElement : bodyElements) { if ("p".equalsIgnoreCase(currentElement.tag().getName())) { notesElement = currentElement; } else if ("table".equalsIgnoreCase(currentElement.tag().getName())) { billTableElement = currentElement; } else { if (headerElement != null) { alertInfoCommittees.add(parseInfoCommittee(refDateTime, weekOf, headerElement, notesElement, billTableElement, addendum)); headerElement = notesElement = billTableElement = null; } if ("h3".equalsIgnoreCase(currentElement.tag().getName())) { headerElement = currentElement; } } } return alertInfoCommittees; } private static AgendaAlertInfoCommittee parseInfoCommittee(LocalDateTime refDateTime, LocalDate weekOf, Element headerElement, Element notesElement, Element billTableElement, Version addendum) throws ParseError { AgendaAlertInfoCommittee aaic = new AgendaAlertInfoCommittee(); aaic.setReferenceId(new SpotCheckReferenceId(SpotCheckRefType.LBDC_AGENDA_ALERT, refDateTime)); aaic.setWeekOf(weekOf); aaic.setAddendum(addendum); String[] headerLines = ScrapeUtils.getFormattedText(headerElement).split("\n"); aaic.setCommitteeId(getCommitteeId(headerLines[0])); aaic.setChair(getChair(headerLines[1])); aaic.setMeetingDateTime(getMeetingTime(headerLines[2])); aaic.setLocation(headerLines[3].trim()); aaic.setNotes(notesElement != null ? ScrapeUtils.getFormattedText(notesElement).trim() : ""); if (billTableElement != null) { getCommitteeItems(billTableElement, SessionYear.of(aaic.getWeekOf().getYear())) .forEach(aaic::addInfoCommitteeItem); } return aaic; } private static CommitteeId getCommitteeId(String committeeNameLine) throws ParseError { Matcher committeeNameMatcher = committeeNamePattern.matcher(committeeNameLine); if (committeeNameMatcher.matches()) { return new CommitteeId(Chamber.SENATE, committeeNameMatcher.group(1).trim()); } throw new ParseError("could not parse committee name from " + committeeNameLine + ""); } private static String getChair(String chairLine) throws ParseError { Matcher chairMatcher = chairPattern.matcher(chairLine); if (chairMatcher.matches()) { return chairMatcher.group(1); } throw new ParseError("could not parse chair " + chairLine + ""); } private static LocalDateTime getMeetingTime(String meetingTimeLine) throws ParseError { Matcher meetingTimeMatcher = meetingTimePattern.matcher(meetingTimeLine); if (meetingTimeMatcher.matches()) { LocalDate meetingDay = LocalDate.parse(meetingTimeMatcher.group(2), DateTimeFormatter.ofPattern("MMMM d, yyyy")); String timeString = meetingTimeMatcher.group(1); LocalTime meetingTime; if (StringUtils.isBlank(timeString)) { meetingTime = LocalTime.MIDNIGHT; } else if ("12 Noon".equals(timeString)) { meetingTime = LocalTime.NOON; } else { meetingTime = LocalTime.parse(timeString, DateTimeFormatter.ofPattern("h:mm a")); } return meetingDay.atTime(meetingTime); } throw new ParseError("could not parse meeting time from '" + meetingTimeLine + "'"); } private static List<AgendaInfoCommitteeItem> getCommitteeItems(Element billTableElement, SessionYear sessionYear) { Elements rows = billTableElement.getElementsByTag("tr"); List<AgendaInfoCommitteeItem> committeeItems = new ArrayList<>(); rows.stream().filter(row -> row.getElementsByTag("th").isEmpty()) .forEach(row -> { String[] billEntry = ScrapeUtils.getFormattedText(row.children().first()).split("\n"); committeeItems.add(new AgendaInfoCommitteeItem( new BillId(billEntry[0].replaceAll("^\\s*(\\d+[A-z]?)\\s*$", "S$1"), sessionYear), billEntry.length > 1 ? billEntry[1].trim() : "" )); }); return committeeItems; } }