package gov.nysenate.openleg.model.spotcheck; import gov.nysenate.openleg.model.base.SessionYear; import gov.nysenate.openleg.model.bill.BillId; import gov.nysenate.openleg.model.calendar.CalendarEntry; import gov.nysenate.openleg.model.calendar.CalendarId; import gov.nysenate.openleg.util.DateUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.io.File; import java.time.LocalDate; import java.time.LocalDateTime; import java.time.format.DateTimeFormatter; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; public class ActiveListHTMLParser { public static ActiveListSpotcheckReference getSpotcheckReference(File html) throws Exception { List<CalendarEntry> entries = new ArrayList(); //Get sequence number from previous page //todo Get CalendarId from previous page, can use yearstring for year though String monthString = null, dayString = null, yearString = null; String calNo, billPrintNo; int sessionNumber = 0; int sequenceNo = 0; Document doc = Jsoup.parse(html, "UTF-8"); //get the additions to the HTML at top of file Elements a = doc.select("h1"); String releasedDT = a.get(1).text(); LocalDateTime releasedDateTime = LocalDateTime.parse(releasedDT, DateUtils.LRS_WEBSITE_DATETIME_FORMAT); sessionNumber = releasedDateTime.getYear(); //System.out.println("session number ::::::: " +sessionNumber); sequenceNo = Integer.parseInt(a.get(2).text()); //System.out.println("sequenceNo:::::::::::"+sequenceNo); Elements h3 = doc.getElementsByTag("h3"); String listTitle = h3.first().text(); Pattern datePattern = Pattern.compile("(Active List) (Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)" + " (January|February|March|April|May|June|July|August|September|October|November|December) (\\d{1,2}), (\\d{4})"); Matcher dateMatch = datePattern.matcher(listTitle); if (dateMatch.find()){ monthString = dateMatch.group(3); dayString = dateMatch.group(4); yearString = dateMatch.group(5); //System.out.println(monthString + " " + dayString + " " + yearString); } Element table = doc.getElementsByTag("table").get(0); //Gets table Elements rows = table.getElementsByTag("tr"); rows.remove(0); //Removes table legend for (Element row : rows) { //System.out.println(row.getElementsByTag("td").text()); Elements rowElements = row.getElementsByTag("td"); calNo = rowElements.get(0).text(); //gets the calendar number as a string from table for a bill billPrintNo = rowElements.get(1).text(); //gets the bill printNumber CalendarEntry listEntry = new CalendarEntry(Integer.parseInt(calNo), new BillId(billPrintNo, SessionYear.of(sessionNumber)));// BillId billId); entries.add(listEntry); //add current entry to the list } DateTimeFormatter formatter = DateTimeFormatter.ofPattern("MMMM-dd-yyyy"); LocalDate calDate = LocalDate.parse(monthString + "-" + dayString + "-" + yearString, formatter); LocalDateTime reportDate = LocalDateTime.now(); CalendarId calendarId = new CalendarId(sessionNumber, Integer.parseInt(yearString)); return new ActiveListSpotcheckReference(sequenceNo, calendarId, calDate, releasedDateTime, reportDate, entries); } }