package gov.nysenate.openleg.processor.daybreak; import gov.nysenate.openleg.model.base.SessionYear; import gov.nysenate.openleg.model.bill.BillId; import gov.nysenate.openleg.model.spotcheck.daybreak.DaybreakDocType; import gov.nysenate.openleg.model.spotcheck.daybreak.DaybreakFile; import gov.nysenate.openleg.model.spotcheck.daybreak.PageFileEntry; import org.apache.commons.io.FileUtils; import org.apache.commons.lang3.time.DateUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.util.Assert; import java.io.IOException; import java.text.ParseException; import java.time.LocalDate; import java.time.LocalDateTime; import java.time.ZoneId; import java.util.ArrayList; import java.util.List; public class DaybreakPageFileParser { private static Logger logger = LoggerFactory.getLogger(DaybreakFileParser.class); private static String publishDateMatchPattern = "MM/dd/yyyy"; /** * Parses a page daybreak file into PageFileEntries * @param daybreakFile * @return * @throws IOException */ public static List<PageFileEntry> extractPageFileEntries(DaybreakFile daybreakFile) throws IOException{ Assert.isTrue(daybreakFile.getDaybreakDocType() == DaybreakDocType.PAGE_FILE, "this method only parses page files"); List<PageFileEntry> pageFileEntries = new ArrayList<>(); // Get file full text List<String> lines = FileUtils.readLines(daybreakFile.getFile(), "latin1"); lines.remove(0); // Remove the header line // Extract a PageFileEntry from each line lines.stream() .filter(line -> !line.trim().isEmpty()) .forEach(line -> pageFileEntries.add(getPageFileEntryFromLine(line, daybreakFile))); return pageFileEntries; } /** * Converts a line of text into a PageFileEntry object * @param line * @param daybreakFile * @return */ private static PageFileEntry getPageFileEntryFromLine(String line, DaybreakFile daybreakFile){ String[] parts = line.split(","); // Page file line format // SESSYR,SEN_HSE,SEN_NO,SEN_AMD,ASM_HSE,ASM_NO,ASM_AMD,OUT_DATE,PAGES SessionYear sessionYear = null; try { sessionYear = SessionYear.of(Integer.parseInt(parts[0])); } catch(NumberFormatException ex){ logger.error(ex.getMessage()); } LocalDate publishDate = null; try{ publishDate = LocalDateTime.ofInstant( DateUtils.parseDateStrictly(parts[7], publishDateMatchPattern).toInstant(), ZoneId.systemDefault() ).toLocalDate(); } catch(ParseException ex){ logger.error("Could not parse PageFileEntry publish date " + parts[7]); } int pages = Integer.parseInt(parts[8]); String sen_id = (parts[1]+parts[2].replaceAll("^0*", "")+parts[3]).trim(); String asm_id = (parts[4]+parts[5].replaceAll("^0*", "")+parts[6]).trim(); BillId senateBillId = null; BillId assemblyBillId = null; if(!sen_id.isEmpty()) { senateBillId = new BillId(sen_id, sessionYear); } if(!asm_id.isEmpty()) { assemblyBillId = new BillId(asm_id, sessionYear); } return new PageFileEntry(senateBillId, assemblyBillId, daybreakFile, publishDate, pages); } }