package gov.nysenate.openleg.processor.bill; import gov.nysenate.openleg.model.bill.BillTextType; import gov.nysenate.openleg.processor.base.ParseError; import java.time.LocalDate; import java.time.LocalDateTime; import java.util.regex.Matcher; import java.util.regex.Pattern; public class BillTextParser { private static final Pattern textHeaderPattern = Pattern.compile("00000\\.SO DOC.{17}([A-Z* ]{9})[A-Z0-9 ]{16}([A-Z ]{20}) ([0-9]{4}).*"); private String data; private BillTextType billTextType; private LocalDateTime dateTime; /** This is set as true when the parser has received a valid text header * and has not yet received a closing header */ private boolean insideTextHeader; /** A flag that is set to true when the parser detects a delete statement */ private boolean deleted = false; public BillTextParser(String data, BillTextType billTextType, LocalDateTime dateTime) { this.data = data; this.billTextType = billTextType; this.dateTime = dateTime; this.insideTextHeader = false; } /** * Applies information to bill text or memo; replaces any existing information. * Header lines start with 00000.SO DOC and contain one of three actions: * * '' - Start of the bill text</li> * *END* - End of the bill text</li> * *DELETE* - Deletes existing bill text</li> * * Examples * ----------------------------------------------------------------------------------------------------- * Resolution Text | R00000.SO DOC A R22 RESO TEXT 2013 * | R00001LEGISLATIVE RESOLUTION congratulating the Maine-Endwell Football Team * | R00000.SO DOC A R22 *END* RESO TEXT 2013 * ----------------------------------------------------------------------------------------------------- * Bill Text | T00000.SO DOC S 53 BTXT 2013 * | T00002 S T A T E O F N E W Y O R K * | T00000.SO DOC S 53 *END* BTXT 2013 * ----------------------------------------------------------------------------------------------------- * Memo Text | M00000.SO DOC S 1626 MTXT 2013 * | M00006PURPOSE OR GENERAL IDEA OF BILL: The purpose of this bill is to * | M00000.SO DOC S 1625 *END* MTXT 2013 * ----------------------------------------------------------------------------------------------------- * Delete | T00000.SO DOC A 8396 *DELETE* BTXT 2013 * ----------------------------------------------------------------------------------------------------- * * @throws ParseError */ public String extractText() throws ParseError { // BillText, ResolutionText, and MemoText can be handled the same way. // Since Text Blocks can be back to back we constantly look for headers // with actions that tell us to start over, end, or delete. StringBuilder text = new StringBuilder(); text.ensureCapacity(data.length()); String fullText = ""; for (String line : data.split("\n")) { fullText = parseLine(line, text, fullText); } if (insideTextHeader) { // This is a known issue that was resolved on 03/23/2011 if (dateTime.isAfter(LocalDate.of(2011, 3, 23).atStartOfDay())) { throw new ParseError("Finished text data without a footer"); } else { // Commit what we have and move on fullText = text.toString(); } } return fullText; } /** * Performs parsing actions for a line of the bill text * @param line */ protected String parseLine(String line, StringBuilder text, String fullText) throws ParseError { Matcher header = textHeaderPattern.matcher(line); if (line.startsWith("00000") && header.find()) { String action = header.group(1).trim(); String type = header.group(2).trim(); if (!type.matches(billTextType.getTypeString())) { throw new ParseError("Unknown text type found: " + type); } switch (action) { case "*DELETE*": text.setLength(0); insideTextHeader = false; this.deleted = true; break; case "*END*": if (insideTextHeader) { this.deleted = false; fullText = text.toString(); text.setLength(0); insideTextHeader = false; } else { throw new ParseError("Text END Found before a body: " + line); } break; case "": // No action indicates the start of a text header // This header repeats every 100 lines insideTextHeader = true; break; default: throw new ParseError("Unrecognized action type: " + line); } } else if (insideTextHeader) { // Remove the leading numbers text.append((line.length() > 5) ? line.substring(5) : line.substring(line.length())); text.append("\n"); } else { throw new ParseError("Text Body found before header: "+line); } return fullText; } /** Basic Getters / Setters **/ public boolean isDeleted() { return deleted; } }