package gov.nysenate.openleg.processor.daybreak; import gov.nysenate.openleg.model.base.SessionYear; import gov.nysenate.openleg.model.bill.BillId; import gov.nysenate.openleg.model.spotcheck.daybreak.DaybreakDocType; import gov.nysenate.openleg.model.spotcheck.daybreak.DaybreakFile; import gov.nysenate.openleg.model.spotcheck.daybreak.DaybreakFragment; import org.apache.commons.io.FileUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.util.Assert; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; public class DaybreakFileParser { private static final Logger logger = LoggerFactory.getLogger(DaybreakFileParser.class); /** Matches a single row in the bill table */ public static Pattern rowPattern = Pattern.compile("<tr.*?>(.+?)</tr>"); /** Used to remove undesired html and unwanted sections */ public static Pattern stripParts = Pattern.compile( "<b>(.*?)</b>|" + // Remove bold text "<(a|/a|td).*?>|" + // Remove a, /a, and td tags. Leave /td for later "<br>\\s*Criminal Sanction Impact." // Remove criminal impact text if present ); /** * Parses through an html daybreak file and extracts any found daybreak fragments * @param daybreakFile * @return */ public static List<DaybreakFragment> extractDaybreakFragments(DaybreakFile daybreakFile) throws IOException{ Assert.isTrue(daybreakFile.getDaybreakDocType() != DaybreakDocType.PAGE_FILE, "This parser is not for page files"); List<DaybreakFragment> daybreakFragments = new ArrayList<>(); String fullText = FileUtils.readFileToString(daybreakFile.getFile(), "UTF-8").replaceAll("\\r?\\n", " "); Matcher rowMatcher = rowPattern.matcher(fullText); rowMatcher.find(); // Throw the first two rows away rowMatcher.find(); // They are just headers for the table while(rowMatcher.find()){ // Each row contains 1 bill String text = stripParts.matcher(rowMatcher.group(1)) // Match all non <br> and </td> tags .replaceAll("") // Remove them .replace("</td>", "\n") // convert </td> and <br> to newlines .replace("<br>", "\n") .replace("�", " ") // Replace all instances of � with space ; // Here we are going through each line and trimming excess whitespace String[] lines = text.split("\\n"); String fragmentPrintNo = null; StringBuilder fragmentText = new StringBuilder(); fragmentText.ensureCapacity(text.length()); for(int i=0; i<lines.length; i++){ if(i==0){ // The first line should be the bill print number fragmentPrintNo = lines[i].trim(); } fragmentText.append(lines[i].trim()); fragmentText.append('\n'); } // TODO: it is assumed that the daybreak only contains bills from the current session year // todo: perhaps there is another way of getting the session year? BillId fragmentBillId = new BillId(fragmentPrintNo, SessionYear.of(daybreakFile.getReportDate().getYear())); daybreakFragments.add(new DaybreakFragment(fragmentBillId, daybreakFile, fragmentText.toString())); } return daybreakFragments; } }