package gov.nysenate.openleg.util; import com.google.common.base.Splitter; import com.google.common.base.Strings; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; public class BillTextUtils { protected static Pattern startPagePattern = Pattern.compile("(^\\s+\\w\\.\\s\\d+(--\\w)?\\s+\\d+(\\s+\\w\\.\\s\\d+(--\\w)?)?$|^\\s+\\d+\\s+\\d+\\-\\d+\\-\\d$|^\\s+\\d{1,4}$)"); protected static Pattern endPagePattern = Pattern.compile("^\\s*(EXPLANATION--Matter|LBD[0-9-]+$)"); protected static Pattern textLinePattern = Pattern.compile("^ {1,5}[0-9]+ "); protected static Pattern billTextPageStartPattern = Pattern.compile("^(\\s+\\w.\\s\\d+(--\\w)?)?\\s{10,}(\\d+)(\\s{10,}(\\w.\\s\\d+(--\\w)?)?(\\d+-\\d+-\\d(--\\w)?)?)?$"); protected static Integer MAX_LINES_RES_PAGE = 60; /** * Extracts a list of numbers which represent the line indices in which a * page break occurs. The indices start at 0 since they are extracted * from an array of lines. * * @param fullText String - Bill full text * @return List<Integer> */ public static List<Integer> getNewPageLines(String fullText) { List<String> lines = Splitter.on("\n").splitToList(fullText); return getNewPageLines(lines); } /** * Uses the new page lines to generate a list of pages from the bill text. * * @param fullText String - String - Bill full text * @return List<List<String>> */ public static List<List<String>> getBillPages(String fullText) { List<List<String>> pages = new ArrayList<>(); List<String> lines = Splitter.on("\n").splitToList(fullText); int startLine = 0; for (int newPageLine : getNewPageLines(lines)) { pages.add(lines.subList(startLine, newPageLine)); startLine = newPageLine; } pages.add(lines.subList(startLine, lines.size())); return pages; } /** * Returns the pages for resolution full text. Since resolutions don't have the same * formatting cues as bills, we just cap the pages to a certain number of lines. * @param fullText * @return */ public static List<List<String>> getResolutionPages(String fullText) { List<List<String>> pages = new ArrayList<>(); List<String> lines = Splitter.on("\n").splitToList(fullText); int numPages = new Double(Math.ceil((double) lines.size() / MAX_LINES_RES_PAGE)).intValue(); for (int page = 0; page < numPages; page++) { int pageStart = page * MAX_LINES_RES_PAGE; int pageEnd = Math.min(pageStart + MAX_LINES_RES_PAGE, lines.size()); pages.add(lines.subList(pageStart, pageEnd)); } return pages; } private static List<Integer> getNewPageLines(List<String> lines) { List<Integer> pageLines = new ArrayList<>(); for (int i = 0; i < lines.size(); i++) { if (isFirstLineOfNextPage(lines.get(i), i)) { pageLines.add(i); } } return pageLines; } /** * Returns the number of pages contained within the supplied bill text. Although * we could have just used the {@link #getNewPageLines(String)} method, iterating * though the lines in reverse looking for the page number in the pattern is a few * times more efficient. * * @param fullText String - Bill full text * @return int */ public static int getPageCount(String fullText) { // Short circuit if (Strings.isNullOrEmpty(fullText)) return 0; // Iterate through the lines in reverse order (until 10 to prevent errors) // looking for the last page number (e.g. A. 7461--A 2 ...) String[] lines = fullText.split("\n"); for (int i = lines.length - 1; i > 10; i--) { Matcher billTextPageMatcher = billTextPageStartPattern.matcher(lines[i]); if (billTextPageMatcher.find()) { return Integer.parseInt(billTextPageMatcher.group(3)); } } // Since there are no page indicators, just assume its a single page bill return 1; } /** WIP */ public static String formatBillText(boolean isResolution, String fullText) { if (!isResolution && fullText != null && !fullText.isEmpty()) { List<String> lines = Splitter.on("\n").splitToList(fullText); StringBuilder formattedFullText = new StringBuilder(); lines.forEach(line -> { if (line.length() > 7) { formattedFullText.append(line.substring(7)).append("\n"); } else { formattedFullText.append(line).append("\n"); } }); return formattedFullText.toString(); } return fullText; } /** * Checks if the given line matches the new page pattern. */ private static boolean isFirstLineOfNextPage(String line, int lineNum) { Matcher billTextPageMatcher = billTextPageStartPattern.matcher(line); // Ignore erroneous result in first 10 lines. return lineNum > 10 && billTextPageMatcher.find(); } }