package gov.nysenate.openleg.processor.law; import gov.nysenate.openleg.model.law.LawChapterCode; import gov.nysenate.openleg.model.law.LawDocInfo; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.text.WordUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Collectors; import java.util.stream.Stream; public class LawTitleParser { private static final Logger logger = LoggerFactory.getLogger(LawTitleParser.class); protected static String sectionTitlePattern = "(?i)((?:Section|§)\\s*%s).?\\s(.+?)\\.(.*)"; protected static Pattern tocStartPattern = Pattern.compile("(Section|Article)\\s+\\n?[0-9a-zA-Z-.]+"); protected static Pattern nonSectionPrefixPattern = Pattern.compile("((\\*\\s*)?(SUB)?(ARTICLE|TITLE|PART)(.+?)(\\\\n|--))"); protected static Pattern contiguousUppercasePattern = Pattern.compile("([^a-z]+(\\b))"); protected static Pattern contiguousUppercaseExcludeTocPattern = Pattern.compile("(~\\s+\\d+\\.\\s*)$"); /** --- Methods --- */ public static String extractTitle(LawDocInfo lawDocInfo, String bodyText) { String title = ""; if (lawDocInfo != null) { switch (lawDocInfo.getDocType()) { case CHAPTER: title = extractTitleFromChapter(lawDocInfo); break; case SUBTITLE: case PART: case SUB_PART: case ARTICLE: case TITLE: title = extractTitleFromNonSection(lawDocInfo, bodyText); break; case SECTION: title = extractTitleFromSection(lawDocInfo, bodyText); break; case INDEX: break; case CONTENTS: break; default: break; } } return title; } /** * Extract the chapter title using the mapping of law id to LawChapterType if possible. */ protected static String extractTitleFromChapter(LawDocInfo docInfo) { try { LawChapterCode chapterType = LawChapterCode.valueOf(docInfo.getLawId()); return chapterType.getName(); } catch (IllegalArgumentException ex) { return docInfo.getLawId() + " Law"; } } /** * Parses the title for an article by assuming that most article titles are presented in all caps. */ protected static String extractTitleFromNonSection(LawDocInfo lawDocInfo, String bodyText) { String title = bodyText; // Remove the location designator Matcher prefixMatcher = nonSectionPrefixPattern.matcher(bodyText); if (prefixMatcher.find()) { title = title.substring(prefixMatcher.end()); } // Check if there is a 'Section X' in the body, it is usually a good indicator of where the title ends. Matcher sectionStartMatcher = tocStartPattern.matcher(title); if (sectionStartMatcher.find()) { title = title.substring(0, sectionStartMatcher.start()); } // Otherwise try to find the first contiguous sequence of uppercase characters. else { // Replace new lines with an easier to detect symbol that doesn't break the uppercase. title = title.replaceAll("\\\\n", "~"); Matcher uppercaseMatcher = contiguousUppercasePattern.matcher(title); if (uppercaseMatcher.find()) { title = title.substring(0, uppercaseMatcher.end()); Matcher removeLastNumberMatcher = contiguousUppercaseExcludeTocPattern.matcher(title); if (removeLastNumberMatcher.find()) { title = title.substring(0, removeLastNumberMatcher.start()); } title = title.replaceAll("~", " "); } // Otherwise just grab the whole thing (truncate it to say 240 characters) and call it a day. else { title = title.substring(0, Integer.min(240, title.length())); } } return capitalizeTitle(title.replaceAll("(\\\\n|\\s{2,})", " ").trim()); } /** * Extract the title from the section document using a common pattern if applicable or just getting the * first line or so. */ protected static String extractTitleFromSection(LawDocInfo docInfo, String text) { String title = ""; if (text != null && !text.isEmpty()) { int asteriskLoc = docInfo.getLocationId().indexOf("*"); String locationId = (asteriskLoc != -1) ? docInfo.getLocationId().substring(0, asteriskLoc) : docInfo.getLocationId(); Pattern titlePattern = Pattern.compile(String.format(sectionTitlePattern, locationId.toLowerCase())); int sectionIdx = text.indexOf("§"); String trimText = (sectionIdx != -1) ? text.substring(sectionIdx).trim() : text.trim(); Matcher titleMatcher = titlePattern.matcher(trimText); if (titleMatcher.matches()) { title = titleMatcher.group(2).replaceAll("-\\\\n\\s*", "").replaceAll("\\\\n?\\s*", " "); } else { logger.warn("Section title pattern mismatch for document id {}", docInfo.getDocumentId()); title = trimText; } } return StringUtils.abbreviate(title, 140); } protected static String capitalizeTitle(String title) { if (title != null && !title.isEmpty()) { String capStr = WordUtils.capitalizeFully(title); return capStr.substring(0, 1) + Stream.of(capStr.substring(1).split(" ")) .map(s -> (s.matches("(Of|Or|The|For|A|And|An)")) ? s.toLowerCase() : s) .collect(Collectors.joining(" ")); } return title; } }