package gov.nysenate.openleg.processor.hearing; import gov.nysenate.openleg.util.PublicHearingTextUtils; import org.springframework.stereotype.Service; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; @Service public class PublicHearingTitleParser { private static final Pattern TITLE = Pattern.compile( "(?<title>" + "((NEW YORK STATE )?FORUM/TOWN HALL" + "|PUBLIC (HEARING|FORUM)" + "|ROUNDTABLE DISCUSSION" + "|A NEW YORK STATE SENATE HEARING" + "|NEW YORK STATE \\d{4})" + ".+?) " + // Title body "*(?=-{10,})"); // Marks the end of title. /** * Extracts the PublicHearing title from the first page of the PublicHearingFile. * @param firstPage * @return */ public String parse(List<String> firstPage) { String pageText = turnPageIntoString(firstPage); Matcher matchTitle = TITLE.matcher(pageText); if (!matchTitle.find()) { return null; } return matchTitle.group("title"); } /** * Turns a list of String's into a single String with * whitespace and line numbers removed. * @param firstPage * @return */ private String turnPageIntoString(List<String> firstPage) { String pageText = ""; for (String line : firstPage) { if (PublicHearingTextUtils.hasContent(line)) { pageText += " " + PublicHearingTextUtils.stripLineNumber(line); } } return pageText; } }