package gov.nysenate.openleg.processor.transcript; import org.apache.commons.lang3.text.WordUtils; import java.time.LocalDate; import java.time.LocalTime; import java.time.format.DateTimeFormatter; import java.time.format.DateTimeParseException; /** * Set of methods that function on individual transcript lines to help with parsing logic. */ public class TranscriptLine { /** Regex to match any non alphanumeric or whitespace characters. */ private static final String invalidCharactersRegex = "[^a-zA-Z0-9 ]+"; /** All page numbers occur in the first 10 characters of a line. */ private static final int MAXIMUM_PAGE_LINE_INDEX = 10; /** The maximum number of lines on a page. A number greater than this * cannot be a line number. */ private static final int MAXIMUM_PAGE_LINE_NUMBER = 27; /** The actual text of the line. */ private final String text; public TranscriptLine(String text) { this.text = text; } public String fullText() { return text; } /** * Page number is usually right aligned at the top of each page. * However, sometimes it's left aligned on the next line instead. * e.g. 082895.v1, 011299.v1 * @return <code>true</code> if line contains a page number; * <code>false</code> otherwise. */ public boolean isPageNumber() { String validText = stripInvalidCharacters().trim(); if (isNumber(validText)) { if (isRightAligned(validText) || greaterThanMaxPageLineNum(validText)) { return true; } } return false; } /** * Determines if this TranscriptLine's text contains a line number. * @return <code>true</code> if this TranscriptLine contains a line number; * <code>false</code> otherwise. */ public boolean hasLineNumber() { // split on two spaces so time typo's don't get treated as line numbers. return isNumber(text.trim().split(" ")[0]) && !isPageNumber(); } /** * Attempts to remove the line number from this line. * @return Returns line text with the line number removed * or the text unaltered if it doesn't have a line number. */ public String removeLineNumber() { if (hasLineNumber()) { if (text.trim().length() < 2) { return text.trim().substring(1); } return text.trim().substring(2); } return text; } /** * Determine if this TranscriptLine's text contains the transcripts location. * @return */ public boolean isLocation() { if (text.toUpperCase().contains("ALBANY") && text.toUpperCase().contains("NEW") && text.toUpperCase().contains("YORK")) { return true; } return false; } /** * Extracts and returns the location data from a TranscriptLine. * Only use this if you know via {@link #isLocation()} that this line contains the location. * @return */ public String getLocation() { return removeLineNumber().toUpperCase().trim().replaceAll("\\s+", " "); } /** * Determines if this TranscriptLine's text contains date information. * @return */ public boolean isDate() { DateTimeFormatter dtf = DateTimeFormatter.ofPattern("MMMM d yyyy"); try { LocalDate.parse(getDateString(), dtf); } catch (DateTimeParseException ex) { return false; } return true; } /** * Extracts the date information from lines which containd the date. * Only use if the line contains date information via {@link #isDate()}. * @return */ public String getDateString() { return WordUtils.capitalizeFully(removeLineNumber().replace(" , ", " ").replace(", ", " ") .replace(",", " ").replace(".", "").replace(" ", " ").trim()); } /** * Determines if this TranscriptLine contains the time of the transcript. * @return */ public boolean isTime() { DateTimeFormatter dtf = DateTimeFormatter.ofPattern("hmma"); try { LocalTime.parse(getTimeString(), dtf); } catch (DateTimeParseException ex) { return false; } return true; } /** * Returns a string with time information parsed from this TranscriptLine. * Only use if {@link #isTime()} determines this line has time information. * @return */ public String getTimeString() { // remove all erroneous characters including spaces. String date = removeLineNumber().replace(":", "").replace(".", "").replace(" ", "").trim(); if (date.contains("Noon")) date = date.replace("Noon", "pm"); return date.toUpperCase(); } /** * Determines if this TranscriptLine contains the Transcript's session type info. * @return */ public boolean isSession() { if (text.contains("SESSION")) return true; return false; } public boolean isEmpty() { return text.replaceAll(invalidCharactersRegex,"").trim().isEmpty(); } /** * Determines if this TranscriptLine contains the stenographer information. * @return */ public boolean isStenographer() { return text.contains("Candyco Transcription Service, Inc.") || text.contains("(518) 371-8910"); } /** * Removes invalid characters from a line of text, such as broken pipe or binary. * @return The line with invalid characters removed. */ public String stripInvalidCharacters() { return text.replaceAll(invalidCharactersRegex,""); } /** --- Internal Methods --- */ private boolean isNumber(String text) { try { Integer.parseInt(text.trim()); } catch (NumberFormatException e) { return false; } return true; } private boolean greaterThanMaxPageLineNum(String validText) { if (Integer.valueOf(validText) > MAXIMUM_PAGE_LINE_NUMBER) { return true; } return false; } private boolean isRightAligned(String validText) { int startIndex = text.indexOf(validText); if (startIndex > MAXIMUM_PAGE_LINE_INDEX) { return true; } return false; } }