package gov.nysenate.openleg.util;
import gov.nysenate.openleg.processor.transcript.TranscriptLine;
import java.util.ArrayList;
import java.util.List;
public class TranscriptTextUtils
{
/**
* Generates pages from transcript text in a common format.
*
* <p>Transcript text of all formats are converted into a common single spaced
* format, and minor errors on the first page are corrected.</p>
*
* @param fullText
* @return
*/
public static List<List<String>> getPdfFormattedPages(String fullText) {
List<List<String>> rawPages = getPages(fullText);
fixErrorsOnFirstPage(rawPages);
List<List<String>> formattedPages = new ArrayList<List<String>>();
for (List<String> pageLines : rawPages) {
if (isFirstPage(pageLines, rawPages) && !pageHasLineNumbers(pageLines)) {
formattedPages.add(parseWithManualSpacing(pageLines));
}
else {
formattedPages.add(parseWithOriginalSpacing(pageLines));
}
}
return formattedPages;
}
/**
* Parse individual transcript text pages by their page numbers.
* This solution works for all transcript text formats.
* @param fullText
* @return
*/
private static List<List<String>> getPages(String fullText) {
List<List<String>> pages = new ArrayList<List<String>>();
List<String> page = new ArrayList<String>();
String[] pageLines = fullText.split("\n");
for (int lineNum = 0; lineNum < pageLines.length; lineNum++) {
page.add(pageLines[lineNum]);
if(endOfPage(pageLines, lineNum)) {
pages.add(page);
page = new ArrayList<String>();
}
}
return pages;
}
private static boolean endOfPage(String[] pageLines, int lineNum) {
// Ignore the first page number.
if (lineNum > 10) {
if (lineNum + 1 < pageLines.length) {
TranscriptLine nextLine = new TranscriptLine(pageLines[lineNum + 1]);
if (nextLine.isPageNumber()) {
return true;
}
}
}
if (lineNum + 1 == pageLines.length) {
return true;
}
return false;
}
/**
* Transcripts without line numbers must have their spacing done manually.
*/
private static List<String> parseWithManualSpacing(List<String> pageLines) {
List<String> page = new ArrayList<>();
for (int i = 0; i < pageLines.size(); i++) {
TranscriptLine line = new TranscriptLine(pageLines.get(i));
if (line.isPageNumber()) {
page.add(line.stripInvalidCharacters());
}
else if (!line.isEmpty() && !line.isStenographer()) {
page.add(line.fullText());
if (line.fullText().trim().equals("NEW YORK STATE SENATE")) {
addBlankLines(page, 2);
}
else if (line.fullText().trim().contains("STENOGRAPHIC RECORD")) {
addBlankLines(page, 2);
}
else if (line.isTime()) {
addBlankLines(page, 2);
}
else if (line.isSession()) {
addBlankLines(page, 3);
}
}
}
return page;
}
private static List<String> parseWithOriginalSpacing(List<String> pageLines) {
List<String> page = new ArrayList<>();
for (String pageLine : pageLines) {
TranscriptLine line = new TranscriptLine(pageLine);
if (line.isPageNumber()) {
page.add(line.stripInvalidCharacters());
}
else if (!line.isEmpty() && !line.isStenographer()) {
page.add(line.fullText());
}
}
return page;
}
/**
* Fixes a variety of formatting errors that occur on the first page of the original documents.
*/
private static void fixErrorsOnFirstPage(List<List<String>> pages) {
List<String> correctedFirstPage = new ArrayList<String>();
List<String> firstPage = pages.get(0);
for (int i = 0; i < firstPage.size(); i++) {
TranscriptLine line = new TranscriptLine(firstPage.get(i));
if (!line.isEmpty()) {
if (line.fullText().endsWith(",") || line.fullText().endsWith(", Acting")) {
// Combine two lines into one; corrects formatting. i.e. 123096.v1
TranscriptLine nextLine = getNextLine(firstPage, i);
if (nextLine.fullText().trim().equals("President") || nextLine.fullText().trim().equals("Acting President")) {
line = new TranscriptLine(line.fullText() + " " + nextLine.fullText().trim());
// Skip next line since we combined it with the previous line.
i++;
}
}
correctedFirstPage.add(line.fullText());
}
}
pages.set(pages.indexOf(firstPage), correctedFirstPage);
}
private static void addBlankLines(List<String> page, int numLines) {
for (int i = 0; i < numLines; i++) {
page.add("");
}
}
private static boolean pageHasLineNumbers(List<String> pageLines) {
for (String pageLine : pageLines) {
TranscriptLine line = new TranscriptLine(pageLine);
if (!line.isEmpty() && !line.isPageNumber()) {
return line.hasLineNumber();
}
}
return false;
}
private static boolean isFirstPage(List<String> pageLines, List<List<String>> rawPages) {
return rawPages.indexOf(pageLines) == 0;
}
private static TranscriptLine getNextLine(List<String> pageLines, int i) {
if (i + 1 < pageLines.size()) {
return new TranscriptLine(pageLines.get(i + 1));
}
return null;
}
}