package gov.nysenate.openleg.service.scraping;
import gov.nysenate.openleg.model.base.SessionYear;
import gov.nysenate.openleg.model.bill.BaseBillId;
import gov.nysenate.openleg.model.bill.BillId;
import gov.nysenate.openleg.model.spotcheck.billtext.BillTextReference;
import gov.nysenate.openleg.processor.base.ParseError;
import gov.nysenate.openleg.util.DateUtils;
import org.apache.commons.io.FileUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.Elements;
import org.springframework.stereotype.Service;
import java.io.File;
import java.io.IOException;
import java.time.LocalDateTime;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Created by kyle on 3/10/15.
*/
@Service
public class ScrapedBillTextParser {
private static final Pattern scrapedBillFilePattern = Pattern.compile("^(\\d{4})-([A-z]\\d+)-(\\d{8}T\\d{6}).html$");
private static final Pattern billIdPattern = Pattern.compile("^([A-z]\\d+)(?:-([A-z]))?$");
private static final Pattern resolutionStartPattern = Pattern.compile("^\\s+([A-z]{2,})");
/**
* Parses a scraped bill file into a bill text reference containing an active amendment, full text, and a sponsor memo
* @param file File
* @return BillTextReference
* @throws IOException if there are troubles reading the file
* @throws ParseError if there are troubles while parsing the file
*/
public BillTextReference parseReference(File file) throws IOException, ParseError{
Matcher filenameMatcher = scrapedBillFilePattern.matcher(file.getName());
if (filenameMatcher.matches()) {
// Parse metadata from the file name
BaseBillId baseBillId = new BaseBillId(filenameMatcher.group(2), Integer.parseInt(filenameMatcher.group(1)));
LocalDateTime referenceDateTime = LocalDateTime.parse(filenameMatcher.group(3), DateUtils.BASIC_ISO_DATE_TIME);
Document document = Jsoup.parse(file, "UTF-8");
// If the scraped page indicates the bill was not found, return a "not found" bill text reference
if (billNotFound(document)) {
return new BillTextReference(baseBillId, referenceDateTime, FileUtils.readFileToString(file), "", true);
}
try {
// Get the active amendment id, full text and memo
BillId billId = getBillId(document, baseBillId.getSession());
String text = getText(document, baseBillId);
String memo = getMemo(document, baseBillId);
return new BillTextReference(billId, referenceDateTime, text, memo, false);
} catch (ParseError ex) {
// throw new ParseError("Error while parsing scraped bill: " + file.getName(), ex);
return new BillTextReference(baseBillId, referenceDateTime, "", "", true);
}
}
throw new ParseError("Could not parse scraped bill filename: " + file.getName());
}
/** --- Internal Methods --- */
/**
* Parses the amendment bill id from one of the first header lines
*/
private BillId getBillId(Document document, SessionYear sessionYear) throws ParseError {
Element printNoEle = document.select("span.nv_bot_info > strong").first();
if (printNoEle != null) {
Matcher printNoMatcher = billIdPattern.matcher(printNoEle.text());
if (printNoMatcher.matches()) {
String basePrintNo = printNoMatcher.group(1);
String version = printNoMatcher.group(2);
return new BillId(basePrintNo + (version != null ? version : ""), sessionYear);
}
throw new ParseError("could not parse scraped bill print no: " + printNoEle.text());
}
throw new ParseError("could not get scraped bill print no:");
}
/**
* Parses the full bill text and formats it to account for standard differences between LRS and sobi data
*/
private String getText(Document document, BaseBillId baseBillId) throws ParseError {
Element contents = document.getElementById("nv_bot_contents");
if (contents == null) {
throw new ParseError("Could not locate scraped bill contents");
}
Elements textEles = new Elements();
// Bill text is found in all pre tags contained in <div id="nv_bot_contents"> before the first <hr class="noprint">
for (Element element : contents.children()) {
if ("pre".equalsIgnoreCase(element.tagName())) {
textEles.add(element);
} else if ("hr".equalsIgnoreCase(element.tagName()) && element.classNames().contains("noprint")) {
break;
}
}
StringBuilder textBuilder = new StringBuilder();
textEles.forEach(ele -> processTextNode(ele, textBuilder));
return formatBillText(textBuilder.toString(), baseBillId);
}
/**
* Alters the raw bill text to match the standard formatting of sobi bill text
*/
private String formatBillText(String billText, BaseBillId billId) {
billText = billText.replaceAll("[\r\\uFEFF-\\uFFFF]|(?<=\n) ", "");
billText = billText.replaceAll("ยง", "S");
if (billId.getBillType().isResolution()) {
billText = billText.replaceFirst("^\n\n[\\w \\.-]+\n\n[\\w '\\.\\-:]+\n", "");
billText = billText.replaceFirst("^\\s+PROVIDING", String.format("\n%s RESOLUTION providing", billId.getChamber()));
Matcher resoStartMatcher = resolutionStartPattern.matcher(billText);
if (resoStartMatcher.find()) {
billText = billText.replaceFirst(resolutionStartPattern.pattern(),
"\nLEGISLATIVE RESOLUTION " + resoStartMatcher.group(1).toLowerCase());
}
} else {
billText = billText.replaceFirst("^\n\n[ ]{12}STATE OF NEW YORK(?=\n)",
"\n S T A T E O F N E W Y O R K");
billText = billText.replaceFirst("(?<=\\n)[ ]{16}IN SENATE(?=\\n)",
" I N S E N A T E");
billText = billText.replaceFirst("(?<=\\n)[ ]{15}IN ASSEMBLY(?=\\n)",
" I N A S S E M B L Y");
billText = billText.replaceFirst("(?<=\\n)[ ]{12}SENATE - ASSEMBLY(?=\\n)",
" S E N A T E - A S S E M B L Y");
}
return billText;
}
/**
* Parses and returns the sponsor memo
*/
private String getMemo(Document document, BaseBillId baseBillId) {
Element memoEle = document.select("pre:last-of-type").first(); // you are the first and last of your kind
// Do not get memo if bill is a resolution
if (!baseBillId.getBillType().isResolution() && memoEle != null) {
StringBuilder memoBuilder = new StringBuilder();
processTextNode(memoEle, memoBuilder);
// todo format text
return memoBuilder.toString();
}
return "";
}
/**
* Extracts bill/memo text from an element recursively
*/
private void processTextNode(Element ele, StringBuilder stringBuilder) {
for (Node t : ele.childNodes()) {
if (t instanceof Element) {
Element e = (Element) t;
// TEXT IN <U> TAGS IS REPRESENTED IN CAPS FOR SOBI BILL TEXT
if ("u".equals(e.tag().getName())) {
stringBuilder.append(e.text().toUpperCase());
} else {
processTextNode(e, stringBuilder);
}
} else if (t instanceof TextNode) {
stringBuilder.append(((TextNode) t).getWholeText());
}
}
}
/**
* Returns true if a "Bill Status Information Not Found" tag is located in the document indicating that
* the bill is not on LRS
*/
private boolean billNotFound(Document document) {
Element botContents = document.getElementById("nv_bot_contents");
if (botContents == null) return true;
Elements redFonts = botContents.select("font[color=\"red\"]");
Element notFoundText = redFonts.first();
return notFoundText != null && "Bill Status Information Not Found".equals(notFoundText.text());
}
}