package gov.nysenate.openleg.processor.law;
import com.google.common.collect.Sets;
import gov.nysenate.openleg.model.law.LawDocumentType;
import gov.nysenate.openleg.model.law.LawFile;
import gov.nysenate.openleg.model.law.LawTree;
import gov.nysenate.openleg.model.law.LawVersionId;
import gov.nysenate.openleg.model.process.DataProcessUnit;
import gov.nysenate.openleg.processor.base.AbstractDataProcessor;
import gov.nysenate.openleg.service.law.data.LawDataService;
import gov.nysenate.openleg.service.law.data.LawTreeNotFoundEx;
import gov.nysenate.openleg.service.law.event.BulkLawUpdateEvent;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import java.io.File;
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import static gov.nysenate.openleg.model.law.LawDocumentType.*;
/**
* Processes the initial/update law dumps and persists the data.
*/
@Service
public class LawProcessor extends AbstractDataProcessor
{
private static final Logger logger = LoggerFactory.getLogger(LawProcessor.class);
/** The law files are most likely sent in CP850 encoding. */
protected static Charset LAWFILE_CHARSET = Charset.forName("CP850");
/** Pattern for law doc headers. */
protected static Pattern lawHeader =
Pattern.compile("\\.\\.SO DOC ((\\w{3})(.{13}))(.{8}) (.{15}) (?:LAWS\\(((?:UN)?CONSOLIDATED)\\))");
/** Hints about the law hierarchy for certain laws that have inconsistent doc id naming. */
protected static Map<String, List<LawDocumentType>> expectedLawOrdering = new HashMap<>();
static {
expectedLawOrdering.put("EDN", Arrays.asList(TITLE, ARTICLE, SUBARTICLE, PART, SUB_PART));
expectedLawOrdering.put("CPL", Arrays.asList(PART, TITLE, ARTICLE));
}
/** Set of law ids to ignore during processing. */
protected Set<String> ignoreLaws = Sets.newHashSet("CNS");
/** Set of law ids to only allow processing of. Overrides 'ignoreLaws'. */
protected Set<String> onlyLaws = Sets.newHashSet();
@Autowired private LawDataService lawDataService;
@Override
public void init() {
initBase();
}
/**
* Performs all the steps required to process and persist the supplied LawFile.
*
* @param lawFile LawFile
*/
public void process(final LawFile lawFile) {
boolean isInitial = lawFile.isInitialDump();
DataProcessUnit unit = createDataProcessUnit(lawFile);
try {
logger.info("Processing law file {}", lawFile);
List<LawBlock> lawBlocks = getLawBlocks(lawFile);
if (isInitial) {
processInitialLaws(lawFile, lawBlocks, unit);
}
else {
processLawUpdates(lawFile, lawBlocks, unit);
}
}
catch (IOException ex) {
logger.error("Unexpected IOException during LawFile processing", ex);
unit.addException("Unexpected IOException: " + ex.getMessage());
}
catch (LawParseException ex) {
unit.addException("Fatal law parsing error, processing has been halted! " + ex.getMessage(), logger);
}
postDataUnitEvent(unit);
}
/** --- Basic Getters/Setters --- */
public Set<String> getIgnoreLaws() {
return ignoreLaws;
}
public void setIgnoreLaws(String... ignoreLaws) {
this.ignoreLaws = Sets.newHashSet(ignoreLaws);
}
public Set<String> getOnlyLaws() {
return onlyLaws;
}
public void setOnlyLaws(String... lawIds) {
this.onlyLaws = Sets.newHashSet(lawIds);
}
/** --- Internal Methods --- */
/**
* The initial laws are parsed such that the order of the documents indicates the structure of the laws
* (i.e. there are no master documents).
*
* @param lawFile LawFile
* @param lawBlocks List<LawBlock>
*/
protected void processInitialLaws(LawFile lawFile, List<LawBlock> lawBlocks, DataProcessUnit unit) {
Map<String, LawBuilder> lawBuilders = new HashMap<>();
for (LawBlock block : lawBlocks) {
if (!shouldProcessLaw(block)) continue;
// Create the law builder for the law id if it doesn't already exist.
if (!lawBuilders.containsKey(block.getLawId())) {
LawBuilder lawBuilder = createLawBuilder(new LawVersionId(block.getLawId(), block.getPublishedDate()), null);
lawBuilders.put(block.getLawId(), lawBuilder);
unit.addMessage("Processing initial docs for " + block.getLawId());
}
// Process the initial block
lawBuilders.get(block.getLawId()).addInitialBlock(block, true);
}
// Persist the results
persist(lawFile, lawBuilders);
}
/**
* The update files will either contain a document for a new or changed law block, or a MASTER document
* to indicate that the organization of the law has changed. The other types of actions include AMENDED
* and REPEALED but we have not encountered those as of yet.
*
* @param lawFile LawFile
* @param lawBlocks List<LawBlock>
*/
protected void processLawUpdates(LawFile lawFile, List<LawBlock> lawBlocks, DataProcessUnit unit) {
Map<String, LawBuilder> lawBuilders = new HashMap<>();
Map<String, LawTree> lawTrees = new HashMap<>();
for (LawBlock block : lawBlocks) {
if (!shouldProcessLaw(block)) continue;
LawVersionId lawVersionId = new LawVersionId(block.getLawId(), block.getPublishedDate());
logger.debug("Processing law version id: {}", lawVersionId);
// Retrieve the existing law tree if it exists.
if (!lawTrees.containsKey(block.getLawId())) {
try {
LawTree lawTree = lawDataService.getLawTree(block.getLawId(), block.getPublishedDate());
lawTrees.put(block.getLawId(), lawTree);
}
catch (LawTreeNotFoundEx ex) {
lawTrees.put(block.getLawId(), null);
unit.addException("Update received for a law " + block.getLawId() + " without an existing tree!", logger);
}
}
// Create the law builder for the law id if it doesn't already exist.
if (!lawBuilders.containsKey(block.getLawId())) {
LawBuilder lawBuilder = createLawBuilder(lawVersionId, lawTrees.get(block.getLawId()));
lawBuilders.put(block.getLawId(), lawBuilder);
}
// Process the update block
lawBuilders.get(block.getLawId()).addUpdateBlock(block);
}
persist(lawFile, lawBuilders);
}
/**
* Iterates over the law builders and persists the processed output.
*
* @param lawFile LawFile - Used to keep track of the source
* @param lawBuilders Map<String, LawBuilder>
*/
private void persist(LawFile lawFile, Map<String, LawBuilder> lawBuilders) {
// Persist the results
lawBuilders.forEach((lawId, lawBuilder) ->{
logger.info("Persisting law documents for {}", lawId);
eventBus.post(new BulkLawUpdateEvent(lawBuilder.getProcessedLawDocuments()));
lawBuilder.getProcessedLawDocuments().forEach(d -> lawDataService.saveLawDocument(lawFile, d));
logger.info("Persisting law tree for {}", lawId);
lawDataService.saveLawTree(lawFile, lawBuilder.getProcessedLawTree());
});
}
/**
* Extracts a collection of LawBlocks from the given LawFile. Each block is represents all the meta data and
* text for each document section in the law file (delineated by the ..SO DOC header). The LawBlock is just
* a helper object that should be used to construct LawDocuments.
*
* @param lawFile LawFile - The LawFile to extract the blocks from.
* @return List<ListBlock>
* @throws IOException
*/
protected List<LawBlock> getLawBlocks(LawFile lawFile) throws IOException {
List<LawBlock> rawDocList = new ArrayList<>();
logger.debug("Extracting law blocks...");
File file = lawFile.getFile();
Iterator<String> fileItr = Files.lines(file.toPath(), LAWFILE_CHARSET).iterator();
LawBlock block = null;
Matcher headerMatcher;
while (fileItr.hasNext()) {
String line = fileItr.next();
headerMatcher = lawHeader.matcher(line);
if (headerMatcher.matches()) {
if (block != null && !LawDocIdFixer.ignoreDocument(block.getDocumentId(), block.getPublishedDate())) {
rawDocList.add(block);
}
block = new LawBlock();
block.setHeader(line);
block.setLawId(headerMatcher.group(2).trim());
block.setPublishedDate(lawFile.getPublishedDate());
block.setDocumentId(
LawDocIdFixer.applyReplacement(headerMatcher.group(1).trim(), lawFile.getPublishedDate()));
block.setLocationId(block.getDocumentId().substring(3));
block.setMethod(headerMatcher.group(4).trim());
block.setConsolidated(headerMatcher.group(6).equals("CONSOLIDATED"));
}
else {
if (block == null) throw new LawParseException("No doc header received prior to line: " + line);
block.getText().append(line).append("\\n");
}
}
if (block != null && !LawDocIdFixer.ignoreDocument(block.getDocumentId(), block.getPublishedDate())) {
rawDocList.add(block);
}
return rawDocList;
}
protected boolean shouldProcessLaw(LawBlock block) {
return (onlyLaws.contains(block.getLawId())) ||
(onlyLaws.isEmpty() && !ignoreLaws.contains(block.getLawId()));
}
protected LawBuilder createLawBuilder(LawVersionId lawVersionId, LawTree previousTree) {
if (expectedLawOrdering.containsKey(lawVersionId.getLawId())) {
return new HintBasedLawBuilder(lawVersionId, previousTree, expectedLawOrdering.get(lawVersionId.getLawId()));
}
else {
return new IdBasedLawBuilder(lawVersionId, previousTree);
}
}
}