package gov.nysenate.openleg.processor.sobi; import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSet; import com.google.common.eventbus.EventBus; import gov.nysenate.openleg.config.Environment; import gov.nysenate.openleg.dao.base.LimitOffset; import gov.nysenate.openleg.dao.base.SortOrder; import gov.nysenate.openleg.dao.sobi.SobiDao; import gov.nysenate.openleg.model.process.DataProcessAction; import gov.nysenate.openleg.model.process.DataProcessUnit; import gov.nysenate.openleg.model.process.DataProcessUnitEvent; import gov.nysenate.openleg.model.sobi.*; import gov.nysenate.openleg.processor.agenda.AgendaProcessor; import gov.nysenate.openleg.processor.agenda.AgendaVoteProcessor; import gov.nysenate.openleg.processor.bill.BillSobiProcessor; import gov.nysenate.openleg.processor.bill.BillXMLBillDigestProcessor; import gov.nysenate.openleg.processor.bill.BillXMLBillTextProcessor; import gov.nysenate.openleg.processor.calendar.ActiveListProcessor; import gov.nysenate.openleg.processor.calendar.CalendarProcessor; import gov.nysenate.openleg.processor.entity.CommitteeProcessor; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.dao.DataAccessException; import org.springframework.dao.DataIntegrityViolationException; import org.springframework.stereotype.Service; import javax.annotation.PostConstruct; import java.io.IOException; import java.time.LocalDateTime; import java.util.ArrayList; import java.util.Arrays; import java.util.Iterator; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * This SobiProcessService implementation processes every type of sobi fragment. */ @Service public class ManagedSobiProcessService implements SobiProcessService { private static final Logger logger = LoggerFactory.getLogger(ManagedSobiProcessService.class); private static final Pattern patchTagPattern = Pattern.compile("^\\s*</?PATCH>\\s*$"); @Autowired private SobiDao sobiDao; @Autowired private EventBus eventBus; @Autowired private Environment env; /** --- Processor Dependencies --- */ @Autowired private AgendaProcessor agendaProcessor; @Autowired private AgendaVoteProcessor agendaVoteProcessor; @Autowired private BillSobiProcessor billSobiProcessor; @Autowired private CalendarProcessor calendarProcessor; @Autowired private ActiveListProcessor activeListProcessor; @Autowired private CommitteeProcessor committeeProcessor; @Autowired private BillXMLBillTextProcessor billXMLBillTextProcessor; @Autowired private BillXMLBillDigestProcessor billXMLBillDigestProcessor; /** Register processors to handle a specific SobiFragment via this mapping. */ private ImmutableMap<SobiFragmentType, SobiProcessor> processorMap; @PostConstruct protected void init() { eventBus.register(this); processorMap = ImmutableMap.<SobiFragmentType, SobiProcessor>builder() .put(SobiFragmentType.AGENDA, agendaProcessor) .put(SobiFragmentType.AGENDA_VOTE, agendaVoteProcessor) .put(SobiFragmentType.BILL, billSobiProcessor) .put(SobiFragmentType.CALENDAR, calendarProcessor) .put(SobiFragmentType.CALENDAR_ACTIVE, activeListProcessor) .put(SobiFragmentType.COMMITTEE, committeeProcessor) .put(SobiFragmentType.BILLTEXT,billXMLBillTextProcessor) .put(SobiFragmentType.LDSUMM,billXMLBillDigestProcessor) .build(); } /** --- Implemented Methods --- */ /** {@inheritDoc} */ @Override public int collate() { return collateSobiFiles(); } /** {@inheritDoc} */ @Override public int ingest() { return processPendingFragments(SobiProcessOptions.builder().build()); } @Override public String getCollateType() { return "sobi file"; } @Override public String getIngestType() { return "sobi fragment"; } /** {@inheritDoc} */ @Override public int collateSobiFiles() { try { int totalCollated = 0; List<SobiFile> newSobis; do { // Iterate through all the new sobi files in small batches to avoid saturating memory. newSobis = sobiDao.getIncomingSobiFiles(SortOrder.ASC, new LimitOffset(env.getSobiBatchSize())); logger.debug((newSobis.isEmpty()) ? "No more sobi files to collate." : "Collating {} sobi files.", newSobis.size()); for (SobiFile sobiFile : newSobis) { DataProcessUnit unit = new DataProcessUnit("SOBI-FILE", sobiFile.getFileName(), LocalDateTime.now(), DataProcessAction.COLLATE); List<SobiFragment> fragments = createFragments(sobiFile); // Record the sobi file in the backing store. sobiDao.updateSobiFile(sobiFile); // Save the extracted fragments. They will be marked as pending processing. for (SobiFragment fragment : fragments) { logger.info("Saving fragment {}", fragment.getFragmentId()); fragment.setPendingProcessing(true); sobiDao.updateSobiFragment(fragment); unit.addMessage("Saved " + fragment.getFragmentId()); } // Done with this sobi file so let's archive it. sobiDao.archiveAndUpdateSobiFile(sobiFile); totalCollated++; unit.setEndDateTime(LocalDateTime.now()); eventBus.post(new DataProcessUnitEvent(unit)); } } while (!newSobis.isEmpty() && env.isProcessingEnabled()); return totalCollated; } catch (IOException ex) { String errMessage = "Error encountered during collation of sobi files."; throw new DataIntegrityViolationException(errMessage, ex); } } /** {@inheritDoc} */ @Override public List<SobiFragment> getPendingFragments(SortOrder sortByPubDate, LimitOffset limitOffset) { return sobiDao.getPendingSobiFragments(sortByPubDate, limitOffset); } /** {@inheritDoc} */ @Override public int processFragments(List<SobiFragment> fragments, SobiProcessOptions options) { logger.debug((fragments.isEmpty()) ? "No more fragments to process" : "Iterating through {} fragments", fragments.size()); for (SobiFragment fragment : fragments) { // Hand off processing to specific implementations based on fragment type. if (processorMap.containsKey(fragment.getType())) { processorMap.get(fragment.getType()).process(fragment); } else { logger.error("No processors have been registered to handle: " + fragment); } fragment.setProcessedCount(fragment.getProcessedCount() + 1); fragment.setProcessedDateTime(LocalDateTime.now()); } // Perform any necessary post-processing/cleanup processorMap.values().forEach(p -> p.postProcess()); // Set the fragments as processed and update fragments.forEach(f -> { f.setPendingProcessing(false); sobiDao.updateSobiFragment(f); }); return fragments.size(); } /** {@inheritDoc} * * Perform the operation in small batches so memory is not saturated. */ @Override public int processPendingFragments(SobiProcessOptions options) { List<SobiFragment> fragments; int processCount = 0; do { ImmutableSet<SobiFragmentType> allowedTypes = options.getAllowedFragmentTypes(); LimitOffset limOff = (env.isSobiBatchEnabled()) ? new LimitOffset(env.getSobiBatchSize()) : LimitOffset.ONE; fragments = sobiDao.getPendingSobiFragments(allowedTypes, SortOrder.ASC, limOff); processCount += processFragments(fragments, options); } while (!fragments.isEmpty() && env.isProcessingEnabled()); return processCount; } /** {@inheritDoc} */ @Override public void updatePendingProcessing(String fragmentId, boolean pendingProcessing) throws SobiFragmentNotFoundEx { try { SobiFragment fragment = sobiDao.getSobiFragment(fragmentId); fragment.setPendingProcessing(pendingProcessing); sobiDao.updateSobiFragment(fragment); } catch (DataAccessException ex) { throw new SobiFragmentNotFoundEx(); } } /** --- Internal Methods --- */ /** * Extracts a list of SobiFragments from the given SobiFile. */ private List<SobiFragment> createFragments(SobiFile sobiFile) throws IOException { List<SobiFragment> sobiFragments = new ArrayList<>(); StringBuilder billBuffer = new StringBuilder(); boolean isPatch = false; StringBuilder patchMessage = new StringBuilder(); // Incrementing sequenceNo maintains the order in which the sobi fragments were // found in the source sobiFile. However the sequence number for the bill fragment // is always set to 0 to ensure that they are always processed first. int sequenceNo = 1; // Replace the null characters with spaces and split by newline. List<String> lines = Arrays.asList(sobiFile.getText().replace('\0', ' ').split("\\r?\\n")); Iterator<String> lineIterator = lines.iterator(); while (lineIterator.hasNext()) { String line = lineIterator.next(); // Check for a patch tag indicating a manual fix if (patchTagPattern.matcher(line).matches()) { isPatch = true; extractPatchMessage(lineIterator, patchMessage); } SobiFragmentType fragmentType = getFragmentTypeFromLine(line); if (fragmentType != null) { // Bill fragments are in the sobi format and appended into a single buffer if (fragmentType.equals(SobiFragmentType.BILL)) { // Memos need to be converted to latin1 encoding if (line.charAt(11) == SobiLineType.SPONSOR_MEMO.getTypeCode()) { line = new String(line.getBytes(sobiFile.getEncoding()), "latin1"); } line = line.replace((char)193, '°'); billBuffer.append(line).append("\n"); } // Other fragment types are in XML format. The iterator moves past the closing xml // tag and the xml text is stored in the fragment. else { String xmlText = extractXmlText(fragmentType, line, lineIterator); SobiFragment fragment = new SobiFragment(sobiFile, fragmentType, xmlText, sequenceNo++); sobiFragments.add(fragment); } } } // Convert the billBuffer into a single bill fragment (if applicable) with sequence no set to 0. if (billBuffer.length() > 0) { SobiFragment billFragment = new SobiFragment(sobiFile, SobiFragmentType.BILL, billBuffer.toString(), 0); sobiFragments.add(billFragment); } // Set manual fix flag and add notes if this file was a patch if (isPatch) { String notes = patchMessage.toString(); sobiFragments.forEach(fragment -> { fragment.setManualFix(true); fragment.setManualFixNotes(notes); }); } return sobiFragments; } /** * Check the given SOBI line to determine if it matches the start of a SOBI Fragment type. * * @param line String * @return SobiFragmentType or null if no match */ private SobiFragmentType getFragmentTypeFromLine(String line) { for (SobiFragmentType fragmentType : SobiFragmentType.values()) { if (line.matches(fragmentType.getStartPattern())) { return fragmentType; } } return null; } /** * Gets a patch sobi message from within a set of patch tags, appending it to the given string builder * @param lineIterator Iterator<String> * @param patchMessage StringBuilder */ private void extractPatchMessage(Iterator<String> lineIterator, StringBuilder patchMessage) { while(lineIterator.hasNext()) { String line = lineIterator.next(); if (patchTagPattern.matcher(line).matches()) { return; } if (patchMessage.length() > 0) { patchMessage.append("\n"); } patchMessage.append(line.trim()); } } /** * Extracts a well formed XML document from the lines and writes it to the given * file. This depends strongly on escape sequences being on their own line; otherwise * we'll get malformed XML docs. * * @param fragmentType SobiFragmentType * @param line String - The starting line of the document * @param iterator Iterator<String> - Current iterator from the sobi file's text body * * @return String - The resulting XML string. * @throws java.io.IOException */ private String extractXmlText(SobiFragmentType fragmentType, String line, Iterator<String> iterator) throws IOException { String endPattern = fragmentType.getEndPattern(); StringBuffer xmlBuffer = new StringBuffer( "<?xml version='1.0' encoding='UTF-8'?>&newl;" + "<SENATEDATA>&newl;" + line + "&newl;" ); String in = null; while (iterator.hasNext()) { in = iterator.next(); xmlBuffer.append(in.replaceAll("\\xb9", "§")).append("&newl;"); if (in.matches(endPattern)) { break; } } if (in == null) { // This is bad, but don't throw an exception. If the resulting XML document // is malformed we'll throw the exception during ingest. logger.error("Unterminated XML document: " + line); } String xmlString = xmlBuffer.append("</SENATEDATA>").toString(); // TODO: Figure out this magic. xmlBuffer = new StringBuffer(); Matcher m = Pattern.compile("<\\!\\[CDATA\\[(.*?)\\]\\]>").matcher(xmlString); while(m.find()) { m.appendReplacement(xmlBuffer, Matcher.quoteReplacement(m.group(0).replaceAll("&newl;", "").replaceAll("\\\\n","\n"))); } m.appendTail(xmlBuffer); // TODO: Figure out this magic as well. xmlString = xmlBuffer.toString().replaceAll("&newl;", "\n").replaceAll("(?!\n)\\p{Cntrl}","").replaceAll("(?!\\.{2})[ ]{2,}"," "); return xmlString; } }