package com.thinkbiganalytics.metadata.jobrepo.nifi.provenance; /*- * #%L * thinkbig-operational-metadata-integration-service * %% * Copyright (C) 2017 ThinkBig Analytics * %% * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * #L% */ import com.google.common.cache.Cache; import com.google.common.cache.CacheBuilder; import com.google.common.cache.CacheLoader; import com.google.common.cache.LoadingCache; import com.thinkbiganalytics.activemq.config.ActiveMqConstants; import com.thinkbiganalytics.metadata.api.MetadataAccess; import com.thinkbiganalytics.metadata.api.event.MetadataEventService; import com.thinkbiganalytics.metadata.api.event.feed.FeedOperationStatusEvent; import com.thinkbiganalytics.metadata.api.event.feed.OperationStatus; import com.thinkbiganalytics.metadata.api.feed.DeleteFeedListener; import com.thinkbiganalytics.metadata.api.feed.OpsManagerFeed; import com.thinkbiganalytics.metadata.api.feed.OpsManagerFeedProvider; import com.thinkbiganalytics.metadata.api.jobrepo.job.BatchJobExecution; import com.thinkbiganalytics.metadata.api.jobrepo.job.BatchJobExecutionProvider; import com.thinkbiganalytics.metadata.api.jobrepo.nifi.NifiEvent; import com.thinkbiganalytics.metadata.api.jobrepo.step.BatchStepExecution; import com.thinkbiganalytics.metadata.api.jobrepo.step.BatchStepExecutionProvider; import com.thinkbiganalytics.metadata.api.jobrepo.step.FailedStepExecutionListener; import com.thinkbiganalytics.metadata.api.op.FeedOperation; import com.thinkbiganalytics.metadata.jpa.jobrepo.nifi.NifiEventProvider; import com.thinkbiganalytics.nifi.activemq.Queues; import com.thinkbiganalytics.nifi.provenance.model.ProvenanceEventRecordDTO; import com.thinkbiganalytics.nifi.provenance.model.ProvenanceEventRecordDTOHolder; import com.thinkbiganalytics.nifi.rest.client.LegacyNifiRestClient; import org.apache.commons.lang3.StringUtils; import org.apache.nifi.web.api.dto.BulletinDTO; import org.hibernate.exception.LockAcquisitionException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Value; import org.springframework.jms.annotation.JmsListener; import org.springframework.stereotype.Component; import java.util.ArrayList; import java.util.List; import java.util.concurrent.TimeUnit; import java.util.stream.Collectors; import javax.annotation.PostConstruct; import javax.inject.Inject; /** * JMS Listener for NiFi Provenance Events. */ @Component public class ProvenanceEventReceiver implements FailedStepExecutionListener, DeleteFeedListener { private static final Logger log = LoggerFactory.getLogger(ProvenanceEventReceiver.class); /** * Empty feed object for Loading Cache */ static OpsManagerFeed NULL_FEED = new OpsManagerFeed() { @Override public ID getId() { return null; } @Override public String getName() { return null; } @Override protected Object clone() throws CloneNotSupportedException { return super.clone(); } @Override public int hashCode() { return super.hashCode(); } @Override public FeedType getFeedType() { return null; } }; @Inject OpsManagerFeedProvider opsManagerFeedProvider; @Inject NifiBulletinExceptionExtractor nifiBulletinExceptionExtractor; /** * Temporary cache of completed events in to check against to ensure we trigger the same event twice */ Cache<String, String> completedJobEvents = CacheBuilder.newBuilder().expireAfterWrite(20, TimeUnit.MINUTES).build(); /** * Cache of the Ops Manager Feed Object to ensure that we only process and create Job Executions for feeds that have been registered in Feed Manager */ LoadingCache<String, OpsManagerFeed> opsManagerFeedCache = null; @Value("${kylo.ops.mgr.query.nifi.bulletins:false}") private boolean queryForNiFiBulletins; @Inject private NifiEventProvider nifiEventProvider; @Inject private BatchJobExecutionProvider batchJobExecutionProvider; @Inject private BatchStepExecutionProvider batchStepExecutionProvider; @Inject private LegacyNifiRestClient nifiRestClient; @Inject private MetadataAccess metadataAccess; @Inject private MetadataEventService eventService; /** * The amount of retry attempts the system will do if it gets a LockAcquisitionException * MySQL may fail to lock the table when performing inserts into the database resulting in a deadlock exception. * When processing each event the LockAcquisitionException is caught and a retry attempt is done, retrying to process the event this amount of times before giving up. */ private int lockAcquisitionRetryAmount = 4; /** * default constructor creates the feed cache */ public ProvenanceEventReceiver() { // create the loading Cache to get the Feed Manager Feeds. If its not in the cache, query the JCR store for the Feed object otherwise return the NULL_FEED object opsManagerFeedCache = CacheBuilder.newBuilder().build(new CacheLoader<String, OpsManagerFeed>() { @Override public OpsManagerFeed load(String feedName) throws Exception { OpsManagerFeed feed = null; try { feed = metadataAccess.commit(() -> opsManagerFeedProvider.findByName(feedName), MetadataAccess.SERVICE); } catch (Exception e) { } return feed == null ? NULL_FEED : feed; } } ); } @PostConstruct private void init() { batchStepExecutionProvider.subscribeToFailedSteps(this); opsManagerFeedProvider.subscribeFeedDeletion(this); } /** * Unique key for the Event in relation to the Job * * @param event a provenance event * @return a unique key representing the event */ private String triggeredEventsKey(ProvenanceEventRecordDTO event) { return event.getJobFlowFileId() + "_" + event.getEventId(); } /** * Process the Events from Nifi * If it is a batch job, write the records to Ops manager. * if it is a stream just write to the Nifi_event table. * When either are marked as the last event Notify the event bus for the trigger feed mechanism to work. * * @param events The events obtained from JMS */ @JmsListener(destination = Queues.FEED_MANAGER_QUEUE, containerFactory = ActiveMqConstants.JMS_CONTAINER_FACTORY, concurrency = "3-10") public void receiveEvents(ProvenanceEventRecordDTOHolder events) { log.info("About to process batch: {}, {} events from the {} queue ", events.getBatchId(),events.getEvents().size(), Queues.FEED_MANAGER_QUEUE); events.getEvents().stream() .filter(this::isRegisteredWithFeedManager) .filter(this::ensureNewEvent) .forEach(event -> processEvent(event, 0)); } /** * process the event and persist it along with creating the Job and Step. If there is a lock error it will retry until it hits the {@link this#lockAcquisitionRetryAmount} * * @param event a provenance event * @param retryAttempt the retry number. If there is a lock error it will retry until it hits the {@link this#lockAcquisitionRetryAmount} */ private void processEvent(ProvenanceEventRecordDTO event, int retryAttempt) { try { if (event.isBatchJob()) { //ensure the job is there BatchJobExecution jobExecution = metadataAccess.commit(() -> batchJobExecutionProvider.getOrCreateJobExecution(event), MetadataAccess.SERVICE); NifiEvent nifiEvent = metadataAccess.commit(() -> receiveBatchEvent(jobExecution, event), MetadataAccess.SERVICE); } else { NifiEvent nifiEvent = metadataAccess.commit(() -> nifiEventProvider.create(event), MetadataAccess.SERVICE); } if (event.isFinalJobEvent()) { notifyJobFinished(event); } } catch (LockAcquisitionException lae) { //safeguard against LockAcquisitionException if MySQL has a problem locking the table during its processing of the Event. if (retryAttempt < lockAcquisitionRetryAmount) { retryAttempt++; log.error("LockAcquisitionException found trying to process Event: {} . Retry attempt # {} ", event, retryAttempt, lae); //wait and re attempt try { Thread.sleep(300L); } catch (InterruptedException var10) { } processEvent(event, retryAttempt); } else { log.error("LockAcquisitionException found. Unsuccessful after retrying {} times. This event {} will not be processed. ", retryAttempt, event, lae); } } catch (Exception e) { log.error("Error processing Event ", event, e); } } /** * Process this record and record the Job and steps * * @param jobExecution the job execution * @param event a provenance event * @return a persisted nifi event object */ private NifiEvent receiveBatchEvent(BatchJobExecution jobExecution, ProvenanceEventRecordDTO event) { NifiEvent nifiEvent = null; log.debug("Received ProvenanceEvent {}. is end of Job: {}. is ending flowfile:{}, isBatch: {}", event, event.isEndOfJob(), event.isEndingFlowFileEvent(), event.isBatchJob()); nifiEvent = nifiEventProvider.create(event); //query it again jobExecution = batchJobExecutionProvider.findByJobExecutionId(jobExecution.getJobExecutionId()); BatchJobExecution job = batchJobExecutionProvider.save(jobExecution, event, nifiEvent); if (job == null) { log.error(" Detected a Batch event, but could not find related Job record. for event: {} is end of Job: {}. is ending flowfile:{}, isBatch: {}", event, event.isEndOfJob(), event.isEndingFlowFileEvent(), event.isBatchJob()); } return nifiEvent; } /** * Check to see if the event has a relationship to Feed Manager * In cases where a user is experimenting in NiFi and not using Feed Manager the event would not be registered * * @param event a provenance event * @return {@code true} if the event has a feed associaetd with it {@code false} if there is no feed associated with it */ private boolean isRegisteredWithFeedManager(ProvenanceEventRecordDTO event) { String feedName = event.getFeedName(); if (StringUtils.isNotBlank(feedName)) { OpsManagerFeed feed = opsManagerFeedCache.getUnchecked(feedName); if (feed == null || NULL_FEED.equals(feed)) { log.debug("Not processing operational metadata for feed {} , event {} because it is not registered in feed manager ", feedName, event); opsManagerFeedCache.invalidate(feedName); return false; } else { return true; } } return false; } /** * Notify that the Job is complete either as a successful job or failed Job * * @param event a provenance event */ private void notifyJobFinished(ProvenanceEventRecordDTO event) { if (event.isFinalJobEvent()) { String mapKey = triggeredEventsKey(event); String alreadyTriggered = completedJobEvents.getIfPresent(mapKey); if (alreadyTriggered == null) { completedJobEvents.put(mapKey, mapKey); /// TRIGGER JOB COMPLETE!!! if (event.isHasFailedEvents()) { failedJob(event); } else { successfulJob(event); } } } } /** * Triggered for both Batch and Streaming Feed Jobs when the Job and any related Jobs (as a result of a Merge of other Jobs are complete but have a failure in the flow<br/> Example: <br/> Job * (FlowFile) 1,2,3 are all running<br/> Job 1,2,3 get Merged<br/> Job 1,2 finish<br/> Job 3 finishes <br/> * * This will fire when Job3 finishes indicating this entire flow is complete<br/> * * @param event a provenance event */ private void failedJob(ProvenanceEventRecordDTO event) { if (queryForNiFiBulletins && event.isBatchJob()) { queryForNiFiErrorBulletins(event); } FeedOperation.State state = FeedOperation.State.FAILURE; log.debug("FAILED JOB for Event {} ", event); this.eventService.notify(new FeedOperationStatusEvent(new OperationStatus(event.getFeedName(), null, state, "Failed Job"))); } /** * Triggered for both Batch and Streaming Feed Jobs when the Job and any related Jobs (as a result of a Merge of other Jobs are complete<br/> Example: <br/> Job (FlowFile) 1,2,3 are all * running<br/> Job 1,2,3 get Merged<br/> Job 1,2 finish<br/> Job 3 finishes <br/> * * This will fire when Job3 finishes indicating this entire flow is complete<br/> * * @param event a provenance event */ private void successfulJob(ProvenanceEventRecordDTO event) { FeedOperation.State state = FeedOperation.State.SUCCESS; log.debug("Success JOB for Event {} ", event); this.eventService.notify(new FeedOperationStatusEvent(new OperationStatus(event.getFeedName(), null, state, "Job Succeeded for feed: " + event.getFeedName()))); } /** * Make a REST call to NiFi and query for the NiFi Bulletins that have a flowfile id matching for this job execution and write the bulletin message to the {@link * BatchJobExecution#setExitMessage(String)} * * @param event a provenance event */ private void queryForNiFiErrorBulletins(ProvenanceEventRecordDTO event) { try { metadataAccess.commit(() -> { //query for nifi logs List<String> relatedFlowFiles = batchJobExecutionProvider.findRelatedFlowFiles(event.getFlowFileUuid()); if (relatedFlowFiles == null) { relatedFlowFiles = new ArrayList<>(); } if (relatedFlowFiles.isEmpty()) { relatedFlowFiles.add(event.getFlowFileUuid()); } log.info("Failed Job {}/{}. Found {} related flow files. ", event.getEventId(), event.getFlowFileUuid(), relatedFlowFiles.size()); List<BulletinDTO> bulletinDTOS = nifiBulletinExceptionExtractor.getErrorBulletinsForFlowFiles(relatedFlowFiles); if (bulletinDTOS != null && !bulletinDTOS.isEmpty()) { //write them back to the job BatchJobExecution jobExecution = batchJobExecutionProvider.findJobExecution(event); if (jobExecution != null) { String msg = jobExecution.getExitMessage() != null ? jobExecution.getExitMessage() + "\n" : ""; msg += "NiFi exceptions: \n" + bulletinDTOS.stream().map(bulletinDTO -> bulletinDTO.getMessage()).collect(Collectors.joining("\n")); jobExecution.setExitMessage(msg); this.batchJobExecutionProvider.save(jobExecution); } } }, MetadataAccess.SERVICE); } catch (Exception e) { log.error("Unable to query NiFi and save exception bulletins for job failure eventid/flowfile : {} / {}. Exception Message: {}", event.getEventId(), event.getFlowFileUuid(), e.getMessage(), e); } } /** * Fails the step identified by the parameters given * * @param jobExecution the job execution * @param stepExecution the step execution * @param flowFileId the id of the flow file * @param componentId the id of the component */ @Override public void failedStep(BatchJobExecution jobExecution, BatchStepExecution stepExecution, String flowFileId, String componentId) { nifiBulletinExceptionExtractor.addErrorMessagesToStep(stepExecution, flowFileId, componentId); } /* * Indicates if the specified event hasn't already been processed. * * @param event the event to check * @return {@code true} if the event is new, or {@code false otherwise} */ private boolean ensureNewEvent(ProvenanceEventRecordDTO event) { return metadataAccess.read(() -> !nifiEventProvider.exists(event), MetadataAccess.SERVICE); } /** * When a feed is deleted remove it from the cache of feed names * * @param feed a delete feed */ @Override public void onFeedDelete(OpsManagerFeed feed) { log.info("Notified that feed {} has been deleted. Removing this feed from the ProvenanceEventReceiver cache. ", feed.getName()); opsManagerFeedCache.invalidate(feed.getName()); } }