package com.thinkbiganalytics.nifi.provenance;
/*-
* #%L
* thinkbig-nifi-provenance-repo
* %%
* Copyright (C) 2017 ThinkBig Analytics
* %%
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* #L%
*/
import com.google.common.collect.Lists;
import com.thinkbiganalytics.nifi.provenance.cache.FeedFlowFileCacheUtil;
import com.thinkbiganalytics.nifi.provenance.jms.ProvenanceEventActiveMqWriter;
import com.thinkbiganalytics.nifi.provenance.model.BatchFeedProcessorEvents;
import com.thinkbiganalytics.nifi.provenance.model.FeedFlowFile;
import com.thinkbiganalytics.nifi.provenance.model.ProvenanceEventRecordDTO;
import com.thinkbiganalytics.nifi.provenance.model.ProvenanceEventRecordDTOHolder;
import com.thinkbiganalytics.nifi.provenance.model.util.ProvenanceEventUtil;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Qualifier;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.stream.Collectors;
/**
* Process a Kylo managed ProvenanceEvent. If indicated as a Stream ({@link ProvenanceEventRecordDTO#isStream()}) the system will just generate Statistics {@link
* com.thinkbiganalytics.nifi.provenance.model.stats.AggregatedFeedProcessorStatistics} grouping the events by Feed and Processor Id Otherwise if not indicated a stream, it will be processed as a
* Batch job and send the full Event to JMS
*/
public class ProvenanceEventCollector {
private static final Logger log = LoggerFactory.getLogger(ProvenanceEventCollector.class);
@Autowired
ProvenanceFeedLookup provenanceFeedLookup;
@Autowired
ProvenanceStatsCalculator statsCalculator;
@Autowired
FeedFlowFileCacheUtil cacheUtil;
/**
* The Map of Objects that will be grouped and sent over to Kylo as Batch Jobs/Steps for Operations Manager
*/
Map<String, BatchFeedProcessorEvents> groupedBatchEventsByFeed = new ConcurrentHashMap<>();
@Autowired
private ProvenanceEventActiveMqWriter provenanceEventActiveMqWriter;
/**
* Safeguard against the system sending too many batch feed events through to Kylo
* This is the max events per second allowed for a feed/processor combo
* if a given batch exceeds this threshold the remaining jobs will be suppressed
* All jobs will calculate statistics about the feeds
*/
private Integer maxBatchFeedJobEventsPerSecond = 10;
/**
* Size of the group of events that will be batched and sent to Kylo
*/
private Integer jmsEventGroupSize = 50;
@Autowired
public ProvenanceEventCollector(@Qualifier("provenanceEventActiveMqWriter") ProvenanceEventActiveMqWriter provenanceEventActiveMqWriter) {
super();
this.provenanceEventActiveMqWriter = provenanceEventActiveMqWriter;
}
/**
* The key to use to batch up the events by Feed and Processor.
*
* @param event the event to process
* @return the key based upon the feed name and the component id
*/
private String mapKey(ProvenanceEventRecordDTO event) {
return event.getFeedName() + ":" + event.getComponentId();
}
/**
* determine if the event has Feed
*
* @param event the event to check
* @return true if feed name is set, false if not
*/
private boolean hasFeedName(ProvenanceEventRecordDTO event) {
return StringUtils.isNotBlank(event.getFeedName());
}
/**
* Process the event, adding it to the running {@link com.thinkbiganalytics.nifi.provenance.model.FeedFlowFile} , calculating statistics on the event, and if a Batch feed, grouped by Feed and
* Processor, process the entire event for processing.
*
* @param event the event to process
*/
public void process(ProvenanceEventRecordDTO event) {
try {
if (event != null) {
try {
cacheUtil.cacheAndBuildFlowFileGraph(event);
//if the Flow gets an "Empty Queue" message it means a user emptied the queue that was stuck in a connection.
// this means the flow cannot complete and will be treated as a failed flow and failed job
if (ProvenanceEventUtil.isFlowFileQueueEmptied(event)) {
// a Drop event component id will be the connection, not the processor id. we will set the name of the component
event.setComponentName("FlowFile Queue emptied");
event.setIsFailure(true);
event.setHasFailedEvents(true);
FeedFlowFile feedFlowFile = event.getFeedFlowFile();
if (feedFlowFile != null) {
feedFlowFile.checkAndMarkComplete(event);
}
event.getFeedFlowFile().incrementFailedEvents();
}
//only process if we can get the feed name, otherwise its no use
if (hasFeedName(event)) {
//send the event off for stats processing
statsCalculator.calculateStats(event);
//batch up the data to send to kylo if this feed is marked as a batch or if the parent flow file is marked as a batch
if (!event.isStream()) {
batchEvent(event);
}
} else {
log.error("Provenance: Cant find Feed for {} ", event);
}
} catch (FeedFlowFileNotFoundException e) {
log.debug("Unable to find Root flowfile.", event, event.getFlowFileUuid());
}
}
} catch (Exception e) {
log.error("ERROR PROCESSING EVENT! {}. ERROR: {} ", event, e.getMessage(), e);
}
}
/**
* Group the Event by Feed and then by Processor
*
* @param event the event to process
* @return true if added, false if suppressed
*/
private boolean batchEvent(ProvenanceEventRecordDTO event) {
if (event != null) {
return groupedBatchEventsByFeed.computeIfAbsent(mapKey(event), mapKey -> new BatchFeedProcessorEvents(event.getFeedName(),
event
.getComponentId(), getMaxBatchFeedJobEventsPerSecond())).setMaxEventsPerSecond(
getMaxBatchFeedJobEventsPerSecond()).add(event);
}
return false;
}
/**
* Send both the Statistics {@link com.thinkbiganalytics.nifi.provenance.model.stats.AggregatedFeedProcessorStatisticsHolder} and the Batched Provenance Events {@link
* ProvenanceEventRecordDTOHolder } to JMS for Kylo Operations Manager to process
*/
public void sendToJms() {
//update the collection time
List<ProvenanceEventRecordDTO> eventsSentToJms = groupedBatchEventsByFeed.values().stream()
.flatMap(feedProcessorEventAggregate -> feedProcessorEventAggregate.collectEventsToBeSentToJmsQueue().stream())
.collect(Collectors.toList());
sendBatchFeedEvents(eventsSentToJms);
statsCalculator.sendStats();
}
/**
* Send the Batched events over to JMS
*
* @param elements The events to send to JMS
*/
private void sendBatchFeedEvents(List<ProvenanceEventRecordDTO> elements) {
if (elements != null && !elements.isEmpty()) {
Lists.partition(elements, getJmsEventGroupSize()).forEach(eventsSubList -> {
ProvenanceEventRecordDTOHolder eventRecordDTOHolder = new ProvenanceEventRecordDTOHolder();
eventRecordDTOHolder.setEvents(Lists.newArrayList(eventsSubList));
provenanceEventActiveMqWriter.writeBatchEvents(eventRecordDTOHolder);
});
}
}
/**
* The Max number of events allowed per feed per second. This is passed in from the {@link com.thinkbiganalytics.nifi.provenance.reporting.KyloProvenanceEventReportingTask} configuration and used
* to safeguard against processing too many Jobs/Event records as Batch Jobs.
*
* @return the maximum number of jobs/sec allowed for this feed to be considered a batch job
*/
public Integer getMaxBatchFeedJobEventsPerSecond() {
return maxBatchFeedJobEventsPerSecond;
}
public void setMaxBatchFeedJobEventsPerSecond(Integer maxBatchFeedJobEventsPerSecond) {
this.maxBatchFeedJobEventsPerSecond = maxBatchFeedJobEventsPerSecond;
}
/**
* Returns the sub group size of events to group together before sending to JMS.
*/
public Integer getJmsEventGroupSize() {
return jmsEventGroupSize == null ? 50 : jmsEventGroupSize;
}
public void setJmsEventGroupSize(Integer jmsEventGroupSize) {
this.jmsEventGroupSize = jmsEventGroupSize;
}
}