package com.thinkbiganalytics.nifi.provenance.cache; /*- * #%L * thinkbig-nifi-provenance-repo * %% * Copyright (C) 2017 ThinkBig Analytics * %% * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * #L% */ import com.google.common.cache.Cache; import com.google.common.cache.CacheBuilder; import com.thinkbiganalytics.nifi.provenance.AggregationEventProcessingStats; import com.thinkbiganalytics.nifi.provenance.model.FeedFlowFile; import com.thinkbiganalytics.nifi.provenance.reporting.KyloProvenanceEventReportingTask; import org.apache.nifi.controller.ConfigurationContext; import org.joda.time.DateTime; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.ArrayList; import java.util.Collection; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; import java.util.stream.Collectors; /** * As a feed runs through NiFi the root {@link FeedFlowFile} keeps track of its progress and the status of its child flow files {@link FeedFlowFile#activeChildFlowFiles} and last processed * ProvenanceEvent {@link FeedFlowFile#flowFileLastEventTime} When a {@link FeedFlowFile} is marked as the complete {@link FeedFlowFile#isFeedComplete()} it will be removed from this cache via the * {@link this#expire()} thread When NiFi shuts down the cache is persisted to disk via the {@link FeedFlowFileMapDbCache#persistFlowFiles()} called by the {@link * com.thinkbiganalytics.nifi.provenance.reporting.KyloProvenanceEventReportingTask#onShutdown(ConfigurationContext)} This is to ensure that on startup of NiFi the tracking of the running flow files * is kept in tact When NiFi starts the persisted disk cache is checked and loaded back into this cache via the {@link KyloProvenanceEventReportingTask#onConfigurationRestored()} */ public class FeedFlowFileGuavaCache { private static final Logger log = LoggerFactory.getLogger(FeedFlowFileGuavaCache.class); /** * The cache of FeedFlowFiles */ private final Cache<String, FeedFlowFile> cache; /** * The amount of time the expire thread should run to check and expire the feed flow files */ private Integer expireTimerCheckSeconds = 10; /** * Listeners that can get notified with a FeedFlowFile is invalidated and removed from the cache */ private List<FeedFlowFileCacheListener> listeners = new ArrayList<>(); /** * The last time the summary was printed */ private DateTime lastPrintLogTime = null; /** * How often should the summary of whats in the cache be logged * Every 5 minutes */ private Long PRINT_LOG_MILLIS = 60 * 5000L; public FeedFlowFileGuavaCache() { cache = CacheBuilder.newBuilder().build(); log.info("Created new FlowFileGuavaCache running timer every {} seconds to check and expire finished flow files", expireTimerCheckSeconds); initTimerThread(); } /** * A listener can subscribe to the invalidate calls on the cache. * the {@link FeedFlowFileMapDbCache} subscribes to this cache to get messages and invalidate the files persisted on disk when they are completed. */ public void subscribe(FeedFlowFileCacheListener listener) { listeners.add(listener); } /** * Check to see if a given flowfile is in the cache * * @return true if in the cache, false if not */ public boolean isCached(String flowFileId) { return cache.getIfPresent(flowFileId) != null; } /** * Get a FeedFlowFile from the cache. * If the FeedFlowFile is not there it will return null * * @return the FeedFlowFile, or null if not present */ public FeedFlowFile getEntry(String id) { return cache.getIfPresent(id); } /** * Return all the FeedFlowFiles in the cache */ public Collection<FeedFlowFile> getFlowFiles() { return new HashSet<>(cache.asMap().values()); } /** * Add a FeedFlowFile to the cache * * @param flowFileId the id of the flowfile * @param feedFlowFile the FeedFlowFile to relate/add to the cache */ public void add(String flowFileId, FeedFlowFile feedFlowFile) { cache.put(flowFileId, feedFlowFile); } /** * Return all the FeedFlowFiles in the cache that are complete and Done. * * @return the flow files that are completed */ public List<FeedFlowFile> getCompletedFeedFlowFiles() { return getFlowFiles().stream().filter(flowFile -> (flowFile.isFeedComplete())).collect(Collectors.toList()); } /** * Invalidate and remove the given FeedFlowFile from the cache * * @param flowFile the flow file to invalidate/remove */ public void invalidate(FeedFlowFile flowFile) { if (flowFile != null && flowFile.isFeedComplete()) { invalidate(flowFile.getId()); if (flowFile.getChildFlowFiles() != null) { flowFile.getChildFlowFiles().stream().forEach(flowFileId -> invalidate(flowFileId)); } listeners.stream().forEach(flowFileCacheListener -> flowFileCacheListener.onInvalidate(flowFile)); } } /** * Invalidate and remove the flowfile from the cache */ public void invalidate(String flowFileId) { cache.invalidate(flowFileId); } /** * Expire any completed FeedFlowFiles checking the {@link FeedFlowFile#isFeedComplete()} to determine if the FeedFlowFile is complete */ public void expire() { try { long start = System.currentTimeMillis(); List<FeedFlowFile> rootFiles = getCompletedFeedFlowFiles(); if (!rootFiles.isEmpty()) { for (FeedFlowFile root : rootFiles) { invalidate(root); } long stop = System.currentTimeMillis(); if (rootFiles.size() > 0) { log.info("Time to expire {} flowfile and all references {} ms. FeedFlowFile and references left in cache: {} ", rootFiles.size(), (stop - start), getFlowFiles().size()); } } if (lastPrintLogTime == null || (lastPrintLogTime != null && DateTime.now().getMillis() - lastPrintLogTime.getMillis() > (PRINT_LOG_MILLIS))) { printSummary(); lastPrintLogTime = DateTime.now(); } } catch (Exception e) { log.error("Error attempting to invalidate FlowFileGuava cache {}, {}", e.getMessage(), e); } } /** * Log some summary data about the cache and JMS activity */ public void printSummary() { Map<String, FeedFlowFile> map = cache.asMap(); log.info("FeedFlowFile Cache Size: {} ", map.size()); log.info("ProvenanceEvent JMS Stats: Sent {} statistics events to JMS. Sent {} batch events to JMS ", AggregationEventProcessingStats.getStreamingEventsSent(), AggregationEventProcessingStats.getBatchEventsSent()); } /** * Start the timer thread using the {@link this#expireTimerCheckSeconds} as the schedule interval in SECONDS */ private void initTimerThread() { ScheduledExecutorService service = Executors.newSingleThreadScheduledExecutor(); service.scheduleAtFixedRate(() -> { expire(); }, expireTimerCheckSeconds, expireTimerCheckSeconds, TimeUnit.SECONDS); } }