/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nifi.provenance.index.lucene; import java.io.File; import java.io.FileFilter; import java.util.ArrayList; import java.util.Collections; import java.util.Date; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Optional; import java.util.SortedMap; import java.util.TreeMap; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Collectors; import org.apache.nifi.provenance.RepositoryConfiguration; import org.apache.nifi.provenance.util.DirectoryUtils; import org.apache.nifi.util.Tuple; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class IndexDirectoryManager { private static final Logger logger = LoggerFactory.getLogger(IndexDirectoryManager.class); private static final FileFilter INDEX_DIRECTORY_FILTER = f -> f.getName().startsWith("index-"); private static final Pattern INDEX_FILENAME_PATTERN = Pattern.compile("index-(\\d+)"); private final RepositoryConfiguration repoConfig; // guarded by synchronizing on 'this' private final SortedMap<Long, List<IndexLocation>> indexLocationByTimestamp = new TreeMap<>(); private final Map<String, IndexLocation> activeIndices = new HashMap<>(); public IndexDirectoryManager(final RepositoryConfiguration repoConfig) { this.repoConfig = repoConfig; } public synchronized void initialize() { final Map<File, Tuple<Long, IndexLocation>> latestIndexByStorageDir = new HashMap<>(); for (final Map.Entry<String, File> entry : repoConfig.getStorageDirectories().entrySet()) { final String partitionName = entry.getKey(); final File storageDir = entry.getValue(); final File[] indexDirs = storageDir.listFiles(INDEX_DIRECTORY_FILTER); if (indexDirs == null) { logger.warn("Unable to access Provenance Repository storage directory {}", storageDir); continue; } for (final File indexDir : indexDirs) { final Matcher matcher = INDEX_FILENAME_PATTERN.matcher(indexDir.getName()); if (!matcher.matches()) { continue; } final long startTime = DirectoryUtils.getIndexTimestamp(indexDir); final List<IndexLocation> dirsForTimestamp = indexLocationByTimestamp.computeIfAbsent(startTime, t -> new ArrayList<>()); final IndexLocation indexLoc = new IndexLocation(indexDir, startTime, partitionName); dirsForTimestamp.add(indexLoc); final Tuple<Long, IndexLocation> tuple = latestIndexByStorageDir.get(storageDir); if (tuple == null || startTime > tuple.getKey()) { latestIndexByStorageDir.put(storageDir, new Tuple<>(startTime, indexLoc)); } } } // Restore the activeIndices to point at the newest index in each storage location. for (final Tuple<Long, IndexLocation> tuple : latestIndexByStorageDir.values()) { final IndexLocation indexLoc = tuple.getValue(); activeIndices.put(indexLoc.getPartitionName(), indexLoc); } } public synchronized void deleteDirectory(final File directory) { final Iterator<Map.Entry<Long, List<IndexLocation>>> itr = indexLocationByTimestamp.entrySet().iterator(); while (itr.hasNext()) { final Map.Entry<Long, List<IndexLocation>> entry = itr.next(); final List<IndexLocation> locations = entry.getValue(); final IndexLocation locToRemove = new IndexLocation(directory, DirectoryUtils.getIndexTimestamp(directory), directory.getName()); locations.remove(locToRemove); if (locations.isEmpty()) { itr.remove(); } } } /** * Returns a List of all indexes where the latest event in the index has an event time before the given timestamp * * @param timestamp the cutoff * @return all Files that belong to an index, where the index has no events later than the given time */ public synchronized List<File> getDirectoriesBefore(final long timestamp) { final List<File> selected = new ArrayList<>(); // An index cannot be expired if it is the latest index in the storage directory. As a result, we need to // separate the indexes by Storage Directory so that we can easily determine if this is the case. final Map<String, List<IndexLocation>> startTimeWithFileByStorageDirectory = flattenDirectoriesByTimestamp().stream() .collect(Collectors.groupingBy(indexLoc -> indexLoc.getPartitionName())); // Scan through the index directories and the associated index event start time. // If looking at index N, we can determine the index end time by assuming that it is the same as the // start time of index N+1. So we determine the time range of each index and select an index only if // its start time is before the given timestamp and its end time is <= the given timestamp. for (final List<IndexLocation> startTimeWithFile : startTimeWithFileByStorageDirectory.values()) { for (int i = 0; i < startTimeWithFile.size(); i++) { final IndexLocation indexLoc = startTimeWithFile.get(i); final String partition = indexLoc.getPartitionName(); final IndexLocation activeLocation = activeIndices.get(partition); if (indexLoc.equals(activeLocation)) { continue; } final Long indexStartTime = indexLoc.getIndexStartTimestamp(); if (indexStartTime > timestamp) { // If the first timestamp in the index is later than the desired timestamp, // then we are done. We can do this because the list is ordered by monotonically // increasing timestamp as the Tuple key. break; } if (i < startTimeWithFile.size() - 1) { final IndexLocation nextLocation = startTimeWithFile.get(i + 1); final Long indexEndTime = nextLocation.getIndexStartTimestamp(); if (indexEndTime <= timestamp) { logger.debug("Considering Index Location {} older than {} ({}) because its events have an EventTime " + "ranging from {} ({}) to {} ({}) based on the following IndexLocations: {}", nextLocation, timestamp, new Date(timestamp), indexStartTime, new Date(indexStartTime), indexEndTime, new Date(indexEndTime), startTimeWithFile); selected.add(nextLocation.getIndexDirectory()); } } } } logger.debug("Returning the following list of index locations because they were finished being written to before {}: {}", timestamp, selected); return selected; } /** * Convert directoriesByTimestamp to a List of IndexLocations. * This allows us to easily get the 'next' value when iterating over the elements. * This is useful because we know that the 'next' value will have a timestamp that is when that * file started being written to - which is the same as when this index stopped being written to. * * @return a List of all IndexLocations known */ private List<IndexLocation> flattenDirectoriesByTimestamp() { final List<IndexLocation> startTimeWithFile = new ArrayList<>(); for (final Map.Entry<Long, List<IndexLocation>> entry : indexLocationByTimestamp.entrySet()) { for (final IndexLocation indexLoc : entry.getValue()) { startTimeWithFile.add(indexLoc); } } return startTimeWithFile; } public synchronized List<File> getDirectories(final Long startTime, final Long endTime) { final List<File> selected = new ArrayList<>(); // An index cannot be expired if it is the latest index in the partition. As a result, we need to // separate the indexes by partition so that we can easily determine if this is the case. final Map<String, List<IndexLocation>> startTimeWithFileByStorageDirectory = flattenDirectoriesByTimestamp().stream() .collect(Collectors.groupingBy(indexLoc -> indexLoc.getPartitionName())); for (final List<IndexLocation> locationList : startTimeWithFileByStorageDirectory.values()) { selected.addAll(getDirectories(startTime, endTime, locationList)); } return selected; } public synchronized List<File> getDirectories(final Long startTime, final Long endTime, final String partitionName) { // An index cannot be expired if it is the latest index in the partition. As a result, we need to // separate the indexes by partition so that we can easily determine if this is the case. final Map<String, List<IndexLocation>> startTimeWithFileByStorageDirectory = flattenDirectoriesByTimestamp().stream() .collect(Collectors.groupingBy(indexLoc -> indexLoc.getPartitionName())); final List<IndexLocation> indexLocations = startTimeWithFileByStorageDirectory.get(partitionName); if (indexLocations == null) { return Collections.emptyList(); } return getDirectories(startTime, endTime, indexLocations); } protected static List<File> getDirectories(final Long startTime, final Long endTime, final List<IndexLocation> locations) { final List<File> selected = new ArrayList<>(); int overlapCount = 0; for (int i = 0; i < locations.size(); i++) { final IndexLocation indexLoc = locations.get(i); final Long indexStartTimestamp = indexLoc.getIndexStartTimestamp(); if (endTime != null && indexStartTimestamp > endTime) { if (overlapCount == 0) { // Because of how we handle index timestamps and the multi-threading, it is possible // the we could have some overlap where Thread T1 gets an Event with start time 1,000 // for instance. Then T2 gets and Event with start time 1,002 and ends up creating a // new index directory with a start time of 1,002. Then T1 could end up writing events // with timestamp 1,000 to an index with a 'start time' of 1,002. Because of this, // the index start times are approximate. To address this, we include one extra Index // Directory based on start time, so that if we want index directories for Time Range // 1,000 - 1,001 and have indexes 999 and 1,002 we will include the 999 and the 'overlapping' // directory of 1,002 since it could potentially have an event with overlapping timestamp. overlapCount++; } else { continue; } } if (startTime != null) { final Long indexEndTimestamp; if (i < locations.size() - 1) { final IndexLocation nextIndexLoc = locations.get(i + 1); indexEndTimestamp = nextIndexLoc.getIndexStartTimestamp(); if (indexEndTimestamp < startTime) { continue; } } } selected.add(indexLoc.getIndexDirectory()); } return selected; } /** * Notifies the Index Directory Manager that an Index Writer has been committed for the * given index directory. This allows the Directory Manager to know that it needs to check * the size of the index directory and not return this directory as a writable directory * any more if the size has reached the configured threshold. * * @param indexDir the directory that was written to * @return <code>true</code> if the index directory has reached its max threshold and should no * longer be written to, <code>false</code> if the index directory is not full. */ public boolean onIndexCommitted(final File indexDir) { final long indexSize = getSize(indexDir); synchronized (this) { String partitionName = null; for (final Map.Entry<String, IndexLocation> entry : activeIndices.entrySet()) { if (indexDir.equals(entry.getValue().getIndexDirectory())) { partitionName = entry.getKey(); break; } } // If the index is not the active index directory, it should no longer be written to. if (partitionName == null) { logger.debug("Size of Provenance Index at {} is now {}. However, was unable to find the appropriate Active Index to roll over.", indexDir, indexSize); return true; } // If the index size >= desired index size, it should no longer be written to. if (indexSize >= repoConfig.getDesiredIndexSize()) { logger.info("Size of Provenance Index at {} is now {}. Will close this index and roll over to a new one.", indexDir, indexSize); activeIndices.remove(partitionName); return true; } // Index directory is the active index directory and has not yet exceeded the desired size. return false; } } public synchronized Optional<File> getActiveIndexDirectory(final String partitionName) { final IndexLocation indexLocation = activeIndices.get(partitionName); if (indexLocation == null) { return Optional.empty(); } return Optional.of(indexLocation.getIndexDirectory()); } private long getSize(final File indexDir) { if (!indexDir.exists()) { return 0L; } if (!indexDir.isDirectory()) { throw new IllegalArgumentException("Must specify a directory but specified " + indexDir); } // List all files in the Index Directory. final File[] files = indexDir.listFiles(); if (files == null) { return 0L; } long sum = 0L; for (final File file : files) { sum += file.length(); } return sum; } /** * Provides the File that is the directory for the index that should be written to. If there is no index yet * to be written to, or if the index has reached its max size, a new one will be created. The given {@code earliestTimestamp} * should represent the event time of the first event that will go into the index. This is used for file naming purposes so * that the appropriate directories can be looked up quickly later. * * @param earliestTimestamp the event time of the first event that will go into a new index, if a new index is created by this call. * @param partitionName the name of the partition to write to * @return the directory that should be written to */ public synchronized File getWritableIndexingDirectory(final long earliestTimestamp, final String partitionName) { IndexLocation indexLoc = activeIndices.get(partitionName); if (indexLoc == null) { indexLoc = new IndexLocation(createIndex(earliestTimestamp, partitionName), earliestTimestamp, partitionName); logger.debug("Created new Index Directory {}", indexLoc); indexLocationByTimestamp.computeIfAbsent(earliestTimestamp, t -> new ArrayList<>()).add(indexLoc); activeIndices.put(partitionName, indexLoc); } return indexLoc.getIndexDirectory(); } private File createIndex(final long earliestTimestamp, final String partitionName) { final File storageDir = repoConfig.getStorageDirectories().entrySet().stream() .filter(e -> e.getKey().equals(partitionName)) .map(Map.Entry::getValue) .findFirst() .orElseThrow(() -> new IllegalArgumentException("Invalid Partition: " + partitionName)); final File indexDir = new File(storageDir, "index-" + earliestTimestamp); return indexDir; } }