/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nifi.provenance; import java.io.EOFException; import java.io.File; import java.io.FileFilter; import java.io.FileNotFoundException; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReentrantLock; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.nifi.provenance.serialization.RecordReader; import org.apache.nifi.provenance.serialization.RecordReaders; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * */ public class IndexConfiguration { private final RepositoryConfiguration repoConfig; private final Map<File, List<File>> indexDirectoryMap = new HashMap<>(); private final Pattern indexNamePattern = Pattern.compile("index-(\\d+)"); private final Lock lock = new ReentrantLock(); private static final Logger logger = LoggerFactory.getLogger(IndexConfiguration.class); private Long maxIndexedId = null; private Long minIndexedId = null; public IndexConfiguration(final RepositoryConfiguration repoConfig) { this.repoConfig = repoConfig; indexDirectoryMap.putAll(recoverIndexDirectories()); } private Map<File, List<File>> recoverIndexDirectories() { final Map<File, List<File>> indexDirectoryMap = new HashMap<>(); for (final File storageDirectory : repoConfig.getStorageDirectories().values()) { final List<File> indexDirectories = new ArrayList<>(); final File[] matching = storageDirectory.listFiles(new FileFilter() { @Override public boolean accept(final File pathname) { return pathname.isDirectory() && indexNamePattern.matcher(pathname.getName()).matches(); } }); if (matching != null) { for (final File matchingFile : matching) { indexDirectories.add(matchingFile); } } indexDirectoryMap.put(storageDirectory, indexDirectories); } return indexDirectoryMap; } private Long getFirstEntryTime(final File provenanceLogFile) { if (provenanceLogFile == null) { return null; } try (final RecordReader reader = RecordReaders.newRecordReader(provenanceLogFile, null, Integer.MAX_VALUE)) { final StandardProvenanceEventRecord firstRecord = reader.nextRecord(); if (firstRecord == null) { return provenanceLogFile.lastModified(); } return firstRecord.getEventTime(); } catch (final FileNotFoundException | EOFException fnf) { return null; // file no longer exists or there's no record in this file } catch (final IOException ioe) { logger.warn("Failed to read first entry in file {} due to {}", provenanceLogFile, ioe.toString()); logger.warn("", ioe); return null; } } public void removeIndexDirectory(final File indexDirectory) { lock.lock(); try { final Set<File> keysToRemove = new HashSet<>(); for (final Map.Entry<File, List<File>> entry : indexDirectoryMap.entrySet()) { final List<File> list = entry.getValue(); list.remove(indexDirectory); if (list.isEmpty()) { keysToRemove.add(entry.getKey()); } } for (final File keyToRemove : keysToRemove) { indexDirectoryMap.remove(keyToRemove); } } finally { lock.unlock(); } } public File getWritableIndexDirectory(final File provenanceLogFile, final long newIndexTimestamp) { return getWritableIndexDirectoryForStorageDirectory(provenanceLogFile.getParentFile(), provenanceLogFile, newIndexTimestamp); } public File getWritableIndexDirectoryForStorageDirectory(final File storageDirectory, final File provenanceLogFile, final long newIndexTimestamp) { lock.lock(); try { List<File> indexDirectories = this.indexDirectoryMap.get(storageDirectory); if (indexDirectories == null) { final File newDir = addNewIndex(storageDirectory, provenanceLogFile, newIndexTimestamp); indexDirectories = new ArrayList<>(); indexDirectories.add(newDir); indexDirectoryMap.put(storageDirectory, indexDirectories); return newDir; } if (indexDirectories.isEmpty()) { final File newDir = addNewIndex(storageDirectory, provenanceLogFile, newIndexTimestamp); indexDirectories.add(newDir); return newDir; } final File lastDir = indexDirectories.get(indexDirectories.size() - 1); final long size = getSize(lastDir); if (size > repoConfig.getDesiredIndexSize()) { final File newDir = addNewIndex(storageDirectory, provenanceLogFile, newIndexTimestamp); indexDirectories.add(newDir); return newDir; } else { return lastDir; } } finally { lock.unlock(); } } private File addNewIndex(final File storageDirectory, final File provenanceLogFile, final long newIndexTimestamp) { // Build the event time of the first record into the index's filename so that we can determine // which index files to look at when we perform a search. We use the timestamp of the first record // in the Provenance Log file, rather than the current time, because we may perform the Indexing // retroactively. Long firstEntryTime = getFirstEntryTime(provenanceLogFile); if (firstEntryTime == null) { firstEntryTime = newIndexTimestamp; } return new File(storageDirectory, "index-" + firstEntryTime); } public List<File> getIndexDirectories() { lock.lock(); try { final List<File> files = new ArrayList<>(); for (final List<File> list : indexDirectoryMap.values()) { files.addAll(list); } return files; } finally { lock.unlock(); } } private long getIndexStartTime(final File indexDir) { if (indexDir == null) { return -1L; } final Matcher matcher = indexNamePattern.matcher(indexDir.getName()); final boolean matches = matcher.matches(); if (matches) { return Long.parseLong(matcher.group(1)); } else { return -1L; } } /** * Returns the index directories that are applicable only for the given time * span (times inclusive). * * @param startTime the start time of the query for which the indices are * desired * @param endTime the end time of the query for which the indices are * desired * @return the index directories that are applicable only for the given time * span (times inclusive). */ public List<File> getIndexDirectories(final Long startTime, final Long endTime) { if (startTime == null && endTime == null) { return getIndexDirectories(); } final List<File> dirs = new ArrayList<>(); lock.lock(); try { // Sort directories so that we return the newest index first final List<File> sortedIndexDirectories = getIndexDirectories(); Collections.sort(sortedIndexDirectories, new Comparator<File>() { @Override public int compare(final File o1, final File o2) { final long epochTimestamp1 = getIndexStartTime(o1); final long epochTimestamp2 = getIndexStartTime(o2); return Long.compare(epochTimestamp2, epochTimestamp1); } }); for (final File indexDir : sortedIndexDirectories) { // If the index was last modified before the start time, we know that it doesn't // contain any data for us to query. if (startTime != null && indexDir.lastModified() < startTime) { continue; } // If the index was created after the given end time, we know it doesn't contain any // data for us to query. if (endTime != null) { final long indexStartTime = getIndexStartTime(indexDir); if (indexStartTime > endTime) { continue; } } dirs.add(indexDir); } return dirs; } finally { lock.unlock(); } } /** * Returns the index directories that are applicable only for the given * event log * * @param provenanceLogFile the provenance log file for which the index * directories are desired * @return the index directories that are applicable only for the given * event log */ public List<File> getIndexDirectories(final File provenanceLogFile) { final List<File> dirs = new ArrayList<>(); lock.lock(); try { final List<File> indices = indexDirectoryMap.get(provenanceLogFile.getParentFile()); if (indices == null) { return Collections.<File>emptyList(); } final List<File> sortedIndexDirectories = new ArrayList<>(indices); Collections.sort(sortedIndexDirectories, new Comparator<File>() { @Override public int compare(final File o1, final File o2) { final long epochTimestamp1 = getIndexStartTime(o1); final long epochTimestamp2 = getIndexStartTime(o2); return Long.compare(epochTimestamp1, epochTimestamp2); } }); final Long firstEntryTime = getFirstEntryTime(provenanceLogFile); if (firstEntryTime == null) { logger.debug("Found no records in {} so returning no Indices for it", provenanceLogFile); return Collections.<File>emptyList(); } boolean foundIndexCreatedLater = false; for (final File indexDir : sortedIndexDirectories) { // If the index was last modified before the log file was created, we know the index doesn't include // any data for the provenance log. if (indexDir.lastModified() < firstEntryTime) { continue; } final long indexStartTime = getIndexStartTime(indexDir); if (indexStartTime > provenanceLogFile.lastModified()) { // the index was created after the provenance log file was finished being modified. // Either this index doesn't contain info for this provenance log OR this provenance log // file triggered the index to be created. If we've already seen another index that was created // after this log file was finished being modified, we can rest assured that this index wasn't // created for the log file (because the previous one was or the one before that or the one before // that, etc.) if (foundIndexCreatedLater) { continue; } else { foundIndexCreatedLater = true; } } dirs.add(indexDir); } return dirs; } finally { lock.unlock(); } } private long getSize(final File indexDirectory) { if (!indexDirectory.exists()) { return 0L; } if (!indexDirectory.isDirectory()) { throw new IllegalArgumentException("Must specify a directory but specified " + indexDirectory); } // List all files in the Index Directory. final File[] files = indexDirectory.listFiles(); if (files == null) { return 0L; } long sum = 0L; for (final File file : files) { sum += file.length(); } return sum; } /** * @return the amount of disk space in bytes used by all of the indices */ public long getIndexSize() { lock.lock(); try { long sum = 0L; for (final File indexDir : getIndexDirectories()) { sum += getSize(indexDir); } return sum; } finally { lock.unlock(); } } public void setMaxIdIndexed(final long id) { lock.lock(); try { if (maxIndexedId == null || id > maxIndexedId) { maxIndexedId = id; } } finally { lock.unlock(); } } public Long getMaxIdIndexed() { lock.lock(); try { return maxIndexedId; } finally { lock.unlock(); } } public void setMinIdIndexed(final long id) { lock.lock(); try { if (minIndexedId == null || id > minIndexedId) { if (maxIndexedId == null || id > maxIndexedId) { // id will be > maxIndexedId if all records were expired minIndexedId = maxIndexedId; } else { minIndexedId = id; } } } finally { lock.unlock(); } } public Long getMinIdIndexed() { lock.lock(); try { return minIndexedId; } finally { lock.unlock(); } } }