/** * Copyright 2016 LinkedIn Corp. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ package com.github.ambry.store; import com.codahale.metrics.Timer; import com.github.ambry.config.StoreConfig; import com.github.ambry.utils.FileLock; import com.github.ambry.utils.Time; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.EnumSet; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; import java.util.UUID; import java.util.concurrent.ScheduledExecutorService; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * The blob store that controls the log and index */ class BlobStore implements Store { static final String SEPARATOR = "_"; private final static String LockFile = ".lock"; private final String storeId; private final String dataDir; private final ScheduledExecutorService taskScheduler; private final DiskIOScheduler diskIOScheduler; private final Logger logger = LoggerFactory.getLogger(getClass()); /* A lock that prevents concurrent writes to the log */ private final Object lock = new Object(); private final StoreConfig config; private final long capacityInBytes; private final StoreKeyFactory factory; private final MessageStoreRecovery recovery; private final MessageStoreHardDelete hardDelete; private final StoreMetrics metrics; private final Time time; private final UUID sessionId = UUID.randomUUID(); private Log log; private BlobStoreCompactor compactor; private PersistentIndex index; private BlobStoreStats blobStoreStats; private boolean started; private FileLock fileLock; /** * States representing the different scenarios that can occur when a set of messages are to be written to the store. * Nomenclature: * Absent: a key that is non-existent in the store. * Duplicate: a key that exists in the store and has the same CRC (meaning the message is the same). * Colliding: a key that exists in the store but has a different CRC (meaning the message is different). */ private enum MessageWriteSetStateInStore { ALL_ABSENT, // The messages are all absent in the store. COLLIDING, // At least one message in the write set has the same key as another, different message in the store. ALL_DUPLICATE, // The messages are all duplicates - every one of them already exist in the store. SOME_NOT_ALL_DUPLICATE, // At least one of the message is a duplicate, but not all. } BlobStore(String storeId, StoreConfig config, ScheduledExecutorService taskScheduler, DiskIOScheduler diskIOScheduler, StorageManagerMetrics storageManagerMetrics, String dataDir, long capacityInBytes, StoreKeyFactory factory, MessageStoreRecovery recovery, MessageStoreHardDelete hardDelete, Time time) { this.metrics = storageManagerMetrics.createStoreMetrics(storeId); this.storeId = storeId; this.dataDir = dataDir; this.taskScheduler = taskScheduler; this.diskIOScheduler = diskIOScheduler; this.config = config; this.capacityInBytes = capacityInBytes; this.factory = factory; this.recovery = recovery; this.hardDelete = hardDelete; this.time = time; blobStoreStats = new BlobStoreStats(index, time, diskIOScheduler); } @Override public void start() throws StoreException { synchronized (lock) { if (started) { throw new StoreException("Store already started", StoreErrorCodes.Store_Already_Started); } final Timer.Context context = metrics.storeStartTime.time(); try { // Check if the data dir exist. If it does not exist, create it File dataFile = new File(dataDir); if (!dataFile.exists()) { logger.info("Store : {} data directory not found. creating it", dataDir); boolean created = dataFile.mkdir(); if (!created) { throw new StoreException("Failed to create directory for data dir " + dataDir, StoreErrorCodes.Initialization_Error); } } if (!dataFile.isDirectory() || !dataFile.canRead()) { throw new StoreException(dataFile.getAbsolutePath() + " is either not a directory or is not readable", StoreErrorCodes.Initialization_Error); } // lock the directory fileLock = new FileLock(new File(dataDir, LockFile)); if (!fileLock.tryLock()) { throw new StoreException( "Failed to acquire lock on file " + dataDir + ". Another process or thread is using this directory.", StoreErrorCodes.Initialization_Error); } StoreDescriptor storeDescriptor = new StoreDescriptor(dataDir); log = new Log(dataDir, capacityInBytes, config.storeSegmentSizeInBytes, metrics); compactor = new BlobStoreCompactor(dataDir, storeId, factory, config, metrics, diskIOScheduler, log, time, sessionId, storeDescriptor.getIncarnationId()); index = new PersistentIndex(dataDir, taskScheduler, log, config, factory, recovery, hardDelete, metrics, time, sessionId, storeDescriptor.getIncarnationId()); compactor.initialize(index); metrics.initializeIndexGauges(index, capacityInBytes); blobStoreStats = new BlobStoreStats(index, time, diskIOScheduler); started = true; } catch (Exception e) { metrics.storeStartFailure.inc(); throw new StoreException("Error while starting store for dir " + dataDir, e, StoreErrorCodes.Initialization_Error); } finally { context.stop(); } } } @Override public StoreInfo get(List<? extends StoreKey> ids, EnumSet<StoreGetOptions> storeGetOptions) throws StoreException { checkStarted(); // allows concurrent gets final Timer.Context context = metrics.getResponse.time(); try { List<BlobReadOptions> readOptions = new ArrayList<BlobReadOptions>(ids.size()); Map<StoreKey, MessageInfo> indexMessages = new HashMap<StoreKey, MessageInfo>(ids.size()); for (StoreKey key : ids) { BlobReadOptions readInfo = index.getBlobReadInfo(key, storeGetOptions); readOptions.add(readInfo); indexMessages.put(key, readInfo.getMessageInfo()); } MessageReadSet readSet = new StoreMessageReadSet(readOptions); // We ensure that the metadata list is ordered with the order of the message read set view that the // log provides. This ensures ordering of all messages across the log and metadata from the index. List<MessageInfo> messageInfoList = new ArrayList<MessageInfo>(readSet.count()); for (int i = 0; i < readSet.count(); i++) { messageInfoList.add(indexMessages.get(readSet.getKeyAt(i))); } return new StoreInfo(readSet, messageInfoList); } catch (StoreException e) { throw e; } catch (Exception e) { throw new StoreException("Unknown exception while trying to fetch blobs from store " + dataDir, e, StoreErrorCodes.Unknown_Error); } finally { context.stop(); } } /** * Checks the state of the messages in the given {@link MessageWriteSet} in the given {@link FileSpan}. * @param messageSetToWrite Non-empty set of messages to write to the store. * @param fileSpan The fileSpan on which the check for existence of the messages have to be made. * @return {@link MessageWriteSetStateInStore} representing the outcome of the state check. * @throws StoreException relays those encountered from {@link PersistentIndex#findKey(StoreKey, FileSpan)}. */ private MessageWriteSetStateInStore checkWriteSetStateInStore(MessageWriteSet messageSetToWrite, FileSpan fileSpan) throws StoreException { int existingIdenticalEntries = 0; for (MessageInfo info : messageSetToWrite.getMessageSetInfo()) { if (index.findKey(info.getStoreKey(), fileSpan) != null) { if (index.wasRecentlySeen(info)) { existingIdenticalEntries++; metrics.identicalPutAttemptCount.inc(); } else { return MessageWriteSetStateInStore.COLLIDING; } } } if (existingIdenticalEntries == messageSetToWrite.getMessageSetInfo().size()) { return MessageWriteSetStateInStore.ALL_DUPLICATE; } else if (existingIdenticalEntries > 0) { return MessageWriteSetStateInStore.SOME_NOT_ALL_DUPLICATE; } else { return MessageWriteSetStateInStore.ALL_ABSENT; } } @Override public void put(MessageWriteSet messageSetToWrite) throws StoreException { checkStarted(); final Timer.Context context = metrics.putResponse.time(); try { if (messageSetToWrite.getMessageSetInfo().isEmpty()) { throw new IllegalArgumentException("Message write set cannot be empty"); } Offset indexEndOffsetBeforeCheck = index.getCurrentEndOffset(); MessageWriteSetStateInStore state = checkWriteSetStateInStore(messageSetToWrite, null); if (state == MessageWriteSetStateInStore.ALL_ABSENT) { synchronized (lock) { // Validate that log end offset was not changed. If changed, check once again for existing // keys in store Offset currentIndexEndOffset = index.getCurrentEndOffset(); if (!currentIndexEndOffset.equals(indexEndOffsetBeforeCheck)) { FileSpan fileSpan = new FileSpan(indexEndOffsetBeforeCheck, currentIndexEndOffset); state = checkWriteSetStateInStore(messageSetToWrite, fileSpan); } if (state == MessageWriteSetStateInStore.ALL_ABSENT) { Offset endOffsetOfLastMessage = log.getEndOffset(); messageSetToWrite.writeTo(log); logger.trace("Store : {} message set written to log", dataDir); List<MessageInfo> messageInfo = messageSetToWrite.getMessageSetInfo(); ArrayList<IndexEntry> indexEntries = new ArrayList<>(messageInfo.size()); for (MessageInfo info : messageInfo) { FileSpan fileSpan = log.getFileSpanForMessage(endOffsetOfLastMessage, info.getSize()); IndexValue value = new IndexValue(info.getSize(), fileSpan.getStartOffset(), info.getExpirationTimeInMs()); IndexEntry entry = new IndexEntry(info.getStoreKey(), value, info.getCrc()); indexEntries.add(entry); endOffsetOfLastMessage = fileSpan.getEndOffset(); } FileSpan fileSpan = new FileSpan(indexEntries.get(0).getValue().getOffset(), endOffsetOfLastMessage); index.addToIndex(indexEntries, fileSpan); logger.trace("Store : {} message set written to index ", dataDir); } } } switch (state) { case COLLIDING: throw new StoreException( "For at least one message in the write set, another blob with same key exists in store", StoreErrorCodes.Already_Exist); case SOME_NOT_ALL_DUPLICATE: throw new StoreException( "At least one message but not all in the write set is identical to an existing entry", StoreErrorCodes.Already_Exist); case ALL_DUPLICATE: logger.trace("All entries to put already exist in the store, marking operation as successful"); break; case ALL_ABSENT: logger.trace("All entries were absent, and were written to the store successfully"); break; } } catch (StoreException e) { throw e; } catch (IOException e) { throw new StoreException("IO error while trying to put blobs to store " + dataDir, e, StoreErrorCodes.IOError); } catch (Exception e) { throw new StoreException("Unknown error while trying to put blobs to store " + dataDir, e, StoreErrorCodes.Unknown_Error); } finally { context.stop(); } } @Override public void delete(MessageWriteSet messageSetToDelete) throws StoreException { checkStarted(); final Timer.Context context = metrics.deleteResponse.time(); try { List<MessageInfo> infoList = messageSetToDelete.getMessageSetInfo(); Offset indexEndOffsetBeforeCheck = index.getCurrentEndOffset(); for (MessageInfo info : infoList) { IndexValue value = index.findKey(info.getStoreKey()); if (value == null) { throw new StoreException("Cannot delete id " + info.getStoreKey() + " since it is not present in the index.", StoreErrorCodes.ID_Not_Found); } else if (value.isFlagSet(IndexValue.Flags.Delete_Index)) { throw new StoreException( "Cannot delete id " + info.getStoreKey() + " since it is already deleted in the index.", StoreErrorCodes.ID_Deleted); } } synchronized (lock) { Offset currentIndexEndOffset = index.getCurrentEndOffset(); if (!currentIndexEndOffset.equals(indexEndOffsetBeforeCheck)) { FileSpan fileSpan = new FileSpan(indexEndOffsetBeforeCheck, currentIndexEndOffset); for (MessageInfo info : infoList) { IndexValue value = index.findKey(info.getStoreKey(), fileSpan); if (value != null && value.isFlagSet(IndexValue.Flags.Delete_Index)) { throw new StoreException( "Cannot delete id " + info.getStoreKey() + " since it is already deleted in the index.", StoreErrorCodes.ID_Deleted); } } } Offset endOffsetOfLastMessage = log.getEndOffset(); messageSetToDelete.writeTo(log); logger.trace("Store : {} delete mark written to log", dataDir); for (MessageInfo info : infoList) { FileSpan fileSpan = log.getFileSpanForMessage(endOffsetOfLastMessage, info.getSize()); index.markAsDeleted(info.getStoreKey(), fileSpan); endOffsetOfLastMessage = fileSpan.getEndOffset(); } logger.trace("Store : {} delete has been marked in the index ", dataDir); } } catch (StoreException e) { throw e; } catch (IOException e) { throw new StoreException("IO error while trying to delete blobs from store " + dataDir, e, StoreErrorCodes.IOError); } catch (Exception e) { throw new StoreException("Unknown error while trying to delete blobs from store " + dataDir, e, StoreErrorCodes.Unknown_Error); } finally { context.stop(); } } @Override public FindInfo findEntriesSince(FindToken token, long maxTotalSizeOfEntries) throws StoreException { checkStarted(); final Timer.Context context = metrics.findEntriesSinceResponse.time(); try { return index.findEntriesSince(token, maxTotalSizeOfEntries); } finally { context.stop(); } } @Override public Set<StoreKey> findMissingKeys(List<StoreKey> keys) throws StoreException { checkStarted(); final Timer.Context context = metrics.findMissingKeysResponse.time(); try { return index.findMissingKeys(keys); } finally { context.stop(); } } @Override public StoreStats getStoreStats() { return blobStoreStats; } @Override public boolean isKeyDeleted(StoreKey key) throws StoreException { checkStarted(); final Timer.Context context = metrics.isKeyDeletedResponse.time(); try { IndexValue value = index.findKey(key); if (value == null) { throw new StoreException("Key " + key + " not found in store. Cannot check if it is deleted", StoreErrorCodes.ID_Not_Found); } return value.isFlagSet(IndexValue.Flags.Delete_Index); } finally { context.stop(); } } @Override public long getSizeInBytes() { return index.getLogUsedCapacity(); } /** * Fetch {@link CompactionDetails} based on the {@link CompactionPolicy} for this {@link BlobStore} containing * information about log segments to be compacted * @param compactionPolicy the {@link CompactionPolicy} that needs to be used to determine the {@link CompactionDetails} * @return the {@link CompactionDetails} containing information about log segments to be compacted. Could be * {@code null} if there isn't anything to compact * @throws StoreException on any issues while reading entries from index */ CompactionDetails getCompactionDetails(CompactionPolicy compactionPolicy) throws StoreException { return compactionPolicy.getCompactionDetails(capacityInBytes, index.getLogUsedCapacity(), log.getSegmentCapacity(), LogSegment.HEADER_SIZE, index.getLogSegmentsNotInJournal(), blobStoreStats); } @Override public void shutdown() throws StoreException { long startTimeInMs = time.milliseconds(); synchronized (lock) { checkStarted(); try { logger.info("Store : " + dataDir + " shutting down"); compactor.close(30); index.close(); log.close(); started = false; } catch (Exception e) { logger.error("Store : " + dataDir + " shutdown of store failed for directory ", e); } finally { try { fileLock.destroy(); } catch (IOException e) { logger.error("Store : " + dataDir + " IO Exception while trying to close the file lock", e); } metrics.storeShutdownTimeInMs.update(time.milliseconds() - startTimeInMs); } } } /** * @return {@code true} if this store has been started successfully. */ boolean isStarted() { return started; } /** * Compacts the store data based on {@code details}. * @param details the {@link CompactionDetails} describing what needs to be compacted. * @throws IllegalArgumentException if any of the provided segments doesn't exist in the log or if one or more offsets * in the segments to compact are in the journal. * @throws IOException if there is any error creating the {@link CompactionLog}. * @throws StoreException if there are any errors during the compaction. */ void compact(CompactionDetails details) throws IOException, StoreException { checkStarted(); compactor.compact(details); } /** * Resumes a compaction if one is in progress. * @throws StoreException if there are any errors during the compaction. */ void maybeResumeCompaction() throws StoreException { checkStarted(); if (CompactionLog.isCompactionInProgress(dataDir, storeId)) { logger.info("Resuming compaction of {}", this); compactor.resumeCompaction(); } } private void checkStarted() throws StoreException { if (!started) { throw new StoreException("Store not started", StoreErrorCodes.Store_Not_Started); } } @Override public String toString() { return "StoreId: " + storeId + ". DataDir: " + dataDir + ". Capacity: " + capacityInBytes; } }