/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hdfs.server.namenode; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Charsets; import com.google.common.collect.Lists; import io.hops.common.IDsGeneratorFactory; import io.hops.common.IDsMonitor; import io.hops.common.INodeUtil; import io.hops.erasure_coding.Codec; import io.hops.erasure_coding.ErasureCodingManager; import io.hops.exception.LockUpgradeException; import io.hops.exception.StorageCallPreventedException; import io.hops.exception.StorageException; import io.hops.exception.TransactionContextException; import io.hops.leader_election.node.ActiveNode; import io.hops.metadata.HdfsStorageFactory; import io.hops.metadata.HdfsVariables; import io.hops.metadata.hdfs.dal.BlockChecksumDataAccess; import io.hops.metadata.hdfs.dal.EncodingStatusDataAccess; import io.hops.metadata.hdfs.dal.INodeAttributesDataAccess; import io.hops.metadata.hdfs.dal.INodeDataAccess; import io.hops.metadata.hdfs.dal.MetadataLogDataAccess; import io.hops.metadata.hdfs.dal.SafeBlocksDataAccess; import io.hops.metadata.hdfs.dal.SizeLogDataAccess; import io.hops.metadata.hdfs.entity.BlockChecksum; import io.hops.metadata.hdfs.entity.EncodingPolicy; import io.hops.metadata.hdfs.entity.EncodingStatus; import io.hops.metadata.hdfs.entity.INodeIdentifier; import io.hops.metadata.hdfs.entity.LeasePath; import io.hops.metadata.hdfs.entity.MetadataLogEntry; import io.hops.metadata.hdfs.entity.ProjectedINode; import io.hops.metadata.hdfs.entity.SizeLogEntry; import io.hops.metadata.hdfs.entity.SubTreeOperation; import io.hops.resolvingcache.Cache; import io.hops.security.Users; import io.hops.transaction.EntityManager; import io.hops.transaction.context.RootINodeCache; import io.hops.transaction.handler.EncodingStatusOperationType; import io.hops.transaction.handler.HDFSOperationType; import io.hops.transaction.handler.HopsTransactionalRequestHandler; import io.hops.transaction.handler.LightWeightRequestHandler; import io.hops.transaction.lock.LockFactory; import io.hops.transaction.lock.TransactionLockTypes.INodeLockType; import io.hops.transaction.lock.TransactionLockTypes.INodeResolveType; import io.hops.transaction.lock.TransactionLockTypes.LockType; import io.hops.transaction.lock.TransactionLocks; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.HadoopIllegalArgumentException; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.ContentSummary; import org.apache.hadoop.fs.CreateFlag; import org.apache.hadoop.fs.FileAlreadyExistsException; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FsServerDefaults; import org.apache.hadoop.fs.InvalidPathException; import org.apache.hadoop.fs.Options; import org.apache.hadoop.fs.Options.Rename; import org.apache.hadoop.fs.ParentNotDirectoryException; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.UnresolvedLinkException; import org.apache.hadoop.fs.permission.FsAction; import org.apache.hadoop.fs.permission.FsPermission; import org.apache.hadoop.fs.permission.PermissionStatus; import org.apache.hadoop.hdfs.DFSConfigKeys; import org.apache.hadoop.hdfs.DFSUtil; import org.apache.hadoop.hdfs.HdfsConfiguration; import org.apache.hadoop.hdfs.protocol.AlreadyBeingCreatedException; import org.apache.hadoop.hdfs.protocol.Block; import org.apache.hadoop.hdfs.protocol.ClientProtocol; import org.apache.hadoop.hdfs.protocol.DatanodeID; import org.apache.hadoop.hdfs.protocol.DatanodeInfo; import org.apache.hadoop.hdfs.protocol.DirectoryListing; import org.apache.hadoop.hdfs.protocol.ExtendedBlock; import org.apache.hadoop.hdfs.protocol.HdfsConstants; import org.apache.hadoop.hdfs.protocol.HdfsConstants.DatanodeReportType; import org.apache.hadoop.hdfs.protocol.HdfsConstants.SafeModeAction; import org.apache.hadoop.hdfs.protocol.HdfsFileStatus; import org.apache.hadoop.hdfs.protocol.LocatedBlock; import org.apache.hadoop.hdfs.protocol.LocatedBlocks; import org.apache.hadoop.hdfs.protocol.QuotaExceededException; import org.apache.hadoop.hdfs.protocol.RecoveryInProgressException; import org.apache.hadoop.hdfs.protocol.datatransfer.ReplaceDatanodeOnFailure; import org.apache.hadoop.hdfs.security.token.block.BlockTokenSecretManager; import org.apache.hadoop.hdfs.security.token.block.BlockTokenSecretManager.AccessMode; import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifier; import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenSecretManager; import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfo; import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfoUnderConstruction; import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager; import org.apache.hadoop.hdfs.server.blockmanagement.BlockPlacementPolicyDefault; import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor; import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeManager; import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeStatistics; import org.apache.hadoop.hdfs.server.blockmanagement.MutableBlockCollection; import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.BlockUCState; import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption; import org.apache.hadoop.hdfs.server.common.Storage; import org.apache.hadoop.hdfs.server.common.StorageInfo; import org.apache.hadoop.hdfs.server.namenode.metrics.FSNamesystemMBean; import org.apache.hadoop.hdfs.server.namenode.metrics.NameNodeMetrics; import org.apache.hadoop.hdfs.server.namenode.web.resources.NamenodeWebHdfsMethods; import org.apache.hadoop.hdfs.server.protocol.DatanodeCommand; import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration; import org.apache.hadoop.hdfs.server.protocol.HeartbeatResponse; import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo; import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.io.Text; import org.apache.hadoop.ipc.Server; import org.apache.hadoop.metrics2.annotation.Metric; import org.apache.hadoop.metrics2.annotation.Metrics; import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem; import org.apache.hadoop.metrics2.util.MBeans; import org.apache.hadoop.net.NetworkTopology; import org.apache.hadoop.net.Node; import org.apache.hadoop.security.AccessControlException; import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.security.UserGroupInformation.AuthenticationMethod; import org.apache.hadoop.security.token.SecretManager.InvalidToken; import org.apache.hadoop.security.token.Token; import org.apache.hadoop.security.token.delegation.DelegationKey; import org.apache.hadoop.util.Daemon; import org.apache.hadoop.util.DataChecksum; import org.apache.hadoop.util.Time; import org.apache.hadoop.util.VersionInfo; import org.mortbay.util.ajax.JSON; import javax.management.NotCompliantMBeanException; import javax.management.ObjectName; import javax.management.StandardMBean; import java.io.BufferedWriter; import java.io.ByteArrayInputStream; import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStreamWriter; import java.io.PrintWriter; import java.io.StringWriter; import java.lang.management.ManagementFactory; import java.net.InetAddress; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.Date; import java.util.EnumSet; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; import java.util.StringTokenizer; import java.util.concurrent.Callable; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; import static io.hops.transaction.lock.LockFactory.BLK; import static io.hops.transaction.lock.LockFactory.getInstance; import io.hops.transaction.lock.SubtreeLockedException; import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.FS_TRASH_INTERVAL_DEFAULT; import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.FS_TRASH_INTERVAL_KEY; import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.IO_FILE_BUFFER_SIZE_DEFAULT; import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.IO_FILE_BUFFER_SIZE_KEY; import static org.apache.hadoop.hdfs.DFSConfigKeys.*; import static org.apache.hadoop.util.Time.now; /** * ************************************************ * FSNamesystem does the actual bookkeeping work for the * DataNode. * <p/> * It tracks several important tables. * <p/> * 1) valid fsname --> blocklist (kept on disk, logged) * 2) Set of all valid blocks (inverted #1) * 3) block --> machinelist (kept in memory, rebuilt dynamically from reports) * 4) machine --> blocklist (inverted #2) * 5) LRU cache of updated-heartbeat machines * ************************************************* */ @InterfaceAudience.Private @Metrics(context = "dfs") public class FSNamesystem implements Namesystem, FSClusterStats, FSNamesystemMBean, NameNodeMXBean { public static final Log LOG = LogFactory.getLog(FSNamesystem.class); private static final ThreadLocal<StringBuilder> auditBuffer = new ThreadLocal<StringBuilder>() { @Override protected StringBuilder initialValue() { return new StringBuilder(); } }; private boolean isAuditEnabled() { return !isDefaultAuditLogger || auditLog.isInfoEnabled(); } private HdfsFileStatus getAuditFileInfo(String path, boolean resolveSymlink) throws IOException, StorageException { return (isAuditEnabled() && isExternalInvocation()) ? dir.getFileInfo(path, resolveSymlink) : null; } private void logAuditEvent(boolean succeeded, String cmd, String src) throws IOException { logAuditEvent(succeeded, cmd, src, null, null); } private void logAuditEvent(boolean succeeded, String cmd, String src, String dst, HdfsFileStatus stat) throws IOException { if (isAuditEnabled() && isExternalInvocation()) { logAuditEvent(succeeded, getRemoteUser(), getRemoteIp(), cmd, src, dst, stat); } } private void logAuditEvent(boolean succeeded, UserGroupInformation ugi, InetAddress addr, String cmd, String src, String dst, HdfsFileStatus stat) { FileStatus status = null; if (stat != null) { Path symlink = stat.isSymlink() ? new Path(stat.getSymlink()) : null; Path path = dst != null ? new Path(dst) : new Path(src); status = new FileStatus(stat.getLen(), stat.isDir(), stat.getReplication(), stat.getBlockSize(), stat.getModificationTime(), stat.getAccessTime(), stat.getPermission(), stat.getOwner(), stat.getGroup(), symlink, path); } for (AuditLogger logger : auditLoggers) { logger.logAuditEvent(succeeded, ugi.toString(), addr, cmd, src, dst, status); } } /** * Logger for audit events, noting successful FSNamesystem operations. Emits * to FSNamesystem.audit at INFO. Each event causes a set of tab-separated * <code>key=value</code> pairs to be written for the following properties: * <code> * ugi=<ugi in RPC> * ip=<remote IP> * cmd=<command> * src=<src path> * dst=<dst path (optional)> * perm=<permissions (optional)> * </code> */ public static final Log auditLog = LogFactory.getLog(FSNamesystem.class.getName() + ".audit"); static final int DEFAULT_MAX_CORRUPT_FILEBLOCKS_RETURNED = 100; static int BLOCK_DELETION_INCREMENT = 1000; private final boolean isPermissionEnabled; private final boolean persistBlocks; private final UserGroupInformation fsOwner; private final String fsOwnerShortUserName; private final String supergroup; // Scan interval is not configurable. private static final long DELEGATION_TOKEN_REMOVER_SCAN_INTERVAL = TimeUnit.MILLISECONDS.convert(1, TimeUnit.HOURS); final DelegationTokenSecretManager dtSecretManager; private final boolean alwaysUseDelegationTokensForTests; // Tracks whether the default audit logger is the only configured audit // logger; this allows isAuditEnabled() to return false in case the // underlying logger is disabled, and avoid some unnecessary work. private final boolean isDefaultAuditLogger; private final List<AuditLogger> auditLoggers; /** * The namespace tree. */ FSDirectory dir; private final BlockManager blockManager; private final DatanodeStatistics datanodeStatistics; // Block pool ID used by this namenode //HOP mmade it final and now its value is read from the config file. all namenode should have same block pool id private final String blockPoolId; final LeaseManager leaseManager = new LeaseManager(this); Daemon smmthread = null; // SafeModeMonitor thread Daemon nnrmthread = null; // NamenodeResourceMonitor thread private volatile boolean hasResourcesAvailable = true; //HOP. yes we have huge namespace private volatile boolean fsRunning = true; /** * The start time of the namesystem. */ private final long startTime = now(); /** * The interval of namenode checking for the disk space availability */ private final long resourceRecheckInterval; private final FsServerDefaults serverDefaults; private final boolean supportAppends; private final ReplaceDatanodeOnFailure dtpReplaceDatanodeOnFailure; private volatile SafeModeInfo safeMode; // safe mode information private final long maxFsObjects; // maximum number of fs objects // precision of access times. private final long accessTimePrecision; private NameNode nameNode; private final Configuration conf; private final QuotaUpdateManager quotaUpdateManager; private final boolean legacyDeleteEnabled; private final boolean legacyRenameEnabled; private final boolean legacyContentSummaryEnabled; private final boolean legacySetQuotaEnabled; private final ExecutorService subtreeOperationsExecutor; private final boolean erasureCodingEnabled; private final ErasureCodingManager erasureCodingManager; private final long BIGGEST_DELETEABLE_DIR; /** * Clear all loaded data */ void clear() throws IOException { dir.reset(); dtSecretManager.reset(); leaseManager.removeAllLeases(); } @VisibleForTesting LeaseManager getLeaseManager() { return leaseManager; } /** * Instantiates an FSNamesystem loaded from the image and edits * directories specified in the passed Configuration. * * @param conf * the Configuration which specifies the storage directories * from which to load * @return an FSNamesystem which contains the loaded namespace * @throws IOException * if loading fails */ public static FSNamesystem loadFromDisk(Configuration conf, NameNode namenode) throws IOException { FSNamesystem namesystem = new FSNamesystem(conf, namenode); StartupOption startOpt = NameNode.getStartupOption(conf); if (startOpt == StartupOption.RECOVER) { namesystem.setSafeMode(SafeModeAction.SAFEMODE_ENTER); } long loadStart = now(); namesystem.dir .imageLoadComplete(); //HOP: this function was called inside the namesystem.loadFSImage(...) which is commented out long timeTakenToLoadFSImage = now() - loadStart; LOG.info( "Finished loading FSImage in " + timeTakenToLoadFSImage + " msecs"); NameNodeMetrics nnMetrics = NameNode.getNameNodeMetrics(); if (nnMetrics != null) { nnMetrics.setFsImageLoadTime((int) timeTakenToLoadFSImage); } return namesystem; } /** * Create an FSNamesystem. * * @param conf * configuration * @param namenode * the namenode * @throws IOException * on bad configuration */ FSNamesystem(Configuration conf, NameNode namenode) throws IOException { try { this.conf = conf; this.nameNode = namenode; resourceRecheckInterval = conf.getLong(DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_KEY, DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_DEFAULT); this.blockManager = new BlockManager(this, this, conf); this.erasureCodingEnabled = ErasureCodingManager.isErasureCodingEnabled(conf); this.erasureCodingManager = new ErasureCodingManager(this, conf); this.datanodeStatistics = blockManager.getDatanodeManager().getDatanodeStatistics(); this.fsOwner = UserGroupInformation.getCurrentUser(); this.fsOwnerShortUserName = fsOwner.getShortUserName(); this.supergroup = conf.get(DFS_PERMISSIONS_SUPERUSERGROUP_KEY, DFS_PERMISSIONS_SUPERUSERGROUP_DEFAULT); this.isPermissionEnabled = conf.getBoolean(DFS_PERMISSIONS_ENABLED_KEY, DFS_PERMISSIONS_ENABLED_DEFAULT); blockPoolId = StorageInfo.getStorageInfoFromDB().getBlockPoolId(); blockManager.setBlockPoolId(blockPoolId); hopSpecificInitialization(conf); this.quotaUpdateManager = new QuotaUpdateManager(this, conf); legacyDeleteEnabled = conf.getBoolean(DFS_LEGACY_DELETE_ENABLE_KEY, DFS_LEGACY_DELETE_ENABLE_DEFAULT); legacyRenameEnabled = conf.getBoolean(DFS_LEGACY_RENAME_ENABLE_KEY, DFS_LEGACY_RENAME_ENABLE_DEFAULT); legacyContentSummaryEnabled = conf.getBoolean(DFS_LEGACY_CONTENT_SUMMARY_ENABLE_KEY, DFS_LEGACY_CONTENT_SUMMARY_ENABLE_DEFAULT); legacySetQuotaEnabled = conf.getBoolean(DFS_LEGACY_SET_QUOTA_ENABLE_KEY, DFS_LEGACY_SET_QUOTA_ENABLE_DEFAULT); subtreeOperationsExecutor = Executors.newFixedThreadPool( conf.getInt(DFS_SUBTREE_EXECUTOR_LIMIT_KEY, DFS_SUBTREE_EXECUTOR_LIMIT_DEFAULT)); BIGGEST_DELETEABLE_DIR = conf.getLong(DFS_DIR_DELETE_BATCH_SIZE, DFS_DIR_DELETE_BATCH_SIZE_DEFAULT); LOG.info("fsOwner = " + fsOwner); LOG.info("supergroup = " + supergroup); LOG.info("isPermissionEnabled = " + isPermissionEnabled); final boolean persistBlocks = conf.getBoolean(DFS_PERSIST_BLOCKS_KEY, DFS_PERSIST_BLOCKS_DEFAULT); this.persistBlocks = persistBlocks; // Get the checksum type from config String checksumTypeStr = conf.get(DFS_CHECKSUM_TYPE_KEY, DFS_CHECKSUM_TYPE_DEFAULT); DataChecksum.Type checksumType; try { checksumType = DataChecksum.Type.valueOf(checksumTypeStr); } catch (IllegalArgumentException iae) { throw new IOException( "Invalid checksum type in " + DFS_CHECKSUM_TYPE_KEY + ": " + checksumTypeStr); } this.serverDefaults = new FsServerDefaults( conf.getLongBytes(DFS_BLOCK_SIZE_KEY, DFS_BLOCK_SIZE_DEFAULT), conf.getInt(DFS_BYTES_PER_CHECKSUM_KEY, DFS_BYTES_PER_CHECKSUM_DEFAULT), conf.getInt(DFS_CLIENT_WRITE_PACKET_SIZE_KEY, DFS_CLIENT_WRITE_PACKET_SIZE_DEFAULT), (short) conf.getInt(DFS_REPLICATION_KEY, DFS_REPLICATION_DEFAULT), conf.getInt(IO_FILE_BUFFER_SIZE_KEY, IO_FILE_BUFFER_SIZE_DEFAULT), conf.getBoolean(DFS_ENCRYPT_DATA_TRANSFER_KEY, DFS_ENCRYPT_DATA_TRANSFER_DEFAULT), conf.getLong(FS_TRASH_INTERVAL_KEY, FS_TRASH_INTERVAL_DEFAULT), checksumType); this.maxFsObjects = conf.getLong(DFS_NAMENODE_MAX_OBJECTS_KEY, DFS_NAMENODE_MAX_OBJECTS_DEFAULT); this.accessTimePrecision = conf.getLong(DFS_NAMENODE_ACCESSTIME_PRECISION_KEY, DFS_NAMENODE_ACCESSTIME_PRECISION_DEFAULT); this.supportAppends = conf.getBoolean(DFS_SUPPORT_APPEND_KEY, DFS_SUPPORT_APPEND_DEFAULT); LOG.info("Append Enabled: " + supportAppends); this.dtpReplaceDatanodeOnFailure = ReplaceDatanodeOnFailure.get(conf); // For testing purposes, allow the DT secret manager to be started regardless // of whether security is enabled. alwaysUseDelegationTokensForTests = conf.getBoolean(DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_KEY, DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_DEFAULT); this.dtSecretManager = createDelegationTokenSecretManager(conf); this.dir = new FSDirectory(this, conf); this.safeMode = new SafeModeInfo(conf); this.auditLoggers = initAuditLoggers(conf); this.isDefaultAuditLogger = auditLoggers.size() == 1 && auditLoggers.get(0) instanceof DefaultAuditLogger; } catch (IOException e) { LOG.error(getClass().getSimpleName() + " initialization failed.", e); close(); throw e; } catch (RuntimeException re) { LOG.error(getClass().getSimpleName() + " initialization failed.", re); close(); throw re; } } private List<AuditLogger> initAuditLoggers(Configuration conf) { // Initialize the custom access loggers if configured. Collection<String> alClasses = conf.getStringCollection(DFS_NAMENODE_AUDIT_LOGGERS_KEY); List<AuditLogger> auditLoggers = Lists.newArrayList(); if (alClasses != null && !alClasses.isEmpty()) { for (String className : alClasses) { try { AuditLogger logger; if (DFS_NAMENODE_DEFAULT_AUDIT_LOGGER_NAME.equals(className)) { logger = new DefaultAuditLogger(); } else { logger = (AuditLogger) Class.forName(className).newInstance(); } logger.initialize(conf); auditLoggers.add(logger); } catch (RuntimeException re) { throw re; } catch (Exception e) { throw new RuntimeException(e); } } } // Make sure there is at least one logger installed. if (auditLoggers.isEmpty()) { auditLoggers.add(new DefaultAuditLogger()); } return auditLoggers; } private void startSecretManager() { if (dtSecretManager != null) { try { dtSecretManager.startThreads(); } catch (IOException e) { // Inability to start secret manager // can't be recovered from. throw new RuntimeException(e); } } } private void startSecretManagerIfNecessary() throws IOException { boolean shouldRun = shouldUseDelegationTokens() && !isInSafeMode(); boolean running = dtSecretManager.isRunning(); if (shouldRun && !running) { startSecretManager(); } } private void stopSecretManager() { if (dtSecretManager != null) { dtSecretManager.stopThreads(); } } /** * Start services common * configuration * * @param conf * @throws IOException */ void startCommonServices(Configuration conf) throws IOException { this.registerMBean(); // register the MBean for the FSNamesystemState IDsMonitor.getInstance().start(); if (isClusterInSafeMode()) { assert safeMode != null && !safeMode.isPopulatingReplQueues(); setBlockTotal(); performPendingSafeModeOperation(); } blockManager.activate(conf); RootINodeCache.start(); if (dir.isQuotaEnabled()) { quotaUpdateManager.activate(); } registerMXBean(); DefaultMetricsSystem.instance().register(this); } /** * Stop services common * */ void stopCommonServices() { if (blockManager != null) { blockManager.close(); } if (quotaUpdateManager != null) { quotaUpdateManager.close(); } RootINodeCache.stop(); } /** * Start services required in active state * * @throws IOException */ void startActiveServices() throws IOException { LOG.info("Starting services required for active state"); LOG.info("Catching up to latest edits from old active before " + "taking over writer role in edits logs"); blockManager.getDatanodeManager().markAllDatanodesStale(); if (isClusterInSafeMode()) { if (!isInSafeMode() || (isInSafeMode() && safeMode.isPopulatingReplQueues())) { LOG.info("Reprocessing replication and invalidation queues"); blockManager.processMisReplicatedBlocks(); } } leaseManager.startMonitor(); startSecretManagerIfNecessary(); //ResourceMonitor required only at ActiveNN. See HDFS-2914 this.nnrmthread = new Daemon(new NameNodeResourceMonitor()); nnrmthread.start(); if (erasureCodingEnabled) { erasureCodingManager.activate(); } } private boolean shouldUseDelegationTokens() { return UserGroupInformation.isSecurityEnabled() || alwaysUseDelegationTokensForTests; } /** * Stop services required in active state * * @throws InterruptedException */ void stopActiveServices() { LOG.info("Stopping services started for active state"); stopSecretManager(); if (leaseManager != null) { leaseManager.stopMonitor(); } if (nnrmthread != null) { ((NameNodeResourceMonitor) nnrmthread.getRunnable()).stopMonitor(); nnrmthread.interrupt(); } if (erasureCodingManager != null) { erasureCodingManager.close(); } } NamespaceInfo getNamespaceInfo() throws IOException { return unprotectedGetNamespaceInfo(); } /** * Version of @see #getNamespaceInfo() that is not protected by a lock. */ NamespaceInfo unprotectedGetNamespaceInfo() throws IOException { StorageInfo storageInfo = StorageInfo.getStorageInfoFromDB(); return new NamespaceInfo(storageInfo.getNamespaceID(), getClusterId(), getBlockPoolId(), storageInfo.getCTime()); } /** * Close down this file system manager. * Causes heartbeat and lease daemons to stop; waits briefly for * them to finish, but a short timeout returns control back to caller. */ void close() { fsRunning = false; try { stopCommonServices(); if (smmthread != null) { smmthread.interrupt(); } } finally { // using finally to ensure we also wait for lease daemon try { stopActiveServices(); if (dir != null) { dir.close(); } } catch (IOException ie) { LOG.error("Error closing FSDirectory", ie); IOUtils.cleanup(LOG, dir); } } } @Override public boolean isRunning() { return fsRunning; } long getDefaultBlockSize() { return serverDefaults.getBlockSize(); } FsServerDefaults getServerDefaults() { return serverDefaults; } long getAccessTimePrecision() { return accessTimePrecision; } private boolean isAccessTimeSupported() { return accessTimePrecision > 0; } ///////////////////////////////////////////////////////// // // These methods are called by HadoopFS clients // ///////////////////////////////////////////////////////// /** * Set permissions for an existing file. * * @throws IOException */ void setPermissionSTO(final String src, final FsPermission permission) throws AccessControlException, FileNotFoundException, SafeModeException, UnresolvedLinkException, IOException { boolean txFailed = true; INodeIdentifier inode = null; try { inode = lockSubtreeAndCheckPathPermission(src, true, null, null, null, null, SubTreeOperation.StoOperationType.SET_PERMISSION_STO); final boolean isSto = inode != null; new HopsTransactionalRequestHandler(HDFSOperationType.SUBTREE_SETPERMISSION, src) { @Override public void acquireLock(TransactionLocks locks) throws IOException { LockFactory lf = getInstance(); locks.add(lf.getINodeLock(!dir.isQuotaEnabled()?true:false/*skip INode Attr Lock*/, nameNode, INodeLockType.WRITE, INodeResolveType.PATH,false, true, src)).add(lf.getBlockLock()); } @Override public Object performTask() throws StorageException, IOException { try { setPermissionSTOInt(src, permission, isSto); } catch (AccessControlException e) { logAuditEvent(false, "setPermission", src); throw e; } return null; } }.handle(this); txFailed = false; } finally { if(txFailed){ if(inode!=null){ unlockSubtree(src); } } } } private void setPermissionSTOInt(String src, FsPermission permission,boolean isSTO) throws AccessControlException, FileNotFoundException, SafeModeException, UnresolvedLinkException, IOException, StorageException { HdfsFileStatus resultingStat = null; FSPermissionChecker pc = getPermissionChecker(); if (isInSafeMode()) { throw new SafeModeException("Cannot set permission for " + src, safeMode); } checkOwner(pc, src); dir.setPermission(src, permission); resultingStat = getAuditFileInfo(src, false); logAuditEvent(true, "setPermission", src, null, resultingStat); //remove sto from if(isSTO){ INode[] nodes = dir.getRootDir().getExistingPathINodes(src, false); INode inode = nodes[nodes.length - 1]; if (inode != null && inode.isSubtreeLocked()) { inode.setSubtreeLocked(false); EntityManager.update(inode); } EntityManager.remove(new SubTreeOperation(getSubTreeLockPathPrefix(src))); } } /** * Set permissions for an existing file. * * @throws IOException */ void setPermission(final String src, final FsPermission permission) throws AccessControlException, FileNotFoundException, SafeModeException, UnresolvedLinkException, IOException { new HopsTransactionalRequestHandler(HDFSOperationType.SET_PERMISSION, src) { @Override public void acquireLock(TransactionLocks locks) throws IOException { LockFactory lf = getInstance(); locks.add(lf.getINodeLock(nameNode, INodeLockType.WRITE, INodeResolveType.PATH, src)).add(lf.getBlockLock()); } @Override public Object performTask() throws StorageException, IOException { try { setPermissionInt(src, permission); } catch (AccessControlException e) { logAuditEvent(false, "setPermission", src); throw e; } return null; } }.handle(this); } private void setPermissionInt(String src, FsPermission permission) throws AccessControlException, FileNotFoundException, SafeModeException, UnresolvedLinkException, IOException, StorageException { HdfsFileStatus resultingStat = null; FSPermissionChecker pc = getPermissionChecker(); if (isInSafeMode()) { throw new SafeModeException("Cannot set permission for " + src, safeMode); } checkOwner(pc, src); dir.setPermission(src, permission); resultingStat = getAuditFileInfo(src, false); logAuditEvent(true, "setPermission", src, null, resultingStat); } /** * Set owner for an existing file. * * @throws IOException */ void setOwnerSTO(final String src, final String username, final String group) throws AccessControlException, FileNotFoundException, SafeModeException, UnresolvedLinkException, IOException { boolean txFailed = true; INodeIdentifier inode = null;; try{ inode = lockSubtreeAndCheckPathPermission(src, true, null, null, null, null, SubTreeOperation.StoOperationType.SET_OWNER_STO); final boolean isSto = inode != null; new HopsTransactionalRequestHandler(HDFSOperationType.SET_OWNER_SUBTREE, src) { @Override public void acquireLock(TransactionLocks locks) throws IOException { LockFactory lf = getInstance(); locks.add(lf.getINodeLock(!dir.isQuotaEnabled()?true:false/*skip INode Attr Lock*/,nameNode, INodeLockType.WRITE, INodeResolveType.PATH, false, true, src)).add(lf.getBlockLock()); } @Override public Object performTask() throws StorageException, IOException { try { setOwnerSTOInt(src, username, group,isSto); } catch (AccessControlException e) { logAuditEvent(false, "setOwner", src); throw e; } return null; } }.handle(this); txFailed = false; }finally{ if(txFailed){ if(inode!=null){ unlockSubtree(src); } } } } private void setOwnerSTOInt(String src, String username, String group, boolean isSTO) throws AccessControlException, FileNotFoundException, SafeModeException, UnresolvedLinkException, IOException, StorageException { HdfsFileStatus resultingStat = null; FSPermissionChecker pc = getPermissionChecker(); if (isInSafeMode()) { throw new SafeModeException("Cannot set owner for " + src, safeMode); } checkOwner(pc, src); if (!pc.isSuperUser()) { if (username != null && !pc.getUser().equals(username)) { throw new AccessControlException("Non-super user cannot change owner"); } if (group != null && !pc.containsGroup(group)) { throw new AccessControlException("User does not belong to " + group); } } dir.setOwner(src, username, group); resultingStat = getAuditFileInfo(src, false); logAuditEvent(true, "setOwner", src, null, resultingStat); if(isSTO){ INode[] nodes = dir.getRootDir().getExistingPathINodes(src, false); INode inode = nodes[nodes.length - 1]; if (inode != null && inode.isSubtreeLocked()) { inode.setSubtreeLocked(false); EntityManager.update(inode); } EntityManager.remove(new SubTreeOperation(getSubTreeLockPathPrefix(src))); } } /** * Set owner for an existing file. * * @throws IOException */ void setOwner(final String src, final String username, final String group) throws AccessControlException, FileNotFoundException, SafeModeException, UnresolvedLinkException, IOException { new HopsTransactionalRequestHandler(HDFSOperationType.SET_OWNER, src) { @Override public void acquireLock(TransactionLocks locks) throws IOException { LockFactory lf = getInstance(); locks.add(lf.getINodeLock(nameNode, INodeLockType.WRITE, INodeResolveType.PATH, src)).add(lf.getBlockLock()); } @Override public Object performTask() throws StorageException, IOException { try { setOwnerInt(src, username, group); } catch (AccessControlException e) { logAuditEvent(false, "setOwner", src); throw e; } return null; } }.handle(this); } private void setOwnerInt(String src, String username, String group) throws AccessControlException, FileNotFoundException, SafeModeException, UnresolvedLinkException, IOException, StorageException { HdfsFileStatus resultingStat = null; FSPermissionChecker pc = getPermissionChecker(); if (isInSafeMode()) { throw new SafeModeException("Cannot set owner for " + src, safeMode); } checkOwner(pc, src); if (!pc.isSuperUser()) { if (username != null && !pc.getUser().equals(username)) { throw new AccessControlException("Non-super user cannot change owner"); } if (group != null && !pc.containsGroup(group)) { throw new AccessControlException("User does not belong to " + group); } } dir.setOwner(src, username, group); resultingStat = getAuditFileInfo(src, false); logAuditEvent(true, "setOwner", src, null, resultingStat); } /** * Get block locations within the specified range. * * @see ClientProtocol#getBlockLocations(String, long, long) */ public LocatedBlocks getBlockLocations(final String clientMachine, final String src, final long offset, final long length) throws IOException { // First try the operation using shared lock. // Upgrade the lock to exclusive lock if LockUpgradeException is encountered. // This operation tries to update the inode access time once every hr. // The lock upgrade exception is thrown when the inode access time stamp is // updated while holding shared lock on the inode. In this case retry the operation // using an exclusive lock. try{ return getBlockLocationsWithLock(clientMachine, src, offset, length, INodeLockType.READ); }catch(LockUpgradeException e){ LOG.debug("Encountered LockUpgradeException while reading "+src+". Retrying the operation using exclusive locks"); return getBlockLocationsWithLock(clientMachine, src, offset, length, INodeLockType.WRITE); } } LocatedBlocks getBlockLocationsWithLock(final String clientMachine, final String src, final long offset, final long length, final INodeLockType lockType) throws IOException { HopsTransactionalRequestHandler getBlockLocationsHandler = new HopsTransactionalRequestHandler( HDFSOperationType.GET_BLOCK_LOCATIONS, src) { @Override public void acquireLock(TransactionLocks locks) throws IOException { LockFactory lf = getInstance(); locks.add(lf.getINodeLock(!dir.isQuotaEnabled()?true:false,nameNode, lockType, INodeResolveType.PATH, src)).add(lf.getBlockLock()) .add(lf.getBlockRelated(BLK.RE, BLK.ER, BLK.CR, BLK.UC)); } @Override public Object performTask() throws StorageException, IOException { LocatedBlocks blocks = getBlockLocationsInternal(src, offset, length, true, true, true); if (blocks != null) { blockManager.getDatanodeManager() .sortLocatedBlocks(clientMachine, blocks.getLocatedBlocks()); LocatedBlock lastBlock = blocks.getLastLocatedBlock(); if (lastBlock != null) { ArrayList<LocatedBlock> lastBlockList = new ArrayList<LocatedBlock>(); lastBlockList.add(lastBlock); blockManager.getDatanodeManager() .sortLocatedBlocks(clientMachine, lastBlockList); } } return blocks; } }; LocatedBlocks locatedBlocks = (LocatedBlocks) getBlockLocationsHandler.handle(this); logAuditEvent(true, "open", src); return locatedBlocks; } /** * Get block locations within the specified range. * * @throws FileNotFoundException, * UnresolvedLinkException, IOException * @see ClientProtocol#getBlockLocations(String, long, long) */ public LocatedBlocks getBlockLocations(final String src, final long offset, final long length, final boolean doAccessTime, final boolean needBlockToken, final boolean checkSafeMode) throws IOException { HopsTransactionalRequestHandler getBlockLocationsHandler = new HopsTransactionalRequestHandler( HDFSOperationType.GET_BLOCK_LOCATIONS, src) { @Override public void acquireLock(TransactionLocks locks) throws IOException { LockFactory lf = getInstance(); locks.add(lf.getINodeLock(nameNode, INodeLockType.READ, INodeResolveType.PATH, src)).add(lf.getBlockLock()) .add(lf.getBlockRelated(BLK.RE, BLK.ER, BLK.CR, BLK.UC)); } @Override public Object performTask() throws IOException { return getBlockLocationsInternal(src, offset, length, doAccessTime, needBlockToken, checkSafeMode); } }; return (LocatedBlocks) getBlockLocationsHandler.handle(this); } /** * Get block locations within the specified range. * * @throws FileNotFoundException, * UnresolvedLinkException, IOException * @see ClientProtocol#getBlockLocations(String, long, long) */ LocatedBlocks getBlockLocationsInternal(String src, long offset, long length, boolean doAccessTime, boolean needBlockToken, boolean checkSafeMode) throws FileNotFoundException, UnresolvedLinkException, IOException, StorageException { FSPermissionChecker pc = getPermissionChecker(); try { return getBlockLocationsInt(pc, src, offset, length, doAccessTime, needBlockToken, checkSafeMode); } catch (AccessControlException e) { logAuditEvent(false, "open", src); throw e; } } public boolean isFileCorrupt(final String filePath) throws IOException { LocatedBlocks blocks = getBlockLocationsInternal(filePath, 0, Long.MAX_VALUE, true, true, true); for (LocatedBlock b : blocks.getLocatedBlocks()) { if (b.isCorrupt() || (b.getLocations().length == 0 && b.getBlockSize() > 0)) { return true; } } return false; } private LocatedBlocks getBlockLocationsInt(FSPermissionChecker pc, String src, long offset, long length, boolean doAccessTime, boolean needBlockToken, boolean checkSafeMode) throws FileNotFoundException, UnresolvedLinkException, IOException, StorageException { if (isPermissionEnabled) { checkPathAccess(pc, src, FsAction.READ); } if (offset < 0) { throw new HadoopIllegalArgumentException( "Negative offset is not supported. File: " + src); } if (length < 0) { throw new HadoopIllegalArgumentException( "Negative length is not supported. File: " + src); } final LocatedBlocks ret = getBlockLocationsUpdateTimes(src, offset, length, doAccessTime, needBlockToken); if (checkSafeMode && isInSafeMode()) { for (LocatedBlock b : ret.getLocatedBlocks()) { // if safemode & no block locations yet then throw safemodeException if ((b.getLocations() == null) || (b.getLocations().length == 0)) { throw new SafeModeException("Zero blocklocations for " + src, safeMode); } } } return ret; } /* * Get block locations within the specified range, updating the * access times if necessary. */ private LocatedBlocks getBlockLocationsUpdateTimes(String src, long offset, long length, boolean doAccessTime, boolean needBlockToken) throws FileNotFoundException, UnresolvedLinkException, IOException, StorageException { for (int attempt = 0; attempt < 2; attempt++) { // if the namenode is in safemode, then do not update access time if (isInSafeMode()) { doAccessTime = false; } long now = now(); final INodeFile inode = INodeFile.valueOf(dir.getINode(src), src); if (doAccessTime && isAccessTimeSupported()) { dir.setTimes(src, inode, -1, now, false); } return blockManager .createLocatedBlocks(inode.getBlocks(), inode.computeFileSize(false), inode.isUnderConstruction(), offset, length, needBlockToken); } return null; // can never reach here } /** * Moves all the blocks from srcs and appends them to trg * To avoid rollbacks we will verify validitity of ALL of the args * before we start actual move. * * @param target * @param srcs * @throws IOException */ void concat(final String target, final String[] srcs) throws IOException { final String[] paths = new String[srcs.length + 1]; System.arraycopy(srcs, 0, paths, 0, srcs.length); paths[srcs.length] = target; new HopsTransactionalRequestHandler(HDFSOperationType.CONCAT) { @Override public void acquireLock(TransactionLocks locks) throws IOException { LockFactory lf = getInstance(); locks.add( lf.getINodeLock(nameNode, INodeLockType.WRITE_ON_TARGET_AND_PARENT, INodeResolveType.PATH, paths)).add(lf.getBlockLock()).add( lf.getBlockRelated(BLK.RE, BLK.CR, BLK.ER, BLK.PE, BLK.UC, BLK.IV)); if (erasureCodingEnabled) { locks.add(lf.getEncodingStatusLock(LockType.WRITE, srcs)); } } @Override public Object performTask() throws IOException { try { concatInt(target, srcs); } catch (AccessControlException e) { logAuditEvent(false, "concat", Arrays.toString(srcs), target, null); throw e; } return null; } }.handle(this); } private void concatInt(String target, String[] srcs) throws IOException, UnresolvedLinkException, StorageException { if (FSNamesystem.LOG.isDebugEnabled()) { FSNamesystem.LOG.debug("concat " + Arrays.toString(srcs) + " to " + target); } // verify args if (target.isEmpty()) { throw new IllegalArgumentException("Target file name is empty"); } if (srcs == null || srcs.length == 0) { throw new IllegalArgumentException("No sources given"); } // We require all files be in the same directory String trgParent = target.substring(0, target.lastIndexOf(Path.SEPARATOR_CHAR)); for (String s : srcs) { String srcParent = s.substring(0, s.lastIndexOf(Path.SEPARATOR_CHAR)); if (!srcParent.equals(trgParent)) { throw new IllegalArgumentException( "Sources and target are not in the same directory"); } } HdfsFileStatus resultingStat = null; FSPermissionChecker pc = getPermissionChecker(); if (isInSafeMode()) { throw new SafeModeException("Cannot concat " + target, safeMode); } concatInternal(pc, target, srcs); resultingStat = getAuditFileInfo(target, false); logAuditEvent(true, "concat", Arrays.toString(srcs), target, resultingStat); } /** * See {@link #concat(String, String[])} */ private void concatInternal(FSPermissionChecker pc, String target, String[] srcs) throws IOException, UnresolvedLinkException, StorageException { // write permission for the target if (isPermissionEnabled) { checkPathAccess(pc, target, FsAction.WRITE); // and srcs for (String aSrc : srcs) { checkPathAccess(pc, aSrc, FsAction.READ); // read the file checkParentAccess(pc, aSrc, FsAction.WRITE); // for delete } } // to make sure no two files are the same Set<INode> si = new HashSet<INode>(); // we put the following prerequisite for the operation // replication and blocks sizes should be the same for ALL the blocks // check the target final INodeFile trgInode = INodeFile.valueOf(dir.getINode(target), target); if (trgInode.isUnderConstruction()) { throw new HadoopIllegalArgumentException( "concat: target file " + target + " is under construction"); } // per design target shouldn't be empty and all the blocks same size if (trgInode.numBlocks() == 0) { throw new HadoopIllegalArgumentException( "concat: target file " + target + " is empty"); } long blockSize = trgInode.getPreferredBlockSize(); // check the end block to be full final BlockInfo last = trgInode.getLastBlock(); if (blockSize != last.getNumBytes()) { throw new HadoopIllegalArgumentException( "The last block in " + target + " is not full; last block size = " + last.getNumBytes() + " but file block size = " + blockSize); } si.add(trgInode); short repl = trgInode.getBlockReplication(); // now check the srcs boolean endSrc = false; // final src file doesn't have to have full end block for (int i = 0; i < srcs.length; i++) { String src = srcs[i]; if (i == srcs.length - 1) { endSrc = true; } final INodeFile srcInode = INodeFile.valueOf(dir.getINode(src), src); if (src.isEmpty() || srcInode.isUnderConstruction() || srcInode.numBlocks() == 0) { throw new HadoopIllegalArgumentException("concat: source file " + src + " is invalid or empty or underConstruction"); } // check replication and blocks size if (repl != srcInode.getBlockReplication()) { throw new HadoopIllegalArgumentException( "concat: the soruce file " + src + " and the target file " + target + " should have the same replication: source replication is " + srcInode.getBlockReplication() + " but target replication is " + repl); } // verify that all the blocks are of the same length as target // should be enough to check the end blocks final BlockInfo[] srcBlocks = srcInode.getBlocks(); int idx = srcBlocks.length - 1; if (endSrc) { idx = srcBlocks.length - 2; // end block of endSrc is OK not to be full } if (idx >= 0 && srcBlocks[idx].getNumBytes() != blockSize) { throw new HadoopIllegalArgumentException( "concat: the soruce file " + src + " and the target file " + target + " should have the same blocks sizes: target block size is " + blockSize + " but the size of source block " + idx + " is " + srcBlocks[idx].getNumBytes()); } si.add(srcInode); } // make sure no two files are the same if (si.size() < srcs.length + 1) { // trg + srcs // it means at least two files are the same throw new HadoopIllegalArgumentException( "concat: at least two of the source files are the same"); } if (NameNode.stateChangeLog.isDebugEnabled()) { NameNode.stateChangeLog.debug("DIR* NameSystem.concat: " + Arrays.toString(srcs) + " to " + target); } dir.concat(target, srcs); } /** * stores the modification and access time for this inode. * The access time is precise upto an hour. The transaction, if needed, is * written to the edits log but is not flushed. */ void setTimes(final String src, final long mtime, final long atime) throws IOException { new HopsTransactionalRequestHandler(HDFSOperationType.SET_TIMES, src) { @Override public void acquireLock(TransactionLocks locks) throws IOException { LockFactory lf = getInstance(); locks.add(lf.getINodeLock(nameNode, INodeLockType.WRITE, INodeResolveType.PATH, src)).add(lf.getBlockLock()); } @Override public Object performTask() throws StorageException, IOException { try { setTimesInt(src, mtime, atime); } catch (AccessControlException e) { logAuditEvent(false, "setTimes", src); throw e; } return null; } }.handle(this); } private void setTimesInt(String src, long mtime, long atime) throws IOException, UnresolvedLinkException, StorageException { if (!isAccessTimeSupported() && atime != -1) { throw new IOException("Access time for hdfs is not configured. " + " Please set " + DFS_NAMENODE_ACCESSTIME_PRECISION_KEY + " configuration parameter."); } HdfsFileStatus resultingStat = null; FSPermissionChecker pc = getPermissionChecker(); // Write access is required to set access and modification times if (isPermissionEnabled) { checkPathAccess(pc, src, FsAction.WRITE); } INode inode = dir.getINode(src); if (inode != null) { dir.setTimes(src, inode, mtime, atime, true); resultingStat = getAuditFileInfo(src, false); } else { throw new FileNotFoundException( "File/Directory " + src + " does not exist."); } logAuditEvent(true, "setTimes", src, null, resultingStat); } /** * Create a symbolic link. */ void createSymlink(final String target, final String link, final PermissionStatus dirPerms, final boolean createParent) throws IOException { new HopsTransactionalRequestHandler(HDFSOperationType.CREATE_SYM_LINK, link) { @Override public void acquireLock(TransactionLocks locks) throws IOException { LockFactory lf = getInstance(); locks.add(lf.getINodeLock(nameNode, INodeLockType.WRITE, INodeResolveType.PATH, false, link)); } @Override public Object performTask() throws IOException { try { createSymlinkInt(target, link, dirPerms, createParent); } catch (AccessControlException e) { logAuditEvent(false, "createSymlink", link, target, null); throw e; } return null; } }.handle(this); } private void createSymlinkInt(String target, String link, PermissionStatus dirPerms, boolean createParent) throws IOException, UnresolvedLinkException, StorageException { HdfsFileStatus resultingStat = null; FSPermissionChecker pc = getPermissionChecker(); if (!createParent) { verifyParentDir(link); } createSymlinkInternal(pc, target, link, dirPerms, createParent); resultingStat = getAuditFileInfo(link, false); logAuditEvent(true, "createSymlink", link, target, resultingStat); } /** * Create a symbolic link. */ private void createSymlinkInternal(FSPermissionChecker pc, String target, String link, PermissionStatus dirPerms, boolean createParent) throws IOException, UnresolvedLinkException, StorageException { if (NameNode.stateChangeLog.isDebugEnabled()) { NameNode.stateChangeLog.debug("DIR* NameSystem.createSymlink: target=" + target + " link=" + link); } if (isInSafeMode()) { throw new SafeModeException("Cannot create symlink " + link, safeMode); } if (!DFSUtil.isValidName(link)) { throw new InvalidPathException("Invalid file name: " + link); } if (!dir.isValidToCreate(link)) { throw new IOException("failed to create link " + link + " either because the filename is invalid or the file exists"); } if (isPermissionEnabled) { checkAncestorAccess(pc, link, FsAction.WRITE); } // validate that we have enough inodes. checkFsObjectLimit(); // add symbolic link to namespace dir.addSymlink(link, target, dirPerms, createParent); } /** * Set replication for an existing file. * <p/> * The NameNode sets new replication and schedules either replication of * under-replicated data blocks or removal of the excessive block copies * if the blocks are over-replicated. * * @param src * file name * @param replication * new replication * @return true if successful; * false if file does not exist or is a directory * @see ClientProtocol#setReplication(String, short) */ boolean setReplication(final String src, final short replication) throws IOException { HopsTransactionalRequestHandler setReplicationHandler = new HopsTransactionalRequestHandler(HDFSOperationType.SET_REPLICATION, src) { @Override public void acquireLock(TransactionLocks locks) throws IOException { LockFactory lf = getInstance(); locks.add(lf.getINodeLock(!dir.isQuotaEnabled()?true:false/*skip INode Attr Lock*/,nameNode, INodeLockType.WRITE_ON_TARGET_AND_PARENT, INodeResolveType.PATH, src)).add(lf.getBlockLock()).add( lf.getBlockRelated(BLK.RE, BLK.ER, BLK.CR, BLK.UC, BLK.UR, BLK.IV)); } @Override public Object performTask() throws IOException { try { return setReplicationInt(src, replication); } catch (AccessControlException e) { logAuditEvent(false, "setReplication", src); throw e; } } }; return (Boolean) setReplicationHandler.handle(this); } private boolean setReplicationInt(final String src, final short replication) throws IOException, StorageException { blockManager.verifyReplication(src, replication, null); final boolean isFile; FSPermissionChecker pc = getPermissionChecker(); if (isInSafeMode()) { throw new SafeModeException("Cannot set replication for " + src, safeMode); } if (isPermissionEnabled) { checkPathAccess(pc, src, FsAction.WRITE); } final short[] oldReplication = new short[1]; final Block[] blocks = dir.setReplication(src, replication, oldReplication); isFile = blocks != null; if (isFile) { blockManager.setReplication(oldReplication[0], replication, src, blocks); } if (isFile) { logAuditEvent(true, "setReplication", src); } return isFile; } void setMetaEnabled(final String src, final boolean metaEnabled) throws IOException { try { INodeIdentifier inode = lockSubtree(src, SubTreeOperation.StoOperationType.META_ENABLE); final AbstractFileTree.FileTree fileTree = new AbstractFileTree.FileTree( FSNamesystem.this, inode); fileTree.buildUp(); new HopsTransactionalRequestHandler(HDFSOperationType.SET_META_ENABLED, src) { @Override public void acquireLock(TransactionLocks locks) throws IOException { LockFactory lf = getInstance(); locks.add(lf.getINodeLock(nameNode, INodeLockType.WRITE, INodeResolveType.PATH, true, true, src)); } @Override public Object performTask() throws IOException { try { logMetadataEvents(fileTree, MetadataLogEntry.Operation.ADD); setMetaEnabledInt(src, metaEnabled); } catch (AccessControlException e) { logAuditEvent(false, "setMetaEnabled", src); throw e; } return null; } }.handle(this); } finally { unlockSubtree(src); } } private void setMetaEnabledInt(final String src, final boolean metaEnabled) throws IOException { FSPermissionChecker pc = getPermissionChecker(); if (isInSafeMode()) { throw new SafeModeException("Cannot set metaEnabled for " + src, safeMode); } if (isPermissionEnabled) { checkPathAccess(pc, src, FsAction.WRITE); } INode targetNode = getINode(src); if (!targetNode.isDirectory()) { throw new FileNotFoundException(src + ": Is not a directory"); } else { INodeDirectory dirNode = (INodeDirectory) targetNode; dirNode.setMetaEnabled(metaEnabled); EntityManager.update(dirNode); } } private void logMetadataEvents(AbstractFileTree.FileTree fileTree, MetadataLogEntry.Operation operation) throws TransactionContextException, StorageException { ProjectedINode datasetDir = fileTree.getSubtreeRoot(); for (ProjectedINode node : fileTree.getAllChildren()) { MetadataLogEntry logEntry = new MetadataLogEntry(datasetDir.getId(), node.getId(), node.getPartitionId(), node.getParentId(), node .getName(), operation); EntityManager.add(logEntry); } } long getPreferredBlockSize(final String filename) throws IOException { HopsTransactionalRequestHandler getPreferredBlockSizeHandler = new HopsTransactionalRequestHandler( HDFSOperationType.GET_PREFERRED_BLOCK_SIZE, filename) { @Override public void acquireLock(TransactionLocks locks) throws IOException { LockFactory lf = LockFactory.getInstance(); locks.add(lf.getINodeLock(nameNode, INodeLockType.READ_COMMITTED, INodeResolveType.PATH, filename)); } @Override public Object performTask() throws IOException { FSPermissionChecker pc = getPermissionChecker(); if (isPermissionEnabled) { checkTraverse(pc, filename); } return dir.getPreferredBlockSize(filename); } }; return (Long) getPreferredBlockSizeHandler.handle(this); } /* * Verify that parent directory of src exists. */ private void verifyParentDir(String src) throws FileNotFoundException, ParentNotDirectoryException, UnresolvedLinkException, StorageException, TransactionContextException { Path parent = new Path(src).getParent(); if (parent != null) { INode parentNode = getINode(parent.toString()); if (parentNode == null) { throw new FileNotFoundException( "Parent directory doesn't exist: " + parent.toString()); } else if (!parentNode.isDirectory() && !parentNode.isSymlink()) { throw new ParentNotDirectoryException( "Parent path is not a directory: " + parent.toString()); } } } /** * Create a new file entry in the namespace. * <p/> * For description of parameters and exceptions thrown see * {@link ClientProtocol#create} */ HdfsFileStatus startFile(final String src, final PermissionStatus permissions, final String holder, final String clientMachine, final EnumSet<CreateFlag> flag, final boolean createParent, final short replication, final long blockSize) throws IOException { return (HdfsFileStatus) new HopsTransactionalRequestHandler( HDFSOperationType.START_FILE, src) { @Override public void acquireLock(TransactionLocks locks) throws IOException { LockFactory lf = getInstance(); locks.add( //if quota is disabled then do not read the INode Attributes table lf.getINodeLock(!dir.isQuotaEnabled()?true:false/*skip INode Attr Lock*/,nameNode, INodeLockType.WRITE_ON_TARGET_AND_PARENT, INodeResolveType.PATH, false, src)).add(lf.getBlockLock()) .add(lf.getLeaseLock(LockType.WRITE, holder)) .add(lf.getLeasePathLock(LockType.READ_COMMITTED)).add( lf.getBlockRelated(BLK.RE, BLK.CR, BLK.ER, BLK.UC, BLK.UR, BLK.PE, BLK.IV)); if (flag.contains(CreateFlag.OVERWRITE) && dir.isQuotaEnabled()) { locks.add(lf.getQuotaUpdateLock(src)); } if (flag.contains(CreateFlag.OVERWRITE) && erasureCodingEnabled) { locks.add(lf.getEncodingStatusLock(LockType.WRITE, src)); } } @Override public Object performTask() throws IOException { try { return startFileInt(src, permissions, holder, clientMachine, flag, createParent, replication, blockSize); } catch (AccessControlException e) { logAuditEvent(false, "create", src); throw e; } } }.handle(this); } private HdfsFileStatus startFileInt(String src, PermissionStatus permissions, String holder, String clientMachine, EnumSet<CreateFlag> flag, boolean createParent, short replication, long blockSize) throws AccessControlException, SafeModeException, FileAlreadyExistsException, UnresolvedLinkException, FileNotFoundException, ParentNotDirectoryException, IOException, StorageException { FSPermissionChecker pc = getPermissionChecker(); startFileInternal(pc, src, permissions, holder, clientMachine, flag, createParent, replication, blockSize); final HdfsFileStatus stat = dir.getFileInfoForCreate(src, false); logAuditEvent(true, "create", src, null, (isAuditEnabled() && isExternalInvocation()) ? stat : null); return stat; } /** * Create new or open an existing file for append.<p> * <p/> * In case of opening the file for append, the method returns the last * block of the file if this is a partial block, which can still be used * for writing more data. The client uses the returned block locations * to form the data pipeline for this block.<br> * The method returns null if the last block is full or if this is a * new file. The client then allocates a new block with the next call * using {@link NameNodeRpcServer#addBlock}.<p> * <p/> * For description of parameters and exceptions thrown see * {@link ClientProtocol#create} * * @return the last block locations if the block is partial or null otherwise */ private LocatedBlock startFileInternal(FSPermissionChecker pc, String src, PermissionStatus permissions, String holder, String clientMachine, EnumSet<CreateFlag> flag, boolean createParent, short replication, long blockSize) throws SafeModeException, FileAlreadyExistsException, AccessControlException, UnresolvedLinkException, FileNotFoundException, ParentNotDirectoryException, IOException, StorageException { if (NameNode.stateChangeLog.isDebugEnabled()) { NameNode.stateChangeLog.debug( "DIR* NameSystem.startFile: src=" + src + ", holder=" + holder + ", clientMachine=" + clientMachine + ", createParent=" + createParent + ", replication=" + replication + ", createFlag=" + flag.toString()); } if (isInSafeMode()) { throw new SafeModeException("Cannot create file" + src, safeMode); } if (!DFSUtil.isValidName(src)) { throw new InvalidPathException(src); } // Verify that the destination does not exist as a directory already. boolean pathExists = dir.exists(src); if (pathExists && dir.isDir(src)) { throw new FileAlreadyExistsException( "Cannot create file " + src + "; already exists as a directory."); } boolean overwrite = flag.contains(CreateFlag.OVERWRITE); boolean append = flag.contains(CreateFlag.APPEND); if (isPermissionEnabled) { if (append || (overwrite && pathExists)) { checkPathAccess(pc, src, FsAction.WRITE); } else { checkAncestorAccess(pc, src, FsAction.WRITE); } } if (!createParent) { verifyParentDir(src); } try { blockManager.verifyReplication(src, replication, clientMachine); boolean create = flag.contains(CreateFlag.CREATE); final INode myFile = dir.getINode(src); if (myFile == null) { if (!create) { throw new FileNotFoundException( "failed to overwrite or append to non-existent file " + src + " on client " + clientMachine); } } else { // File exists - must be one of append or overwrite if (overwrite) { delete(src, true); } else { // Opening an existing file for write - may need to recover lease. recoverLeaseInternal(myFile, src, holder, clientMachine, false); if (!append) { throw new FileAlreadyExistsException( "failed to create file " + src + " on client " + clientMachine + " because the file exists"); } } } final DatanodeDescriptor clientNode = blockManager.getDatanodeManager().getDatanodeByHost(clientMachine); if (append && myFile != null) { final INodeFile f = INodeFile.valueOf(myFile, src); return prepareFileForWrite(src, f, holder, clientMachine, clientNode); } else { // Now we can add the name to the filesystem. This file has no // blocks associated with it. // checkFsObjectLimit(); // increment global generation stamp //HOP[M] generationstamp is not used for inodes long genstamp = 0; INodeFileUnderConstruction newNode = dir.addFile(src, permissions, replication, blockSize, holder, clientMachine, clientNode, genstamp); if (newNode == null) { throw new IOException("DIR* NameSystem.startFile: " + "Unable to add file to namespace."); } leaseManager.addLease(newNode.getClientName(), src); if (NameNode.stateChangeLog.isDebugEnabled()) { NameNode.stateChangeLog.debug( "DIR* NameSystem.startFile: " + "add " + src + " to namespace for " + holder); } } } catch (IOException ie) { NameNode.stateChangeLog .warn("DIR* NameSystem.startFile: " + ie.getMessage()); throw ie; } return null; } /** * Replace current node with a INodeUnderConstruction. * Recreate lease record. * * @param src * path to the file * @param file * existing file object * @param leaseHolder * identifier of the lease holder on this file * @param clientMachine * identifier of the client machine * @param clientNode * if the client is collocated with a DN, that DN's descriptor * @return the last block locations if the block is partial or null otherwise * @throws UnresolvedLinkException * @throws IOException */ LocatedBlock prepareFileForWrite(String src, INodeFile file, String leaseHolder, String clientMachine, DatanodeDescriptor clientNode) throws IOException, StorageException { INodeFileUnderConstruction cons = file.convertToUnderConstruction(leaseHolder, clientMachine, clientNode); Lease lease = leaseManager.addLease(cons.getClientName(), src); LocatedBlock ret = blockManager.convertLastBlockToUnderConstruction(cons); lease.updateLastTwoBlocksInLeasePath(src, file.getLastBlock(), file .getPenultimateBlock()); return ret; } /** * Recover lease; * Immediately revoke the lease of the current lease holder and start lease * recovery so that the file can be forced to be closed. * * @param src * the path of the file to start lease recovery * @param holder * the lease holder's name * @param clientMachine * the client machine's name * @return true if the file is already closed * @throws IOException */ boolean recoverLease(final String src, final String holder, final String clientMachine) throws IOException { HopsTransactionalRequestHandler recoverLeaseHandler = new HopsTransactionalRequestHandler(HDFSOperationType.RECOVER_LEASE, src) { @Override public void acquireLock(TransactionLocks locks) throws IOException { LockFactory lf = getInstance(); locks.add(lf.getINodeLock(nameNode, INodeLockType.WRITE, INodeResolveType.PATH, src)) .add(lf.getLeaseLock(LockType.WRITE, holder)) .add(lf.getLeasePathLock(LockType.READ_COMMITTED)).add(lf.getBlockLock()) .add( lf.getBlockRelated(BLK.RE, BLK.CR, BLK.ER, BLK.UC, BLK.UR)); } @Override public Object performTask() throws StorageException, IOException { FSPermissionChecker pc = getPermissionChecker(); if (isInSafeMode()) { throw new SafeModeException("Cannot recover the lease of " + src, safeMode); } if (!DFSUtil.isValidName(src)) { throw new IOException("Invalid file name: " + src); } final INodeFile inode = INodeFile.valueOf(dir.getINode(src), src); if (!inode.isUnderConstruction()) { return true; } if (isPermissionEnabled) { checkPathAccess(pc, src, FsAction.WRITE); } recoverLeaseInternal(inode, src, holder, clientMachine, true); return false; } }; return (Boolean) recoverLeaseHandler.handle(this); } private void recoverLeaseInternal(INode fileInode, String src, String holder, String clientMachine, boolean force) throws IOException, StorageException { if (fileInode != null && fileInode.isUnderConstruction()) { INodeFileUnderConstruction pendingFile = (INodeFileUnderConstruction) fileInode; // // If the file is under construction , then it must be in our // leases. Find the appropriate lease record. // Lease lease = leaseManager.getLease(holder); // // We found the lease for this file. And surprisingly the original // holder is trying to recreate this file. This should never occur. // if (!force && lease != null) { Lease leaseFile = leaseManager.getLeaseByPath(src); if ((leaseFile != null && leaseFile.equals(lease)) || lease.getHolder().equals(holder)) { throw new AlreadyBeingCreatedException( "failed to create file " + src + " for " + holder + " on client " + clientMachine + " because current leaseholder is trying to recreate file."); } } // // Find the original holder. // lease = leaseManager.getLease(pendingFile.getClientName()); if (lease == null) { throw new AlreadyBeingCreatedException( "failed to create file " + src + " for " + holder + " on client " + clientMachine + " because pendingCreates is non-null but no leases found."); } if (force) { // close now: no need to wait for soft lease expiration and // close only the file src LOG.info("recoverLease: " + lease + ", src=" + src + " from client " + pendingFile.getClientName()); internalReleaseLease(lease, src, holder); } else { assert lease.getHolder().equals(pendingFile.getClientName()) : "Current lease holder " + lease.getHolder() + " does not match file creator " + pendingFile.getClientName(); // // If the original holder has not renewed in the last SOFTLIMIT // period, then start lease recovery. // if (leaseManager.expiredSoftLimit(lease)) { LOG.info("startFile: recover " + lease + ", src=" + src + " client " + pendingFile.getClientName()); boolean isClosed = internalReleaseLease(lease, src, null); if (!isClosed) { throw new RecoveryInProgressException.NonAbortingRecoveryInProgressException( "Failed to close file " + src + ". Lease recovery is in progress. Try again later."); } } else { final BlockInfo lastBlock = pendingFile.getLastBlock(); if (lastBlock != null && lastBlock.getBlockUCState() == BlockUCState.UNDER_RECOVERY) { throw new RecoveryInProgressException( "Recovery in progress, file [" + src + "], " + "lease owner [" + lease.getHolder() + "]"); } else { throw new AlreadyBeingCreatedException( "Failed to create file [" + src + "] for [" + holder + "] on client [" + clientMachine + "], because this file is already being created by [" + pendingFile.getClientName() + "] on [" + pendingFile.getClientMachine() + "]"); } } } } } /** * Append to an existing file in the namespace. */ LocatedBlock appendFile(final String src, final String holder, final String clientMachine) throws IOException { HopsTransactionalRequestHandler appendFileHandler = new HopsTransactionalRequestHandler(HDFSOperationType.APPEND_FILE, src) { @Override public void acquireLock(TransactionLocks locks) throws IOException { LockFactory lf = getInstance(); locks.add(lf.getINodeLock(!dir.isQuotaEnabled()?true:false,nameNode, INodeLockType.WRITE_ON_TARGET_AND_PARENT, INodeResolveType.PATH, src)).add(lf.getBlockLock()) .add(lf.getLeaseLock(LockType.WRITE, holder)) .add(lf.getLeasePathLock(LockType.READ_COMMITTED)).add( lf.getBlockRelated(BLK.RE, BLK.CR, BLK.ER, BLK.UC, BLK.UR, BLK.IV, BLK.PE)); // Always needs to be read. Erasure coding might have been // enabled earlier and we don't want to end up in an inconsistent // state. locks.add(lf.getEncodingStatusLock(LockType.READ_COMMITTED, src)); } @Override public Object performTask() throws IOException { try { INode target = getINode(src); if (target != null) { EncodingStatus status = EntityManager.find( EncodingStatus.Finder.ByInodeId, target.getId()); if (status != null) { throw new IOException("Cannot append to erasure-coded file"); } } return appendFileInt(src, holder, clientMachine); } catch (AccessControlException e) { logAuditEvent(false, "append", src); throw e; } } }; return (LocatedBlock) appendFileHandler.handle(this); } private LocatedBlock appendFileInt(String src, String holder, String clientMachine) throws AccessControlException, SafeModeException, FileAlreadyExistsException, FileNotFoundException, ParentNotDirectoryException, IOException, UnresolvedLinkException, StorageException { if (!supportAppends) { throw new UnsupportedOperationException( "Append is not enabled on this NameNode. Use the " + DFS_SUPPORT_APPEND_KEY + " configuration option to enable it."); } LocatedBlock lb = null; FSPermissionChecker pc = getPermissionChecker(); lb = startFileInternal(pc, src, null, holder, clientMachine, EnumSet.of(CreateFlag.APPEND), false, blockManager.maxReplication, 0); if (lb != null) { if (NameNode.stateChangeLog.isDebugEnabled()) { NameNode.stateChangeLog.debug( "DIR* NameSystem.appendFile: file " + src + " for " + holder + " at " + clientMachine + " block " + lb.getBlock() + " block size " + lb.getBlock().getNumBytes()); } } logAuditEvent(true, "append", src); return lb; } ExtendedBlock getExtendedBlock(Block blk) { return new ExtendedBlock(blockPoolId, blk); } /** * The client would like to obtain an additional block for the indicated * filename (which is being written-to). Return an array that consists * of the block, plus a set of machines. The first on this list should * be where the client writes data. Subsequent items in the list must * be provided in the connection to the first datanode. * <p/> * Make sure the previous blocks have been reported by datanodes and * are replicated. Will return an empty 2-elt array if we want the * client to "try again later". */ LocatedBlock getAdditionalBlock(final String src, final String clientName, final ExtendedBlock previous, final HashMap<Node, Node> excludedNodes) throws IOException { HopsTransactionalRequestHandler additionalBlockHandler = new HopsTransactionalRequestHandler( HDFSOperationType.GET_ADDITIONAL_BLOCK, src) { @Override public void acquireLock(TransactionLocks locks) throws IOException { LockFactory lf = getInstance(); locks.add(lf.getINodeLock(nameNode, INodeLockType.WRITE, INodeResolveType.PATH, src)) .add(lf.getLeaseLock(LockType.READ, clientName)) .add(lf.getLeasePathLock(LockType.READ_COMMITTED)) .add(lf.getLastTwoBlocksLock(src)) .add(lf.getBlockRelated(BLK.RE, BLK.CR, BLK.ER, BLK.UC)); } @Override public Object performTask() throws IOException { long blockSize; int replication; DatanodeDescriptor clientNode = null; if (NameNode.stateChangeLog.isDebugEnabled()) { NameNode.stateChangeLog.debug( "BLOCK* NameSystem.getAdditionalBlock: file " + src + " for " + clientName); } // Part I. Analyze the state of the file with respect to the input data. LocatedBlock[] onRetryBlock = new LocatedBlock[1]; final INode[] inodes = analyzeFileState(src, clientName, previous, onRetryBlock); final INodeFileUnderConstruction pendingFile = (INodeFileUnderConstruction) inodes[inodes.length - 1]; if (onRetryBlock[0] != null) { // This is a retry. Just return the last block. return onRetryBlock[0]; } blockSize = pendingFile.getPreferredBlockSize(); //clientNode = pendingFile.getClientNode(); HOP clientNode = pendingFile.getClientNode() == null ? null : getBlockManager().getDatanodeManager() .getDatanode(pendingFile.getClientNode()); replication = pendingFile.getBlockReplication(); // choose targets for the new block to be allocated. final DatanodeDescriptor targets[] = getBlockManager() .chooseTarget(src, replication, clientNode, excludedNodes, blockSize); // Part II. // Allocate a new block, add it to the INode and the BlocksMap. Block newBlock = null; long offset; // Run the full analysis again, since things could have changed // while chooseTarget() was executing. LocatedBlock[] onRetryBlock2 = new LocatedBlock[1]; INode[] inodes2 = analyzeFileState(src, clientName, previous, onRetryBlock2); final INodeFileUnderConstruction pendingFile2 = (INodeFileUnderConstruction) inodes2[inodes2.length - 1]; if (onRetryBlock2[0] != null) { // This is a retry. Just return the last block. return onRetryBlock2[0]; } // commit the last block and complete it if it has minimum replicas commitOrCompleteLastBlock(pendingFile2, ExtendedBlock.getLocalBlock(previous)); // allocate new block, record block locations in INode. newBlock = createNewBlock(pendingFile2); saveAllocatedBlock(src, inodes2, newBlock, targets); dir.persistBlocks(src, pendingFile2); offset = pendingFile2.computeFileSize(true); Lease lease = leaseManager.getLease(clientName); lease.updateLastTwoBlocksInLeasePath(src, newBlock, ExtendedBlock.getLocalBlock(previous)); // Return located block return makeLocatedBlock(newBlock, targets, offset); } }; return (LocatedBlock) additionalBlockHandler.handle(this); } INode[] analyzeFileState(String src, String clientName, ExtendedBlock previous, LocatedBlock[] onRetryBlock) throws IOException, LeaseExpiredException, StorageException { checkBlock(previous); onRetryBlock[0] = null; if (isInSafeMode()) { throw new SafeModeException("Cannot add block to " + src, safeMode); } // have we exceeded the configured limit of fs objects. checkFsObjectLimit(); Block previousBlock = ExtendedBlock.getLocalBlock(previous); final INode[] inodes = dir.getRootDir().getExistingPathINodes(src, true); final INodeFileUnderConstruction pendingFile = checkLease(src, clientName, inodes[inodes.length - 1]); BlockInfo lastBlockInFile = pendingFile.getLastBlock(); if (!Block.matchingIdAndGenStamp(previousBlock, lastBlockInFile)) { // The block that the client claims is the current last block // doesn't match up with what we think is the last block. There are // four possibilities: // 1) This is the first block allocation of an append() pipeline // which started appending exactly at a block boundary. // In this case, the client isn't passed the previous block, // so it makes the allocateBlock() call with previous=null. // We can distinguish this since the last block of the file // will be exactly a full block. // 2) This is a retry from a client that missed the response of a // prior getAdditionalBlock() call, perhaps because of a network // timeout, or because of an HA failover. In that case, we know // by the fact that the client is re-issuing the RPC that it // never began to write to the old block. Hence it is safe to // to return the existing block. // 3) This is an entirely bogus request/bug -- we should error out // rather than potentially appending a new block with an empty // one in the middle, etc // 4) This is a retry from a client that timed out while // the prior getAdditionalBlock() is still being processed, // currently working on chooseTarget(). // There are no means to distinguish between the first and // the second attempts in Part I, because the first one hasn't // changed the namesystem state yet. // We run this analysis again in Part II where case 4 is impossible. BlockInfo penultimateBlock = pendingFile.getPenultimateBlock(); if (previous == null && lastBlockInFile != null && lastBlockInFile.getNumBytes() == pendingFile.getPreferredBlockSize() && lastBlockInFile.isComplete()) { // Case 1 if (NameNode.stateChangeLog.isDebugEnabled()) { NameNode.stateChangeLog.debug( "BLOCK* NameSystem.allocateBlock: handling block allocation" + " writing to a file with a complete previous block: src=" + src + " lastBlock=" + lastBlockInFile); } } else if (Block.matchingIdAndGenStamp(penultimateBlock, previousBlock)) { if (lastBlockInFile.getNumBytes() != 0) { throw new IOException( "Request looked like a retry to allocate block " + lastBlockInFile + " but it already contains " + lastBlockInFile.getNumBytes() + " bytes"); } // Case 2 // Return the last block. NameNode.stateChangeLog.info("BLOCK* allocateBlock: " + "caught retry for allocation of a new block in " + src + ". Returning previously allocated block " + lastBlockInFile); long offset = pendingFile.computeFileSize(true); onRetryBlock[0] = makeLocatedBlock(lastBlockInFile, ((BlockInfoUnderConstruction) lastBlockInFile) .getExpectedLocations(getBlockManager().getDatanodeManager()), offset); return inodes; } else { // Case 3 throw new IOException("Cannot allocate block in " + src + ": " + "passed 'previous' block " + previous + " does not match actual " + "last block in file " + lastBlockInFile); } } // Check if the penultimate block is minimally replicated if (!checkFileProgress(pendingFile, false)) { throw new NotReplicatedYetException("Not replicated yet: " + src); } return inodes; } LocatedBlock makeLocatedBlock(Block blk, DatanodeInfo[] locs, long offset) throws IOException { LocatedBlock lBlk = new LocatedBlock(getExtendedBlock(blk), locs, offset); getBlockManager() .setBlockToken(lBlk, BlockTokenSecretManager.AccessMode.WRITE); return lBlk; } /** * @see NameNodeRpcServer#getAdditionalDatanode(String, ExtendedBlock, * DatanodeInfo[], * DatanodeInfo[], int, String) */ LocatedBlock getAdditionalDatanode(final String src, final ExtendedBlock blk, final DatanodeInfo[] existings, final HashMap<Node, Node> excludes, final int numAdditionalNodes, final String clientName) throws IOException { HopsTransactionalRequestHandler getAdditionalDatanodeHandler = new HopsTransactionalRequestHandler( HDFSOperationType.GET_ADDITIONAL_DATANODE, src) { @Override public void acquireLock(TransactionLocks locks) throws IOException { LockFactory lf = getInstance(); locks.add(lf.getINodeLock(nameNode, INodeLockType.READ, INodeResolveType.PATH, src)) .add(lf.getLeaseLock(LockType.READ, clientName)); } @Override public Object performTask() throws IOException { //check if the feature is enabled dtpReplaceDatanodeOnFailure.checkEnabled(); final DatanodeDescriptor clientnode; final long preferredblocksize; final List<DatanodeDescriptor> chosen; //check safe mode if (isInSafeMode()) { throw new SafeModeException( "Cannot add datanode; src=" + src + ", blk=" + blk, safeMode); } //check lease final INodeFileUnderConstruction file = checkLease(src, clientName, false); //clientnode = file.getClientNode(); HOP clientnode = getBlockManager().getDatanodeManager() .getDatanode(file.getClientNode()); preferredblocksize = file.getPreferredBlockSize(); //find datanode descriptors chosen = new ArrayList<DatanodeDescriptor>(); for (DatanodeInfo d : existings) { final DatanodeDescriptor descriptor = blockManager.getDatanodeManager().getDatanode(d); if (descriptor != null) { chosen.add(descriptor); } } // choose new datanodes. final DatanodeInfo[] targets = blockManager.getBlockPlacementPolicy() .chooseTarget(src, numAdditionalNodes, clientnode, chosen, true, excludes, preferredblocksize); final LocatedBlock lb = new LocatedBlock(blk, targets); blockManager.setBlockToken(lb, AccessMode.COPY); return lb; } }; return (LocatedBlock) getAdditionalDatanodeHandler.handle(this); } /** * The client would like to let go of the given block */ boolean abandonBlock(final ExtendedBlock b, final String src, final String holder) throws IOException { HopsTransactionalRequestHandler abandonBlockHandler = new HopsTransactionalRequestHandler(HDFSOperationType.ABANDON_BLOCK, src) { @Override public void acquireLock(TransactionLocks locks) throws IOException { LockFactory lf = getInstance(); locks.add(lf.getINodeLock(nameNode, INodeLockType.WRITE_ON_TARGET_AND_PARENT, INodeResolveType.PATH, src)).add(lf.getLeaseLock(LockType.READ)) .add(lf.getLeasePathLock(LockType.READ_COMMITTED, src)) .add(lf.getBlockLock()) .add(lf.getBlockRelated(BLK.RE, BLK.CR, BLK.UC, BLK.UR)); } @Override public Object performTask() throws IOException { // // Remove the block from the pending creates list // if (NameNode.stateChangeLog.isDebugEnabled()) { NameNode.stateChangeLog.debug( "BLOCK* NameSystem.abandonBlock: " + b + "of file " + src); } if (isInSafeMode()) { throw new SafeModeException( "Cannot abandon block " + b + " for fle" + src, safeMode); } INodeFileUnderConstruction file = checkLease(src, holder, false); dir.removeBlock(src, file, ExtendedBlock.getLocalBlock(b)); leaseManager.getLease(holder).updateLastTwoBlocksInLeasePath(src, file.getLastBlock(), file.getPenultimateBlock()); if (NameNode.stateChangeLog.isDebugEnabled()) { NameNode.stateChangeLog.debug( "BLOCK* NameSystem.abandonBlock: " + b + " is removed from pendingCreates"); } dir.persistBlocks(src, file); return true; } }; return (Boolean) abandonBlockHandler.handle(this); } // make sure that we still have the lease on this file. private INodeFileUnderConstruction checkLease(String src, String holder) throws LeaseExpiredException, UnresolvedLinkException, StorageException, TransactionContextException { return checkLease(src, holder, true); } private INodeFileUnderConstruction checkLease(String src, String holder, boolean updateLastTwoBlocksInFile) throws LeaseExpiredException, UnresolvedLinkException, StorageException, TransactionContextException { return checkLease(src, holder, dir.getINode(src), updateLastTwoBlocksInFile); } private INodeFileUnderConstruction checkLease(String src, String holder, INode file) throws LeaseExpiredException, StorageException, TransactionContextException { return checkLease(src, holder, file, true); } private INodeFileUnderConstruction checkLease(String src, String holder, INode file, boolean updateLastTwoBlocksInFile) throws LeaseExpiredException, StorageException, TransactionContextException { if (file == null || !(file instanceof INodeFile)) { Lease lease = leaseManager.getLease(holder); throw new LeaseExpiredException( "No lease on " + src + ": File does not exist. " + (lease != null ? lease.toString() : "Holder " + holder + " does not have any open files.")); } if (!file.isUnderConstruction()) { Lease lease = leaseManager.getLease(holder); throw new LeaseExpiredException( "No lease on " + src + ": File is not open for writing. " + (lease != null ? lease.toString() : "Holder " + holder + " does not have any open files.")); } INodeFileUnderConstruction pendingFile = (INodeFileUnderConstruction) file; if (holder != null && !pendingFile.getClientName().equals(holder)) { throw new LeaseExpiredException( "Lease mismatch on " + src + " owned by " + pendingFile.getClientName() + " but is accessed by " + holder); } if(updateLastTwoBlocksInFile) { pendingFile.updateLastTwoBlocks(leaseManager.getLease(holder), src); } return pendingFile; } /** * Complete in-progress write to the given file. * * @return true if successful, false if the client should continue to retry * (e.g if not all blocks have reached minimum replication yet) * @throws IOException * on error (eg lease mismatch, file not open, file deleted) */ boolean completeFile(final String src, final String holder, final ExtendedBlock last) throws IOException { HopsTransactionalRequestHandler completeFileHandler = new HopsTransactionalRequestHandler(HDFSOperationType.COMPLETE_FILE, src) { @Override public void acquireLock(TransactionLocks locks) throws IOException { LockFactory lf = getInstance(); locks.add(lf.getINodeLock(!dir.isQuotaEnabled()?true:false/*skip Inode Atrr*/,nameNode, INodeLockType.WRITE, INodeResolveType.PATH, src)) .add(lf.getLeaseLock(LockType.WRITE, holder)) .add(lf.getLeasePathLock(LockType.READ_COMMITTED)).add(lf.getBlockLock()) .add(lf.getBlockRelated(BLK.RE, BLK.CR, BLK.ER, BLK.UC, BLK.UR, BLK.IV)); } @Override public Object performTask() throws IOException { checkBlock(last); return completeFileInternal(src, holder, ExtendedBlock.getLocalBlock(last)); } }; return (Boolean) completeFileHandler.handle(this); } private boolean completeFileInternal(String src, String holder, Block last) throws SafeModeException, UnresolvedLinkException, IOException, StorageException { if (NameNode.stateChangeLog.isDebugEnabled()) { NameNode.stateChangeLog.debug("DIR* NameSystem.completeFile: " + src + " for " + holder); } if (isInSafeMode()) { throw new SafeModeException("Cannot complete file " + src, safeMode); } INodeFileUnderConstruction pendingFile; try { pendingFile = checkLease(src, holder); } catch (LeaseExpiredException lee) { final INode inode = dir.getINode(src); if (inode != null && inode instanceof INodeFile && !inode.isUnderConstruction()) { // This could be a retry RPC - i.e the client tried to close // the file, but missed the RPC response. Thus, it is trying // again to close the file. If the file still exists and // the client's view of the last block matches the actual // last block, then we'll treat it as a successful close. // See HDFS-3031. final Block realLastBlock = ((INodeFile) inode).getLastBlock(); if (Block.matchingIdAndGenStamp(last, realLastBlock)) { NameNode.stateChangeLog.info("DIR* completeFile: " + "request from " + holder + " to complete " + src + " which is already closed. But, it appears to be an RPC " + "retry. Returning success"); return true; } } throw lee; } // commit the last block and complete it if it has minimum replicas commitOrCompleteLastBlock(pendingFile, last); if (!checkFileProgress(pendingFile, true)) { return false; } finalizeINodeFileUnderConstruction(src, pendingFile); NameNode.stateChangeLog .info("DIR* completeFile: " + src + " is closed by " + holder); return true; } /** * Save allocated block at the given pending filename * * @param src * path to the file * @param inodes * representing each of the components of src. * The last INode is the INode for the file. * @throws QuotaExceededException * If addition of block exceeds space quota */ BlockInfo saveAllocatedBlock(String src, INode[] inodes, Block newBlock, DatanodeDescriptor targets[]) throws IOException, StorageException { BlockInfo b = dir.addBlock(src, inodes, newBlock, targets); NameNode.stateChangeLog.info( "BLOCK* allocateBlock: " + src + ". " + getBlockPoolId() + " " + b); for (DatanodeDescriptor dn : targets) { dn.incBlocksScheduled(); } return b; } /** * Create new block with a unique block id and a new generation stamp. */ Block createNewBlock(INodeFile pendingFile) throws IOException, StorageException { Block b = new Block(IDsGeneratorFactory.getInstance().getUniqueBlockID() , 0, 0); // HOP. previous code was getFSImage().getUniqueBlockId() // Increment the generation stamp for every new block. b.setGenerationStampNoPersistance(pendingFile.nextGenerationStamp()); return b; } /** * Check that the indicated file's blocks are present and * replicated. If not, return false. If checkall is true, then check * all blocks, otherwise check only penultimate block. */ boolean checkFileProgress(INodeFile v, boolean checkall) throws StorageException, IOException { if (checkall) { // // check all blocks of the file. // for (BlockInfo block : v.getBlocks()) { if (!block.isComplete()) { BlockInfo cBlock = blockManager .tryToCompleteBlock((MutableBlockCollection) v, block.getBlockIndex()); if (cBlock != null) { block = cBlock; } if (!block.isComplete()) { LOG.info("BLOCK* checkFileProgress: " + block + " has not reached minimal replication " + blockManager.minReplication); return false; } } } } else { // // check the penultimate block of this file // BlockInfo b = v.getPenultimateBlock(); if (b != null && !b.isComplete()) { blockManager .tryToCompleteBlock((MutableBlockCollection) v, b.getBlockIndex()); b = v.getPenultimateBlock(); if (!b.isComplete()) { LOG.info("BLOCK* checkFileProgress: " + b + " has not reached minimal replication " + blockManager.minReplication); return false; } } } return true; } //////////////////////////////////////////////////////////////// // Here's how to handle block-copy failure during client write: // -- As usual, the client's write should result in a streaming // backup write to a k-machine sequence. // -- If one of the backup machines fails, no worries. Fail silently. // -- Before client is allowed to close and finalize file, make sure // that the blocks are backed up. Namenode may have to issue specific backup // commands to make up for earlier datanode failures. Once all copies // are made, edit namespace and return to client. //////////////////////////////////////////////////////////////// /** * Change the indicated filename. * * @deprecated Use {@link #renameTo(String, String, Options.Rename...)} * instead. */ @Deprecated boolean renameTo(final String src, final String dst) throws IOException { HopsTransactionalRequestHandler renameToHandler = new HopsTransactionalRequestHandler(HDFSOperationType.DEPRICATED_RENAME, src) { @Override public void acquireLock(TransactionLocks locks) throws IOException { LockFactory lf = getInstance(); locks.add(lf.getLegacyRenameINodeLock(!dir.isQuotaEnabled()?true:false/*skip INode Attr Lock*/, nameNode, INodeLockType.WRITE_ON_TARGET_AND_PARENT, INodeResolveType.PATH, src, dst)) .add(lf.getLeaseLock(LockType.WRITE)) .add(lf.getLeasePathLock(LockType.READ_COMMITTED)).add(lf.getBlockLock()) .add(lf.getBlockRelated(BLK.RE, BLK.UC, BLK.IV, BLK.CR, BLK.ER, BLK.PE, BLK.UR)); if (dir.isQuotaEnabled()) { locks.add(lf.getQuotaUpdateLock(true, src, dst)); } } @Override public Object performTask() throws IOException { try { return renameToInt(src, dst); } catch (AccessControlException e) { logAuditEvent(false, "rename", src, dst, null); throw e; } } }; return (Boolean) renameToHandler.handle(this); } private boolean renameToInt(String src, String dst) throws IOException, UnresolvedLinkException, StorageException { boolean status = false; HdfsFileStatus resultingStat = null; if (NameNode.stateChangeLog.isDebugEnabled()) { NameNode.stateChangeLog.debug("DIR* NameSystem.renameTo: " + src + " to " + dst); } FSPermissionChecker pc = getPermissionChecker(); status = renameToInternal(pc, src, dst); if (status) { resultingStat = getAuditFileInfo(dst, false); } if (status) { logAuditEvent(true, "rename", src, dst, resultingStat); } return status; } /** * @deprecated See {@link #renameTo(String, String)} */ @Deprecated private boolean renameToInternal(FSPermissionChecker pc, String src, String dst) throws IOException, UnresolvedLinkException, StorageException { if (isInSafeMode()) { throw new SafeModeException("Cannot rename " + src, safeMode); } if (!DFSUtil.isValidName(dst)) { throw new IOException("Invalid name: " + dst); } if (isPermissionEnabled) { //We should not be doing this. This is move() not renameTo(). //but for now, //NOTE: yes, this is bad! it's assuming much lower level behavior // of rewriting the dst String actualdst = dir.isDir(dst) ? dst + Path.SEPARATOR + new Path(src).getName() : dst; checkParentAccess(pc, src, FsAction.WRITE); checkAncestorAccess(pc, actualdst, FsAction.WRITE); } if (dir.renameTo(src, dst)) { return true; } return false; } /** * Rename src to dst */ void renameTo(final String src, final String dst, final Options.Rename... options) throws IOException { new HopsTransactionalRequestHandler(HDFSOperationType.RENAME, src) { @Override public void acquireLock(TransactionLocks locks) throws IOException { LockFactory lf = getInstance(); locks.add(lf.getRenameINodeLock(nameNode, INodeLockType.WRITE_ON_TARGET_AND_PARENT, INodeResolveType.PATH, src, dst)) .add(lf.getLeaseLock(LockType.WRITE)) .add(lf.getLeasePathLock(LockType.READ_COMMITTED)).add(lf.getBlockLock()) .add(lf.getBlockRelated(BLK.RE, BLK.CR, BLK.UC, BLK.UR, BLK.IV, BLK.PE, BLK.ER)); if (dir.isQuotaEnabled()) { locks.add(lf.getQuotaUpdateLock(true, src, dst)); } } @Override public Object performTask() throws IOException { HdfsFileStatus resultingStat = null; if (NameNode.stateChangeLog.isDebugEnabled()) { NameNode.stateChangeLog.debug( "DIR* NameSystem.renameTo: with options - " + src + " to " + dst); } FSPermissionChecker pc = getPermissionChecker(); renameToInternal(pc, src, dst, options); resultingStat = getAuditFileInfo(dst, false); if (resultingStat != null) { StringBuilder cmd = new StringBuilder("rename options="); for (Rename option : options) { cmd.append(option.value()).append(" "); } logAuditEvent(true, cmd.toString(), src, dst, resultingStat); } return null; } }.handle(this); } private void renameToInternal(FSPermissionChecker pc, String src, String dst, Options.Rename... options) throws IOException, StorageException { if (isInSafeMode()) { throw new SafeModeException("Cannot rename " + src, safeMode); } if (!DFSUtil.isValidName(dst)) { throw new InvalidPathException("Invalid name: " + dst); } if (isPermissionEnabled) { checkParentAccess(pc, src, FsAction.WRITE); checkAncestorAccess(pc, dst, FsAction.WRITE); } dir.renameTo(src, dst, options); } /** * Remove the indicated file from namespace. * * @see ClientProtocol#delete(String, boolean) for detailed descriptoin and * description of exceptions */ public boolean deleteWithTransaction(final String src, final boolean recursive) throws IOException { HopsTransactionalRequestHandler deleteHandler = new HopsTransactionalRequestHandler(HDFSOperationType.DELETE, src) { @Override public void acquireLock(TransactionLocks locks) throws IOException { LockFactory lf = getInstance(); locks.add(lf.getINodeLock(!dir.isQuotaEnabled()?true:false/*skip INode Attr Lock*/, nameNode, INodeLockType.WRITE_ON_TARGET_AND_PARENT, INodeResolveType.PATH_AND_IMMEDIATE_CHILDREN, false, src)) .add(lf.getLeaseLock(LockType.WRITE)) .add(lf.getLeasePathLock(LockType.READ_COMMITTED)).add(lf.getBlockLock()) .add(lf.getBlockRelated(BLK.RE, BLK.CR, BLK.UC, BLK.UR, BLK.PE, BLK.IV)); if (dir.isQuotaEnabled()) { locks.add(lf.getQuotaUpdateLock(true, src)); } if (erasureCodingEnabled) { locks.add(lf.getEncodingStatusLock(LockType.WRITE, src)); } } @Override public Object performTask() throws IOException { return delete(src, recursive); } }; return (Boolean) deleteHandler.handle(this); } boolean delete(String src, boolean recursive) throws AccessControlException, SafeModeException, UnresolvedLinkException, IOException, StorageException { try { return deleteInt(src, recursive); } catch (AccessControlException e) { logAuditEvent(false, "delete", src); throw e; } } private boolean deleteInt(String src, boolean recursive) throws AccessControlException, SafeModeException, UnresolvedLinkException, IOException, StorageException { if (NameNode.stateChangeLog.isDebugEnabled()) { NameNode.stateChangeLog.debug("DIR* NameSystem.delete: " + src); } boolean status = deleteInternal(src, recursive, true); if (status) { logAuditEvent(true, "delete", src); } return status; } FSPermissionChecker getPermissionChecker() throws AccessControlException { return new FSPermissionChecker(fsOwnerShortUserName, supergroup); } /** * Remove a file/directory from the namespace. * <p/> * For large directories, deletion is incremental. The blocks under * the directory are collected and deleted a small number at a time. * <p/> * For small directory or file the deletion is done in one shot. * * @see ClientProtocol#delete(String, boolean) for description of exceptions */ private boolean deleteInternal(String src, boolean recursive, boolean enforcePermission) throws AccessControlException, SafeModeException, UnresolvedLinkException, IOException, StorageException { ArrayList<Block> collectedBlocks = new ArrayList<Block>(); FSPermissionChecker pc = getPermissionChecker(); if (isInSafeMode()) { throw new SafeModeException("Cannot delete " + src, safeMode); } if (!recursive && dir.isNonEmptyDirectory(src)) { throw new IOException(src + " is non empty"); } if (enforcePermission && isPermissionEnabled) { checkPermission(pc, src, false, null, FsAction.WRITE, null, FsAction.ALL); } // Unlink the target directory from directory tree if (!dir.delete(src, collectedBlocks)) { return false; } removeBlocks(collectedBlocks); // Incremental deletion of blocks collectedBlocks.clear(); if (NameNode.stateChangeLog.isDebugEnabled()) { NameNode.stateChangeLog .debug("DIR* Namesystem.delete: " + src + " is removed"); } return true; } /** * From the given list, incrementally remove the blocks from blockManager * Writelock is dropped and reacquired every BLOCK_DELETION_INCREMENT to * ensure that other waiters on the lock can get in. See HDFS-2938 */ private void removeBlocks(List<Block> blocks) throws StorageException, TransactionContextException { int start = 0; int end = 0; while (start < blocks.size()) { end = BLOCK_DELETION_INCREMENT + start; end = end > blocks.size() ? blocks.size() : end; for (int i = start; i < end; i++) { blockManager.removeBlock(blocks.get(i)); } start = end; } } void removePathAndBlocks(String src, List<Block> blocks) throws StorageException, IOException { leaseManager.removeLeaseWithPrefixPath(src); if (blocks == null) { return; } for (Block b : blocks) { blockManager.removeBlock(b); } } /** * Get the file info for a specific file. * * @param src * The string representation of the path to the file * @param resolveLink * whether to throw UnresolvedLinkException * if src refers to a symlink * @return object containing information regarding the file * or null if file not found * @throws AccessControlException * if access is denied * @throws UnresolvedLinkException * if a symlink is encountered. */ public HdfsFileStatus getFileInfo(final String src, final boolean resolveLink) throws AccessControlException, UnresolvedLinkException, IOException { HopsTransactionalRequestHandler getFileInfoHandler = new HopsTransactionalRequestHandler(HDFSOperationType.GET_FILE_INFO, src) { @Override public void acquireLock(TransactionLocks locks) throws IOException { LockFactory lf = getInstance(); locks.add(lf.getINodeLock(true/*skip quota*/,nameNode, INodeLockType.READ, INodeResolveType.PATH, resolveLink, src)); } @Override public Object performTask() throws IOException { HdfsFileStatus stat = null; FSPermissionChecker pc = getPermissionChecker(); try { if (isPermissionEnabled) { checkTraverse(pc, src); } stat = dir.getFileInfo(src, resolveLink); } catch (AccessControlException e) { logAuditEvent(false, "getfileinfo", src); throw e; } logAuditEvent(true, "getfileinfo", src); return stat; } }; if (!DFSUtil.isValidName(src)) { throw new InvalidPathException("Invalid file name: " + src); } return (HdfsFileStatus) getFileInfoHandler.handle(this); } /** * Create all the necessary directories */ boolean mkdirs(final String src, final PermissionStatus permissions, final boolean createParent) throws IOException, UnresolvedLinkException { final boolean resolvedLink = false; HopsTransactionalRequestHandler mkdirsHandler = new HopsTransactionalRequestHandler(HDFSOperationType.MKDIRS, src) { @Override public void acquireLock(TransactionLocks locks) throws IOException { LockFactory lf = getInstance(); locks.add(lf.getINodeLock(!dir.isQuotaEnabled()?true:false,nameNode, INodeLockType.WRITE_ON_TARGET_AND_PARENT, INodeResolveType.PATH, resolvedLink, src)); } @Override public Object performTask() throws StorageException, IOException { try { return mkdirsInt(src, permissions, createParent); } catch (AccessControlException e) { logAuditEvent(false, "mkdirs", src); throw e; } } }; return (Boolean) mkdirsHandler.handle(this); } private boolean mkdirsInt(String src, PermissionStatus permissions, boolean createParent) throws IOException, UnresolvedLinkException, StorageException { HdfsFileStatus resultingStat = null; boolean status = false; if (NameNode.stateChangeLog.isDebugEnabled()) { NameNode.stateChangeLog .debug(this.getNamenodeId() + ") DIR* NameSystem.mkdirs: " + src); } FSPermissionChecker pc = getPermissionChecker(); status = mkdirsInternal(pc, src, permissions, createParent); if (status) { resultingStat = dir.getFileInfo(src, false); } if (status) { logAuditEvent(true, "mkdirs", src, null, resultingStat); } return status; } /** * Create all the necessary directories */ private boolean mkdirsInternal(FSPermissionChecker pc, String src, PermissionStatus permissions, boolean createParent) throws IOException, UnresolvedLinkException, StorageException { if (isInSafeMode()) { throw new SafeModeException("Cannot create directory " + src, safeMode); } if (isPermissionEnabled) { checkTraverse(pc, src); } if (dir.isDir(src)) { // all the users of mkdirs() are used to expect 'true' even if // a new directory is not created. return true; } if (!DFSUtil.isValidName(src)) { throw new InvalidPathException(src); } if (isPermissionEnabled) { checkAncestorAccess(pc, src, FsAction.WRITE); } if (!createParent) { verifyParentDir(src); } // validate that we have enough inodes. This is, at best, a // heuristic because the mkdirs() operation migth need to // create multiple inodes. checkFsObjectLimit(); if (!dir.mkdirs(src, permissions, false, now())) { throw new IOException("Failed to create directory: " + src); } return true; } ContentSummary getContentSummary(final String src) throws AccessControlException, FileNotFoundException, UnresolvedLinkException, IOException { if (isLegacyConentSummaryEnabled()) { throw new UnsupportedActionException("Legacy Content Summary is not supported"); } else { return multiTransactionalGetContentSummary(src); } } // ContentSummary getContentSummaryLegacy(final String src) // throws AccessControlException, FileNotFoundException, // UnresolvedLinkException, IOException { // HopsTransactionalRequestHandler getContentSummaryHandler = // new HopsTransactionalRequestHandler( // HDFSOperationType.GET_CONTENT_SUMMARY, src) { // @Override // public void acquireLock(TransactionLocks locks) throws IOException { // LockFactory lf = getInstance(); // locks.add(lf.getINodeLock(nameNode, INodeLockType.READ, // INodeResolveType.PATH_AND_ALL_CHILDREN_RECURSIVELY, src)) // .add(lf.getBlockLock()); // } // // @Override // public Object performTask() throws IOException { // FSPermissionChecker pc = // new FSPermissionChecker(fsOwnerShortUserName, supergroup); // if (isPermissionEnabled) { // checkPermission(pc, src, false, null, null, null, // FsAction.READ_EXECUTE); // } // return dir.getContentSummary(src); // } // }; // return (ContentSummary) getContentSummaryHandler.handle(this); // } /** * Set the namespace quota and diskspace quota for a directory. * See {@link ClientProtocol#setQuota(String, long, long)} for the * contract. */ // void setQuota(final String path, final long nsQuota, final long dsQuota) // throws IOException, UnresolvedLinkException { // HopsTransactionalRequestHandler setQuotaHandler = // new HopsTransactionalRequestHandler(HDFSOperationType.SET_QUOTA, path) { // @Override // public void acquireLock(TransactionLocks locks) throws IOException { // LockFactory lf = getInstance(); // locks.add(lf.getINodeLock(nameNode, INodeLockType.WRITE, // INodeResolveType.PATH_AND_ALL_CHILDREN_RECURSIVELY, path)) // .add(lf.getBlockLock()); // } // // @Override // public Object performTask() throws StorageException, IOException { // checkSuperuserPrivilege(); // if (isInSafeMode()) { // throw new SafeModeException("Cannot set quota on " + path, // safeMode); // } // dir.setQuota(path, nsQuota, dsQuota); // return null; // } // }; // setQuotaHandler.handle(this); // } /** * Persist all metadata about this file. * * @param src * The string representation of the path * @param clientName * The string representation of the client * @param lastBlockLength * The length of the last block * under construction reported from client. * @throws IOException * if path does not exist */ void fsync(final String src, final String clientName, final long lastBlockLength) throws IOException, UnresolvedLinkException { new HopsTransactionalRequestHandler(HDFSOperationType.FSYNC, src) { @Override public void acquireLock(TransactionLocks locks) throws IOException { LockFactory lf = getInstance(); locks.add( lf.getINodeLock(nameNode, INodeLockType.WRITE, INodeResolveType.PATH, src)).add(lf.getLeaseLock(LockType.READ, clientName)) .add(lf.getLeasePathLock(LockType.READ_COMMITTED)) .add(lf.getBlockLock()); } @Override public Object performTask() throws IOException { NameNode.stateChangeLog .info("BLOCK* fsync: " + src + " for " + clientName); if (isInSafeMode()) { throw new SafeModeException("Cannot fsync file " + src, safeMode); } INodeFileUnderConstruction pendingFile = checkLease(src, clientName); if (lastBlockLength > 0) { pendingFile.updateLengthOfLastBlock(lastBlockLength); } dir.persistBlocks(src, pendingFile); pendingFile.recomputeFileSize(); return null; } }.handle(this); } /** * Move a file that is being written to be immutable. * * @param src * The filename * @param lease * The lease for the client creating the file * @param recoveryLeaseHolder * reassign lease to this holder if the last block * needs recovery; keep current holder if null. * @return true if file has been successfully finalized and closed or * false if block recovery has been initiated. Since the lease owner * has been changed and logged, caller should call logSync(). * @throws AlreadyBeingCreatedException * if file is waiting to achieve minimal * replication;<br> * RecoveryInProgressException if lease recovery is in progress.<br> * IOException in case of an error. */ boolean internalReleaseLease(Lease lease, String src, String recoveryLeaseHolder) throws AlreadyBeingCreatedException, IOException, UnresolvedLinkException, StorageException { LOG.info("Recovering " + lease + ", src=" + src); assert !isInSafeMode(); final INodeFileUnderConstruction pendingFile = INodeFileUnderConstruction.valueOf(dir.getINode(src), src); int nrBlocks = pendingFile.numBlocks(); BlockInfo[] blocks = pendingFile.getBlocks(); int nrCompleteBlocks; BlockInfo curBlock = null; for (nrCompleteBlocks = 0; nrCompleteBlocks < nrBlocks; nrCompleteBlocks++) { curBlock = blocks[nrCompleteBlocks]; if (!curBlock.isComplete()) { break; } assert blockManager.checkMinReplication(curBlock) : "A COMPLETE block is not minimally replicated in " + src; } // If there are no incomplete blocks associated with this file, // then reap lease immediately and close the file. if (nrCompleteBlocks == nrBlocks) { finalizeINodeFileUnderConstruction(src, pendingFile); NameNode.stateChangeLog.warn("BLOCK*" + " internalReleaseLease: All existing blocks are COMPLETE," + " lease removed, file closed."); return true; // closed! } // Only the last and the penultimate blocks may be in non COMPLETE state. // If the penultimate block is not COMPLETE, then it must be COMMITTED. if (nrCompleteBlocks < nrBlocks - 2 || nrCompleteBlocks == nrBlocks - 2 && curBlock != null && curBlock.getBlockUCState() != BlockUCState.COMMITTED) { final String message = "DIR* NameSystem.internalReleaseLease: " + "attempt to release a create lock on " + src + " but file is already closed."; NameNode.stateChangeLog.warn(message); throw new IOException(message); } // The last block is not COMPLETE, and // that the penultimate block if exists is either COMPLETE or COMMITTED final BlockInfo lastBlock = pendingFile.getLastBlock(); BlockUCState lastBlockState = lastBlock.getBlockUCState(); BlockInfo penultimateBlock = pendingFile.getPenultimateBlock(); boolean penultimateBlockMinReplication; BlockUCState penultimateBlockState; if (penultimateBlock == null) { penultimateBlockState = BlockUCState.COMPLETE; // If penultimate block doesn't exist then its minReplication is met penultimateBlockMinReplication = true; } else { penultimateBlockState = BlockUCState.COMMITTED; penultimateBlockMinReplication = blockManager.checkMinReplication(penultimateBlock); } assert penultimateBlockState == BlockUCState.COMPLETE || penultimateBlockState == BlockUCState.COMMITTED : "Unexpected state of penultimate block in " + src; switch (lastBlockState) { case COMPLETE: assert false : "Already checked that the last block is incomplete"; break; case COMMITTED: // Close file if committed blocks are minimally replicated if (penultimateBlockMinReplication && blockManager.checkMinReplication(lastBlock)) { finalizeINodeFileUnderConstruction(src, pendingFile); NameNode.stateChangeLog.warn("BLOCK*" + " internalReleaseLease: Committed blocks are minimally replicated," + " lease removed, file closed."); return true; // closed! } // Cannot close file right now, since some blocks // are not yet minimally replicated. // This may potentially cause infinite loop in lease recovery // if there are no valid replicas on data-nodes. String message = "DIR* NameSystem.internalReleaseLease: " + "Failed to release lease for file " + src + ". Committed blocks are waiting to be minimally replicated." + " Try again later."; NameNode.stateChangeLog.warn(message); throw new AlreadyBeingCreatedException(message); case UNDER_CONSTRUCTION: case UNDER_RECOVERY: final BlockInfoUnderConstruction uc = (BlockInfoUnderConstruction) lastBlock; // setup the last block locations from the blockManager if not known if (uc.getNumExpectedLocations() == 0) { uc.setExpectedLocations(blockManager.getNodes(lastBlock)); } // start recovery of the last block for this file long blockRecoveryId = pendingFile.nextGenerationStamp(); lease = reassignLease(lease, src, recoveryLeaseHolder, pendingFile); uc.initializeBlockRecovery(blockRecoveryId, getBlockManager().getDatanodeManager()); leaseManager.renewLease(lease); // Cannot close file right now, since the last block requires recovery. // This may potentially cause infinite loop in lease recovery // if there are no valid replicas on data-nodes. NameNode.stateChangeLog.warn("DIR* NameSystem.internalReleaseLease: " + "File " + src + " has not been closed." + " Lease recovery is in progress. " + "RecoveryId = " + blockRecoveryId + " for block " + lastBlock); break; } return false; } private Lease reassignLease(Lease lease, String src, String newHolder, INodeFileUnderConstruction pendingFile) throws StorageException, TransactionContextException { if (newHolder == null) { return lease; } return reassignLeaseInternal(lease, src, newHolder, pendingFile); } Lease reassignLeaseInternal(Lease lease, String src, String newHolder, INodeFileUnderConstruction pendingFile) throws StorageException, TransactionContextException { pendingFile.setClientName(newHolder); return leaseManager.reassignLease(lease, src, newHolder); } private void commitOrCompleteLastBlock( final INodeFileUnderConstruction fileINode, final Block commitBlock) throws IOException { if (!blockManager.commitOrCompleteLastBlock(fileINode, commitBlock)) { return; } fileINode.recomputeFileSize(); if (dir.isQuotaEnabled()) { final long diff = fileINode.getPreferredBlockSize() - commitBlock.getNumBytes(); if (diff > 0) { // Adjust disk space consumption if required String path = leaseManager.findPath(fileINode); dir.updateSpaceConsumed(path, 0, -diff * fileINode.getBlockReplication()); } } try { if (fileINode.isPathMetaEnabled()) { SizeLogDataAccess da = (SizeLogDataAccess) HdfsStorageFactory.getDataAccess(SizeLogDataAccess.class); da.add(new SizeLogEntry(fileINode.getId(), fileINode.getSize())); } } catch (StorageCallPreventedException e) { // Path is not available during block synchronization but it is OK // for us if search results are off by one block } } private void finalizeINodeFileUnderConstruction(String src, INodeFileUnderConstruction pendingFile) throws IOException, UnresolvedLinkException, StorageException { leaseManager.removeLease(pendingFile.getClientName(), src); // The file is no longer pending. // Create permanent INode, update blocks INodeFile newFile = pendingFile.convertToInodeFile(); // close file and persist block allocations for this file dir.closeFile(src, newFile); blockManager.checkReplication(newFile); } void commitBlockSynchronization(final ExtendedBlock lastblock, final long newgenerationstamp, final long newlength, final boolean closeFile, final boolean deleteblock, final DatanodeID[] newtargets, final String[] newtargetstorages) throws IOException, UnresolvedLinkException { new HopsTransactionalRequestHandler( HDFSOperationType.COMMIT_BLOCK_SYNCHRONIZATION) { INodeIdentifier inodeIdentifier; @Override public void setUp() throws StorageException { inodeIdentifier = INodeUtil.resolveINodeFromBlock(lastblock.getLocalBlock()); } @Override public void acquireLock(TransactionLocks locks) throws IOException { LockFactory lf = getInstance(); locks.add( lf.getIndividualINodeLock(INodeLockType.WRITE, inodeIdentifier, true)).add(lf.getLeaseLock(LockType.WRITE)) .add(lf.getLeasePathLock(LockType.READ_COMMITTED)) .add(lf.getBlockLock(lastblock.getBlockId(), inodeIdentifier)) .add(lf.getBlockRelated(BLK.RE, BLK.CR, BLK.ER, BLK.UC, BLK.UR)); } @Override public Object performTask() throws IOException { String src = ""; // If a DN tries to commit to the standby, the recovery will // fail, and the next retry will succeed on the new NN. if (isInSafeMode()) { throw new SafeModeException( "Cannot commitBlockSynchronization while in safe mode", safeMode); } LOG.info("commitBlockSynchronization(lastblock=" + lastblock + ", newgenerationstamp=" + newgenerationstamp + ", newlength=" + newlength + ", newtargets=" + Arrays.asList(newtargets) + ", closeFile=" + closeFile + ", deleteBlock=" + deleteblock + ")"); final BlockInfo storedBlock = blockManager.getStoredBlock(ExtendedBlock.getLocalBlock(lastblock)); if (storedBlock == null) { throw new IOException("Block (=" + lastblock + ") not found"); } INodeFile iFile = (INodeFile) storedBlock.getBlockCollection(); if (!iFile.isUnderConstruction() || storedBlock.isComplete()) { throw new IOException( "Unexpected block (=" + lastblock + ") since the file (=" + iFile.getLocalName() + ") is not under construction"); } long recoveryId = ((BlockInfoUnderConstruction) storedBlock).getBlockRecoveryId(); if (recoveryId != newgenerationstamp) { throw new IOException("The recovery id " + newgenerationstamp + " does not match current recovery id " + recoveryId + " for block " + lastblock); } INodeFileUnderConstruction pendingFile = (INodeFileUnderConstruction) iFile; if (deleteblock) { pendingFile.removeLastBlock(ExtendedBlock.getLocalBlock(lastblock)); blockManager.removeBlockFromMap(storedBlock); } else { // update last block storedBlock.setGenerationStamp(newgenerationstamp); storedBlock.setNumBytes(newlength); iFile.recomputeFileSize(); // find the DatanodeDescriptor objects // There should be no locations in the blockManager till now because the // file is underConstruction DatanodeDescriptor[] descriptors = null; if (newtargets.length > 0) { descriptors = new DatanodeDescriptor[newtargets.length]; for (int i = 0; i < newtargets.length; i++) { descriptors[i] = blockManager.getDatanodeManager().getDatanode(newtargets[i]); } } if ((closeFile) && (descriptors != null)) { // the file is getting closed. Insert block locations into blockManager. // Otherwise fsck will report these blocks as MISSING, especially if the // blocksReceived from Datanodes take a long time to arrive. for (int i = 0; i < descriptors.length; i++) { descriptors[i].addBlock(storedBlock); } } // add pipeline locations into the INodeUnderConstruction pendingFile.setLastBlock(storedBlock, descriptors); } src = leaseManager.findPath(pendingFile); if (closeFile) { // commit the last block and complete it if it has minimum replicas commitOrCompleteLastBlock(pendingFile, storedBlock); //remove lease, close file finalizeINodeFileUnderConstruction(src, pendingFile); } else { // If this commit does not want to close the file, persist blocks dir.persistBlocks(src, pendingFile); } if (closeFile) { LOG.info( "commitBlockSynchronization(newblock=" + lastblock + ", file=" + src + ", newgenerationstamp=" + newgenerationstamp + ", newlength=" + newlength + ", newtargets=" + Arrays.asList(newtargets) + ") successful"); } else { LOG.info("commitBlockSynchronization(" + lastblock + ") successful"); } return null; } }.handle(this); } /** * Renew the lease(s) held by the given client */ void renewLease(final String holder) throws IOException { new HopsTransactionalRequestHandler(HDFSOperationType.RENEW_LEASE) { @Override public void acquireLock(TransactionLocks locks) throws IOException { LockFactory lf = LockFactory.getInstance(); locks.add(lf.getLeaseLock(LockType.WRITE, holder)); } @Override public Object performTask() throws IOException { if (isInSafeMode()) { throw new SafeModeException("Cannot renew lease for " + holder, safeMode); } leaseManager.renewLease(holder); return null; } }.handle(this); } /** * Get a partial listing of the indicated directory * * @param src * the directory name * @param startAfter * the name to start after * @param needLocation * if blockLocations need to be returned * @return a partial listing starting after startAfter * @throws AccessControlException * if access is denied * @throws UnresolvedLinkException * if symbolic link is encountered * @throws IOException * if other I/O error occurred */ DirectoryListing getListing(final String src, final byte[] startAfter, final boolean needLocation) throws AccessControlException, UnresolvedLinkException, IOException { HopsTransactionalRequestHandler getListingHandler = new HopsTransactionalRequestHandler(HDFSOperationType.GET_LISTING, src) { @Override public void acquireLock(TransactionLocks locks) throws IOException { LockFactory lf = LockFactory.getInstance(); locks.add(lf.getINodeLock(true/*skip INodeAttr*/, nameNode, INodeLockType.READ, INodeResolveType.PATH_AND_IMMEDIATE_CHILDREN, src)); if(needLocation){ locks .add(lf.getBlockLock()) .add(lf.getBlockRelated(BLK.RE, BLK.ER, BLK.CR, BLK.UC)); } } @Override public Object performTask() throws IOException { try { return getListingInt(src, startAfter, needLocation); } catch (AccessControlException e) { logAuditEvent(false, "listStatus", src); throw e; } } }; return (DirectoryListing) getListingHandler.handle(this); } private DirectoryListing getListingInt(String src, byte[] startAfter, boolean needLocation) throws AccessControlException, UnresolvedLinkException, IOException, StorageException { DirectoryListing dl; FSPermissionChecker pc = getPermissionChecker(); if (isPermissionEnabled) { if (dir.isDir(src)) { checkPathAccess(pc, src, FsAction.READ_EXECUTE); } else { checkTraverse(pc, src); } } logAuditEvent(true, "listStatus", src); dl = dir.getListing(src, startAfter, needLocation); return dl; } ///////////////////////////////////////////////////////// // // These methods are called by datanodes // ///////////////////////////////////////////////////////// /** * Register Datanode. * <p/> * The purpose of registration is to identify whether the new datanode * serves a new data storage, and will report new data block copies, * which the namenode was not aware of; or the datanode is a replacement * node for the data storage that was previously served by a different * or the same (in terms of host:port) datanode. * The data storages are distinguished by their storageIDs. When a new * data storage is reported the namenode issues a new unique storageID. * <p/> * Finally, the namenode returns its namespaceID as the registrationID * for the datanodes. * namespaceID is a persistent attribute of the name space. * The registrationID is checked every time the datanode is communicating * with the namenode. * Datanodes with inappropriate registrationID are rejected. * If the namenode stops, and then restarts it can restore its * namespaceID and will continue serving the datanodes that has previously * registered with the namenode without restarting the whole cluster. * * @see org.apache.hadoop.hdfs.server.datanode.DataNode */ void registerDatanode(DatanodeRegistration nodeReg) throws IOException { getBlockManager().getDatanodeManager().registerDatanode(nodeReg); checkSafeMode(); } /** * Get registrationID for datanodes based on the namespaceID. * * @return registration ID * @see #registerDatanode(DatanodeRegistration) */ String getRegistrationID() throws IOException { return Storage.getRegistrationID(StorageInfo.getStorageInfoFromDB()); } /** * The given node has reported in. This method should: * 1) Record the heartbeat, so the datanode isn't timed out * 2) Adjust usage stats for future block allocation * <p/> * If a substantial amount of time passed since the last datanode * heartbeat then request an immediate block report. * * @return an array of datanode commands * @throws IOException */ HeartbeatResponse handleHeartbeat(DatanodeRegistration nodeReg, long capacity, long dfsUsed, long remaining, long blockPoolUsed, int xceiverCount, int xmitsInProgress, int failedVolumes) throws IOException { final int maxTransfer = blockManager.getMaxReplicationStreams() - xmitsInProgress; DatanodeCommand[] cmds = blockManager.getDatanodeManager() .handleHeartbeat(nodeReg, blockPoolId, capacity, dfsUsed, remaining, blockPoolUsed, xceiverCount, maxTransfer, failedVolumes); return new HeartbeatResponse(cmds); } /** * Returns whether or not there were available resources at the last check of * resources. * * @return true if there were sufficient resources available, false otherwise. */ boolean nameNodeHasResourcesAvailable() { return hasResourcesAvailable; } /** * Periodically calls hasAvailableResources of NameNodeResourceChecker, and * if * there are found to be insufficient resources available, causes the NN to * enter safe mode. If resources are later found to have returned to * acceptable levels, this daemon will cause the NN to exit safe mode. */ class NameNodeResourceMonitor implements Runnable { boolean shouldNNRmRun = true; @Override public void run() { try { while (fsRunning && shouldNNRmRun) { if (!nameNodeHasResourcesAvailable()) { String lowResourcesMsg = "NameNode low on available disk space. "; if (!isInSafeMode()) { FSNamesystem.LOG.warn(lowResourcesMsg + "Entering safe mode."); } else { FSNamesystem.LOG.warn(lowResourcesMsg + "Already in safe mode."); } enterSafeMode(true); } try { Thread.sleep(resourceRecheckInterval); } catch (InterruptedException ie) { // Deliberately ignore } } } catch (Exception e) { FSNamesystem.LOG.error("Exception in NameNodeResourceMonitor: ", e); } } public void stopMonitor() { shouldNNRmRun = false; } } private void checkBlock(ExtendedBlock block) throws IOException { if (block != null && !this.blockPoolId.equals(block.getBlockPoolId())) { throw new IOException( "Unexpected BlockPoolId " + block.getBlockPoolId() + " - expected " + blockPoolId); } } @Metric({"MissingBlocks", "Number of missing blocks"}) public long getMissingBlocksCount() throws IOException { // not locking return blockManager.getMissingBlocksCount(); } @Metric({"ExpiredHeartbeats", "Number of expired heartbeats"}) public int getExpiredHeartbeats() { return datanodeStatistics.getExpiredHeartbeats(); } /** * @see ClientProtocol#getStats() */ long[] getStats() throws IOException { final long[] stats = datanodeStatistics.getStats(); stats[ClientProtocol.GET_STATS_UNDER_REPLICATED_IDX] = getUnderReplicatedBlocks(); stats[ClientProtocol.GET_STATS_CORRUPT_BLOCKS_IDX] = getCorruptReplicaBlocks(); stats[ClientProtocol.GET_STATS_MISSING_BLOCKS_IDX] = getMissingBlocksCount(); return stats; } @Override // FSNamesystemMBean @Metric({"CapacityTotal", "Total raw capacity of data nodes in bytes"}) public long getCapacityTotal() { return datanodeStatistics.getCapacityTotal(); } @Metric({"CapacityTotalGB", "Total raw capacity of data nodes in GB"}) public float getCapacityTotalGB() { return DFSUtil.roundBytesToGB(getCapacityTotal()); } @Override // FSNamesystemMBean @Metric( {"CapacityUsed", "Total used capacity across all data nodes in bytes"}) public long getCapacityUsed() { return datanodeStatistics.getCapacityUsed(); } @Metric({"CapacityUsedGB", "Total used capacity across all data nodes in GB"}) public float getCapacityUsedGB() { return DFSUtil.roundBytesToGB(getCapacityUsed()); } @Override // FSNamesystemMBean @Metric({"CapacityRemaining", "Remaining capacity in bytes"}) public long getCapacityRemaining() { return datanodeStatistics.getCapacityRemaining(); } @Metric({"CapacityRemainingGB", "Remaining capacity in GB"}) public float getCapacityRemainingGB() { return DFSUtil.roundBytesToGB(getCapacityRemaining()); } @Metric({"CapacityUsedNonDFS", "Total space used by data nodes for non DFS purposes in bytes"}) public long getCapacityUsedNonDFS() { return datanodeStatistics.getCapacityUsedNonDFS(); } /** * Total number of connections. */ @Override // FSNamesystemMBean @Metric public int getTotalLoad() { return datanodeStatistics.getXceiverCount(); } int getNumberOfDatanodes(DatanodeReportType type) { return getBlockManager().getDatanodeManager().getDatanodeListForReport(type) .size(); } DatanodeInfo[] datanodeReport(final DatanodeReportType type) throws AccessControlException { checkSuperuserPrivilege(); final DatanodeManager dm = getBlockManager().getDatanodeManager(); final List<DatanodeDescriptor> results = dm.getDatanodeListForReport(type); DatanodeInfo[] arr = new DatanodeInfo[results.size()]; for (int i = 0; i < arr.length; i++) { arr[i] = new DatanodeInfo(results.get(i)); } return arr; } Date getStartTime() { return new Date(startTime); } void refreshNodes() throws IOException { checkSuperuserPrivilege(); getBlockManager().getDatanodeManager() .refreshNodes(new HdfsConfiguration()); } void setBalancerBandwidth(long bandwidth) throws IOException { checkSuperuserPrivilege(); getBlockManager().getDatanodeManager().setBalancerBandwidth(bandwidth); } /** * SafeModeInfo contains information related to the safe mode. * <p/> * An instance of {@link SafeModeInfo} is created when the name node * enters safe mode. * <p/> * During name node startup {@link SafeModeInfo} counts the number of * <em>safe blocks</em>, those that have at least the minimal number of * replicas, and calculates the ratio of safe blocks to the total number * of blocks in the system, which is the size of blocks in * {@link FSNamesystem#blockManager}. When the ratio reaches the * {@link #threshold} it starts the {@link SafeModeMonitor} daemon in order * to monitor whether the safe mode {@link #extension} is passed. * Then it leaves safe mode and destroys itself. * <p/> * If safe mode is turned on manually then the number of safe blocks is * not tracked because the name node is not intended to leave safe mode * automatically in the case. * * @see ClientProtocol#setSafeMode * @see SafeModeMonitor */ class SafeModeInfo { // configuration fields /** * Safe mode threshold condition %. */ private double threshold; /** * Safe mode minimum number of datanodes alive */ private int datanodeThreshold; /** * Safe mode extension after the threshold. */ private int extension; /** * Min replication required by safe mode. */ private int safeReplication; /** * threshold for populating needed replication queues */ private double replQueueThreshold; // internal fields /** * Time when threshold was reached. * <p/> * <br>-1 safe mode is off * <br> 0 safe mode is on, but threshold is not reached yet */ private long reached = -1; /** * Total number of blocks. */ int blockTotal; /** * Number of blocks needed to satisfy safe mode threshold condition */ private int blockThreshold; /** * Number of blocks needed before populating replication queues */ private int blockReplQueueThreshold; /** * time of the last status printout */ private long lastStatusReport = 0; /** * flag indicating whether replication queues have been initialized */ boolean initializedReplQueues = false; /** * Was safemode entered automatically because available resources were low. */ private boolean resourcesLow = false; public ThreadLocal<Boolean> safeModePendingOperation = new ThreadLocal<Boolean>(); /** * Creates SafeModeInfo when the name node enters * automatic safe mode at startup. * * @param conf * configuration */ private SafeModeInfo(Configuration conf) { this.threshold = conf.getFloat(DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_KEY, DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_DEFAULT); if (threshold > 1.0) { LOG.warn("The threshold value should't be greater than 1, threshold: " + threshold); } this.datanodeThreshold = conf.getInt(DFS_NAMENODE_SAFEMODE_MIN_DATANODES_KEY, DFS_NAMENODE_SAFEMODE_MIN_DATANODES_DEFAULT); this.extension = conf.getInt(DFS_NAMENODE_SAFEMODE_EXTENSION_KEY, 0); this.safeReplication = conf.getInt(DFS_NAMENODE_REPLICATION_MIN_KEY, DFS_NAMENODE_REPLICATION_MIN_DEFAULT); if (this.safeReplication > 1) { LOG.warn("Only safe replication 1 is supported"); this.safeReplication = 1; } LOG.info(DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_KEY + " = " + threshold); LOG.info( DFS_NAMENODE_SAFEMODE_MIN_DATANODES_KEY + " = " + datanodeThreshold); LOG.info(DFS_NAMENODE_SAFEMODE_EXTENSION_KEY + " = " + extension); // default to safe mode threshold (i.e., don't populate queues before leaving safe mode) this.replQueueThreshold = conf.getFloat(DFS_NAMENODE_REPL_QUEUE_THRESHOLD_PCT_KEY, (float) threshold); this.blockTotal = 0; } /** * Creates SafeModeInfo when safe mode is entered manually, or because * available resources are low. * <p/> * The {@link #threshold} is set to 1.5 so that it could never be reached. * {@link #blockTotal} is set to -1 to indicate that safe mode is manual. * * @see SafeModeInfo */ private SafeModeInfo(boolean resourcesLow) throws IOException { this.threshold = 1.5f; // this threshold can never be reached this.datanodeThreshold = Integer.MAX_VALUE; this.extension = Integer.MAX_VALUE; this.safeReplication = Short.MAX_VALUE + 1; // more than maxReplication this.replQueueThreshold = 1.5f; // can never be reached this.blockTotal = -1; this.reached = -1; this.resourcesLow = resourcesLow; enter(); reportStatus("STATE* Safe mode is ON.", true); } /** * Check if safe mode is on. * * @return true if in safe mode */ private boolean isOn() throws IOException { doConsistencyCheck(); return this.reached >= 0 && isClusterInSafeMode(); } /** * Check if we are populating replication queues. */ private boolean isPopulatingReplQueues() { return initializedReplQueues; } /** * Enter safe mode. */ private void enter() { this.reached = 0; } /** * Leave safe mode. * <p/> * Check for invalid, under- & over-replicated blocks in the end of * startup. */ private void leave() throws IOException { // if not done yet, initialize replication queues. // In the standby, do not populate repl queues if (!isPopulatingReplQueues() && shouldPopulateReplQueues()) { initializeReplQueues(); } leaveInternal(); HdfsVariables.exitClusterSafeMode(); HdfsVariables.resetMisReplicatedIndex(); clearSafeBlocks(); } private void leaveInternal() throws IOException { long timeInSafemode = now() - startTime; NameNode.stateChangeLog.info( "STATE* Leaving safe mode after " + timeInSafemode / 1000 + " secs"); NameNode.getNameNodeMetrics().setSafeModeTime((int) timeInSafemode); if (reached >= 0) { NameNode.stateChangeLog.info("STATE* Safe mode is OFF"); } reached = -1; safeMode = null; final NetworkTopology nt = blockManager.getDatanodeManager().getNetworkTopology(); NameNode.stateChangeLog.info( "STATE* Network topology has " + nt.getNumOfRacks() + " racks and " + nt.getNumOfLeaves() + " datanodes"); NameNode.stateChangeLog.info("STATE* UnderReplicatedBlocks has " + blockManager.numOfUnderReplicatedBlocks() + " blocks"); startSecretManagerIfNecessary(); } /** * Initialize replication queues. */ private void initializeReplQueues() throws IOException { LOG.info("initializing replication queues"); assert !isPopulatingReplQueues() : "Already initialized repl queues"; long startTimeMisReplicatedScan = now(); blockManager.processMisReplicatedBlocks(); initializedReplQueues = true; NameNode.stateChangeLog.info("STATE* Replication Queue initialization " + "scan for invalid, over- and under-replicated blocks " + "completed in " + (now() - startTimeMisReplicatedScan) + " msec"); } /** * Check whether we have reached the threshold for * initializing replication queues. */ private boolean canInitializeReplQueues() throws IOException { return shouldPopulateReplQueues() && blockSafe() >= blockReplQueueThreshold; } /** * Safe mode can be turned off iff * another namenode went out of safemode or * the threshold is reached and * the extension time have passed. * * @return true if can leave or false otherwise. */ private boolean canLeave() throws IOException { if (reached == 0 && isClusterInSafeMode()) { return false; } if (now() - reached < extension) { reportStatus("STATE* Safe mode ON.", false); return false; } return !needEnter(); } /** * This NameNode tries to help the cluster to get out of safemode by * updaing the safeblock count. * This call will trigger the @link{SafeModeMonitor} if it's not already * started. * @throws IOException */ private void tryToHelpToGetout() throws IOException { if (isManual()) { return; } startSafeModeMonitor(); } /** * The cluster already left safemode, now it's time to for this namenode * to leave as well. * @throws IOException */ private void clusterLeftSafeModeAlready() throws IOException { leaveInternal(); } /** * There is no need to enter safe mode * if DFS is empty or {@link #threshold} == 0 or another namenode already * went out of safemode */ private boolean needEnter() throws IOException { if (!isClusterInSafeMode()) { return false; } return (threshold != 0 && blockSafe() < blockThreshold) || (getNumLiveDataNodes() < datanodeThreshold) || (!nameNodeHasResourcesAvailable()); } /** * Check and trigger safe mode if needed. */ private void checkMode() throws IOException { // Have to have write-lock since leaving safemode initializes // repl queues, which requires write lock if (needEnter()) { enter(); // check if we are ready to initialize replication queues if (canInitializeReplQueues() && !isPopulatingReplQueues()) { initializeReplQueues(); } reportStatus("STATE* Safe mode ON.", false); return; } // the threshold is reached if (!isOn() || // safe mode is off extension <= 0 || threshold <= 0) { // don't need to wait this.leave(); // leave safe mode return; } if (reached > 0) { // threshold has already been reached before reportStatus("STATE* Safe mode ON.", false); return; } // start monitor reached = now(); startSafeModeMonitor(); reportStatus("STATE* Safe mode extension entered.", true); // check if we are ready to initialize replication queues if (canInitializeReplQueues() && !isPopulatingReplQueues()) { initializeReplQueues(); } } private synchronized void startSafeModeMonitor() { if (smmthread == null) { smmthread = new Daemon(new SafeModeMonitor()); smmthread.start(); } } /** * Set total number of blocks. */ private synchronized void setBlockTotal(int total) throws IOException { this.blockTotal = total; this.blockThreshold = (int) (blockTotal * threshold); this.blockReplQueueThreshold = (int) (blockTotal * replQueueThreshold); setSafeModePendingOperation(true); } /** * Increment number of safe blocks if current block has * reached minimal replication. * * @param blk * current block */ private void incrementSafeBlockCount(Block blk) throws IOException { addSafeBlock(blk.getBlockId()); setSafeModePendingOperation(true); } /** * Decrement number of safe blocks if current block has * fallen below minimal replication. * @param blk * current block * @param replication * current replication */ private void decrementSafeBlockCount(Block blk, short replication) throws IOException { if (replication == safeReplication - 1) { removeSafeBlock(blk.getBlockId()); setSafeModePendingOperation(true); } } /** * Check if safe mode was entered manually or automatically (at startup, or * when disk space is low). */ private boolean isManual() { return extension == Integer.MAX_VALUE && !resourcesLow; } /** * Set manual safe mode. */ private synchronized void setManual() { extension = Integer.MAX_VALUE; } /** * Check if safe mode was entered due to resources being low. */ private boolean areResourcesLow() { return resourcesLow; } /** * Set that resources are low for this instance of safe mode. */ private void setResourcesLow() { resourcesLow = true; } /** * A tip on how safe mode is to be turned off: manually or automatically. */ String getTurnOffTip() { if (reached < 0) { return "Safe mode is OFF."; } String leaveMsg = ""; if (areResourcesLow()) { leaveMsg = "Resources are low on NN. " + "Please add or free up more resources then turn off safe mode manually. " + "NOTE: If you turn off safe mode before adding resources, " + "the NN will immediately return to safe mode."; } else { leaveMsg = "Safe mode will be turned off automatically"; } if (isManual()) { leaveMsg = "Use \"hdfs dfsadmin -safemode leave\" to turn safe mode off"; } if (blockTotal < 0) { return leaveMsg + "."; } int numLive = getNumLiveDataNodes(); String msg = ""; long blockSafe; try { blockSafe = blockSafe(); } catch (IOException ex) { LOG.error(ex); return "got exception " + ex.getMessage(); } if (reached == 0) { if (blockSafe < blockThreshold) { msg += String.format("The reported blocks %d needs additional %d" + " blocks to reach the threshold %.4f of total blocks %d.", blockSafe, (blockThreshold - blockSafe) + 1, threshold, blockTotal); } if (numLive < datanodeThreshold) { if (!"".equals(msg)) { msg += "\n"; } msg += String.format( "The number of live datanodes %d needs an additional %d live " + "datanodes to reach the minimum number %d.", numLive, (datanodeThreshold - numLive), datanodeThreshold); } msg += " " + leaveMsg; } else { msg = String.format("The reported blocks %d has reached the threshold" + " %.4f of total blocks %d.", blockSafe, threshold, blockTotal); if (datanodeThreshold > 0) { msg += String.format(" The number of live datanodes %d has reached " + "the minimum number %d.", numLive, datanodeThreshold); } msg += " " + leaveMsg; } if (reached == 0 || isManual()) { // threshold is not reached or manual return msg + "."; } // extension period is in progress return msg + " in " + Math.abs(reached + extension - now()) / 1000 + " seconds."; } /** * Print status every 20 seconds. */ private void reportStatus(String msg, boolean rightNow) throws IOException { long curTime = now(); if (!rightNow && (curTime - lastStatusReport < 20 * 1000)) { return; } NameNode.stateChangeLog.error(msg + " \n" + getTurnOffTip()); lastStatusReport = curTime; } @Override public String toString() { String blockSafe; try { blockSafe = "" + blockSafe(); } catch (IOException ex) { blockSafe = ex.getMessage(); } String resText = "Current safe blocks = " + blockSafe + ". Target blocks = " + blockThreshold + " for threshold = %" + threshold + ". Minimal replication = " + safeReplication + "."; if (reached > 0) { resText += " Threshold was reached " + new Date(reached) + "."; } return resText; } /** * Checks consistency of the class state. * This is costly so only runs if asserts are enabled. */ private void doConsistencyCheck() throws IOException { boolean assertsOn = false; assert assertsOn = true; // set to true if asserts are on if (!assertsOn) { return; } if (blockTotal == -1 /*&& blockSafe == -1*/) { return; // manual safe mode } long blockSafe = blockSafe(); int activeBlocks = blockManager.getActiveBlockCount(); if ((blockTotal != activeBlocks) && !(blockSafe >= 0 && blockSafe <= blockTotal)) { throw new AssertionError(" SafeMode: Inconsistent filesystem state: " + "SafeMode data: blockTotal=" + blockTotal + " blockSafe=" + blockSafe + "; " + "BlockManager data: active=" + activeBlocks); } } private void adjustBlockTotals(int deltaSafe, int deltaTotal) throws IOException { //FIXME ?! } private void setSafeModePendingOperation(Boolean val) { LOG.debug("SafeModeX Some operation are put on hold"); safeModePendingOperation.set(val); } private void adjustSafeBlocks(Set<Long> safeBlocks) throws IOException { int lastSafeBlockSize = blockSafe(); addSafeBlocks(safeBlocks); int newSafeBlockSize = blockSafe(); if (LOG.isDebugEnabled()) { LOG.debug("Adjusting safe blocks from " + lastSafeBlockSize + "/" + blockTotal + " to " + newSafeBlockSize + "/" + blockTotal); } checkMode(); } private void performSafeModePendingOperation() throws IOException { if (safeModePendingOperation.get() != null) { if (safeModePendingOperation.get().booleanValue() == true) { LOG.debug("SafeMode about to perform pending safemode operation"); safeModePendingOperation.set(false); checkMode(); } } } /** * Get number of safeblocks from the database * @return * @throws IOException */ int blockSafe() throws IOException { return getBlockSafe(); } } /** * Periodically check whether it is time to leave safe mode. * This thread starts when the threshold level is reached. */ class SafeModeMonitor implements Runnable { /** * interval in msec for checking safe mode: {@value} */ private static final long recheckInterval = 1000; /** */ @Override public void run() { try { while (fsRunning && (safeMode != null && !safeMode.canLeave())) { safeMode.checkMode(); try { Thread.sleep(recheckInterval); } catch (InterruptedException ie) { } } if (!fsRunning) { LOG.info("NameNode is being shutdown, exit SafeModeMonitor thread"); } else { try { // leave safe mode and stop the monitor leaveSafeMode(); } catch (IOException ex) { LOG.error(ex); } } smmthread = null; } catch (IOException ex) { LOG.error(ex); } } } boolean setSafeMode(SafeModeAction action) throws IOException { if (action != SafeModeAction.SAFEMODE_GET) { checkSuperuserPrivilege(); switch (action) { case SAFEMODE_LEAVE: // leave safe mode leaveSafeMode(); break; case SAFEMODE_ENTER: // enter safe mode enterSafeMode(false); break; default: LOG.error("Unexpected safe mode action"); } } return isInSafeMode(); } @Override public void checkSafeMode() throws IOException { // safeMode is volatile, and may be set to null at any time SafeModeInfo safeMode = this.safeMode; if (safeMode != null) { safeMode.checkMode(); } } @Override public boolean isInSafeMode() throws IOException { // safeMode is volatile, and may be set to null at any time SafeModeInfo safeMode = this.safeMode; if (safeMode == null) { return false; } if (!isClusterInSafeMode()) { safeMode.clusterLeftSafeModeAlready(); return false; } else { safeMode.tryToHelpToGetout(); } return safeMode.isOn(); } @Override public boolean isInStartupSafeMode() throws IOException { // safeMode is volatile, and may be set to null at any time SafeModeInfo safeMode = this.safeMode; if (safeMode == null) { return false; } return !safeMode.isManual() && safeMode.isOn(); } @Override public boolean isPopulatingReplQueues() { if (!shouldPopulateReplQueues()) { return false; } // safeMode is volatile, and may be set to null at any time SafeModeInfo safeMode = this.safeMode; if (safeMode == null) { return true; } return safeMode.isPopulatingReplQueues(); } private boolean shouldPopulateReplQueues() { return true; } @Override public void incrementSafeBlockCount(BlockInfo blk) throws IOException { // safeMode is volatile, and may be set to null at any time SafeModeInfo safeMode = this.safeMode; if (safeMode == null) { return; } safeMode.incrementSafeBlockCount(blk); } @Override public void decrementSafeBlockCount(BlockInfo b) throws StorageException, IOException { // safeMode is volatile, and may be set to null at any time SafeModeInfo safeMode = this.safeMode; if (safeMode == null) // mostly true { return; } if (b.isComplete()) { safeMode.decrementSafeBlockCount(b, (short) blockManager.countNodes(b).liveReplicas()); } } /** * Adjust the total number of blocks safe and expected during safe mode. * If safe mode is not currently on, this is a no-op. * * @param deltaSafe * the change in number of safe blocks * @param deltaTotal * the change i nnumber of total blocks expected */ @Override public void adjustSafeModeBlockTotals(int deltaSafe, int deltaTotal) throws IOException { // safeMode is volatile, and may be set to null at any time SafeModeInfo safeMode = this.safeMode; if (safeMode == null) { return; } safeMode.adjustBlockTotals(deltaSafe, deltaTotal); } /** * Set the total number of blocks in the system. */ public void setBlockTotal() throws IOException { // safeMode is volatile, and may be set to null at any time SafeModeInfo safeMode = this.safeMode; if (safeMode == null) { return; } safeMode.setBlockTotal((int) blockManager.getTotalCompleteBlocks()); } /** * Get the total number of blocks in the system. */ @Override // FSNamesystemMBean @Metric public long getBlocksTotal() throws IOException { return blockManager.getTotalBlocks(); } /** * Enter safe mode manually. * * @throws IOException */ void enterSafeMode(boolean resourcesLow) throws IOException { // Stop the secret manager, since rolling the master key would // try to write to the edit log stopSecretManager(); if (!isInSafeMode()) { safeMode = new SafeModeInfo(resourcesLow); HdfsVariables.enterClusterSafeMode(); return; } if (resourcesLow) { safeMode.setResourcesLow(); } safeMode.setManual(); NameNode.stateChangeLog .info("STATE* Safe mode is ON" + safeMode.getTurnOffTip()); } /** * Leave safe mode. * * @throws IOException */ void leaveSafeMode() throws IOException { if (!isInSafeMode()) { NameNode.stateChangeLog.info("STATE* Safe mode is already OFF"); return; } safeMode.leave(); } String getSafeModeTip() throws IOException { if (!isInSafeMode()) { return ""; } return safeMode.getTurnOffTip(); } PermissionStatus createFsOwnerPermissions(FsPermission permission) { return new PermissionStatus(fsOwner.getShortUserName(), supergroup, permission); } private void checkOwner(FSPermissionChecker pc, String path) throws IOException { checkPermission(pc, path, true, null, null, null, null); } private void checkPathAccess(FSPermissionChecker pc, String path, FsAction access) throws IOException { checkPermission(pc, path, false, null, null, access, null); } private void checkParentAccess(FSPermissionChecker pc, String path, FsAction access) throws IOException { checkPermission(pc, path, false, null, access, null, null); } private void checkAncestorAccess(FSPermissionChecker pc, String path, FsAction access) throws IOException { checkPermission(pc, path, false, access, null, null, null); } private void checkTraverse(FSPermissionChecker pc, String path) throws IOException { checkPermission(pc, path, false, null, null, null, null); } @Override public void checkSuperuserPrivilege() throws AccessControlException { if (isPermissionEnabled) { FSPermissionChecker pc = getPermissionChecker(); pc.checkSuperuserPrivilege(); } } /** * Check whether current user have permissions to access the path. For more * details of the parameters, see * {@link FSPermissionChecker#checkPermission}. */ private void checkPermission(FSPermissionChecker pc, String path, boolean doCheckOwner, FsAction ancestorAccess, FsAction parentAccess, FsAction access, FsAction subAccess) throws IOException { if (!pc.isSuperUser()) { pc.checkPermission(path, dir.getRootDir(), doCheckOwner, ancestorAccess, parentAccess, access, subAccess); } } /** * Check to see if we have exceeded the limit on the number * of inodes. */ void checkFsObjectLimit() throws IOException, StorageException { if (maxFsObjects != 0 && maxFsObjects <= dir.totalInodes() + getBlocksTotal()) { throw new IOException("Exceeded the configured number of objects " + maxFsObjects + " in the filesystem."); } } /** * Get the total number of objects in the system. */ long getMaxObjects() { return maxFsObjects; } @Override // FSNamesystemMBean @Metric public long getFilesTotal() { try { return this.dir.totalInodes(); } catch (Exception ex) { LOG.error(ex); return -1; } } @Override // FSNamesystemMBean @Metric public long getPendingReplicationBlocks() { return blockManager.getPendingReplicationBlocksCount(); } @Override // FSNamesystemMBean @Metric public long getUnderReplicatedBlocks() { return blockManager.getUnderReplicatedBlocksCount(); } /** * Returns number of blocks with corrupt replicas */ @Metric({"CorruptBlocks", "Number of blocks with corrupt replicas"}) public long getCorruptReplicaBlocks() { return blockManager.getCorruptReplicaBlocksCount(); } @Override // FSNamesystemMBean @Metric public long getScheduledReplicationBlocks() { return blockManager.getScheduledReplicationBlocksCount(); } @Metric public long getPendingDeletionBlocks() throws IOException { return blockManager.getPendingDeletionBlocksCount(); } @Metric public long getExcessBlocks() { return blockManager.getExcessBlocksCount(); } // HA-only metric @Metric public long getPostponedMisreplicatedBlocks() { return blockManager.getPostponedMisreplicatedBlocksCount(); } @Metric public int getBlockCapacity() { return blockManager.getCapacity(); } @Override // FSNamesystemMBean public String getFSState() throws IOException { return isInSafeMode() ? "safeMode" : "Operational"; } private ObjectName mbeanName; /** * Register the FSNamesystem MBean using the name * "hadoop:service=NameNode,name=FSNamesystemState" */ private void registerMBean() { // We can only implement one MXBean interface, so we keep the old one. try { StandardMBean bean = new StandardMBean(this, FSNamesystemMBean.class); mbeanName = MBeans.register("NameNode", "FSNamesystemState", bean); } catch (NotCompliantMBeanException e) { throw new RuntimeException("Bad MBean setup", e); } LOG.info("Registered FSNamesystemState MBean"); } /** * shutdown FSNamesystem */ void shutdown() { if (mbeanName != null) { MBeans.unregister(mbeanName); } } @Override // FSNamesystemMBean public int getNumLiveDataNodes() { return getBlockManager().getDatanodeManager().getNumLiveDataNodes(); } @Override // FSNamesystemMBean public int getNumDeadDataNodes() { return getBlockManager().getDatanodeManager().getNumDeadDataNodes(); } @Override // FSNamesystemMBean @Metric({"StaleDataNodes", "Number of datanodes marked stale due to delayed heartbeat"}) public int getNumStaleDataNodes() { return getBlockManager().getDatanodeManager().getNumStaleNodes(); } private INodeFileUnderConstruction checkUCBlock(ExtendedBlock block, String clientName) throws IOException, StorageException { if (isInSafeMode()) { throw new SafeModeException("Cannot get a new generation stamp and an " + "access token for block " + block, safeMode); } // check stored block state BlockInfo storedBlock = blockManager.getStoredBlock(ExtendedBlock.getLocalBlock(block)); if (storedBlock == null || storedBlock.getBlockUCState() != BlockUCState.UNDER_CONSTRUCTION) { throw new IOException(block + " does not exist or is not under Construction" + storedBlock); } // check file inode INodeFile file = (INodeFile) storedBlock.getBlockCollection(); if (file == null || !file.isUnderConstruction()) { throw new IOException("The file " + storedBlock + " belonged to does not exist or it is not under construction."); } // check lease INodeFileUnderConstruction pendingFile = (INodeFileUnderConstruction) file; if (clientName == null || !clientName.equals(pendingFile.getClientName())) { throw new LeaseExpiredException("Lease mismatch: " + block + " is accessed by a non lease holder " + clientName); } return pendingFile; } /** * Client is reporting some bad block locations. */ void reportBadBlocks(LocatedBlock[] blocks) throws IOException { NameNode.stateChangeLog.info("*DIR* reportBadBlocks"); for (int i = 0; i < blocks.length; i++) { ExtendedBlock blk = blocks[i].getBlock(); DatanodeInfo[] nodes = blocks[i].getLocations(); for (int j = 0; j < nodes.length; j++) { DatanodeInfo dn = nodes[j]; blockManager .findAndMarkBlockAsCorrupt(blk, dn, "client machine reported it"); } } } /** * Get a new generation stamp together with an access token for * a block under construction * <p/> * This method is called for recovering a failed pipeline or setting up * a pipeline to append to a block. * * @param block * a block * @param clientName * the name of a client * @return a located block with a new generation stamp and an access token * @throws IOException * if any error occurs */ LocatedBlock updateBlockForPipeline(final ExtendedBlock block, final String clientName) throws IOException { HopsTransactionalRequestHandler updateBlockForPipelineHandler = new HopsTransactionalRequestHandler( HDFSOperationType.UPDATE_BLOCK_FOR_PIPELINE) { INodeIdentifier inodeIdentifier; @Override public void setUp() throws StorageException { Block b = block.getLocalBlock(); inodeIdentifier = INodeUtil.resolveINodeFromBlock(b); } @Override public void acquireLock(TransactionLocks locks) throws IOException { LockFactory lf = LockFactory.getInstance(); locks.add( lf.getIndividualINodeLock(INodeLockType.WRITE, inodeIdentifier)) .add(lf.getBlockLock(block.getBlockId(), inodeIdentifier)); } @Override public Object performTask() throws IOException { LocatedBlock locatedBlock; // check vadility of parameters checkUCBlock(block, clientName); INodeFile pendingFile = (INodeFile) EntityManager .find(INode.Finder.ByINodeIdFTIS, inodeIdentifier.getInodeId()); // get a new generation stamp and an access token block.setGenerationStamp(pendingFile.nextGenerationStamp()); locatedBlock = new LocatedBlock(block, new DatanodeInfo[0]); blockManager.setBlockToken(locatedBlock, AccessMode.WRITE); return locatedBlock; } }; return (LocatedBlock) updateBlockForPipelineHandler.handle(this); } /** * Update a pipeline for a block under construction * * @param clientName * the name of the client * @param oldBlock * and old block * @param newBlock * a new block with a new generation stamp and length * @param newNodes * datanodes in the pipeline * @throws IOException * if any error occurs */ void updatePipeline(final String clientName, final ExtendedBlock oldBlock, final ExtendedBlock newBlock, final DatanodeID[] newNodes) throws IOException { new HopsTransactionalRequestHandler(HDFSOperationType.UPDATE_PIPELINE) { INodeIdentifier inodeIdentifier; @Override public void setUp() throws StorageException { Block b = oldBlock.getLocalBlock(); inodeIdentifier = INodeUtil.resolveINodeFromBlock(b); } @Override public void acquireLock(TransactionLocks locks) throws IOException { LockFactory lf = LockFactory.getInstance(); locks.add( lf.getIndividualINodeLock(INodeLockType.WRITE, inodeIdentifier, true)) .add(lf.getLeaseLock(LockType.READ)) .add(lf.getLeasePathLock(LockType.READ_COMMITTED)) .add(lf.getBlockLock(oldBlock.getBlockId(), inodeIdentifier)) .add(lf.getBlockRelated(BLK.UC)); } @Override public Object performTask() throws IOException { if (isInSafeMode()) { throw new SafeModeException("Pipeline not updated", safeMode); } assert newBlock.getBlockId() == oldBlock.getBlockId() : newBlock + " and " + oldBlock + " has different block identifier"; LOG.info("updatePipeline(block=" + oldBlock + ", newGenerationStamp=" + newBlock.getGenerationStamp() + ", newLength=" + newBlock.getNumBytes() + ", newNodes=" + Arrays.asList(newNodes) + ", clientName=" + clientName + ")"); updatePipelineInternal(clientName, oldBlock, newBlock, newNodes); LOG.info( "updatePipeline(" + oldBlock + ") successfully to " + newBlock); return null; } }.handle(this); } /** * @see #updatePipeline(String, ExtendedBlock, ExtendedBlock, DatanodeID[]) */ private void updatePipelineInternal(String clientName, ExtendedBlock oldBlock, ExtendedBlock newBlock, DatanodeID[] newNodes) throws IOException, StorageException { // check the vadility of the block and lease holder name final INodeFileUnderConstruction pendingFile = checkUCBlock(oldBlock, clientName); pendingFile.updateLastTwoBlocks(leaseManager.getLease(clientName)); final BlockInfoUnderConstruction blockinfo = (BlockInfoUnderConstruction) pendingFile.getLastBlock(); // check new GS & length: this is not expected if (newBlock.getGenerationStamp() <= blockinfo.getGenerationStamp() || newBlock.getNumBytes() < blockinfo.getNumBytes()) { String msg = "Update " + oldBlock + " (len = " + blockinfo.getNumBytes() + ") to an older state: " + newBlock + " (len = " + newBlock.getNumBytes() + ")"; LOG.warn(msg); throw new IOException(msg); } // Update old block with the new generation stamp and new length blockinfo.setGenerationStamp(newBlock.getGenerationStamp()); blockinfo.setNumBytes(newBlock.getNumBytes()); pendingFile.recomputeFileSize(); // find the DatanodeDescriptor objects final DatanodeManager dm = getBlockManager().getDatanodeManager(); DatanodeDescriptor[] descriptors = null; if (newNodes.length > 0) { descriptors = new DatanodeDescriptor[newNodes.length]; for (int i = 0; i < newNodes.length; i++) { descriptors[i] = dm.getDatanode(newNodes[i]); } } blockinfo.setExpectedLocations(descriptors); } // rename was successful. If any part of the renamed subtree had // files that were being written to, update with new filename. void unprotectedChangeLease(String src, String dst) throws StorageException, TransactionContextException { leaseManager.changeLease(src, dst); } static class CorruptFileBlockInfo { String path; Block block; public CorruptFileBlockInfo(String p, Block b) { path = p; block = b; } @Override public String toString() { return block.getBlockName() + "\t" + path; } } /** * @param path * Restrict corrupt files to this portion of namespace. * @param cookieTab * Support for continuation; the set of files we return * back is ordered by blockid; startBlockAfter tells where to start from * @return a list in which each entry describes a corrupt file/block * @throws AccessControlException * @throws IOException */ Collection<CorruptFileBlockInfo> listCorruptFileBlocks(final String path, String[] cookieTab) throws IOException { checkSuperuserPrivilege(); if (!isPopulatingReplQueues()) { throw new IOException("Cannot run listCorruptFileBlocks because " + "replication queues have not been initialized."); } // print a limited # of corrupt files per call final int[] count = {0}; final ArrayList<CorruptFileBlockInfo> corruptFiles = new ArrayList<CorruptFileBlockInfo>(); final Iterator<Block> blkIterator = blockManager.getCorruptReplicaBlockIterator(); if (cookieTab == null) { cookieTab = new String[]{null}; } final int[] skip = {getIntCookie(cookieTab[0])}; for (int i = 0; i < skip[0] && blkIterator.hasNext(); i++) { blkIterator.next(); } HopsTransactionalRequestHandler listCorruptFileBlocksHandler = new HopsTransactionalRequestHandler( HDFSOperationType.LIST_CORRUPT_FILE_BLOCKS) { INodeIdentifier iNodeIdentifier; @Override public void setUp() throws StorageException { Block block = (Block) getParams()[0]; iNodeIdentifier = INodeUtil.resolveINodeFromBlock(block); } @Override public void acquireLock(TransactionLocks locks) throws IOException { Block block = (Block) getParams()[0]; LockFactory lf = LockFactory.getInstance(); locks.add(lf.getIndividualINodeLock(INodeLockType.READ_COMMITTED, iNodeIdentifier, true)) .add(lf.getBlockLock(block.getBlockId(), iNodeIdentifier)) .add(lf.getBlockRelated(BLK.RE, BLK.CR, BLK.ER)); } @Override public Object performTask() throws IOException { Block blk = (Block) getParams()[0]; INode inode = (INodeFile) blockManager.getBlockCollection(blk); skip[0]++; if (inode != null && blockManager.countNodes(blk).liveReplicas() == 0) { String src = FSDirectory.getFullPathName(inode); if (src.startsWith(path)) { corruptFiles.add(new CorruptFileBlockInfo(src, blk)); count[0]++; } } return null; } }; while (blkIterator.hasNext()) { Block blk = blkIterator.next(); listCorruptFileBlocksHandler.setParams(blk); listCorruptFileBlocksHandler.handle(this); if (count[0] >= DEFAULT_MAX_CORRUPT_FILEBLOCKS_RETURNED) { break; } } cookieTab[0] = String.valueOf(skip[0]); LOG.info("list corrupt file blocks returned: " + count[0]); return corruptFiles; } /** * Convert string cookie to integer. */ private static int getIntCookie(String cookie) { int c; if (cookie == null) { c = 0; } else { try { c = Integer.parseInt(cookie); } catch (NumberFormatException e) { c = 0; } } c = Math.max(0, c); return c; } /** * Create delegation token secret manager */ private DelegationTokenSecretManager createDelegationTokenSecretManager( Configuration conf) { return new DelegationTokenSecretManager( conf.getLong(DFS_NAMENODE_DELEGATION_KEY_UPDATE_INTERVAL_KEY, DFS_NAMENODE_DELEGATION_KEY_UPDATE_INTERVAL_DEFAULT), conf.getLong(DFS_NAMENODE_DELEGATION_TOKEN_MAX_LIFETIME_KEY, DFS_NAMENODE_DELEGATION_TOKEN_MAX_LIFETIME_DEFAULT), conf.getLong(DFS_NAMENODE_DELEGATION_TOKEN_RENEW_INTERVAL_KEY, DFS_NAMENODE_DELEGATION_TOKEN_RENEW_INTERVAL_DEFAULT), DELEGATION_TOKEN_REMOVER_SCAN_INTERVAL, this); } /** * Returns the DelegationTokenSecretManager instance in the namesystem. * * @return delegation token secret manager object */ DelegationTokenSecretManager getDelegationTokenSecretManager() { return dtSecretManager; } /** * @param renewer * @return Token<DelegationTokenIdentifier> * @throws IOException */ Token<DelegationTokenIdentifier> getDelegationToken(final Text renewer) throws IOException { //FIXME This does not seem to be persisted HopsTransactionalRequestHandler getDelegationTokenHandler = new HopsTransactionalRequestHandler( HDFSOperationType.GET_DELEGATION_TOKEN) { @Override public void acquireLock(TransactionLocks locks) throws IOException { } @Override public Object performTask() throws StorageException, IOException { Token<DelegationTokenIdentifier> token; if (isInSafeMode()) { throw new SafeModeException("Cannot issue delegation token", safeMode); } if (!isAllowedDelegationTokenOp()) { throw new IOException( "Delegation Token can be issued only with kerberos or web authentication"); } if (dtSecretManager == null || !dtSecretManager.isRunning()) { LOG.warn("trying to get DT with no secret manager running"); return null; } UserGroupInformation ugi = getRemoteUser(); String user = ugi.getUserName(); Text owner = new Text(user); Text realUser = null; if (ugi.getRealUser() != null) { realUser = new Text(ugi.getRealUser().getUserName()); } DelegationTokenIdentifier dtId = new DelegationTokenIdentifier(owner, renewer, realUser); token = new Token<DelegationTokenIdentifier>(dtId, dtSecretManager); long expiryTime = dtSecretManager.getTokenExpiryTime(dtId); return token; } }; return (Token<DelegationTokenIdentifier>) getDelegationTokenHandler .handle(this); } /** * @param token * @return New expiryTime of the token * @throws InvalidToken * @throws IOException */ long renewDelegationToken(final Token<DelegationTokenIdentifier> token) throws InvalidToken, IOException { //FIXME This does not seem to be persisted HopsTransactionalRequestHandler renewDelegationTokenHandler = new HopsTransactionalRequestHandler( HDFSOperationType.RENEW_DELEGATION_TOKEN) { @Override public void acquireLock(TransactionLocks locks) throws IOException { } @Override public Object performTask() throws StorageException, IOException { long expiryTime; if (isInSafeMode()) { throw new SafeModeException("Cannot renew delegation token", safeMode); } if (!isAllowedDelegationTokenOp()) { throw new IOException( "Delegation Token can be renewed only with kerberos or web authentication"); } String renewer = getRemoteUser().getShortUserName(); expiryTime = dtSecretManager.renewToken(token, renewer); DelegationTokenIdentifier id = new DelegationTokenIdentifier(); ByteArrayInputStream buf = new ByteArrayInputStream(token.getIdentifier()); DataInputStream in = new DataInputStream(buf); id.readFields(in); return expiryTime; } }; return (Long) renewDelegationTokenHandler.handle(this); } /** * @param token * @throws IOException */ void cancelDelegationToken(final Token<DelegationTokenIdentifier> token) throws IOException { //FIXME This does not seem to be persisted HopsTransactionalRequestHandler cancelDelegationTokenHandler = new HopsTransactionalRequestHandler( HDFSOperationType.CANCEL_DELEGATION_TOKEN) { @Override public void acquireLock(TransactionLocks locks) throws IOException { } @Override public Object performTask() throws StorageException, IOException { if (isInSafeMode()) { throw new SafeModeException("Cannot cancel delegation token", safeMode); } String canceller = getRemoteUser().getUserName(); DelegationTokenIdentifier id = dtSecretManager.cancelToken(token, canceller); return null; } }; cancelDelegationTokenHandler.handle(this); } /** * @param out * save state of the secret manager */ void saveSecretManagerState(DataOutputStream out) throws IOException { dtSecretManager.saveSecretManagerState(out); } /** * @param in * load the state of secret manager from input stream */ void loadSecretManagerState(DataInputStream in) throws IOException { dtSecretManager.loadSecretManagerState(in); } /** * Log the updateMasterKey operation to edit logs * * @param key * new delegation key. */ public void logUpdateMasterKey(DelegationKey key) throws IOException { assert !isInSafeMode() : "this should never be called while in safemode, since we stop " + "the DT manager before entering safemode!"; // No need to hold FSN lock since we don't access any internal // structures, and this is stopped before the FSN shuts itself // down, etc. } /** * @return true if delegation token operation is allowed */ private boolean isAllowedDelegationTokenOp() throws IOException { AuthenticationMethod authMethod = getConnectionAuthenticationMethod(); if (UserGroupInformation.isSecurityEnabled() && (authMethod != AuthenticationMethod.KERBEROS) && (authMethod != AuthenticationMethod.KERBEROS_SSL) && (authMethod != AuthenticationMethod.CERTIFICATE)) { return false; } return true; } /** * Returns authentication method used to establish the connection * * @return AuthenticationMethod used to establish connection * @throws IOException */ private AuthenticationMethod getConnectionAuthenticationMethod() throws IOException { UserGroupInformation ugi = getRemoteUser(); AuthenticationMethod authMethod = ugi.getAuthenticationMethod(); if (authMethod == AuthenticationMethod.PROXY) { authMethod = ugi.getRealUser().getAuthenticationMethod(); } return authMethod; } /** * Client invoked methods are invoked over RPC and will be in * RPC call context even if the client exits. */ private boolean isExternalInvocation() { return Server.isRpcInvocation() || NamenodeWebHdfsMethods.isWebHdfsInvocation(); } private static InetAddress getRemoteIp() { InetAddress ip = Server.getRemoteIp(); if (ip != null) { return ip; } return NamenodeWebHdfsMethods.getRemoteIp(); } // optimize ugi lookup for RPC operations to avoid a trip through // UGI.getCurrentUser which is synch'ed private static UserGroupInformation getRemoteUser() throws IOException { UserGroupInformation ugi = null; if (Server.isRpcInvocation()) { ugi = Server.getRemoteUser(); } return (ugi != null) ? ugi : UserGroupInformation.getCurrentUser(); } /** * Log fsck event in the audit log */ void logFsckEvent(String src, InetAddress remoteAddress) throws IOException { if (isAuditEnabled()) { logAuditEvent(true, getRemoteUser(), remoteAddress, "fsck", src, null, null); } } /** * Register NameNodeMXBean */ private void registerMXBean() { MBeans.register("NameNode", "NameNodeInfo", this); } /** * Class representing Namenode information for JMX interfaces */ @Override // NameNodeMXBean public String getVersion() { return VersionInfo.getVersion() + ", r" + VersionInfo.getRevision(); } @Override // NameNodeMXBean public long getUsed() { return this.getCapacityUsed(); } @Override // NameNodeMXBean public long getFree() { return this.getCapacityRemaining(); } @Override // NameNodeMXBean public long getTotal() { return this.getCapacityTotal(); } @Override // NameNodeMXBean public String getSafemode() throws IOException { if (!this.isInSafeMode()) { return ""; } return "Safe mode is ON." + this.getSafeModeTip(); } @Override // NameNodeMXBean public boolean isUpgradeFinalized() { throw new UnsupportedOperationException("HOP: Upgrade is not supported"); } @Override // NameNodeMXBean public long getNonDfsUsedSpace() { return datanodeStatistics.getCapacityUsedNonDFS(); } @Override // NameNodeMXBean public float getPercentUsed() { return datanodeStatistics.getCapacityUsedPercent(); } @Override // NameNodeMXBean public long getBlockPoolUsedSpace() { return datanodeStatistics.getBlockPoolUsed(); } @Override // NameNodeMXBean public float getPercentBlockPoolUsed() { return datanodeStatistics.getPercentBlockPoolUsed(); } @Override // NameNodeMXBean public float getPercentRemaining() { return datanodeStatistics.getCapacityRemainingPercent(); } @Override // NameNodeMXBean public long getTotalBlocks() throws IOException { return getBlocksTotal(); } @Override // NameNodeMXBean @Metric public long getTotalFiles() { return getFilesTotal(); } @Override // NameNodeMXBean public long getNumberOfMissingBlocks() throws IOException { return getMissingBlocksCount(); } @Override // NameNodeMXBean public int getThreads() { return ManagementFactory.getThreadMXBean().getThreadCount(); } /** * Returned information is a JSON representation of map with host name as the * key and value is a map of live node attribute keys to its values */ @Override // NameNodeMXBean public String getLiveNodes() throws IOException { final Map<String, Map<String, Object>> info = new HashMap<String, Map<String, Object>>(); final List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>(); blockManager.getDatanodeManager().fetchDatanodes(live, null, true); for (DatanodeDescriptor node : live) { final Map<String, Object> innerinfo = new HashMap<String, Object>(); innerinfo.put("lastContact", getLastContact(node)); innerinfo.put("usedSpace", getDfsUsed(node)); innerinfo.put("adminState", node.getAdminState().toString()); innerinfo.put("nonDfsUsedSpace", node.getNonDfsUsed()); innerinfo.put("capacity", node.getCapacity()); innerinfo.put("numBlocks", node.numBlocks()); info.put(node.getHostName(), innerinfo); } return JSON.toString(info); } /** * Returned information is a JSON representation of map with host name as the * key and value is a map of dead node attribute keys to its values */ @Override // NameNodeMXBean public String getDeadNodes() { final Map<String, Map<String, Object>> info = new HashMap<String, Map<String, Object>>(); final List<DatanodeDescriptor> dead = new ArrayList<DatanodeDescriptor>(); blockManager.getDatanodeManager().fetchDatanodes(null, dead, true); for (DatanodeDescriptor node : dead) { final Map<String, Object> innerinfo = new HashMap<String, Object>(); innerinfo.put("lastContact", getLastContact(node)); innerinfo.put("decommissioned", node.isDecommissioned()); info.put(node.getHostName(), innerinfo); } return JSON.toString(info); } /** * Returned information is a JSON representation of map with host name as the * key and value is a map of decomisioning node attribute keys to its values */ @Override // NameNodeMXBean public String getDecomNodes() { final Map<String, Map<String, Object>> info = new HashMap<String, Map<String, Object>>(); final List<DatanodeDescriptor> decomNodeList = blockManager.getDatanodeManager().getDecommissioningNodes(); for (DatanodeDescriptor node : decomNodeList) { final Map<String, Object> innerinfo = new HashMap<String, Object>(); innerinfo.put("underReplicatedBlocks", node.decommissioningStatus.getUnderReplicatedBlocks()); innerinfo.put("decommissionOnlyReplicas", node.decommissioningStatus.getDecommissionOnlyReplicas()); innerinfo.put("underReplicateInOpenFiles", node.decommissioningStatus.getUnderReplicatedInOpenFiles()); info.put(node.getHostName(), innerinfo); } return JSON.toString(info); } private long getLastContact(DatanodeDescriptor alivenode) { return (Time.now() - alivenode.getLastUpdate()) / 1000; } private long getDfsUsed(DatanodeDescriptor alivenode) { return alivenode.getDfsUsed(); } @Override // NameNodeMXBean public String getClusterId() { String cid = ""; try { cid = StorageInfo.getStorageInfoFromDB().getClusterID(); } catch (IOException e) { } return cid; } @Override // NameNodeMXBean public String getBlockPoolId() { return blockPoolId; } @Override // NameNodeMXBean public String getNameDirStatuses() { throw new UnsupportedOperationException( "HOP: there are no name dirs any more"); } /** * @return the block manager. */ public BlockManager getBlockManager() { return blockManager; } /** * Verifies that the given identifier and password are valid and match. * * @param identifier * Token identifier. * @param password * Password in the token. * @throws InvalidToken */ public synchronized void verifyToken(DelegationTokenIdentifier identifier, byte[] password) throws InvalidToken { getDelegationTokenSecretManager().verifyToken(identifier, password); } @Override public boolean isGenStampInFuture(long genStamp) throws StorageException { throw new UnsupportedOperationException("Not supported anymore."); } @VisibleForTesting public SafeModeInfo getSafeModeInfoForTests() { return safeMode; } @Override public boolean isAvoidingStaleDataNodesForWrite() { return this.blockManager.getDatanodeManager() .shouldAvoidStaleDataNodesForWrite(); } /** * Default AuditLogger implementation; used when no access logger is * defined in the config file. It can also be explicitly listed in the * config file. */ private static class DefaultAuditLogger implements AuditLogger { @Override public void initialize(Configuration conf) { // Nothing to do. } @Override public void logAuditEvent(boolean succeeded, String userName, InetAddress addr, String cmd, String src, String dst, FileStatus status) { if (auditLog.isInfoEnabled()) { final StringBuilder sb = auditBuffer.get(); sb.setLength(0); sb.append("allowed=").append(succeeded).append("\t"); sb.append("ugi=").append(userName).append("\t"); sb.append("ip=").append(addr).append("\t"); sb.append("cmd=").append(cmd).append("\t"); sb.append("src=").append(src).append("\t"); sb.append("dst=").append(dst).append("\t"); if (null == status) { sb.append("perm=null"); } else { sb.append("perm="); sb.append(status.getOwner()).append(":"); sb.append(status.getGroup()).append(":"); sb.append(status.getPermission()); } auditLog.info(sb); } } } public void hopSpecificInitialization(Configuration conf) throws IOException { HdfsStorageFactory.setConfiguration(conf); } @Override public boolean isLeader() { return nameNode.isLeader(); } @Override public long getNamenodeId() { return nameNode.getLeCurrentId(); } public String getSupergroup() { return this.supergroup; } public void performPendingSafeModeOperation() throws IOException { // safeMode is volatile, and may be set to null at any time SafeModeInfo safeMode = this.safeMode; if (safeMode != null) { safeMode.performSafeModePendingOperation(); } } public void changeConf(List<String> props, List<String> newVals) throws IOException { for (int i = 0; i < props.size(); i++) { String prop = props.get(i); String value = newVals.get(i); if (prop.equals(DFSConfigKeys.DFS_RESOLVING_CACHE_ENABLED) || prop.equals(DFSConfigKeys.DFS_SET_PARTITION_KEY_ENABLED)) { LOG.info("change configuration for " + prop + " to " + value); conf.set(prop, value); if (prop.equals(DFSConfigKeys.DFS_RESOLVING_CACHE_ENABLED)) { Cache.getInstance() .enableOrDisable(Boolean.parseBoolean(value)); } } else { LOG.info("change configuration for " + prop + " to " + value + " is not supported yet"); } } } public void flushCache(String userName, String groupName){ Users.flushCache(userName, groupName); } public class FNode implements Comparable<FNode> { private String parentPath; private INode inode; public FNode(String parentPath, INode inode) { this.parentPath = parentPath; this.inode = inode; } public String getPath() { if (parentPath.endsWith("/")) { return parentPath + inode.getLocalName(); } else { return parentPath + "/" + inode.getLocalName(); } } public INode getINode() { return inode; } public String getParentPath() { return parentPath; } @Override public int compareTo(FNode o) { int obj1Length = INode.getPathComponents(getPath()).length; int obj2Length = INode.getPathComponents(o.getPath()).length; if (obj1Length == obj2Length) { return 0; } else if (obj1Length < obj2Length) { return 1; } else { return -1; } } } @Override public void adjustSafeModeBlocks(Set<Long> safeBlocks) throws IOException { // safeMode is volatile, and may be set to null at any time SafeModeInfo safeMode = this.safeMode; if (safeMode == null) { return; } safeMode.adjustSafeBlocks(safeBlocks); } QuotaUpdateManager getQuotaUpdateManager() { return quotaUpdateManager; } public String getFilePathAncestorLockType() { return conf.get(DFSConfigKeys.DFS_STORAGE_ANCESTOR_LOCK_TYPE, DFSConfigKeys.DFS_STORAGE_ANCESTOR_LOCK_TYPE_DEFAULT); } /** * Update safeblocks in the database * @param safeBlock * block to be added to safeblocks * @throws IOException */ private void addSafeBlock(final Long safeBlock) throws IOException { Set<Long> safeBlocks = new HashSet<Long>(); safeBlocks.add(safeBlock); addSafeBlocks(safeBlocks); } /** * Remove a block that is not considered safe anymore * @param safeBlock * block to be removed from safeblocks * @throws IOException */ private void removeSafeBlock(final Long safeBlock) throws IOException { new LightWeightRequestHandler(HDFSOperationType.REMOVE_SAFE_BLOCKS) { @Override public Object performTask() throws StorageException, IOException { SafeBlocksDataAccess da = (SafeBlocksDataAccess) HdfsStorageFactory .getDataAccess(SafeBlocksDataAccess.class); da.remove(safeBlock); return null; } }; } /** * Update safeblocks in the database * @param safeBlocks * list of blocks to be added to safeblocks * @throws IOException */ private void addSafeBlocks(final Set<Long> safeBlocks) throws IOException { new LightWeightRequestHandler(HDFSOperationType.ADD_SAFE_BLOCKS) { @Override public Object performTask() throws StorageException, IOException { SafeBlocksDataAccess da = (SafeBlocksDataAccess) HdfsStorageFactory .getDataAccess(SafeBlocksDataAccess.class); da.insert(safeBlocks); return null; } }.handle(); } /** * Get number of blocks to be considered safe in the current cluster * @return number of safeblocks * @throws IOException */ private int getBlockSafe() throws IOException { return (Integer) new LightWeightRequestHandler( HDFSOperationType.GET_SAFE_BLOCKS_COUNT) { @Override public Object performTask() throws StorageException, IOException { SafeBlocksDataAccess da = (SafeBlocksDataAccess) HdfsStorageFactory .getDataAccess(SafeBlocksDataAccess.class); return da.countAll(); } }.handle(); } /** * Delete all safeblocks * @throws IOException */ private void clearSafeBlocks() throws IOException { new LightWeightRequestHandler(HDFSOperationType.CLEAR_SAFE_BLOCKS) { @Override public Object performTask() throws StorageException, IOException { SafeBlocksDataAccess da = (SafeBlocksDataAccess) HdfsStorageFactory .getDataAccess(SafeBlocksDataAccess.class); da.removeAll(); return null; } }.handle(); } /** * Check if the cluster is in safemode? * @return true if the cluster in safemode, false otherwise. * @throws IOException */ private boolean isClusterInSafeMode() throws IOException { return HdfsVariables.isClusterInSafeMode(); } boolean isPermissionEnabled() { return isPermissionEnabled; } ExecutorService getSubtreeOperationsExecutor() { return subtreeOperationsExecutor; } boolean isLegacyDeleteEnabled() { return legacyDeleteEnabled; } boolean isLegacyRenameEnabled() { return legacyRenameEnabled; } boolean isLegacyConentSummaryEnabled() { return legacyContentSummaryEnabled; } boolean isLegacySetQuotaEnabled() { return legacySetQuotaEnabled; } /** * Setting the quota of a directory in multiple transactions. Calculating the * namespace counts of a large directory tree might take to much time for a * single transaction. Hence, this functions first reads up the whole tree in * multiple transactions while calculating its quota counts before setting * the quota in a single transaction using these counts. * The subtree is locked during these operations in order to prevent any * concurrent modification. * * @param path * the path of the directory where the quota should be set * @param nsQuota * the namespace quota to be set * @param dsQuota * the diskspace quota to be set * @throws IOException, UnresolvedLinkException */ void multiTransactionalSetQuota(final String path, final long nsQuota, final long dsQuota) throws IOException, UnresolvedLinkException { checkSuperuserPrivilege(); if (isInSafeMode()) { throw new SafeModeException("Cannot set quota on " + path, safeMode); } if (!isLeader()) { throw new RuntimeException("Asked non leading node to setQuota"); } INodeIdentifier subtreeRoot = null; boolean removeSTOLock = false; try { PathInformation pathInfo = getPathExistingINodesFromDB(path, false, null, null, null, null); INode lastComp = pathInfo.getPathInodes()[pathInfo.getPathComponents().length-1]; if(lastComp == null){ throw new FileNotFoundException("Directory does not exist: " + path); }else if(!lastComp.isDirectory()){ throw new FileNotFoundException(path + ": Is not a directory"); } else if(lastComp.isRoot() && nsQuota == HdfsConstants.QUOTA_RESET){ throw new IllegalArgumentException( "Cannot clear namespace quota on root."); } //check if the path is root if(INode.getPathNames(path).length == 0){ // this method return empty array in case of // path = "/" subtreeRoot = INodeDirectory.getRootIdentifier(); }else{ subtreeRoot = lockSubtree(path, SubTreeOperation.StoOperationType.QUOTA_STO); if(subtreeRoot == null){ // in the mean while the dir has been deleted by someone throw new FileNotFoundException("Directory does not exist: " + path); } removeSTOLock = true; } final AbstractFileTree.IdCollectingCountingFileTree fileTree = new AbstractFileTree.IdCollectingCountingFileTree(this, subtreeRoot); fileTree.buildUp(); Iterator<Integer> idIterator = fileTree.getOrderedIds().descendingIterator(); synchronized (idIterator) { quotaUpdateManager.addPrioritizedUpdates(idIterator); try { idIterator.wait(); } catch (InterruptedException e) { // Not sure if this can happend if we are not shutting down but we need to abort in case it happens. throw new IOException("Operation failed due to an Interrupt"); } } HopsTransactionalRequestHandler setQuotaHandler = new HopsTransactionalRequestHandler(HDFSOperationType.SET_QUOTA, path) { @Override public void acquireLock(TransactionLocks locks) throws IOException { LockFactory lf = LockFactory.getInstance(); locks.add(lf.getINodeLock(nameNode, INodeLockType.WRITE, INodeResolveType.PATH, true, true, path)) .add(lf.getBlockLock()); } @Override public Object performTask() throws StorageException, IOException { dir.setQuota(path, nsQuota, dsQuota, fileTree.getNamespaceCount(), fileTree.getDiskspaceCount()); return null; } }; setQuotaHandler.handle(this); } finally { if(removeSTOLock){ unlockSubtree(path); } } } /** * Creates the content summary of a directory tree in multiple transactions. * Creating the content summary of a large directory tree might take to much * time for a single transaction. Hence, this function first builds up an * in-memory representation of the directory tree before reading its attributes * level by level. The directory tree is locked during the operation to prevent * any concurrent modification. * * @param path * the path * @return * the content summary for the given path * @throws IOException */ // [S] what if you call content summary on the root // I have removed sub tree locking from the content summary for now // TODO : fix content summary sub tree locking // ContentSummary multiTransactionalGetContentSummary(final String path) throws AccessControlException, FileNotFoundException, UnresolvedLinkException, IOException { PathInformation pathInfo = getPathExistingINodesFromDB(path, false, null, null, null, null); if(pathInfo.getPathInodes()[pathInfo.getPathComponents().length-1] == null){ throw new FileNotFoundException("File does not exist: " + path); } final INode subtreeRoot = pathInfo.getPathInodes()[pathInfo.getPathComponents().length-1]; final INodeIdentifier subtreeRootIdentifer = new INodeIdentifier(subtreeRoot.getId(),subtreeRoot.getParentId(), subtreeRoot.getLocalName(),subtreeRoot.getPartitionId()); subtreeRootIdentifer.setDepth(((short) (INodeDirectory.ROOT_DIR_DEPTH + pathInfo.getPathComponents().length-1 ))); final AbstractFileTree.CountingFileTree fileTree = new AbstractFileTree.CountingFileTree(this, subtreeRootIdentifer, FsAction.READ_EXECUTE); fileTree.buildUp(); return (ContentSummary) new LightWeightRequestHandler( HDFSOperationType.GET_SUBTREE_ATTRIBUTES) { @Override public Object performTask() throws StorageException, IOException { INodeAttributesDataAccess<INodeAttributes> dataAccess = (INodeAttributesDataAccess<INodeAttributes>) HdfsStorageFactory .getDataAccess(INodeAttributesDataAccess.class); INodeAttributes attributes = dataAccess.findAttributesByPk(subtreeRoot.getId()); if(attributes!=null){ // assert fileTree.getDiskspaceCount() == attributes.getDiskspace(): "Diskspace count did not match fileTree "+fileTree.getDiskspaceCount()+" attributes "+attributes.getDiskspace(); // assert fileTree.getNamespaceCount() == attributes.getNsCount(): "Namespace count did not match fileTree "+fileTree.getNamespaceCount()+" attributes "+attributes.getNsCount(); } return new ContentSummary(fileTree.getFileSizeSummary(), fileTree.getFileCount(), fileTree.getDirectoryCount(), attributes == null ? subtreeRoot.getNsQuota() : attributes.getNsQuota(), fileTree.getDiskspaceCount(), attributes == null ? subtreeRoot.getDsQuota() : attributes.getDsQuota()); } }.handle(this); } /** * Renaming a directory tree in multiple transactions. Renaming a large * directory tree might take to much time for a single transaction when * its quota counts need to be calculated. Hence, this functions first * reads up the whole tree in multiple transactions while calculating its * quota counts before executing the rename in a single transaction. * The subtree is locked during these operations in order to prevent any * concurrent modification. * * @param src * the source * @param dst * the destination * @throws IOException */ void multiTransactionalRename(final String src, final String dst, final Options.Rename... options) throws IOException { if (NameNode.stateChangeLog.isDebugEnabled()) { NameNode.stateChangeLog.debug( "DIR* NameSystem.multiTransactionalRename: with options - " + src + " to " + dst); } if (isInSafeMode()) { throw new SafeModeException("Cannot rename " + src, safeMode); } if (!DFSUtil.isValidName(dst)) { throw new InvalidPathException("Invalid name: " + dst); } if (dst.equals(src)) { throw new FileAlreadyExistsException( "The source " + src + " and destination " + dst + " are the same"); } // dst cannot be a directory or a file under src if (dst.startsWith(src) && dst.charAt(src.length()) == Path.SEPARATOR_CHAR) { String error = "Rename destination " + dst + " is a directory or file under source " + src; NameNode.stateChangeLog .warn("DIR* FSDirectory.unprotectedRenameTo: " + error); throw new IOException(error); } //-- boolean overwrite = false; if (null != options) { for (Rename option : options) { if (option == Rename.OVERWRITE) { overwrite = true; } } } String error = null; PathInformation srcInfo = getPathExistingINodesFromDB(src, false, null, FsAction.WRITE, null, null); INode[] srcInodes = srcInfo.getPathInodes(); INode srcInode = srcInodes[srcInodes.length - 1]; // validate source if (srcInode == null) { error = "rename source " + src + " is not found."; NameNode.stateChangeLog .warn("DIR* FSDirectory.unprotectedRenameTo: " + error); throw new FileNotFoundException(error); } if (srcInodes.length == 1) { error = "rename source cannot be the root"; NameNode.stateChangeLog .warn("DIR* FSDirectory.unprotectedRenameTo: " + error); throw new IOException(error); } if (srcInode.isSymlink() && dst.equals(((INodeSymlink) srcInode).getLinkValue())) { throw new FileAlreadyExistsException( "Cannot rename symlink " + src + " to its target " + dst); } //validate dst PathInformation dstInfo = getPathExistingINodesFromDB(dst, false, FsAction.WRITE, null, null, null); INode[] dstInodes = dstInfo.getPathInodes(); INode dstInode = dstInodes[dstInodes.length - 1]; if (dstInodes.length == 1) { error = "rename destination cannot be the root"; NameNode.stateChangeLog .warn("DIR* FSDirectory.unprotectedRenameTo: " + error); throw new IOException(error); } if (dstInode != null) { // Destination exists // It's OK to rename a file to a symlink and vice versa if (dstInode.isDirectory() != srcInode.isDirectory()) { error = "Source " + src + " and destination " + dst + " must both be directories"; NameNode.stateChangeLog .warn("DIR* FSDirectory.unprotectedRenameTo: " + error); throw new IOException(error); } if (!overwrite) { // If destination exists, overwrite flag must be true error = "rename destination " + dst + " already exists"; NameNode.stateChangeLog .warn("DIR* FSDirectory.unprotectedRenameTo: " + error); throw new FileAlreadyExistsException(error); } short depth = (short) (INodeDirectory.ROOT_DIR_DEPTH + dstInfo.getPathInodes().length-1); boolean areChildrenRandomlyPartitioned = INode.isTreeLevelRandomPartitioned(depth); if (dstInode.isDirectory() && dir.hasChildren(dstInode.getId(),areChildrenRandomlyPartitioned)) { error = "rename cannot overwrite non empty destination directory " + dst; NameNode.stateChangeLog .warn("DIR* FSDirectory.unprotectedRenameTo: " + error); throw new IOException(error); } } if (dstInodes[dstInodes.length - 2] == null) { error = "rename destination parent " + dst + " not found."; NameNode.stateChangeLog .warn("DIR* FSDirectory.unprotectedRenameTo: " + error); throw new FileNotFoundException(error); } if (!dstInodes[dstInodes.length - 2].isDirectory()) { error = "rename destination parent " + dst + " is a file."; NameNode.stateChangeLog .warn("DIR* FSDirectory.unprotectedRenameTo: " + error); throw new ParentNotDirectoryException(error); } INode srcDataset = getMetaEnabledParent(srcInodes); INode dstDataset = getMetaEnabledParent(dstInodes); Collection<MetadataLogEntry> logEntries = Collections.EMPTY_LIST; //-- //TODO [S] if src is a file then there is no need for sub tree locking //mechanism on the src and dst //However the quota is enabled then all the quota update on the dst //must be applied before the move operation. long srcNsCount = srcInfo.getNsCount(); //if not dir then it will return zero long srcDsCount = srcInfo.getDsCount(); long dstNsCount = dstInfo.getNsCount(); long dstDsCount = dstInfo.getDsCount(); boolean isUsingSubTreeLocks = srcInfo.isDir(); boolean renameTransactionCommitted = false; INodeIdentifier srcSubTreeRoot = null; String subTreeLockDst = INode.constructPath(dstInfo.getPathComponents(), 0, dstInfo.getNumExistingComp()); if(subTreeLockDst.equals(INodeDirectory.ROOT_NAME)){ subTreeLockDst = "/"; // absolute path } try { if (isUsingSubTreeLocks) { LOG.debug("Rename src: " + src + " dst: " + dst + " requires sub-tree locking mechanism"); srcSubTreeRoot = lockSubtreeAndCheckPathPermission(src, false, null, FsAction.WRITE, null, null, SubTreeOperation.StoOperationType.RENAME_STO); if (srcSubTreeRoot != null) { AbstractFileTree.QuotaCountingFileTree srcFileTree; if (pathIsMetaEnabled(srcInodes) || pathIsMetaEnabled(dstInodes)) { srcFileTree = new AbstractFileTree.LoggingQuotaCountingFileTree(this, srcSubTreeRoot, srcDataset, dstDataset); srcFileTree.buildUp(); logEntries = ((AbstractFileTree.LoggingQuotaCountingFileTree) srcFileTree).getMetadataLogEntries(); } else { srcFileTree = new AbstractFileTree.QuotaCountingFileTree(this, srcSubTreeRoot); srcFileTree.buildUp(); } srcNsCount = srcFileTree.getNamespaceCount(); srcDsCount = srcFileTree.getDiskspaceCount(); } } else { LOG.debug("Rename src: " + src + " dst: " + dst + " does not require sub-tree locking mechanism"); } renameTo(src, dst, srcNsCount, srcDsCount, dstNsCount, dstDsCount, isUsingSubTreeLocks, subTreeLockDst, logEntries, options); renameTransactionCommitted = true; } finally { if (!renameTransactionCommitted) { if (srcSubTreeRoot != null) { //only unlock if locked unlockSubtree(src); } } } } private boolean pathIsMetaEnabled(INode[] pathComponents) { return getMetaEnabledParent(pathComponents) == null ? false : true; } private INode getMetaEnabledParent(INode[] pathComponents) { for (INode node : pathComponents) { if (node != null && node.isDirectory()) { INodeDirectory dir = (INodeDirectory) node; if (dir.isMetaEnabled()) { return dir; } } } return null; } private void renameTo(final String src, final String dst, final long srcNsCount, final long srcDsCount, final long dstNsCount, final long dstDsCount, final boolean isUsingSubTreeLocks, final String subTreeLockDst, final Collection<MetadataLogEntry> logEntries, final Options.Rename... options ) throws IOException, UnresolvedLinkException { new HopsTransactionalRequestHandler( isUsingSubTreeLocks?HDFSOperationType.SUBTREE_RENAME: HDFSOperationType.RENAME, src) { @Override public void acquireLock(TransactionLocks locks) throws IOException { LockFactory lf = LockFactory.getInstance(); locks.add(lf.getRenameINodeLock(nameNode, INodeLockType.WRITE_ON_TARGET_AND_PARENT, INodeResolveType.PATH, true, src, dst)) .add(lf.getBlockLock()) .add(lf.getBlockRelated(BLK.RE, BLK.CR, BLK.UC, BLK.UR, BLK.IV, BLK.PE, BLK.ER)); if (dir.isQuotaEnabled()) { locks.add(lf.getQuotaUpdateLock(true, src, dst)); } if(!isUsingSubTreeLocks){ locks.add(lf.getLeaseLock(LockType.WRITE)) .add(lf.getLeasePathLock(LockType.READ_COMMITTED)); }else{ locks.add(lf.getLeaseLock(LockType.WRITE)) .add(lf.getLeasePathLock(LockType.WRITE, src)); } if (erasureCodingEnabled) { locks.add(lf.getEncodingStatusLock(LockType.WRITE, dst)); } } @Override public Object performTask() throws IOException { if (NameNode.stateChangeLog.isDebugEnabled()) { NameNode.stateChangeLog.debug( "DIR* NameSystem.renameTo: with options - " + src + " to " + dst); } if (isInSafeMode()) { throw new SafeModeException("Cannot rename " + src, safeMode); } if (!DFSUtil.isValidName(dst)) { throw new InvalidPathException("Invalid name: " + dst); } for (MetadataLogEntry logEntry : logEntries) { EntityManager.add(logEntry); } for (Options.Rename op : options) { if (op == Rename.KEEP_ENCODING_STATUS) { INode[] srcNodes = dir.getRootDir().getExistingPathINodes(src, false); INode[] dstNodes = dir.getRootDir().getExistingPathINodes(dst, false); INode srcNode = srcNodes[srcNodes.length - 1]; INode dstNode = dstNodes[dstNodes.length - 1]; EncodingStatus status = EntityManager.find( EncodingStatus.Finder.ByInodeId, dstNode.getId()); EncodingStatus newStatus = new EncodingStatus(status); newStatus.setInodeId(srcNode.getId()); EntityManager.add(newStatus); EntityManager.remove(status); break; } } removeSubTreeLocksForRenameInternal(src, isUsingSubTreeLocks, subTreeLockDst); dir.renameTo(src, dst, srcNsCount, srcDsCount, dstNsCount, dstDsCount, options); return null; } }.handle(this); } private void removeSubTreeLocksForRenameInternal(final String src, final boolean isUsingSubTreeLocks, final String subTreeLockDst) throws StorageException, TransactionContextException, UnresolvedLinkException { if (isUsingSubTreeLocks) { INode[] nodes = null; INode inode = null; if (!src.equals("/")) { EntityManager.remove(new SubTreeOperation(getSubTreeLockPathPrefix(src))); nodes = dir.getRootDir().getExistingPathINodes(src, false); inode = nodes[nodes.length - 1]; if (inode != null && inode.isSubtreeLocked()) { inode.setSubtreeLocked(false); EntityManager.update(inode); } } } } @Deprecated boolean multiTransactionalRename(final String src, final String dst) throws IOException { if (NameNode.stateChangeLog.isDebugEnabled()) { NameNode.stateChangeLog.debug( "DIR* NameSystem.multiTransactionalRename: with options - " + src + " to " + dst); } if (isInSafeMode()) { throw new SafeModeException("Cannot rename " + src, safeMode); } if (!DFSUtil.isValidName(dst)) { throw new InvalidPathException("Invalid name: " + dst); } if (INode.getPathComponents(src).length == 1) { NameNode.stateChangeLog.warn( "DIR* FSDirectory.unprotectedRenameTo: " + "failed to rename " + src + " to " + dst + " because source is the root"); return false; } PathInformation srcInfo = getPathExistingINodesFromDB(src, false, null, FsAction.WRITE, null, null); INode[] srcInodes = srcInfo.getPathInodes(); INode srcInode = srcInodes[srcInodes.length - 1]; if(srcInode == null){ NameNode.stateChangeLog.warn( "DIR* FSDirectory.unprotectedRenameTo: " + "failed to rename " + src + " to " + dst + " because source does not exist"); return false; } PathInformation dstInfo = getPathExistingINodesFromDB(dst, false, FsAction.WRITE, null, null, null); String actualDst = dst; if(dstInfo.isDir()){ actualDst += Path.SEPARATOR + new Path(src).getName(); } if (actualDst.equals(src)) { return true; } INode[] dstInodes = dstInfo.getPathInodes(); if(dstInodes[dstInodes.length-2] == null){ NameNode.stateChangeLog.warn( "DIR* FSDirectory.unprotectedRenameTo: " + "failed to rename " + src + " to " + dst + " because destination's parent does not exist"); return false; } if (actualDst.startsWith(src) && actualDst.charAt(src.length()) == Path.SEPARATOR_CHAR) { NameNode.stateChangeLog.warn( "DIR* FSDirectory.unprotectedRenameTo: " + "failed to rename " + src + " to " + actualDst + " because destination starts with src"); return false; } INode srcDataset = getMetaEnabledParent(srcInfo.getPathInodes()); INode dstDataset = getMetaEnabledParent(dstInfo.getPathInodes()); Collection<MetadataLogEntry> logEntries = Collections.EMPTY_LIST; //TODO [S] if src is a file then there is no need for sub tree locking //mechanism on the src and dst //However the quota is enabled then all the quota update on the dst //must be applied before the move operation. long srcNsCount = srcInfo.getNsCount(); //if not dir then it will return zero long srcDsCount = srcInfo.getDsCount(); long dstNsCount = dstInfo.getNsCount(); long dstDsCount = dstInfo.getDsCount(); boolean isUsingSubTreeLocks = srcInfo.isDir(); boolean renameTransactionCommitted = false; INodeIdentifier srcSubTreeRoot = null; String subTreeLockDst = INode.constructPath(dstInfo.getPathComponents(), 0, dstInfo.getNumExistingComp()); if(subTreeLockDst.equals(INodeDirectory.ROOT_NAME)){ subTreeLockDst = "/"; // absolute path } try { if (isUsingSubTreeLocks) { LOG.debug("Rename src: "+src+" dst: "+dst+" requires sub-tree locking mechanism"); srcSubTreeRoot = lockSubtreeAndCheckPathPermission(src, false, null, FsAction.WRITE, null, null, SubTreeOperation.StoOperationType.RENAME_STO); if (srcSubTreeRoot != null) { AbstractFileTree.QuotaCountingFileTree srcFileTree; if (pathIsMetaEnabled(srcInfo.pathInodes) || pathIsMetaEnabled(dstInfo.pathInodes)) { srcFileTree = new AbstractFileTree.LoggingQuotaCountingFileTree(this, srcSubTreeRoot, srcDataset, dstDataset); srcFileTree.buildUp(); logEntries = ((AbstractFileTree.LoggingQuotaCountingFileTree) srcFileTree).getMetadataLogEntries(); } else { srcFileTree = new AbstractFileTree.QuotaCountingFileTree(this, srcSubTreeRoot); srcFileTree.buildUp(); } srcNsCount = srcFileTree.getNamespaceCount(); srcDsCount = srcFileTree.getDiskspaceCount(); } } else { LOG.debug("Rename src: " + src + " dst: " + dst + " does not require sub-tree locking mechanism"); } boolean retValue = renameTo(src, dst, srcNsCount, srcDsCount, dstNsCount, dstDsCount, isUsingSubTreeLocks, subTreeLockDst, logEntries); // the rename Tx has commited. it has also remove the subTreelocks renameTransactionCommitted = true; return retValue; } finally { if (!renameTransactionCommitted) { if (srcSubTreeRoot != null) { //only unlock if locked unlockSubtree(src); } } } } /** * Change the indicated filename. * * @deprecated Use {@link #renameTo(String, String, Options.Rename...)} * instead. */ @Deprecated boolean renameTo(final String src, final String dst, final long srcNsCount, final long srcDsCount, final long dstNsCount, final long dstDsCount, final boolean isUsingSubTreeLocks, final String subTreeLockDst, final Collection<MetadataLogEntry> logEntries) throws IOException, UnresolvedLinkException { HopsTransactionalRequestHandler renameToHandler = new HopsTransactionalRequestHandler( isUsingSubTreeLocks ? HDFSOperationType.SUBTREE_DEPRICATED_RENAME : HDFSOperationType.DEPRICATED_RENAME , src) { @Override public void acquireLock(TransactionLocks locks) throws IOException { LockFactory lf = LockFactory.getInstance(); locks.add(lf.getLegacyRenameINodeLock(!dir.isQuotaEnabled()?true:false/*skip INode Attr Lock*/, nameNode, INodeLockType.WRITE_ON_TARGET_AND_PARENT, INodeResolveType.PATH, true, src, dst)) .add(lf.getBlockLock()) .add(lf.getBlockRelated(BLK.RE, BLK.UC, BLK.IV, BLK.CR, BLK.ER, BLK.PE, BLK.UR)); if(!isUsingSubTreeLocks){ locks.add(lf.getLeaseLock(LockType.WRITE)) .add(lf.getLeasePathLock(LockType.READ_COMMITTED)); }else{ locks.add(lf.getLeaseLock(LockType.READ_COMMITTED)) .add(lf.getLeasePathLock(LockType.READ_COMMITTED,src)); } if (dir.isQuotaEnabled()) { locks.add(lf.getQuotaUpdateLock(true, src, dst)); } } @Override public Object performTask() throws IOException { if (NameNode.stateChangeLog.isDebugEnabled()) { NameNode.stateChangeLog.debug("DIR* NameSystem.renameTo: " + src + " to " + dst); } if (isInSafeMode()) { throw new SafeModeException("Cannot rename " + src, safeMode); } if (!DFSUtil.isValidName(dst)) { throw new IOException("Invalid name: " + dst); } // remove the subtree locks removeSubTreeLocksForRenameInternal(src, isUsingSubTreeLocks, subTreeLockDst); for (MetadataLogEntry logEntry : logEntries) { EntityManager.add(logEntry); } return dir.renameTo(src, dst, srcNsCount, srcDsCount, dstNsCount, dstDsCount); } }; return (Boolean) renameToHandler.handle(this); } /** * Delete a directory tree in multiple transactions. Deleting a large directory * tree might take to much time for a single transaction. Hence, this function * first builds up an in-memory representation of the directory tree to be * deleted and then deletes it level by level. The directory tree is locked * during the delete to prevent any concurrent modification. * * @param path * the path to be deleted * @param recursive * whether or not and non-empty directory should be deleted * @return * true if the delete succeeded * @throws IOException */ boolean multiTransactionalDelete(final String path, final boolean recursive) throws IOException { if (NameNode.stateChangeLog.isDebugEnabled()) { NameNode.stateChangeLog .debug("DIR* NameSystem.multiTransactionalDelete: " + path); } boolean ret; try { ret = multiTransactionalDeleteInternal(path, recursive); logAuditEvent(ret, "delete", path); } catch (IOException e) { logAuditEvent(false, "delete", path); throw e; } return ret; } private boolean multiTransactionalDeleteInternal(final String path, final boolean recursive) throws IOException { if (isInSafeMode()) { throw new SafeModeException("Cannot delete " + path, safeMode); } if (!recursive) { // It is safe to do this as it will only delete a single file or an empty directory return deleteWithTransaction(path, recursive); } PathInformation pathInfo = this.getPathExistingINodesFromDB(path, false, null, FsAction.WRITE, null, null); INode[] pathInodes = pathInfo.getPathInodes(); INode pathInode = pathInodes[pathInodes.length - 1]; if (pathInode == null) { NameNode.stateChangeLog .debug("Failed to remove " + path + " because it does not exist"); return false; } else if (pathInode.isRoot()) { NameNode.stateChangeLog.warn("Failed to remove " + path + " because the root is not allowed to be deleted"); return false; } INodeIdentifier subtreeRoot = null; if (pathInode.isFile()) { return deleteWithTransaction(path, recursive); } else { //sub tree operation try { subtreeRoot = lockSubtreeAndCheckPathPermission(path, false, null, FsAction.WRITE, null, null, SubTreeOperation.StoOperationType.DELETE_STO); AbstractFileTree.FileTree fileTree = new AbstractFileTree.FileTree(this, subtreeRoot, FsAction.ALL); fileTree.buildUp(); if (dir.isQuotaEnabled()) { Iterator<Integer> idIterator = fileTree.getAllINodesIds().iterator(); synchronized (idIterator) { quotaUpdateManager.addPrioritizedUpdates(idIterator); try { idIterator.wait(); } catch (InterruptedException e) { // Not sure if this can happend if we are not shutting down but we need to abort in case it happens. throw new IOException("Operation failed due to an Interrupt"); } } } for (int i = fileTree.getHeight(); i > 0; i--) { if (deleteTreeLevel(path, fileTree, i) == false) { return false; } } } finally { if(subtreeRoot != null){ unlockSubtree(path); } } return true; } } private boolean deleteTreeLevel(final String subtreeRootPath, final AbstractFileTree.FileTree fileTree, int level) { ArrayList<Future> barrier = new ArrayList<Future>(); for (final ProjectedINode dir : fileTree.getDirsByLevel(level)) { if (fileTree.countChildren(dir.getId()) <= BIGGEST_DELETEABLE_DIR) { final String path = fileTree.createAbsolutePath(subtreeRootPath, dir); Future f = multiTransactionDeleteInternal(path); barrier.add(f); } else { //delete the content of the direcotry one by one. for (final ProjectedINode inode : fileTree.getChildren(dir.getId())) { if(!inode.isDirectory()) { final String path = fileTree.createAbsolutePath(subtreeRootPath, inode); Future f = multiTransactionDeleteInternal(path); barrier.add(f); } } // the dir is empty now. delete it. final String path = fileTree.createAbsolutePath(subtreeRootPath, dir); Future f = multiTransactionDeleteInternal(path); barrier.add(f); } } boolean result = true; for (Future f : barrier) { try { if (((Boolean) f.get()) == false) { result = false; } } catch (Exception e) { result = false; LOG.error("Exception was thrown during partial delete", e); } } return result; } private Future multiTransactionDeleteInternal(final String path){ return subtreeOperationsExecutor.submit(new Callable<Boolean>() { @Override public Boolean call() throws Exception { HopsTransactionalRequestHandler deleteHandler = new HopsTransactionalRequestHandler(HDFSOperationType.SUBTREE_DELETE) { @Override public void acquireLock(TransactionLocks locks) throws IOException { LockFactory lf = LockFactory.getInstance(); locks.add(lf.getINodeLock(!dir.isQuotaEnabled()?true:false/*skip INode Attr Lock*/, nameNode, INodeLockType.WRITE_ON_TARGET_AND_PARENT, INodeResolveType.PATH_AND_ALL_CHILDREN_RECURSIVELY, false, true, path)) .add(lf.getLeaseLock(LockType.WRITE)) .add(lf.getLeasePathLock(LockType.READ_COMMITTED)) .add(lf.getBlockLock()).add( lf.getBlockRelated(BLK.RE, BLK.CR, BLK.UC, BLK.UR, BLK.PE, BLK.IV)); if (dir.isQuotaEnabled()) { locks.add(lf.getQuotaUpdateLock(true, path)); } if (erasureCodingEnabled) { locks.add(lf.getEncodingStatusLock(true,LockType.WRITE, path)); } } @Override public Object performTask() throws IOException { return deleteInternal(path,true,false); } }; return (Boolean) deleteHandler.handle(this); } }); } /** * Lock a subtree of the filesystem tree. * Locking a subtree prevents it from any concurrent write operations. * * @param path * the root of the subtree to be locked * @return * the inode representing the root of the subtree * @throws IOException */ @VisibleForTesting INodeIdentifier lockSubtree(final String path, SubTreeOperation.StoOperationType stoType) throws IOException { return lockSubtreeAndCheckPathPermission(path, false, null, null, null, null, stoType); } /** * Lock a subtree of the filesystem tree and ensure that the client has * sufficient permissions. Locking a subtree prevents it from any concurrent * write operations. * * @param path * the root of the subtree to be locked * @param doCheckOwner * whether or not to check the owner * @param ancestorAccess * the requested ancestor access * @param parentAccess * the requested parent access * @param access * the requested access * @param subAccess * the requested subaccess * @return * the inode representing the root of the subtree * @throws IOException */ @VisibleForTesting INodeIdentifier lockSubtreeAndCheckPathPermission(final String path, final boolean doCheckOwner, final FsAction ancestorAccess, final FsAction parentAccess, final FsAction access, final FsAction subAccess, final SubTreeOperation.StoOperationType stoType) throws IOException { if(path.compareTo("/")==0){ return null; } return (INodeIdentifier) new HopsTransactionalRequestHandler( HDFSOperationType.SET_SUBTREE_LOCK) { @Override public void setUp() throws IOException { super.setUp(); if(LOG.isDebugEnabled()) { LOG.debug("About to lock \"" + path + "\""); } } @Override public void acquireLock(TransactionLocks locks) throws IOException { LockFactory lf = LockFactory.getInstance(); locks.add(lf.getINodeLock(!dir.isQuotaEnabled()?true:false/*skip INode Attr Lock*/, nameNode, INodeLockType .WRITE, INodeResolveType.PATH, false, path)). //READ_COMMITTED because it is index scan and locking is bad idea //INode lock is sufficient add(lf.getSubTreeOpsLock(LockType.READ_COMMITTED, getSubTreeLockPathPrefix(path))); // it is } @Override public Object performTask() throws IOException { FSPermissionChecker pc = getPermissionChecker(); if (isPermissionEnabled && !pc.isSuperUser()) { pc.checkPermission(path, dir.getRootDir(), doCheckOwner, ancestorAccess, parentAccess, access, subAccess); } INode[] nodes = dir.getRootDir().getExistingPathINodes(path, false); INode inode = nodes[nodes.length - 1]; if (inode != null && inode.isDirectory() && !inode.isRoot()) { // never lock the fs root checkSubTreeLocks(getSubTreeLockPathPrefix(path)); inode.setSubtreeLocked(true); inode.setSubtreeLockOwner(getNamenodeId()); EntityManager.update(inode); if(LOG.isDebugEnabled()) { LOG.debug("Lock the INode with sub tree lock flag. Path: \"" + path + "\" " + " id: " + inode.getId() + " pid: " + inode.getParentId() + " name: " + inode.getLocalName()); } EntityManager.update(new SubTreeOperation(getSubTreeLockPathPrefix(path) ,nameNode.getId(),stoType)); INodeIdentifier iNodeIdentifier = new INodeIdentifier(inode.getId(), inode.getParentId(), inode.getLocalName(), inode.getPartitionId()); iNodeIdentifier.setDepth(inode.myDepth()); return iNodeIdentifier; }else{ if(LOG.isInfoEnabled()) { LOG.info("No componenet was locked in the path using sub tree flag. " + "Path: \"" + path + "\""); } return null; } } }.handle(this); } /** * adds / at the end of the path * suppose /aa/bb is locked and we want to lock an other foler /a. * when we search for all prefixes "/a" it will return subtree ops in other * folders i.e /aa*. By adding / in the end of the path solves the problem * @param path * @return /path + "/" */ private String getSubTreeLockPathPrefix(String path){ String subTreeLockPrefix = path; if(!subTreeLockPrefix.endsWith("/")){ subTreeLockPrefix+="/"; } return subTreeLockPrefix; } /** * check for sub tree locks in the descendant tree * @return number of active operations in the descendant tree */ private void checkSubTreeLocks(String path) throws TransactionContextException, StorageException{ List<SubTreeOperation> ops = (List<SubTreeOperation>) EntityManager.findList(SubTreeOperation.Finder.ByPathPrefix, path); // THIS RETURNS ONLY ONE SUBTREE OP IN THE CHILD TREE. INCREASE THE LIMIT IN IMPL LAYER IF NEEDED Set<Long> activeNameNodeIds = new HashSet<Long>(); for(ActiveNode node:nameNode.getActiveNameNodes().getActiveNodes()){ activeNameNodeIds.add(node.getId()); } for(SubTreeOperation op : ops){ if(activeNameNodeIds.contains(op.getNameNodeId())){ throw new SubtreeLockedException("There is atleat one on going subtree operation " + "on the decendents. Path: "+op.getPath() +" Operation "+op.getOpType()+" NameNodeId "+op.getNameNodeId()); }else{ // operation started by a dead namenode. //TODO: what if the activeNameNodeIds does not contain all new namenode ids //An operation belonging to new namenode might be considered dead //handle this my maintaining a list of dead namenodes. EntityManager.remove(op); } } } /** * Unlock a subtree in the filesystem tree. * * @param path * the root of the subtree * @throws IOException */ @VisibleForTesting void unlockSubtree(final String path) throws IOException { new HopsTransactionalRequestHandler(HDFSOperationType.RESET_SUBTREE_LOCK) { @Override public void acquireLock(TransactionLocks locks) throws IOException { LockFactory lf = LockFactory.getInstance(); locks.add(lf.getINodeLock(!dir.isQuotaEnabled()?true:false/*skip INode Attr Lock*/,nameNode, INodeLockType.WRITE, INodeResolveType.PATH, false, true, path)); } @Override public Object performTask() throws IOException { INode[] nodes = dir.getRootDir().getExistingPathINodes(path, false); INode inode = nodes[nodes.length - 1]; if (inode != null && inode.isSubtreeLocked()) { inode.setSubtreeLocked(false); EntityManager.update(inode); } EntityManager.remove(new SubTreeOperation(getSubTreeLockPathPrefix(path))); return null; } }.handle(this); } private int pid(String param) { StringTokenizer tok = new StringTokenizer(param); tok.nextElement(); return Integer.parseInt((String) tok.nextElement()); } private String pname(String param) { StringTokenizer tok = new StringTokenizer(param); return (String) tok.nextElement(); } public NameNode getNameNode() { return nameNode; } /** * @see org.apache.hadoop.hdfs.protocol.ClientProtocol#getEncodingStatus */ public EncodingStatus getEncodingStatus(final String filePath) throws IOException { HopsTransactionalRequestHandler findReq = new HopsTransactionalRequestHandler( HDFSOperationType.FIND_ENCODING_STATUS) { @Override public void acquireLock(TransactionLocks locks) throws IOException { LockFactory lf = LockFactory.getInstance(); locks.add(lf.getINodeLock(nameNode, INodeLockType.READ_COMMITTED, INodeResolveType.PATH, filePath)).add( lf.getEncodingStatusLock(LockType.READ_COMMITTED, filePath)); } @Override public Object performTask() throws IOException { FSPermissionChecker pc = getPermissionChecker(); try { if (isPermissionEnabled) { checkPathAccess(pc, filePath, FsAction.READ); } } catch (AccessControlException e){ logAuditEvent(false, "getEncodingStatus", filePath); throw e; } INode targetNode = getINode(filePath); if (targetNode == null) { throw new FileNotFoundException(); } return EntityManager .find(EncodingStatus.Finder.ByInodeId, targetNode.getId()); } }; Object result = findReq.handle(); if (result == null) { return new EncodingStatus(EncodingStatus.Status.NOT_ENCODED); } return (EncodingStatus) result; } /** * Get the inode with the given id. * * @param id * the inode id * @return * the inode * @throws IOException */ public INode findInode(final int id) throws IOException { LightWeightRequestHandler findHandler = new LightWeightRequestHandler(HDFSOperationType.GET_INODE) { @Override public Object performTask() throws IOException { INodeDataAccess<INode> dataAccess = (INodeDataAccess) HdfsStorageFactory .getDataAccess(INodeDataAccess.class); return dataAccess.findInodeByIdFTIS(id); } }; return (INode) findHandler.handle(); } /** * Get the path of a file with the given inode id. * * @param id * the inode id of the file * @return * the path * @throws IOException */ public String getPath(int id) throws IOException { LinkedList<INode> resolvedInodes = new LinkedList<INode>(); boolean resovled[] = new boolean[1]; INodeUtil.findPathINodesById(id, resolvedInodes, resovled); if (resovled[0] == false) { throw new IOException( "Path could not be resolved for inode with id " + id); } return INodeUtil.constructPath(resolvedInodes); } /** * @see org.apache.hadoop.hdfs.protocol.ClientProtocol#getMissingBlockLocations */ public LocatedBlocks getMissingBlockLocations(final String clientMachine, final String filePath) throws AccessControlException, FileNotFoundException, UnresolvedLinkException, IOException { LocatedBlocks blocks = getBlockLocations(clientMachine, filePath, 0, Long.MAX_VALUE); Iterator<LocatedBlock> iterator = blocks.getLocatedBlocks().iterator(); while (iterator.hasNext()) { LocatedBlock b = iterator.next(); if ((b.isCorrupt() || (b.getLocations().length == 0 && b.getBlockSize() > 0)) == false) { iterator.remove(); } } return blocks; } /** * Add and encoding status for a file. * * @param sourcePath * the file path * @param policy * the policy to be used * @throws IOException */ public void addEncodingStatus(final String sourcePath, final EncodingPolicy policy, final EncodingStatus.Status status) throws IOException { new HopsTransactionalRequestHandler(HDFSOperationType.ADD_ENCODING_STATUS) { @Override public void acquireLock(TransactionLocks locks) throws IOException { LockFactory lf = LockFactory.getInstance(); locks.add(lf.getINodeLock(nameNode, INodeLockType.WRITE, INodeResolveType.PATH, sourcePath)); locks.add(lf.getEncodingStatusLock(LockType.WRITE, sourcePath)); } @Override public Object performTask() throws IOException { FSPermissionChecker pc = getPermissionChecker(); try { if (isPermissionEnabled) { checkPathAccess(pc, sourcePath, FsAction.WRITE); } } catch (AccessControlException e){ logAuditEvent(false, "encodeFile", sourcePath); throw e; } INode target = getINode(sourcePath); EncodingStatus existing = EntityManager.find( EncodingStatus.Finder.ByInodeId, target.getId()); if (existing != null) { throw new IOException("Attempting to request encoding for an" + "encoded file"); } int inodeId = dir.getINode(sourcePath).getId(); EncodingStatus encodingStatus = new EncodingStatus(inodeId, status, policy, System.currentTimeMillis()); EntityManager.add(encodingStatus); return null; } }.handle(); } /** * Remove the status of an erasure-coded file. * * @param encodingStatus * the status of the file * @throws IOException */ public void removeEncodingStatus(final EncodingStatus encodingStatus) throws IOException { // All referring inodes are already deleted. No more lock necessary. LightWeightRequestHandler removeHandler = new LightWeightRequestHandler(EncodingStatusOperationType.DELETE) { @Override public Object performTask() throws StorageException, IOException { BlockChecksumDataAccess blockChecksumDataAccess = (BlockChecksumDataAccess) HdfsStorageFactory .getDataAccess(BlockChecksumDataAccess.class); EncodingStatusDataAccess encodingStatusDataAccess = (EncodingStatusDataAccess) HdfsStorageFactory .getDataAccess(EncodingStatusDataAccess.class); blockChecksumDataAccess.deleteAll(encodingStatus.getInodeId()); blockChecksumDataAccess .deleteAll(encodingStatus.getParityInodeId()); encodingStatusDataAccess.delete(encodingStatus); return null; } }; removeHandler.handle(); } /** * Remove the status of an erasure-coded file. * * @param path * the path of the file * @param encodingStatus * the status of the file * @throws IOException */ public void removeEncodingStatus(final String path, final EncodingStatus encodingStatus) throws IOException { new HopsTransactionalRequestHandler( HDFSOperationType.DELETE_ENCODING_STATUS) { @Override public void acquireLock(TransactionLocks locks) throws IOException { LockFactory lf = LockFactory.getInstance(); locks.add(lf.getINodeLock(nameNode, INodeLockType.WRITE, INodeResolveType.PATH, path)) .add(lf.getEncodingStatusLock(LockType.WRITE, path)); } @Override public Object performTask() throws IOException { EntityManager.remove(encodingStatus); return null; } }.handle(); } /** * @see org.apache.hadoop.hdfs.protocol.ClientProtocol#revokeEncoding */ public void revokeEncoding(final String filePath, short replication) throws IOException { setReplication(filePath, replication); new HopsTransactionalRequestHandler( HDFSOperationType.REVOKE_ENCODING_STATUS) { @Override public void acquireLock(TransactionLocks locks) throws IOException { LockFactory lf = LockFactory.getInstance(); locks.add(lf.getINodeLock(nameNode, INodeLockType.WRITE, INodeResolveType.PATH, filePath)) .add(lf.getEncodingStatusLock(LockType.WRITE, filePath)); } @Override public Object performTask() throws IOException { FSPermissionChecker pc = getPermissionChecker(); try { if (isPermissionEnabled) { checkPathAccess(pc, filePath, FsAction.WRITE); } } catch (AccessControlException e){ logAuditEvent(false, "revokeEncoding", filePath); throw e; } INode targetNode = getINode(filePath); EncodingStatus encodingStatus = EntityManager .find(EncodingStatus.Finder.ByInodeId, targetNode.getId()); encodingStatus.setRevoked(true); EntityManager.update(encodingStatus); return null; } }.handle(); } /** * Set the status of an erasure-coded file. * * @param sourceFile * the file path * @param status * the file status * @throws IOException */ public void updateEncodingStatus(String sourceFile, EncodingStatus.Status status) throws IOException { updateEncodingStatus(sourceFile, status, null, null); } /** * Set the parity status of an erasure-coded file. * * @param sourceFile * the file path * @param parityStatus * the parity file status * @throws IOException */ public void updateEncodingStatus(String sourceFile, EncodingStatus.ParityStatus parityStatus) throws IOException { updateEncodingStatus(sourceFile, null, parityStatus, null); } /** * Set the status of an erasure-coded file. * * @param sourceFile * the file path * @param status * the file status * @param parityFile * the parity file name * @throws IOException */ public void updateEncodingStatus(String sourceFile, EncodingStatus.Status status, String parityFile) throws IOException { updateEncodingStatus(sourceFile, status, null, parityFile); } /** * Set the status of an erasure-coded file and its parity file. * * @param sourceFile * the file path * @param status * the file status * @param parityStatus * the parity status * @param parityFile * the parity file name * @throws IOException */ public void updateEncodingStatus(final String sourceFile, final EncodingStatus.Status status, final EncodingStatus.ParityStatus parityStatus, final String parityFile) throws IOException { new HopsTransactionalRequestHandler( HDFSOperationType.UPDATE_ENCODING_STATUS) { @Override public void acquireLock(TransactionLocks locks) throws IOException { LockFactory lf = LockFactory.getInstance(); locks.add(lf.getINodeLock(nameNode, INodeLockType.WRITE, INodeResolveType.PATH, sourceFile)) .add(lf.getEncodingStatusLock(LockType.WRITE, sourceFile)); } @Override public Object performTask() throws StorageException, IOException { INode targetNode = getINode(sourceFile); EncodingStatus encodingStatus = EntityManager .find(EncodingStatus.Finder.ByInodeId, targetNode.getId()); if (status != null) { encodingStatus.setStatus(status); encodingStatus.setStatusModificationTime(System.currentTimeMillis()); } if (parityFile != null) { encodingStatus.setParityFileName(parityFile); // Should be updated together with the status so the modification time is already set } if (parityStatus != null) { encodingStatus.setParityStatus(parityStatus); encodingStatus.setStatusModificationTime(System.currentTimeMillis()); } EntityManager.update(encodingStatus); return null; } }.handle(); } public INode getINode(String path) throws UnresolvedLinkException, StorageException, TransactionContextException { INode[] inodes = dir.getExistingPathINodes(path); return inodes[inodes.length - 1]; } public boolean isErasureCodingEnabled() { return erasureCodingEnabled; } /** * @see org.apache.hadoop.hdfs.protocol.ClientProtocol#addBlockChecksum */ public void addBlockChecksum(final String src, final int blockIndex, final long checksum) throws IOException { new HopsTransactionalRequestHandler(HDFSOperationType.ADD_BLOCK_CHECKSUM) { @Override public void acquireLock(TransactionLocks locks) throws IOException { LockFactory lf = LockFactory.getInstance(); locks.add(lf.getINodeLock(nameNode, INodeLockType.WRITE, INodeResolveType.PATH, src)); } @Override public Object performTask() throws IOException { FSPermissionChecker pc = getPermissionChecker(); try { if (isPermissionEnabled) { checkPathAccess(pc, src, FsAction.WRITE); } } catch (AccessControlException e){ logAuditEvent(false, "addBlockChecksum", src); throw e; } int inodeId = dir.getINode(src).getId(); BlockChecksum blockChecksum = new BlockChecksum(inodeId, blockIndex, checksum); EntityManager.add(blockChecksum); return null; } }.handle(); } /** * @see org.apache.hadoop.hdfs.protocol.ClientProtocol#getBlockChecksum */ public long getBlockChecksum(final String src, final int blockIndex) throws IOException { return (Long) new HopsTransactionalRequestHandler( HDFSOperationType.GET_BLOCK_CHECKSUM) { @Override public void acquireLock(TransactionLocks locks) throws IOException { LockFactory lf = LockFactory.getInstance(); locks.add( lf.getINodeLock(nameNode, INodeLockType.READ, INodeResolveType.PATH, src)).add(lf.getBlockChecksumLock(src, blockIndex)); } @Override public Object performTask() throws IOException { FSPermissionChecker pc = getPermissionChecker(); try { if (isPermissionEnabled) { checkPathAccess(pc, src, FsAction.READ); } } catch (AccessControlException e){ logAuditEvent(false, "getBlockChecksum", src); throw e; } INode node = dir.getINode(src); BlockChecksumDataAccess.KeyTuple key = new BlockChecksumDataAccess.KeyTuple(node.getId(), blockIndex); BlockChecksum checksum = EntityManager.find(BlockChecksum.Finder.ByKeyTuple, key); if (checksum == null) { throw new IOException("No checksum was found for " + key); } return checksum.getChecksum(); } }.handle(); } /** * @see org.apache.hadoop.hdfs.protocol.ClientProtocol#getRepairedBlockLocations */ public LocatedBlock getRepairedBlockLocations(String clientMachine, String sourcePath, String parityPath, LocatedBlock block, boolean isParity) throws IOException { EncodingStatus status = getEncodingStatus(sourcePath); Codec codec = Codec.getCodec(status.getEncodingPolicy().getCodec()); ArrayList<LocatedBlock> sourceLocations = new ArrayList(getBlockLocations(clientMachine, sourcePath, 0, Long.MAX_VALUE).getLocatedBlocks()); Collections.sort(sourceLocations, LocatedBlock.blockIdComparator); ArrayList<LocatedBlock> parityLocations = new ArrayList(getBlockLocations(clientMachine, parityPath, 0, Long.MAX_VALUE).getLocatedBlocks()); Collections.sort(parityLocations, LocatedBlock.blockIdComparator); HashMap<Node, Node> excluded = new HashMap<Node, Node>(); int stripe = isParity ? getStripe(block, parityLocations, codec.getParityLength()) : getStripe(block, sourceLocations, codec.getStripeLength()); // Exclude all nodes from the related source stripe int index = stripe * codec.getStripeLength(); for (int i = index; i < sourceLocations.size() && i < index + codec.getStripeLength(); i++) { DatanodeInfo[] nodes = sourceLocations.get(i).getLocations(); for (DatanodeInfo node : nodes) { excluded.put(node, node); } } // Exclude all nodes from the related parity blocks index = stripe * codec.getParityLength(); for (int i = index; i < parityLocations.size() && i < index + codec.getParityLength(); i++) { DatanodeInfo[] nodes = parityLocations.get(i).getLocations(); for (DatanodeInfo node : nodes) { excluded.put(node, node); } } BlockPlacementPolicyDefault placementPolicy = (BlockPlacementPolicyDefault) getBlockManager().getBlockPlacementPolicy(); List<DatanodeDescriptor> chosenNodes = new LinkedList<DatanodeDescriptor>(); DatanodeDescriptor[] descriptors = placementPolicy .chooseTarget(isParity ? parityPath : sourcePath, isParity ? 1 : status.getEncodingPolicy().getTargetReplication(), null, chosenNodes, false, excluded, block.getBlockSize()); return new LocatedBlock(block.getBlock(), descriptors); } private int getStripe(LocatedBlock block, ArrayList<LocatedBlock> locatedBlocks, int length) { int i = 0; for (LocatedBlock b : locatedBlocks) { if (block.getBlock().getBlockId() == b.getBlock().getBlockId()) { break; } i++; } return i / length; } private PathInformation getPathExistingINodesFromDB(final String path, final boolean doCheckOwner, final FsAction ancestorAccess, final FsAction parentAccess, final FsAction access, final FsAction subAccess) throws IOException{ HopsTransactionalRequestHandler handler = new HopsTransactionalRequestHandler(HDFSOperationType.SUBTREE_PATH_INFO) { @Override public void acquireLock(TransactionLocks locks) throws IOException { LockFactory lf = LockFactory.getInstance(); locks.add(lf.getINodeLock(!dir.isQuotaEnabled()?true:false/*skip INode Attr Lock*/,nameNode, INodeLockType.READ_COMMITTED, INodeResolveType.PATH, false, path)).add(lf.getBlockLock()); // blk lock only if file } @Override public Object performTask() throws StorageException, IOException { FSPermissionChecker pc = getPermissionChecker(); if (isPermissionEnabled && !pc.isSuperUser()) { pc.checkPermission(path, dir.getRootDir(), doCheckOwner, ancestorAccess, parentAccess, access, subAccess); } byte[][] pathComponents = INode.getPathComponents(path); INode[] pathInodes = new INode[pathComponents.length]; boolean isDir = false; INode.DirCounts srcCounts = new INode.DirCounts(); int numExistingComp = dir.getRootDir(). getExistingPathINodes(pathComponents, pathInodes, false); if(pathInodes[pathInodes.length - 1] != null){ // complete path resolved if(pathInodes[pathInodes.length - 1] instanceof INodeFile ){ isDir = false; //do ns and ds counts for file only pathInodes[pathInodes.length - 1].spaceConsumedInTree(srcCounts); }else{ isDir =true; } } return new PathInformation(path, pathComponents, pathInodes,numExistingComp,isDir, srcCounts.getNsCount(), srcCounts.getDsCount()); } }; return (PathInformation)handler.handle(this); } private class PathInformation{ private String path; private byte[][] pathComponents; private INode[] pathInodes; private boolean dir; private long nsCount; private long dsCount; private int numExistingComp; public PathInformation(String path, byte[][] pathComponents, INode[] pathInodes, int numExistingComp, boolean dir, long nsCount, long dsCount) { this.path = path; this.pathComponents = pathComponents; this.pathInodes = pathInodes; this.dir = dir; this.nsCount = nsCount; this.dsCount = dsCount; this.numExistingComp = numExistingComp; } public String getPath() { return path; } public byte[][] getPathComponents() { return pathComponents; } public INode[] getPathInodes() { return pathInodes; } public boolean isDir() { return dir; } public long getNsCount() { return nsCount; } public long getDsCount() { return dsCount; } public int getNumExistingComp() { return numExistingComp; } } public ExecutorService getExecutorService(){ return subtreeOperationsExecutor; } }