/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hdfs; import java.io.BufferedOutputStream; import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InterruptedIOException; import java.net.InetSocketAddress; import java.net.Socket; import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Iterator; import java.util.LinkedList; import org.apache.hadoop.fs.FSOutputSummer; import org.apache.hadoop.fs.FileAlreadyExistsException; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.Syncable; import org.apache.hadoop.fs.permission.FsPermission; import org.apache.hadoop.hdfs.DFSClient.MultiDataInputStream; import org.apache.hadoop.hdfs.DFSClient.MultiDataOutputStream; import org.apache.hadoop.hdfs.protocol.AppendBlockHeader; import org.apache.hadoop.hdfs.profiling.DFSWriteProfilingData; import org.apache.hadoop.hdfs.profiling.DFSWriteProfilingData.WritePacketClientProfile; import org.apache.hadoop.hdfs.protocol.Block; import org.apache.hadoop.hdfs.protocol.ClientDatanodeProtocol; import org.apache.hadoop.hdfs.protocol.ClientProtocol; import org.apache.hadoop.hdfs.protocol.DSQuotaExceededException; import org.apache.hadoop.io.WriteOptions; import org.apache.hadoop.hdfs.protocol.DataTransferProtocol; import org.apache.hadoop.hdfs.protocol.DataTransferProtocol.PipelineAck; import org.apache.hadoop.hdfs.protocol.DatanodeInfo; import org.apache.hadoop.hdfs.protocol.FSConstants; import org.apache.hadoop.hdfs.protocol.LocatedBlock; import org.apache.hadoop.hdfs.protocol.LocatedBlockWithMetaInfo; import org.apache.hadoop.hdfs.protocol.LocatedBlockWithOldGS; import org.apache.hadoop.hdfs.protocol.NSQuotaExceededException; import org.apache.hadoop.hdfs.protocol.VersionedLocatedBlock; import org.apache.hadoop.hdfs.protocol.WriteBlockHeader; import org.apache.hadoop.hdfs.server.datanode.DataNode; import org.apache.hadoop.hdfs.server.namenode.INode; import org.apache.hadoop.hdfs.server.namenode.NotReplicatedYetException; import org.apache.hadoop.hdfs.server.protocol.BlockAlreadyCommittedException; import org.apache.hadoop.hdfs.util.InjectionEvent; import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.io.Text; import org.apache.hadoop.ipc.Client; import org.apache.hadoop.ipc.ProtocolProxy; import org.apache.hadoop.ipc.RPC; import org.apache.hadoop.ipc.RemoteException; import org.apache.hadoop.net.NetUtils; import org.apache.hadoop.security.AccessControlException; import org.apache.hadoop.util.Daemon; import org.apache.hadoop.util.DataChecksum; import org.apache.hadoop.util.InjectionHandler; import org.apache.hadoop.util.Progressable; import org.apache.hadoop.util.NativeCrc32; import org.apache.hadoop.util.StringUtils; /**************************************************************** * DFSOutputStream creates files from a stream of bytes. * * The client application writes data that is cached internally by * this stream. Data is broken up into packets, each packet is * typically 64K in size. A packet comprises of chunks. Each chunk * is typically 512 bytes and has an associated checksum with it. * * When a client application fills up the currentPacket, it is * enqueued into dataQueue. The DataStreamer thread picks up * packets from the dataQueue, sends it to the first datanode in * the pipeline and moves it from the dataQueue to the ackQueue. * The ResponseProcessor receives acks from the datanodes. When an * successful ack for a packet is received from all datanodes, the * ResponseProcessor removes the corresponding packet from the * ackQueue. * * In case of error, all outstanding packets and moved from * ackQueue. A new pipeline is setup by eliminating the bad * datanode from the original pipeline. The DataStreamer now * starts sending packets from the dataQueue. ****************************************************************/ class DFSOutputStream extends FSOutputSummer implements Syncable, Replicable { private final DFSClient dfsClient; private Socket[] s; boolean closed = false; private String src; private MultiDataOutputStream blockStream; private MultiDataInputStream blockReplyStream; private Block block; final private long blockSize; private boolean pktIncludeVersion = false; final private int packetVersion; private DataChecksum checksum; private LinkedList<DFSOutputStreamPacket> dataQueue = new LinkedList<DFSOutputStreamPacket>(); private LinkedList<DFSOutputStreamPacket> ackQueue = new LinkedList<DFSOutputStreamPacket>(); private int numPendingHeartbeats = 0; private long lastPacketSentTime = 0; private final long packetTimeout; private DFSOutputStreamPacket currentPacket = null; private int maxPackets = 80; // each packet 64K, total 5MB // private int maxPackets = 1000; // each packet 64K, total 64MB private DataStreamer streamer; private ResponseProcessor response = null; private long currentSeqno = 0; private long lastQueuedSeqno = -1; private long lastAckedSeqno = -1; private long bytesCurBlock = 0; // bytes writen in current block private int packetSize = 0; // write packet size, including the header. private int chunksPerPacket = 0; DatanodeInfo[] nodes = null; // list of targets for current block private DatanodeInfo[] favoredNodes = null; // put replicas here if possible private volatile boolean hasError = false; private volatile int errorIndex = 0; volatile IOException lastException = null; private long artificialSlowdown = 0; private long lastFlushOffset = 0; // offset when flush was invoked private boolean persistBlocks = false; // persist blocks on namenode private int recoveryErrorCount = 0; // number of times block recovery failed private final int maxRecoveryErrorCount; private volatile boolean appendChunk = false; // appending to existing partial block private long initialFileSize = 0; // at time of file open private Progressable progress; private short blockReplication; // replication factor of file private long lastBlkOffset = 0; // end pos of last block already sent private boolean forceSync; private boolean doParallelWrites = false; private final WriteOptions options; private void setLastException(IOException e) { if (lastException == null) { lastException = e; } } public void setOffsets(long offset) { DFSClient.LOG.info("set last block offsets in file: " + src + " pos: " + offset); lastBlkOffset = offset; } /** Decide if the write pipeline supports bidirectional heartbeat or not */ private boolean supportClientHeartbeat() throws IOException { return dfsClient.getDataTransferProtocolVersion() >= DataTransferProtocol.CLIENT_HEARTBEAT_VERSION; } /** * Check if the last outstanding packet has not received an ack before * it is timed out. * If true, for now just log it. * We will provide a decent solution to this later on. */ private void checkIfLastPacketTimeout() { synchronized (ackQueue) { if( !ackQueue.isEmpty() && ( System.currentTimeMillis() - lastPacketSentTime > packetTimeout) ) { DFSClient.LOG.warn("Packet " + ackQueue.getLast().seqno + " of " + block + " is timed out"); } } } // // The DataStreamer class is responsible for sending data packets to the // datanodes in the pipeline. It retrieves a new blockid and block locations // from the namenode, and starts streaming packets to the pipeline of // Datanodes. Every packet has a sequence number associated with // it. When all the packets for a block are sent out and acks for each // if them are received, the DataStreamer closes the current block. // private class DataStreamer extends Daemon { private volatile boolean closed = false; private long lastPacket; private boolean doSleep; DataStreamer() throws IOException { // explicitly invoke RPC so avoiding RPC in waitForWork // that might cause timeout dfsClient.getDataTransferProtocolVersion(); } private void waitForWork() throws IOException { if ( supportClientHeartbeat() ) { // send heart beat long now = System.currentTimeMillis(); while ((!closed && !hasError && dfsClient.clientRunning && dataQueue.size() == 0 && (blockStream == null || ( blockStream != null && now - lastPacket < dfsClient.timeoutValue/2))) || doSleep) { long timeout = dfsClient.timeoutValue/2 - (now-lastPacket); timeout = timeout <= 0 ? 1000 : timeout; try { dataQueue.wait(timeout); checkIfLastPacketTimeout(); now = System.currentTimeMillis(); } catch (InterruptedException e) { } doSleep = false; } } else { // no sending heart beat while ((!closed && !hasError && dfsClient.clientRunning && dataQueue.size() == 0) || doSleep) { try { dataQueue.wait(1000); } catch (InterruptedException e) { } doSleep = false; } } } public void run() { while (!closed && dfsClient.clientRunning) { // if the Responder encountered an error, shutdown Responder if (hasError && response != null) { try { response.close(); response.join(); response = null; } catch (InterruptedException e) { } } DFSOutputStreamPacket one = null; // process IO errors if any doSleep = processDatanodeError(hasError, false); try { synchronized (dataQueue) { // wait for a packet to be sent. waitForWork(); if (closed || hasError || !dfsClient.clientRunning) { continue; } InjectionHandler .processEventIO(InjectionEvent.DFSCLIENT_DATASTREAM_AFTER_WAIT, blockStream); // get packet to be sent. if (dataQueue.isEmpty()) { one = DFSOutputStreamPacketFactory.getHeartbeatPacket( DFSOutputStream.this, ifPacketIncludeVersion(), getPacketVersion()); // heartbeat // packet } else { one = dataQueue.getFirst(); // regular data packet one.eventPopFromDataQueue(); } } long offsetInBlock = one.offsetInBlock; // get new block from namenode. if (blockStream == null) { DFSClient.LOG.debug("Allocating new block: " + src + " pos: " + lastBlkOffset); nodes = nextBlockOutputStream(src); this.setName("DataStreamer for file " + src + " block " + block); response = new ResponseProcessor(nodes); response.start(); } if (offsetInBlock > blockSize || (offsetInBlock == blockSize && (one.dataLength > 0 || !one.lastPacketInBlock))) { throw new IOException("BlockSize " + blockSize + " is smaller than data size. " + " Offset of packet in block " + offsetInBlock + " Aborting file " + src); } ByteBuffer buf = one.getBuffer(); InjectionHandler.processEventIO( InjectionEvent.DFSCLIENT_DATASTREAM_BEFORE_WRITE, blockStream); // write out data to remote datanode blockStream.write(buf.array(), buf.position(), buf.remaining()); if (one.lastPacketInBlock) { blockStream.writeInt(0); // indicate end-of-block } blockStream.flush(); lastPacket = System.currentTimeMillis(); if (DFSClient.LOG.isDebugEnabled()) { DFSClient.LOG.debug("DataStreamer block " + block + " wrote packet seqno:" + one.seqno + " size:" + buf.remaining() + " offsetInBlock:" + one.offsetInBlock + " lastPacketInBlock:" + one.lastPacketInBlock); } // move packet from dataQueue to ackQueue synchronized (dataQueue) { if (!one.isHeartbeatPacket()) { dataQueue.removeFirst(); dataQueue.notifyAll(); synchronized (ackQueue) { ackQueue.addLast(one); one.eventAddToAckQueue(); lastPacketSentTime = System.currentTimeMillis(); ackQueue.notifyAll(); } } else { synchronized (ackQueue) { numPendingHeartbeats++; ackQueue.notifyAll(); } DFSClient.LOG.info("Sending a heartbeat packet for block " + block); } } } catch (Throwable e) { dfsClient.incWriteExpCntToStats(); DFSClient.LOG.warn("DataStreamer Exception: ", e); if (e instanceof IOException) { setLastException((IOException)e); } hasError = true; if (blockStream != null) { // find the first datanode to which we could not write data. int possibleError = blockStream.getErrorIndex(); if (possibleError != -1) { errorIndex = possibleError; DFSClient.LOG.warn("DataStreamer bad datanode in pipeline:" + possibleError); } } } if (closed || hasError || !dfsClient.clientRunning) { continue; } // Is this block full? if (one.lastPacketInBlock) { synchronized (ackQueue) { while (!hasError && ackQueue.size() != 0 && dfsClient.clientRunning) { try { ackQueue.wait(); // wait for acks to arrive from datanodes } catch (InterruptedException e) { } } } DFSClient.LOG.debug("Closing old block " + block); this.setName("DataStreamer for file " + src); response.close(); // ignore all errors in Response try { response.join(); response = null; } catch (InterruptedException e) { } if (closed || hasError || !dfsClient.clientRunning) { continue; } synchronized (dataQueue) { try { blockStream.close(); blockReplyStream.close(); } catch (IOException e) { } nodes = null; response = null; blockStream = null; blockReplyStream = null; } } if (progress != null) { progress.progress(); } // This is used by unit test to trigger race conditions. if (artificialSlowdown != 0 && dfsClient.clientRunning) { DFSClient.sleepForUnitTest(artificialSlowdown); } } } // shutdown thread void close() { closed = true; synchronized (dataQueue) { dataQueue.notifyAll(); } synchronized (ackQueue) { ackQueue.notifyAll(); } this.interrupt(); } } // // Processes reponses from the datanodes. A packet is removed // from the ackQueue when its response arrives. // private class ResponseProcessor extends Thread { private volatile boolean closed = false; private DatanodeInfo[] targets = null; private boolean lastPacketInBlock = false; ResponseProcessor (DatanodeInfo[] targets) { this.targets = targets; } public void run() { this.setName("ResponseProcessor for block " + block); while (!closed && dfsClient.clientRunning && !lastPacketInBlock) { // process responses from datanodes. int recordError = 0; try { long seqno = 0; synchronized (ackQueue) { while (!closed && dfsClient.clientRunning && ackQueue.isEmpty() && numPendingHeartbeats == 0) { try { ackQueue.wait(); } catch (InterruptedException e) { // If the thread is being interrupted when waiting for // packet, we log the exception and treat it as a normal // exception. // DFSClient.LOG.info("ResponseProcessor thread interrupted when " + "waiting for new packets"); throw e; } } } if (closed || !dfsClient.clientRunning) { break; } eventStartReceiveAck(); PipelineAck pipelineAck = null; if (!doParallelWrites) { // verify seqno from datanode if (supportClientHeartbeat()) { pipelineAck = new PipelineAck(); pipelineAck.readFields(blockReplyStream.get(0), targets.length, profileData != null); seqno = pipelineAck.getSeqno(); if (!pipelineAck.isSuccess()) { for (int i = 0; i < targets.length && dfsClient.clientRunning; i++) { short reply = pipelineAck.getReply(i); if (reply != DataTransferProtocol.OP_STATUS_SUCCESS) { recordError = i; // first bad datanode throw new IOException("Bad response " + reply + " for block " + block + " from datanode " + targets[i].getName()); } } } } else { // Backward compatibility codes. seqno = blockReplyStream.get(0).readLong(); DFSClient.LOG.debug("DFSClient received ack for seqno " + seqno); if (seqno == DFSOutputStreamPacket.HEART_BEAT_SEQNO) { continue; } // regular ack // processes response status from all datanodes. for (int i = 0; i < targets.length && dfsClient.clientRunning; i++) { short reply = blockReplyStream.get(0).readShort(); if (reply != DataTransferProtocol.OP_STATUS_SUCCESS) { recordError = i; // first bad datanode throw new IOException("Bad response " + reply + " for block " + block + " from datanode " + targets[i].getName()); } } } } else { // The client is writing to all replicas in parallel. It also // expects an ack from all replicas. long lastsn = 0; assert blockReplyStream.size() > 0; for (int i = 0; i < blockReplyStream.size(); i++) { recordError = i; // remember the current slot seqno = blockReplyStream.get(i).readLong(); if (DFSClient.LOG.isDebugEnabled()) { DFSClient.LOG.debug("DFSClient for block " + block + " " + seqno); } if (i != 0 && seqno != -2 && seqno != lastsn) { String msg = "Responses from datanodes do not match " + " this replica acked " + seqno + " but previous replica acked " + lastsn; DFSClient.LOG.warn(msg); throw new IOException(msg); } short reply = blockReplyStream.get(i).readShort(); if (reply != DataTransferProtocol.OP_STATUS_SUCCESS) { recordError = i; // first bad datanode throw new IOException("Bad parallel response " + reply + " for block " + block + " from datanode " + targets[i].getName()); } lastsn = seqno; } } assert seqno != -2 : "Ack for unkown seqno should be a failed ack!"; if (seqno == DFSOutputStreamPacket.HEART_BEAT_SEQNO) { // a heartbeat ack assert supportClientHeartbeat(); synchronized(ackQueue) { assert numPendingHeartbeats > 0; numPendingHeartbeats--; } continue; } DFSOutputStreamPacket one = null; synchronized (ackQueue) { assert !ackQueue.isEmpty(); one = ackQueue.getFirst(); } if (one.seqno != seqno) { throw new IOException("Responseprocessor: Expecting seqno " + " for block " + block + one.seqno + " but received " + seqno); } lastPacketInBlock = one.lastPacketInBlock; if (lastPacketInBlock) { if (DFSClient.LOG.isDebugEnabled()) { DFSClient.LOG .debug("Update pos in file: " + src + " curBlckOffset: " + lastBlkOffset + " blockSize: " + one.getEndPosInCurrBlk()); } lastBlkOffset += one.getEndPosInCurrBlk(); } synchronized (ackQueue) { assert seqno == lastAckedSeqno + 1; lastAckedSeqno = seqno; ackQueue.removeFirst(); ackQueue.notifyAll(); } one.eventAckReceived(); if (getProfileData() != null) { getProfileData().finishPacket(one.profile, pipelineAck); long slowWriteProfileThreshold = options .getLogSlowWriteProfileDataThreshold(); long totalTime = getProfileData().recentPacketProfile.getTotalTime(); if (slowWriteProfileThreshold > 0 && totalTime > slowWriteProfileThreshold) { DFSClient.LOG.warn("Slow Write Packet for block : " + block + ", packet seqno : " + one.seqno + ", total time : " + totalTime + " \n" + getProfileData().recentPacketProfile); } } } catch (Exception e) { if (!closed) { hasError = true; errorIndex = recordError; if (e instanceof IOException) { setLastException((IOException)e); } DFSClient.LOG.warn("DFSOutputStream ResponseProcessor exception " + " for block " + block + StringUtils.stringifyException(e)); closed = true; } } synchronized (dataQueue) { dataQueue.notifyAll(); } synchronized (ackQueue) { ackQueue.notifyAll(); } } } void close() { closed = true; this.interrupt(); } } // If this stream has encountered any errors so far, shutdown // threads and mark stream as closed. Returns true if we should // sleep for a while after returning from this call. // private boolean processDatanodeError(boolean hasError, boolean isAppend) { if (!hasError) { return false; } if (response != null) { DFSClient.LOG.info("Error Recovery for block " + block + " waiting for responder to exit. "); return true; } dfsClient.incWriteExpCntToStats(); if (errorIndex >= 0) { DFSClient.LOG.warn("Error Recovery for block " + block + " bad datanode[" + errorIndex + "] " + (nodes == null? "nodes == null": nodes[errorIndex].getName())); } if (blockStream != null) { try { blockStream.close(); blockReplyStream.close(); } catch (IOException e) { } } blockStream = null; blockReplyStream = null; // move packets from ack queue to front of the data queue synchronized (dataQueue) { synchronized (ackQueue) { if (!ackQueue.isEmpty()) { DFSClient.LOG.info("First unacked packet in " + block + " starts at " + ackQueue.getFirst().offsetInBlock); dataQueue.addAll(0, ackQueue); ackQueue.clear(); } numPendingHeartbeats = 0; } } boolean success = false; while (!success && dfsClient.clientRunning) { DatanodeInfo[] newnodes = null; if (nodes == null) { String msg = "Could not get block locations. " + "Source file \"" + src + "\" - Aborting..."; DFSClient.LOG.warn(msg); setLastException(new IOException(msg)); closed = true; if (streamer != null) streamer.close(); return false; } StringBuilder pipelineMsg = new StringBuilder(); for (int j = 0; j < nodes.length; j++) { pipelineMsg.append(nodes[j].getName()); if (j < nodes.length - 1) { pipelineMsg.append(", "); } } // remove bad datanode from list of datanodes. // If errorIndex was not set (i.e. appends), then do not remove // any datanodes // if (errorIndex < 0) { newnodes = nodes; } else { if (nodes.length <= 1) { lastException = new IOException("All datanodes " + pipelineMsg + " are bad. Aborting..."); closed = true; if (streamer != null) streamer.close(); return false; } DFSClient.LOG.warn("Error Recovery for block " + block + " in pipeline " + pipelineMsg + ": bad datanode " + nodes[errorIndex].getName()); newnodes = new DatanodeInfo[nodes.length-1]; System.arraycopy(nodes, 0, newnodes, 0, errorIndex); System.arraycopy(nodes, errorIndex+1, newnodes, errorIndex, newnodes.length-errorIndex); } // Tell the primary datanode to do error recovery // by stamping appropriate generation stamps. // LocatedBlock newBlock = null; DatanodeInfo primaryNode = null; boolean clientAdRecoveryPrimaryProtocolSupported = false; try { clientAdRecoveryPrimaryProtocolSupported = dfsClient.namenodeProtocolProxy .isMethodSupported("nextGenerationStamp", Block.class, boolean.class); } catch (InterruptedIOException iie) { return false; } catch (IOException ioe) { DFSClient.LOG.warn( "Error when trying to determine whether namenode protocol " + "supports client as block recovoery coordinator.", ioe); } boolean clientAsRecoveryPrimary = dfsClient.conf.getBoolean( "dfs.client.as.block.recovery.primary", true) && clientAdRecoveryPrimaryProtocolSupported; try { if (clientAsRecoveryPrimary) { BlockRecoveryCoordinator brc = new BlockRecoveryCoordinator( DFSClient.LOG, dfsClient.conf, dfsClient.socketTimeout, null, new BlockSyncer(dfsClient.getNamespaceId(), dfsClient.getNameNodeRPC(), DFSClient.LOG), null); newBlock = brc.recoverBlock(dfsClient.getNamespaceId(), block, false, newnodes, false, System.currentTimeMillis() + dfsClient.socketTimeout * 8000); } else { // Pick the "least" datanode as the primary datanode to avoid // deadlock. primaryNode = Collections.min(Arrays.asList(newnodes)); newBlock = recoverBlockFromPrimaryDataNode(primaryNode, newnodes, isAppend); } if (newBlock == null) { throw new IOException("all datanodes do not have the block"); } boolean isEmpty; long nextByteToSend; long newBlockSize = newBlock.getBlockSize(); int numPktRemoved; synchronized (dataQueue) { numPktRemoved = adjustDataQueueAfterBlockRecovery(newBlockSize); isEmpty = dataQueue.isEmpty(); if (isEmpty) { if (currentPacket != null) { nextByteToSend = currentPacket.offsetInBlock; } else { nextByteToSend = bytesCurBlock; } } else { nextByteToSend = dataQueue.getFirst().offsetInBlock; } } if (numPktRemoved > 0) { DFSClient.LOG.info("Remove " + numPktRemoved + " packets in the packet queue after block recovery"); if (nextByteToSend > newBlockSize) { DFSClient.LOG .warn("Missing bytes after removing packets! It should never happen. nextByteToSend " + nextByteToSend + " new block size " + newBlockSize); } } else if (nextByteToSend > newBlockSize) { DFSClient.LOG.warn("Missing bytes! Error Recovery for block " + block + " end up with " + newBlock.getBlockSize() + " bytes but client already sent " + nextByteToSend + " bytes and data queue is " + (isEmpty ? "" : "not ") + "empty."); } else if (DFSClient.LOG.isDebugEnabled()) { DFSClient.LOG.debug("Didn't remove any block. nextByteToSend " + nextByteToSend + " new block size " + newBlockSize); } } catch (BlockAlreadyCommittedException e) { dfsClient.incWriteExpCntToStats(); DFSClient.LOG .warn("Error Recovery for block " + block + " failed " + " because block is already committed according to primary datanode " + primaryNode + ". " + " Pipeline was " + pipelineMsg + ". Aborting...", e); lastException = e; closed = true; if (streamer != null) streamer.close(); return false; // abort with IOexception } catch (IOException e) { dfsClient.incWriteExpCntToStats(); DFSClient.LOG.warn("Failed recovery attempt #" + recoveryErrorCount + " from primary datanode " + primaryNode, e); recoveryErrorCount++; // For client as primary, no need to retry as all failures thrown by // data nodes are already handled. if (clientAsRecoveryPrimary || recoveryErrorCount > maxRecoveryErrorCount) { if (!clientAsRecoveryPrimary && nodes.length > 1) { // if the primary datanode failed, remove it from the list. // The original bad datanode is left in the list because it is // conservative to remove only one datanode in one iteration. for (int j = 0; j < nodes.length; j++) { if (nodes[j].equals(primaryNode)) { errorIndex = j; // forget original bad node. } } // remove primary node from list newnodes = new DatanodeInfo[nodes.length-1]; System.arraycopy(nodes, 0, newnodes, 0, errorIndex); System.arraycopy(nodes, errorIndex+1, newnodes, errorIndex, newnodes.length-errorIndex); nodes = newnodes; DFSClient.LOG.warn("Error Recovery for block " + block + " failed " + " because recovery from primary datanode " + primaryNode + " failed " + recoveryErrorCount + " times. " + " Pipeline was " + pipelineMsg + ". Marking primary datanode as bad."); recoveryErrorCount = 0; errorIndex = -1; return true; // sleep when we return from here } String emsg = "Error Recovery for block " + block + " failed " + " because recovery from primary datanode " + primaryNode + " failed " + recoveryErrorCount + " times. " + " Pipeline was " + pipelineMsg + ". Aborting..."; DFSClient.LOG.warn(emsg); lastException = new IOException(emsg); closed = true; if (streamer != null) streamer.close(); return false; // abort with IOexception } DFSClient.LOG.warn("Error Recovery for block " + block + " failed " + " because recovery from primary datanode " + primaryNode + " failed " + recoveryErrorCount + " times. " + " Pipeline was " + pipelineMsg + ". Will retry..."); return true; // sleep when we return from here } finally { } recoveryErrorCount = 0; // block recovery successful // If the block recovery generated a new generation stamp, use that // from now on. Also, setup new pipeline // if (newBlock != null) { block = newBlock.getBlock(); nodes = newBlock.getLocations(); } this.hasError = false; lastException = null; errorIndex = 0; success = createBlockOutputStream(nodes, dfsClient.clientName, true, false); } response = new ResponseProcessor(nodes); response.start(); return false; // do not sleep, continue processing } LocatedBlock recoverBlockFromPrimaryDataNode(DatanodeInfo primaryNode, DatanodeInfo[] newnodes, boolean isAppend) throws IOException { ProtocolProxy<ClientDatanodeProtocol> primary = null; try { // Copied from org.apache.hadoop.ipc.Client int connectTimeout = dfsClient.conf.getInt( Client.CONNECT_TIMEOUT_KEY, Client.CONNECT_TIMEOUT_DEFAULT); int maxRetries = dfsClient.conf.getInt( Client.CONNECT_MAX_RETRIES_KEY, Client.CONNECT_MAX_RETRIES_DEFAULT); /* * considering pipeline recovery needs 3 RPCs to DataNodes and 2 RPCs to * NameNode; So rpcTimeout sets to be 5 times of client socketTimeout. * Also each datanode RPC might take upto (connectTimeout * maxRetries) * to establish connection. */ int recoverTimeout = 5 * dfsClient.socketTimeout + 3 * (connectTimeout * maxRetries); primary = DFSClient.createClientDNProtocolProxy(primaryNode, dfsClient.conf, recoverTimeout); try { if (primary.isMethodSupported("recoverBlock", int.class, Block.class, boolean.class, DatanodeInfo[].class, long.class)) { // The deadline is up to RPC time out minus one socket timeout // to be more conservative. return primary.getProxy().recoverBlock(dfsClient.namespaceId, block, isAppend, newnodes, System.currentTimeMillis() + recoverTimeout - dfsClient.socketTimeout - (maxRetries * connectTimeout)); } else if (primary.isMethodSupported("recoverBlock", int.class, Block.class, boolean.class, DatanodeInfo[].class)) { return primary.getProxy().recoverBlock( dfsClient.namespaceId, block, isAppend, newnodes); } else { return primary.getProxy().recoverBlock(block, isAppend, newnodes); } } catch (RemoteException re) { if (re.unwrapRemoteException() instanceof BlockAlreadyCommittedException) { throw new BlockAlreadyCommittedException(re); } else { throw re; } } } finally { if (primary != null) { RPC.stopProxy(primary.getProxy()); } } } private int adjustDataQueueAfterBlockRecovery(long newBlockSize) { // New block size should be one of the packet's ending position // If the block offset of the first packet is not the new block // size, we should be able to remove several packets in packet // queue and make sure the first packet is the new block size. // Otherwise, something went wrong. // // We are conservative here: if the first unacked packet starts // with a full chunk, it can always be a clean checkpoint. We // keep the packets starting from it. // int bytesPerChecksum = checksum.getBytesPerChecksum(); int numPktRemoved = 0; long newAckedSeqno = -1; while (!dataQueue.isEmpty()) { DFSOutputStreamPacket first = dataQueue.getFirst(); long endOffsetOfBlock = first.getEndPosInCurrBlk(); if (first.isHeartbeatPacket()) { dataQueue.removeFirst(); numPktRemoved++; } else if (first.offsetInBlock % bytesPerChecksum == 0) { // The first unacked packet starts with a full chunk. // break; } else if (endOffsetOfBlock <= newBlockSize) { if (first.lastPacketInBlock) { // Last block is already acked in all remaining replicas // Resend an empty one to force the stream to finish. // if (endOffsetOfBlock != newBlockSize) { DFSClient.LOG.warn("Packet is the last packet in block with " + endOffsetOfBlock + " but new block length after block recovery is " + newBlockSize + ". Something went wrong."); } first.cleanup(); first.offsetInBlock = endOffsetOfBlock; DFSClient.LOG .info("Resend last packet in block and make it empty, new offsetInBlock " + endOffsetOfBlock); break; } else { dataQueue.removeFirst(); numPktRemoved++; if (first.seqno > lastAckedSeqno) { newAckedSeqno = first.seqno; } } } else { if (first.offsetInBlock != newBlockSize) { DFSClient.LOG.warn("Packet has start offset " + first.offsetInBlock + " and end offset " + endOffsetOfBlock + " but new block length after block recovery is " + newBlockSize + ". Something went wrong."); } break; } } if (numPktRemoved > 0 && newAckedSeqno != -1) { synchronized (ackQueue) { lastAckedSeqno = newAckedSeqno; ackQueue.notifyAll(); } } return numPktRemoved; } private void isClosed() throws IOException { if ((closed || !dfsClient.clientRunning) && lastException != null) { throw lastException; } } // // returns the list of targets, if any, that is being currently used. // DatanodeInfo[] getPipeline() { synchronized (dataQueue) { if (nodes == null) { return null; } DatanodeInfo[] value = new DatanodeInfo[nodes.length]; for (int i = 0; i < nodes.length; i++) { value[i] = nodes[i]; } return value; } } static private DFSWriteProfilingData getProfile(DFSClient dfsClient) { DFSWriteProfilingData profile = DFSClient.getAndResetProfileDataForNextOutputStream(); if (dfsClient != null) { boolean ifAutoPrint = dfsClient.conf.getBoolean( FSConstants.FS_OUTPUT_STREAM_AUTO_PRINT_PROFILE, false); if (ifAutoPrint) { if (profile == null) { profile = new DFSWriteProfilingData(); } profile.setAutoPrintWhileClose(true); } } return profile; } private DFSOutputStream(DFSClient dfsClient, String src, long blockSize, Progressable progress, int bytesPerChecksum, short replication, boolean forceSync, boolean doParallelWrites, DatanodeInfo[] favoredNodes, WriteOptions options) throws IOException { super(new NativeCrc32(), bytesPerChecksum, 4, getProfile(dfsClient)); this.dfsClient = dfsClient; this.forceSync = forceSync; this.doParallelWrites = doParallelWrites; this.src = src; this.blockSize = blockSize; this.blockReplication = replication; this.progress = progress; this.options = options; this.pktIncludeVersion = dfsClient.ifPacketIncludeVersion(); this.packetVersion = dfsClient.getOutPacketVersion(); streamer = new DataStreamer(); packetTimeout = dfsClient.conf.getLong("dfs.client.packet.timeout", 15000); // 15 seconds // try block recovery 5 times: maxRecoveryErrorCount = dfsClient.conf.getInt("dfs.client.block.recovery.retries", 5); if (progress != null) { DFSClient.LOG.debug("Set non-null progress callback on DFSOutputStream "+src); } this.favoredNodes = favoredNodes; if ( bytesPerChecksum < 1 || blockSize % bytesPerChecksum != 0) { throw new IOException("io.bytes.per.checksum(" + bytesPerChecksum + ") and blockSize(" + blockSize + ") do not match. " + "blockSize should be a " + "multiple of io.bytes.per.checksum"); } checksum = DataChecksum.newDataChecksum(FSConstants.CHECKSUM_TYPE, bytesPerChecksum, new NativeCrc32()); } /** * Create a new output stream to the given DataNode. * @see ClientProtocol#create(String, FsPermission, String, boolean, short, long) */ DFSOutputStream(DFSClient dfsClient, String src, int buffersize, Progressable progress, LocatedBlock lastBlock, FileStatus stat, int bytesPerChecksum) throws IOException { this(dfsClient, src, buffersize, progress, lastBlock, stat, bytesPerChecksum, 0); } DFSOutputStream(DFSClient dfsClient, String src, FsPermission masked, boolean overwrite, boolean createParent, short replication, long blockSize, Progressable progress, int buffersize, int bytesPerChecksum, boolean forceSync, boolean doParallelWrites, DatanodeInfo[] favoredNodes) throws IOException { this(dfsClient, src, masked, overwrite, createParent, replication, blockSize, progress, buffersize, bytesPerChecksum, forceSync, doParallelWrites, favoredNodes, new WriteOptions()); } /** * Create a new output stream to the given DataNode. * @see ClientProtocol#create(String, FsPermission, String, boolean, short, long) */ DFSOutputStream(DFSClient dfsClient, String src, FsPermission masked, boolean overwrite, boolean createParent, short replication, long blockSize, Progressable progress,int buffersize, int bytesPerChecksum, boolean forceSync, boolean doParallelWrites, DatanodeInfo[] favoredNodes, WriteOptions options) throws IOException { this(dfsClient, src, blockSize, progress, bytesPerChecksum, replication, forceSync, doParallelWrites, favoredNodes, options); computePacketChunkSize(dfsClient.writePacketSize, bytesPerChecksum); try { if (dfsClient.namenodeProtocolProxy != null && dfsClient.namenodeProtocolProxy.isMethodSupported("create", String.class, FsPermission.class, String.class, boolean.class, boolean.class, short.class, long.class)) { dfsClient.namenode.create(src, masked, dfsClient.clientName, overwrite, createParent, replication, blockSize); } else { dfsClient.namenode.create(src, masked, dfsClient.clientName, overwrite, replication, blockSize); } } catch(RemoteException re) { dfsClient.incWriteExpCntToStats(); throw re.unwrapRemoteException(AccessControlException.class, FileAlreadyExistsException.class, FileNotFoundException.class, NSQuotaExceededException.class, DSQuotaExceededException.class); } streamer.start(); } /** * Create a new output stream to the given DataNode with namespace id. */ DFSOutputStream(DFSClient dfsClient, String src, int buffersize, Progressable progress, LocatedBlock lastBlock, FileStatus stat, int bytesPerChecksum, int namespaceId) throws IOException { this(dfsClient, src, stat.getBlockSize(), progress, bytesPerChecksum, stat.getReplication(), false, false, null, new WriteOptions()); initialFileSize = stat.getLen(); // length of file when opened dfsClient.updateNamespaceIdIfNeeded(namespaceId); // // The last partial block of the file has to be filled. // if (lastBlock != null) { block = lastBlock.getBlock(); long usedInLastBlock = stat.getLen() % blockSize; int freeInLastBlock = (int)(blockSize - usedInLastBlock); // calculate the amount of free space in the pre-existing // last crc chunk int usedInCksum = (int)(stat.getLen() % bytesPerChecksum); super.bytesSentInChunk = usedInCksum; int freeInCksum = bytesPerChecksum - usedInCksum; // if there is space in the last block, then we have to // append to that block if (freeInLastBlock > blockSize) { throw new IOException("The last block for file " + src + " is full."); } int dataProtocolVersion = dfsClient.getDataTransferProtocolVersion(); // indicate that we are appending to an existing block if (dataProtocolVersion >= DataTransferProtocol.APPEND_BLOCK_VERSION) { bytesCurBlock = lastBlock.getBlock().getNumBytes(); } else { bytesCurBlock = lastBlock.getBlockSize(); } if (usedInCksum > 0 && freeInCksum > 0) { // if there is space in the last partial chunk, then // setup in such a way that the next packet will have only // one chunk that fills up the partial chunk. // computePacketChunkSize(0, freeInCksum); resetChecksumChunk(); this.appendChunk = true; } else { // if the remaining space in the block is smaller than // that expected size of of a packet, then create // smaller size packet. // computePacketChunkSize(Math.min(dfsClient.writePacketSize, freeInLastBlock), bytesPerChecksum); } // setup pipeline to append to the last block nodes = lastBlock.getLocations(); errorIndex = -1; // no errors yet. if (nodes.length < 1) { throw new IOException("Unable to retrieve blocks locations" + " for append to last block " + block + " of file " + src); } if (dataProtocolVersion < DataTransferProtocol.APPEND_BLOCK_VERSION) { // go through the block recovery process to setup the pipeline for append while(processDatanodeError(true, true)) { try { Thread.sleep(1000); } catch (InterruptedException e) { lastException = new IOException(e); break; } } } else { setupPipelineForAppend(lastBlock); } if (lastException != null) { throw lastException; } } else { computePacketChunkSize(dfsClient.writePacketSize, bytesPerChecksum); } long blockOffset = stat.getLen(); blockOffset -= blockOffset % blockSize; setOffsets(blockOffset); streamer.start(); } /** * Setup the Append pipeline, the length of current pipeline will shrink * if any datanodes are dead during the process. */ private boolean setupPipelineForAppend(LocatedBlock lastBlock) throws IOException { if (nodes == null || nodes.length == 0) { String msg = "Could not get block locations. " + "Source file \"" + src + "\" - Aborting..."; DFSClient.LOG.warn(msg); setLastException(new IOException(msg)); closed = true; if (streamer != null) streamer.close(); return false; } boolean success = createBlockOutputStream(nodes, dfsClient.clientName, false, true); long oldGenerationStamp = ((LocatedBlockWithOldGS)lastBlock).getOldGenerationStamp(); if (success) { // bump up the generation stamp in NN. Block newBlock = lastBlock.getBlock(); Block oldBlock = new Block(newBlock.getBlockId(), newBlock.getNumBytes(), oldGenerationStamp); dfsClient.namenode.updatePipeline(dfsClient.clientName, oldBlock, newBlock, nodes); } else { DFSClient.LOG.warn("Fall back to block recovery process when trying" + " to setup the append pipeline for file " + src); // set the old generation stamp block.setGenerationStamp(oldGenerationStamp); // fall back the block recovery while(processDatanodeError(true, true)) { try { Thread.sleep(1000); } catch (InterruptedException e) { lastException = new IOException(e); break; } } } return success; } private void computePacketChunkSize(int psize, int csize) { int chunkSize = csize + checksum.getChecksumSize(); int n = getPacketHeaderLen() + DFSClient.SIZE_OF_INTEGER; chunksPerPacket = Math.max((psize - n + chunkSize-1)/chunkSize, 1); packetSize = n + chunkSize*chunksPerPacket; if (DFSClient.LOG.isDebugEnabled()) { DFSClient.LOG.debug("computePacketChunkSize: src=" + src + ", chunkSize=" + chunkSize + ", chunksPerPacket=" + chunksPerPacket + ", packetSize=" + packetSize); } } /** * Open a DataOutputStream to a DataNode so that it can be written to. * This happens when a file is created and each time a new block is allocated. * Must get block ID and the IDs of the destinations from the namenode. * Returns the list of target datanodes. */ private DatanodeInfo[] nextBlockOutputStream(String client) throws IOException { LocatedBlock lb = null; boolean retry = false; DatanodeInfo[] nodes; ArrayList<DatanodeInfo> excludedNodes = new ArrayList<DatanodeInfo>(); int count = dfsClient.conf.getInt("dfs.client.block.write.retries", 3); boolean success; do { hasError = false; lastException = null; errorIndex = 0; retry = false; nodes = null; success = false; long startTime = System.currentTimeMillis(); DatanodeInfo[] excluded = excludedNodes.toArray(new DatanodeInfo[0]); lb = locateFollowingBlock(startTime, excluded.length > 0 ? excluded : null); block = lb.getBlock(); nodes = lb.getLocations(); // // Connect to first DataNode in the list. // success = createBlockOutputStream(nodes, dfsClient.clientName, false, false); if (!success) { DFSClient.LOG.info("Abandoning block " + block + " for file " + src); dfsClient.namenode.abandonBlock(block, src, dfsClient.clientName); if (errorIndex < nodes.length) { DFSClient.LOG.debug("Excluding datanode " + nodes[errorIndex]); excludedNodes.add(nodes[errorIndex]); } // Connection failed. Let's wait a little bit and retry retry = true; } } while (retry && --count >= 0); if (!success && nodes != null) { // in the last fail time, we will retry with the remaining nodes. while (nodes.length > 1 && !success) { if (errorIndex >= nodes.length) { break; } DatanodeInfo[] remainingNodes = new DatanodeInfo[nodes.length - 1]; for (int i = 0; i < errorIndex; i++) { remainingNodes[i] = nodes[i]; } for (int i = errorIndex + 1; i < nodes.length; i++) { remainingNodes[i - 1] = nodes[i]; } nodes = remainingNodes; success = createBlockOutputStream(nodes, dfsClient.clientName, false, false); } } if (!success) { throw new IOException("Unable to create new block."); } return nodes; } // For pipelined writes, connects to the first datanode in the pipeline. // For parallel writes, connect to all specified datanodes. // Returns true if success, otherwise return failure. // private boolean createBlockOutputStream(DatanodeInfo[] nodes, String client, boolean recoveryFlag, boolean appendFlag) { String firstBadLink = ""; if (DFSClient.LOG.isDebugEnabled()) { for (int i = 0; i < nodes.length; i++) { DFSClient.LOG.debug("pipeline = " + nodes[i].getName()); } } // persist blocks on namenode on next flush persistBlocks = true; boolean result = false; int curNode = 0; int length = 0; int pipelineDepth; if (doParallelWrites) { length = nodes.length; // connect to all datanodes pipelineDepth = 1; } else { length = 1; // connect to only the first datanode pipelineDepth = nodes.length; } DataOutputStream[] tmpOut = new DataOutputStream[length]; DataInputStream[] replyIn = new DataInputStream[length]; Socket[] sockets = new Socket[length]; try { for (curNode = 0; curNode < length; curNode++) { DFSClient.LOG.debug("Connecting to " + nodes[curNode].getName()); InetSocketAddress target = NetUtils.createSocketAddr(nodes[curNode].getName()); Socket s = dfsClient.socketFactory.createSocket(); sockets[curNode] = s; dfsClient.timeoutValue = dfsClient.socketReadExtentionTimeout * pipelineDepth + dfsClient.socketTimeout; NetUtils.connect(s, target, dfsClient.timeoutValue, dfsClient.ipTosValue); s.setSoTimeout(dfsClient.timeoutValue); s.setSendBufferSize(DFSClient.DEFAULT_DATA_SOCKET_SIZE); DFSClient.LOG.debug("Send buf size " + s.getSendBufferSize()); long writeTimeout = dfsClient.datanodeWriteExtentionTimeout * pipelineDepth + dfsClient.datanodeWriteTimeout; // // Xmit header info to datanode (see DataXceiver.java) // DataOutputStream out = new DataOutputStream( new BufferedOutputStream(NetUtils.getOutputStream(s, writeTimeout), DataNode.SMALL_BUFFER_SIZE)); tmpOut[curNode] = out; DataInputStream brs = new DataInputStream(NetUtils.getInputStream(s)); replyIn[curNode] = brs; if (getProfileData() != null) { getProfileData().nextBlock(); } int version = dfsClient.getDataTransferProtocolVersion(); // write the header if (!appendFlag) { WriteBlockHeader header = new WriteBlockHeader(version, dfsClient.namespaceId, block.getBlockId(), block.getGenerationStamp(), pipelineDepth, recoveryFlag, false, null, pipelineDepth - 1, nodes, client); header.getWritePipelineInfo().setWriteOptions(options); header.getWritePipelineInfo().getWriteOptions() .setIfProfileEnabled(profileData != null); header.writeVersionAndOpCode(out); header.write(out); } else { AppendBlockHeader header = new AppendBlockHeader(version, dfsClient.namespaceId, block.getBlockId(), block.getNumBytes(), block.getGenerationStamp(), pipelineDepth, false, null, pipelineDepth - 1, nodes, client); header.writeVersionAndOpCode(out); header.write(out); } checksum.writeHeader(out); out.flush(); // receive ack for connect firstBadLink = Text.readString(brs); if (firstBadLink.length() != 0) { throw new IOException("Bad connect ack with firstBadLink " + firstBadLink); } } result = true; // success blockStream = dfsClient.new MultiDataOutputStream(tmpOut); blockReplyStream = dfsClient.new MultiDataInputStream(replyIn); this.s = sockets; if (appendFlag) { // start the responseProcessor if the pipeline is successfully setup // for append only response = new ResponseProcessor(nodes); response.start(); } } catch (IOException ie) { DFSClient.LOG.info("Exception in createBlockOutputStream " + nodes[curNode].getName() + " " + " for file " + src + ie); dfsClient.incWriteExpCntToStats(); // find the datanode that matches if (firstBadLink.length() != 0) { for (int i = 0; i < nodes.length; i++) { if (nodes[i].getName().equals(firstBadLink)) { errorIndex = i; break; } } } else { // if we are doing parallel writes, then record the datanode that is bad errorIndex = curNode; } hasError = true; setLastException(ie); blockReplyStream = null; result = false; } finally { if (!result) { for (int i = 0; i < sockets.length; i++) { IOUtils.closeSocket(sockets[i]); } this.s = null; } } return result; } private LocatedBlock locateFollowingBlock(long start, DatanodeInfo[] excludedNodes ) throws IOException { int retries = dfsClient.conf.getInt( "dfs.client.block.write.locateFollowingBlock.retries", 5); long sleeptime = 400; while (true) { long localstart = System.currentTimeMillis(); while (true) { try { VersionedLocatedBlock loc = null; if (dfsClient.namenodeProtocolProxy != null && dfsClient.namenodeProtocolProxy.isMethodSupported( "addBlockAndFetchMetaInfo", String.class, String.class, DatanodeInfo[].class, DatanodeInfo[].class, long.class, Block.class)) { loc = dfsClient.namenode.addBlockAndFetchMetaInfo(src, dfsClient.clientName, excludedNodes, favoredNodes, this.lastBlkOffset, getLastBlock()); } else if (dfsClient.namenodeProtocolProxy != null && dfsClient.namenodeProtocolProxy.isMethodSupported( "addBlockAndFetchMetaInfo", String.class, String.class, DatanodeInfo[].class, DatanodeInfo[].class, long.class)) { loc = dfsClient.namenode.addBlockAndFetchMetaInfo(src, dfsClient.clientName, excludedNodes, favoredNodes, this.lastBlkOffset); } else if (dfsClient.namenodeProtocolProxy != null && dfsClient.namenodeProtocolProxy.isMethodSupported( "addBlockAndFetchMetaInfo", String.class, String.class, DatanodeInfo[].class, long.class)) { loc = dfsClient.namenode.addBlockAndFetchMetaInfo(src, dfsClient.clientName, excludedNodes, this.lastBlkOffset); } else if (dfsClient.namenodeProtocolProxy != null && dfsClient.namenodeProtocolProxy.isMethodSupported( "addBlockAndFetchMetaInfo", String.class, String.class, DatanodeInfo[].class)) { loc = dfsClient.namenode.addBlockAndFetchMetaInfo(src, dfsClient.clientName, excludedNodes); } else if (dfsClient.namenodeProtocolProxy != null && dfsClient.namenodeProtocolProxy.isMethodSupported( "addBlockAndFetchVersion", String.class, String.class, DatanodeInfo[].class)) { loc = dfsClient.namenode.addBlockAndFetchVersion(src, dfsClient.clientName, excludedNodes); } else if (dfsClient.namenodeProtocolProxy != null && dfsClient.namenodeProtocolProxy.isMethodSupported("addBlock", String.class, String.class, DatanodeInfo[].class)) { return dfsClient.namenode.addBlock(src, dfsClient.clientName, excludedNodes); } else { return dfsClient.namenode.addBlock(src, dfsClient.clientName); } dfsClient.updateDataTransferProtocolVersionIfNeeded( loc.getDataProtocolVersion()); if (loc instanceof LocatedBlockWithMetaInfo) { LocatedBlockWithMetaInfo metaLoc = (LocatedBlockWithMetaInfo)loc; dfsClient.updateNamespaceIdIfNeeded(metaLoc.getNamespaceID()); dfsClient.getNewNameNodeIfNeeded(metaLoc.getMethodFingerPrint()); } return loc; } catch (RemoteException e) { IOException ue = e.unwrapRemoteException(FileNotFoundException.class, AccessControlException.class, NSQuotaExceededException.class, DSQuotaExceededException.class); if (ue != e) { throw ue; // no need to retry these exceptions } if (NotReplicatedYetException.class.getName(). equals(e.getClassName())) { if (retries == 0) { throw e; } else { --retries; DFSClient.LOG.info(StringUtils.stringifyException(e)); if (System.currentTimeMillis() - localstart > 5000) { DFSClient.LOG.info("Waiting for replication for " + (System.currentTimeMillis() - localstart) / 1000 + " seconds"); } try { DFSClient.LOG.warn("NotReplicatedYetException sleeping " + src + " retries left " + retries); Thread.sleep(sleeptime); sleeptime *= 2; } catch (InterruptedException ie) { } } } else { throw e; } } } } } @Override protected void incMetrics(int len){ dfsClient.metrics.incWriteOps(); dfsClient.metrics.incWriteSize(len); } // @see FSOutputSummer#writeChunk() @Override protected synchronized void writeChunk(byte[] b, int offset, int len, byte[] checksum) throws IOException { dfsClient.checkOpen(); isClosed(); int cklen = checksum.length; int bytesPerChecksum = this.checksum.getBytesPerChecksum(); if (len > bytesPerChecksum) { throw new IOException("writeChunk() buffer size is " + len + " is larger than supported bytesPerChecksum " + bytesPerChecksum); } if (checksum.length != this.checksum.getChecksumSize()) { throw new IOException("writeChunk() checksum size is supposed to be " + this.checksum.getChecksumSize() + " but found to be " + checksum.length); } eventStartEnqueuePacket(); synchronized (dataQueue) { // If queue is full, then wait till we can create enough space while (!closed && dataQueue.size() + ackQueue.size() > maxPackets) { try { dataQueue.wait(packetTimeout); checkIfLastPacketTimeout(); } catch (InterruptedException e) { } } isClosed(); if (currentPacket == null) { WritePacketClientProfile pktProfile = null; if (getProfileData() != null) { pktProfile = getProfileData().getWritePacketClientProfile(); } currentPacket = DFSOutputStreamPacketFactory.getPacket( DFSOutputStream.this, ifPacketIncludeVersion(), getPacketVersion(), packetSize, chunksPerPacket, bytesCurBlock, pktProfile); if (DFSClient.LOG.isDebugEnabled()) { DFSClient.LOG.debug("DFSClient writeChunk allocating new packet seqno=" + currentPacket.seqno + ", src=" + src + ", packetSize=" + packetSize + ", chunksPerPacket=" + chunksPerPacket + ", bytesCurBlock=" + bytesCurBlock + ", forceSync=" + forceSync + ", doParallelWrites=" + doParallelWrites + ", len=" + len + ", blocksize=" + blockSize); } } if (packetVersion == DataTransferProtocol.PACKET_VERSION_CHECKSUM_FIRST) { currentPacket.writeChecksum(checksum, 0, cklen); currentPacket.writeData(b, offset, len); } else { // packetVersion == DataTransferProtocol.PACKET_VERSION_CHECKSUM_INLINE currentPacket.writeData(b, offset, len); currentPacket.writeChecksum(checksum, 0, cklen); } currentPacket.numChunks++; bytesCurBlock += len; // If packet is full, enqueue it for transmission if (currentPacket.numChunks == currentPacket.maxChunks || bytesCurBlock == blockSize) { if (DFSClient.LOG.isDebugEnabled()) { DFSClient.LOG.debug("DFSClient writeChunk packet full seqno=" + currentPacket.seqno + ", src=" + src + ", bytesCurBlock=" + bytesCurBlock + ", blockSize=" + blockSize + ", appendChunk=" + appendChunk); } // // if we allocated a new packet because we encountered a block // boundary, reset bytesCurBlock. // if (bytesCurBlock == blockSize) { currentPacket.lastPacketInBlock = true; bytesCurBlock = 0; lastFlushOffset = 0; } enqueueCurrentPacket(); eventEndEnquePacket(); // If this was the first write after reopening a file, then the above // write filled up any partial chunk. Tell the summer to generate full // crc chunks from now on. if (appendChunk) { appendChunk = false; resetChecksumChunk(); } int psize = Math.min((int)(blockSize-bytesCurBlock), dfsClient.writePacketSize); computePacketChunkSize(psize, bytesPerChecksum); } } //LOG.debug("DFSClient writeChunk done length " + len + // " checksum length " + cklen); } private synchronized void enqueueCurrentPacket() { synchronized (dataQueue) { if (currentPacket == null) return; dataQueue.addLast(currentPacket); currentPacket.eventAddToDataQueue(); dataQueue.notifyAll(); lastQueuedSeqno = currentPacket.seqno; currentPacket = null; } } /** * All data is written out to datanodes. It is not guaranteed * that data has been flushed to persistent store on the * datanode. Block allocations are persisted on namenode. */ public void sync() throws IOException { long start = System.currentTimeMillis(); try { long toWaitFor; synchronized (this) { eventStartSync(); /* Record current blockOffset. This might be changed inside * flushBuffer() where a partial checksum chunk might be flushed. * After the flush, reset the bytesCurBlock back to its previous value, * any partial checksum chunk will be sent now and in next packet. */ long saveOffset = bytesCurBlock; DFSOutputStreamPacket oldCurrentPacket = currentPacket; // flush checksum buffer as an incomplete chunk flushBuffer(false, shouldKeepPartialChunkData()); // bytesCurBlock potentially incremented if there was buffered data eventSyncStartWaitAck(); if (DFSClient.LOG.isDebugEnabled()) { DFSClient.LOG.debug("DFSClient flush() : bytesCurBlock " + bytesCurBlock + " lastFlushOffset " + lastFlushOffset); } // Flush only if we haven't already flushed till this offset. if (lastFlushOffset != bytesCurBlock) { assert bytesCurBlock > lastFlushOffset; // record the valid offset of this flush lastFlushOffset = bytesCurBlock; enqueueCurrentPacket(); } else { // just discard the current packet since it is already been sent. if (oldCurrentPacket == null && currentPacket != null) { // If we didn't previously have a packet queued, and now we do, // but we don't plan on sending it, then we should not // skip a sequence number for it! currentSeqno--; } currentPacket = null; } if (shouldKeepPartialChunkData()) { // Restore state of stream. Record the last flush offset // of the last full chunk that was flushed. // bytesCurBlock = saveOffset; } toWaitFor = lastQueuedSeqno; } waitForAckedSeqno(toWaitFor); eventSyncPktAcked(); // If any new blocks were allocated since the last flush, // then persist block locations on namenode. // boolean willPersist; synchronized (this) { willPersist = persistBlocks; persistBlocks = false; } if (willPersist) { dfsClient.namenode.fsync(src, dfsClient.clientName); } long timeval = System.currentTimeMillis() - start; dfsClient.metrics.incSyncTime(timeval); eventEndSync(); } catch (IOException e) { lastException = new IOException("IOException flush:", e); closed = true; closeThreads(); throw e; } } private Block getLastBlock() { return this.block; } /** * Returns the number of replicas of current block. This can be different * from the designated replication factor of the file because the NameNode * does not replicate the block to which a client is currently writing to. * The client continues to write to a block even if a few datanodes in the * write pipeline have failed. If the current block is full and the next * block is not yet allocated, then this API will return 0 because there are * no replicas in the pipeline. */ public int getNumCurrentReplicas() throws IOException { synchronized(dataQueue) { if (nodes == null) { return blockReplication; } return nodes.length; } } public DFSWriteProfilingData getProfileData() { return (DFSWriteProfilingData) profileData; } /** * Waits till all existing data is flushed and confirmations * received from datanodes. */ private void flushInternal() throws IOException { isClosed(); dfsClient.checkOpen(); long toWaitFor; synchronized (this) { enqueueCurrentPacket(); toWaitFor = lastQueuedSeqno; } waitForAckedSeqno(toWaitFor); } private void waitForAckedSeqno(long seqnumToWaitFor) throws IOException { boolean interrupted = false; synchronized (ackQueue) { while (!closed) { isClosed(); if (lastAckedSeqno >= seqnumToWaitFor) { break; } try { ackQueue.wait(); } catch (InterruptedException ie) { interrupted = true; } } } if (interrupted) { Thread.currentThread().interrupt(); } isClosed(); } /** * Closes this output stream and releases any system * resources associated with this stream. */ @Override public void close() throws IOException { try { if (closed) { IOException e = lastException; if (e == null) return; else throw e; } try { closeInternal(); if (s != null) { for (int i = 0; i < s.length; i++) { s[i].close(); } s = null; } } catch (IOException e) { lastException = e; throw e; } if (profileData != null && profileData.isAutoPrintWhileClose()) { DFSClient.LOG.info("Write Profile for " + this.src + ":" + profileData.toString()); } } finally { // We always try to remove the connection from the lease to // avoid memory leak. In case of failed close(), it is possible // that later users' retry of close() could succeed but fail on // lease expiration. Since clients don't possibly write more data // after calling close(), this case doesn't change any guarantee // of data itself. dfsClient.leasechecker.remove(src); } } /** * Harsh abort method that should only be used from tests - this * is in order to prevent pipeline recovery when eg a DN shuts down. */ void abortForTests() throws IOException { if (streamer != null) { streamer.close(); } if (response != null) { response.close(); } closed = true; } /** * Aborts this output stream and releases any system * resources associated with this stream. */ synchronized void abort() throws IOException { if (closed) { return; } setLastException(new IOException("Lease timeout of " + (dfsClient.hdfsTimeout/1000) + " seconds expired.")); closeThreads(); } // shutdown datastreamer and responseprocessor threads. private void closeThreads() throws IOException { try { if (streamer != null) { streamer.close(); streamer.join(); } // shutdown response after streamer has exited. if (response != null) { response.close(); response.join(); response = null; } } catch (InterruptedException e) { throw new InterruptedIOException("Failed to shutdown response thread"); } } /** * Closes this output stream and releases any system * resources associated with this stream. */ private synchronized void closeInternal() throws IOException { dfsClient.checkOpen(); isClosed(); try { eventStartWrite(); flushBuffer(true, false); // flush from all upper layers eventCloseAfterFlushBuffer(); // Mark that this packet is the last packet in block. // If there are no outstanding packets and the last packet // was not the last one in the current block, then create a // packet with empty payload. synchronized (dataQueue) { if (currentPacket == null && bytesCurBlock != 0) { WritePacketClientProfile pktProfile = null; if (getProfileData() != null) { pktProfile = getProfileData().getWritePacketClientProfile(); } currentPacket = DFSOutputStreamPacketFactory.getPacket( DFSOutputStream.this, ifPacketIncludeVersion(), getPacketVersion(), packetSize, chunksPerPacket, bytesCurBlock, pktProfile); } if (currentPacket != null) { currentPacket.lastPacketInBlock = true; } } flushInternal(); // flush all data to Datanodes isClosed(); // check to see if flushInternal had any exceptions closed = true; // allow closeThreads() to showdown threads closeThreads(); synchronized (dataQueue) { if (blockStream != null) { blockStream.writeInt(0); // indicate end-of-block to datanode blockStream.close(); blockReplyStream.close(); } if (s != null) { for (int i = 0; i < s.length; i++) { s[i].close(); } s = null; } } streamer = null; blockStream = null; blockReplyStream = null; eventCloseReceivedAck(); dfsClient.closeFile(src, lastBlkOffset, getLastBlock()); eventEndClose(); } finally { closed = true; } } void setArtificialSlowdown(long period) { artificialSlowdown = period; } synchronized void setChunksPerPacket(int value) { chunksPerPacket = Math.min(chunksPerPacket, value); packetSize = getPacketHeaderLen() + DFSClient.SIZE_OF_INTEGER + (checksum.getBytesPerChecksum() + checksum.getChecksumSize()) * chunksPerPacket; } synchronized void setTestFilename(String newname) { src = newname; } /** * Returns the size of a file as it was when this stream was opened */ long getInitialLen() { return initialFileSize; } private void eventStartEnqueuePacket() { if (getProfileData() != null) { getProfileData().startEnqueuePacket(); } } private void eventEndEnquePacket() { if (getProfileData() != null) { getProfileData().endEnquePacket(); } } private void eventStartSync() { if (getProfileData() != null) { getProfileData().startSync(); } } private void eventSyncStartWaitAck() { if (getProfileData() != null) { getProfileData().syncStartWaitAck(); } } private void eventSyncPktAcked() { if (getProfileData() != null) { getProfileData().syncPktAcked(); } } private void eventEndSync() { if (getProfileData() != null) { getProfileData().endSync(); } } private void eventCloseAfterFlushBuffer() { if (getProfileData() != null) { getProfileData().closeAfterFlushBuffer(); } } private void eventCloseReceivedAck() { if (getProfileData() != null) { getProfileData().closeReceivedAck(); } } private void eventEndClose() { if (getProfileData() != null) { getProfileData().endClose(); } } public void eventStartReceiveAck() { if (getProfileData() != null) { getProfileData().startReceiveAck(); } } int getPacketHeaderLen() { return DataNode.getPacketHeaderLen(ifPacketIncludeVersion()); } long incAndGetCurrentSeqno() { return currentSeqno++; } int getPacketVersion() { return packetVersion; } boolean ifPacketIncludeVersion() { return pktIncludeVersion; } boolean ifForceSync() { return forceSync; } int getBytesPerChecksum() { return checksum.getBytesPerChecksum(); } int getChecksumSize() { return checksum.getChecksumSize(); } @Override protected boolean shouldKeepPartialChunkData() throws IOException { return this.dfsClient.getDataTransferProtocolVersion() < DataTransferProtocol.NOT_RESEND_PARTIAL_CHUNK_VERSION; } }