/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs.server.datanode;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileDescriptor;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.io.RandomAccessFile;
import java.nio.channels.ClosedChannelException;
import java.nio.channels.FileChannel;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future;
import java.util.zip.CRC32;
import java.util.zip.Checksum;
import org.apache.hadoop.fs.FSInputChecker;
import org.apache.hadoop.fs.FSOutputSummer;
import org.apache.hadoop.hdfs.DFSClient;
import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.protocol.DataTransferProtocol;
import org.apache.hadoop.hdfs.protocol.FSConstants;
import org.apache.hadoop.hdfs.server.common.HdfsConstants;
import org.apache.hadoop.hdfs.server.datanode.BlockDataFile.RandomAccessor;
import org.apache.hadoop.hdfs.util.InjectionEvent;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.nativeio.NativeIO;
import org.apache.hadoop.util.CrcConcat;
import org.apache.hadoop.util.DataChecksum;
import org.apache.hadoop.util.InjectionEventI;
import org.apache.hadoop.util.InjectionHandler;
/**
* The class to write into local inline checksum block file.
* The expected block file name is:
* blk_(blockId)_(generation_id)_(checksum_type)_(bytes_per_checksum)
*
* The file format is following:
* +---------------------------+
* | Checksum Header |
* +---------------------------+
* | |
* | Data for Chunk 1 |
* | ...... |
* | |
* +---------------------------+
* | Checksum for Chunk 1 |
* +---------------------------+
* | |
* | Data for Chunk 2 |
* | ...... |
* | |
* +---------------------------+
* | Checksum for Chunk 2 |
* +---------------------------+
* | |
* | Data for Chunk 3 |
* | . |
* | . |
* | . |
* | |
* +---------------------------+
* | Data for Last Chunk |
* | (Can be Partial) |
* +---------------------------+
* | Checksum for Chunk 3 |
* +---------------------------+
*
*/
public class BlockInlineChecksumWriter extends DatanodeBlockWriter {
final protected BlockDataFile blockDataFile;
protected BlockDataFile.Writer blockDataWriter = null;
private int checksumType = DataChecksum.CHECKSUM_UNKNOWN;
private final int writePacketSize;
public BlockInlineChecksumWriter(BlockDataFile blockDataFile, int checksumType,
int bytesPerChecksum, int writePacketSize) {
this.blockDataFile = blockDataFile;
this.bytesPerChecksum = bytesPerChecksum;
this.checksumType = checksumType;
this.writePacketSize = writePacketSize;
}
public void initializeStreams(int bytesPerChecksum, int checksumSize,
Block block, String inAddr, int namespaceId, DataNode datanode)
throws FileNotFoundException, IOException {
if (this.blockDataWriter == null) {
this.blockDataWriter = blockDataFile.getWriter(datanode.writePacketSize * 2);
firstChunkOffset = 0;
}
setParameters(bytesPerChecksum, checksumSize, block, inAddr, namespaceId,
datanode);
}
@Override
public void writeHeader(DataChecksum checksum) throws IOException {
// In current version, no header is written.
}
@Override
public void fadviseStream(int advise, long offset, long len)
throws IOException {
fadviseStream(advise, offset, len, false);
}
@Override
public void fadviseStream(int advise, long offset, long len, boolean sync)
throws IOException {
if (LOG.isDebugEnabled()) {
LOG.debug("posix_fadvise with advise : " + advise + " for : " + blockDataFile.getFile());
}
long fileOffset = BlockInlineChecksumReader.getPosFromBlockOffset(
offset, bytesPerChecksum, checksumSize);
long fileLen = BlockInlineChecksumReader.getFileLengthFromBlockSize(
len + offset, bytesPerChecksum, checksumSize) - fileOffset;
blockDataWriter.posixFadviseIfPossible(fileOffset, fileLen, advise, sync);
}
@Override
public void writePacket(byte pktBuf[], int len, int startDataOff,
int pktBufStartOff, int numChunks, int packetVersion) throws IOException {
if (len == 0) {
return;
}
int chunkOffset = firstChunkOffset;
int remain = len;
if (packetVersion == DataTransferProtocol.PACKET_VERSION_CHECKSUM_FIRST) {
int dataOff = startDataOff;
int checksumOff = pktBufStartOff;
for (int i = 0; i < numChunks; i++) {
assert remain > 0;
int lenToWrite = (remain > bytesPerChecksum - chunkOffset) ? bytesPerChecksum
- chunkOffset
: remain;
// finally write to the disk :
blockDataWriter.write(pktBuf, dataOff, lenToWrite);
if (chunkOffset > 0) {
// Partial chunk
int crcPart2 = DataChecksum.getIntFromBytes(pktBuf, pktBufStartOff);
partialCrcInt = CrcConcat.concatCrc(partialCrcInt, crcPart2,
lenToWrite);
byte[] tempBuf = new byte[4];
DataChecksum.writeIntToBuf(partialCrcInt, tempBuf, 0);
blockDataWriter.write(tempBuf);
LOG.debug("Writing out partial crc for data len " + lenToWrite);
} else {
blockDataWriter.write(pktBuf, checksumOff, checksumSize);
if (lenToWrite < bytesPerChecksum) {
// partial chunk, need to remember the partial CRC
partialCrcInt = DataChecksum.getIntFromBytes(pktBuf, checksumOff);
}
}
chunkOffset = (chunkOffset + lenToWrite) % bytesPerChecksum;
dataOff += lenToWrite;
remain -= lenToWrite;
checksumOff += checksumSize;
}
} else if (packetVersion == DataTransferProtocol.PACKET_VERSION_CHECKSUM_INLINE) {
int firstChunkSize = 0;
int dataOff = startDataOff;
if (chunkOffset > 0) {
// Figure out size of first chunk
firstChunkSize = (len > bytesPerChecksum - chunkOffset) ? bytesPerChecksum
- chunkOffset : len;
// Partial chunk
int crcPart2 = DataChecksum.getIntFromBytes(pktBuf, pktBufStartOff
+ firstChunkSize);
partialCrcInt = CrcConcat.concatCrc(partialCrcInt, crcPart2,
firstChunkSize);
byte[] tempBuf = new byte[4];
DataChecksum.writeIntToBuf(partialCrcInt, tempBuf, 0);
blockDataWriter.write(pktBuf, dataOff, firstChunkSize);
blockDataWriter.write(tempBuf);
dataOff += firstChunkSize + checksumSize;
LOG.debug("Writing out partial crc for data len " + firstChunkSize);
remain -= firstChunkSize;
chunkOffset = (chunkOffset + firstChunkSize) % bytesPerChecksum;
}
if (remain > 0) {
int numFullChunks = remain / bytesPerChecksum;
chunkOffset = remain % bytesPerChecksum;
int bytesLeftInBuf = remain + checksumSize * numFullChunks;
if (chunkOffset > 0) {
// last chunk is partial
partialCrcInt = DataChecksum.getIntFromBytes(pktBuf, dataOff
+ bytesLeftInBuf);
bytesLeftInBuf += checksumSize;
}
blockDataWriter.write(pktBuf, dataOff, bytesLeftInBuf);
}
} else {
throw new IOException("inline checksum doesn't support packet version "
+ packetVersion);
}
blockDataWriter.flush();
firstChunkOffset = chunkOffset;
}
/**
* Retrieves the offset in the block to which the the next write will write
* data to.
*/
public long getChannelPosition() throws IOException {
return blockDataWriter.getChannelPosition();
}
@Override
public void setPosAndRecomputeChecksumIfNeeded(long offsetInBlock, DataChecksum checksum) throws IOException {
long expectedFileLength = BlockInlineChecksumReader.getFileLengthFromBlockSize(
offsetInBlock, bytesPerChecksum, checksumSize);
if (getChannelPosition() == expectedFileLength) {
if (offsetInBlock % bytesPerChecksum != firstChunkOffset) {
throw new IOException("chunk Offset " + firstChunkOffset + " doesn't match offset in block " +
offsetInBlock+ " which should never happen.");
}
if (offsetInBlock % bytesPerChecksum != 0) {
// Previous packet is a partial chunk.
// If the position is the expected file length, we assume
// everything is fine and we just set to the correct position.
setChannelPosition(expectedFileLength - checksumSize, true);
}
return;
}
if (blockDataWriter != null) {
blockDataWriter.flush();
}
long positionToSeekTo = expectedFileLength;
// If this is a partial chunk, then read in pre-existing checksum
if (offsetInBlock % bytesPerChecksum != 0) {
// Previous packet is a partial chunk.
positionToSeekTo -= checksumSize;
LOG.info("setBlockPosition trying to set position to " + offsetInBlock
+ " for block " + block
+ " which is not a multiple of bytesPerChecksum " + bytesPerChecksum);
computePartialChunkCrc(offsetInBlock, bytesPerChecksum, checksum);
}
firstChunkOffset = (int) (offsetInBlock % bytesPerChecksum);
// set the position of the block file
if (LOG.isDebugEnabled()) {
LOG.debug("Changing block file offset of block " + block + " from "
+ getChannelPosition() + " to " + positionToSeekTo);
}
setChannelPosition(positionToSeekTo, firstChunkOffset > 0);
}
/**
* Sets the offset in the block to which the the next write will write data
* to.
*/
public void setChannelPosition(long dataOffset, boolean startWithPartialChunk)
throws IOException {
long channelSize = blockDataWriter.getChannelSize();
if (channelSize < dataOffset) {
String msg = "Trying to change block file offset of block "
+ block
+ "file "
+ ((blockDataFile.getFile() != null) ? blockDataFile.getFile()
: "unknown") + " to " + dataOffset
+ " but actual size of file is " + blockDataWriter.getChannelSize();
throw new IOException(msg);
}
if (dataOffset > channelSize) {
throw new IOException("Set position over the end of the data file.");
}
if (startWithPartialChunk && channelSize != dataOffset + checksumSize) {
DFSClient.LOG.warn("Inline Checksum Block " + block + " channel size "
+ channelSize + " but packet needs to start from " + dataOffset);
}
// This flush should be a no-op since we always flush at the end of
// writePacket() and hence the buffer should be empty.
// However we do this just to be extra careful so that the
// channel.position() doesn't mess up things with respect to the
// buffered dataOut stream.
blockDataWriter.flush();
blockDataWriter.position(dataOffset);
}
/**
* reads in the partial crc chunk and computes checksum of pre-existing data
* in partial chunk.
*/
private void computePartialChunkCrc(long blkoff, int bytesPerChecksum,
DataChecksum checksum) throws IOException {
// find offset of the beginning of partial chunk.
//
int sizePartialChunk = (int) (blkoff % bytesPerChecksum);
int checksumSize = checksum.getChecksumSize();
long fileOff = BlockInlineChecksumReader.getPosFromBlockOffset(blkoff
- sizePartialChunk, bytesPerChecksum, checksumSize);
LOG.info("computePartialChunkCrc sizePartialChunk " + sizePartialChunk
+ " block " + block + " offset in block " + blkoff);
// create an input stream from the block file
// and read in partial crc chunk into temporary buffer
//
byte[] buf = new byte[sizePartialChunk];
byte[] crcbuf = new byte[checksumSize];
// FileInputStream dataIn = null;
/*
RandomAccessFile blockInFile = new RandomAccessFile(blockFile, "r");
dataIn = new FileInputStream(blockInFile.getFD());
if (fileOff > 0) {
blockInFile.seek(fileOff);
}
IOUtils.readFully(dataIn, buf, 0, sizePartialChunk);
// open meta file and read in crc value computer earlier
IOUtils.readFully(dataIn, crcbuf, 0, crcbuf.length);
*/
BlockDataFile.Reader blockReader = blockDataFile.getReader(datanode);
blockReader.readFully(buf, 0, sizePartialChunk, fileOff, true);
blockReader.readFully(crcbuf, 0, crcbuf.length, fileOff + sizePartialChunk,
true);
// compute crc of partial chunk from data read in the block file.
Checksum partialCrc = new CRC32();
partialCrc.update(buf, 0, sizePartialChunk);
LOG.info("Read in partial CRC chunk from disk for block " + block);
// paranoia! verify that the pre-computed crc matches what we
// recalculated just now
if (partialCrc.getValue() != FSInputChecker.checksum2long(crcbuf)) {
String msg = "Partial CRC " + partialCrc.getValue()
+ " does not match value computed the "
+ " last time file was closed "
+ FSInputChecker.checksum2long(crcbuf);
throw new IOException(msg);
}
partialCrcInt = (int) partialCrc.getValue();
}
/**
* Flush the data and checksum data out to the stream. Please call sync to
* make sure to write the data in to disk
*
* @throws IOException
*/
@Override
public void flush(boolean forceSync)
throws IOException {
if (blockDataWriter != null) {
blockDataWriter.flush();
if (forceSync) {
blockDataWriter.force(true);
}
}
}
@Override
public void fileRangeSync(long lastBytesToSync, int flags) throws IOException {
if (blockDataWriter.hasChannel() && lastBytesToSync > 0) {
long channelPos = blockDataWriter.getChannelPosition();
long blockPos = BlockInlineChecksumReader.getBlockSizeFromFileLength(
channelPos, this.checksumType, this.bytesPerChecksum);
long startOffsetInBlock = blockPos - lastBytesToSync;
if (startOffsetInBlock < 0) {
startOffsetInBlock = 0;
}
long lastChunkSizeForStartOffset = startOffsetInBlock % bytesPerChecksum;
long startOffsetInChannel = BlockInlineChecksumReader
.getFileLengthFromBlockSize(startOffsetInBlock
- lastChunkSizeForStartOffset, bytesPerChecksum, checksumSize)
+ lastChunkSizeForStartOffset;
if (LOG.isDebugEnabled()) {
LOG.debug("file_range_sync " + block + " channel position "
+ blockDataWriter.getChannelPosition() + " offset "
+ startOffsetInChannel);
}
blockDataWriter.syncFileRangeIfPossible(startOffsetInChannel, channelPos
- startOffsetInChannel, flags);
}
}
public void truncateBlock(long newBlockLen)
throws IOException {
if (newBlockLen == 0) {
// Special case for truncating to 0 length, since there's no previous
// chunk.
RandomAccessor ra = blockDataFile.getRandomAccessor();
try {
ra.setLength(BlockInlineChecksumReader.getHeaderSize());
} finally {
ra.close();
}
return;
}
DataChecksum dcs = DataChecksum.newDataChecksum(this.checksumType, this.bytesPerChecksum);
this.checksumSize = dcs.getChecksumSize();
long newBlockFileSize = BlockInlineChecksumReader
.getFileLengthFromBlockSize(newBlockLen, bytesPerChecksum, checksumSize);
int lastchunksize = (int) (newBlockLen % bytesPerChecksum);
RandomAccessor ra = blockDataFile.getRandomAccessor();
try {
// truncate blockFile
ra.setLength(newBlockFileSize);
if (lastchunksize != 0) {
// Calculate last partial checksum.
long lastchunkoffset = BlockInlineChecksumReader.getPosFromBlockOffset(
newBlockLen - lastchunksize, bytesPerChecksum, checksumSize);
byte[] b = new byte[Math.max(lastchunksize, checksumSize)];
// read last chunk
ra.seek(lastchunkoffset);
ra.readFully(b, 0, lastchunksize);
// compute checksum
dcs.update(b, 0, lastchunksize);
dcs.writeValue(b, 0, false);
ra.seek(newBlockFileSize - checksumSize);
ra.write(b, 0, checksumSize);
}
} finally {
ra.close();
}
}
@Override
public void close() throws IOException {
close(0);
}
public void close(int fadvise) throws IOException {
IOException ioe = null;
// close block file
try {
try {
flush(datanode.syncOnClose);
if (fadvise != 0) {
fadviseStream(fadvise, 0, 0, true);
}
} finally {
if (blockDataWriter != null) {
blockDataWriter.close();
blockDataWriter = null;
}
}
} catch (IOException e) {
ioe = e;
}
// disk check
// We don't check disk for ClosedChannelException as close() can be
// called twice and it is possible that out.close() throws.
// No need to check or recheck disk then.
//
if (ioe != null) {
if (!(ioe instanceof ClosedChannelException)) {
datanode.checkDiskError(ioe);
}
throw ioe;
}
}
static public String getInlineChecksumFileName(Block block, int checksumType,
int bytesPerChecksum) {
assert checksumType != DataChecksum.CHECKSUM_UNKNOWN;
return block.getBlockName() + "_" + block.getGenerationStamp() + "_"
+ FSDataset.FORMAT_VERSION_INLINECHECKSUM + "_" + checksumType + "_"
+ bytesPerChecksum;
}
/**
* Only used for testing
*/
public BlockDataFile getBlockDataFile() {
return blockDataFile;
}
}