/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs.server.datanode;
import java.io.BufferedInputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.Socket;
import java.net.SocketException;
import java.nio.channels.FileChannel;
import java.security.MessageDigest;
import java.util.Arrays;
import org.apache.hadoop.fs.ChecksumException;
import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.protocol.DataTransferProtocol;
import org.apache.hadoop.hdfs.protocol.FSConstants;
import org.apache.hadoop.hdfs.server.datanode.BlockSender.InputStreamFactory;
import org.apache.hadoop.hdfs.server.datanode.BlockWithChecksumFileReader.MemoizedBlock;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.MD5Hash;
import org.apache.hadoop.net.NetUtils;
import org.apache.hadoop.net.SocketOutputStream;
import org.apache.hadoop.util.ChecksumUtil;
import org.apache.hadoop.util.CrcConcat;
import org.apache.hadoop.util.DataChecksum;
import org.apache.hadoop.util.StringUtils;
/**
* The class to read from inline checksum block file and stream it to
* output packet buffer. The expected block file name is:
* blk_(blockId)_(generation_id)
*
* The file format is following:
* +---------------------------+
* | Checksum Header |
* +---------------------------+
* | |
* | Data for Chunk 1 |
* | ...... |
* | |
* +---------------------------+
* | Checksum for Chunk 1 |
* +---------------------------+
* | |
* | Data for Chunk 2 |
* | ...... |
* | |
* +---------------------------+
* | Checksum for Chunk 2 |
* +---------------------------+
* | |
* | Data for Chunk 3 |
* | . |
* | . |
* | . |
* | |
* +---------------------------+
* | Data for Last Chunk |
* | (Can be Partial) |
* +---------------------------+
* | Checksum for Last Chunk |
* +---------------------------+
*
* After the file header, chunks are saved. For every chunk, first data
* are saved, and then checksums.
*
*/
public class BlockInlineChecksumReader extends DatanodeBlockReader {
private BlockInputStreamFactory streamFactory;
private BlockDataFile.Reader blockDataFileReader;
long blockInPosition = -1;
MemoizedBlock memoizedBlock;
private int initChecksumType;
private int initBytesPerChecksum;
private byte[] tempBuffer = null;
BlockInlineChecksumReader(int namespaceId, Block block,
boolean isFinalized, boolean ignoreChecksum, boolean verifyChecksum,
boolean corruptChecksumOk, BlockInputStreamFactory streamFactory,
int checksumType, int bytesPerChecksum) {
super(namespaceId, block, isFinalized, ignoreChecksum, verifyChecksum,
corruptChecksumOk);
this.streamFactory = streamFactory;
this.initChecksumType = checksumType;
this.initBytesPerChecksum = bytesPerChecksum;
}
@Override
public void fadviseStream(int advise, long offset, long len)
throws IOException {
long fileOffset = BlockInlineChecksumReader.getPosFromBlockOffset(offset,
bytesPerChecksum, checksumSize);
long fileLen = BlockInlineChecksumReader.getFileLengthFromBlockSize(len
+ offset, bytesPerChecksum, checksumSize)
- fileOffset;
blockDataFileReader.posixFadviseIfPossible(fileOffset, fileLen, advise);
}
@Override
public DataChecksum getChecksumToSend(long blockLength) throws IOException {
if (checksum == null) {
assert initChecksumType != DataChecksum.CHECKSUM_UNKNOWN;
checksum = DataChecksum.newDataChecksum(initChecksumType,
initBytesPerChecksum);
super.getChecksumInfo(blockLength);
}
assert checksum != null;
if (ignoreChecksum) {
return DataChecksum.newDataChecksum(DataChecksum.CHECKSUM_NULL,
checksum.getBytesPerChecksum());
} else {
return checksum;
}
}
/**
* get file length for the block size.
*
* @param blockSize
* @param bytesPerChecksum
* @param checksumSize
* @return
*/
public static long getFileLengthFromBlockSize(long blockSize,
int bytesPerChecksum, int checksumSize) {
long numChunks;
if (blockSize % bytesPerChecksum == 0) {
numChunks = blockSize / bytesPerChecksum;
} else {
numChunks = blockSize / bytesPerChecksum + 1;
}
return blockSize + numChunks * checksumSize
+ BlockInlineChecksumReader.getHeaderSize();
}
/**
* Translate from block offset to position in file.
*
* @param offsetInBlock
* @param bytesPerChecksum
* @param checksumSize
* @return
*/
public static long getPosFromBlockOffset(long offsetInBlock, int bytesPerChecksum,
int checksumSize) {
// We only support to read full chunks, so offsetInBlock must be the boundary
// of the chunks.
assert offsetInBlock % bytesPerChecksum == 0;
// The position in the file will be the same as the file size for the block
// size.
return getFileLengthFromBlockSize(offsetInBlock, bytesPerChecksum, checksumSize);
}
public void initialize(long offset, long blockLength)
throws IOException {
blockDataFileReader = streamFactory.getBlockDataFileReader();
memoizedBlock = new MemoizedBlock(blockLength);
}
@Override
public boolean prepareTransferTo() throws IOException {
return false;
}
@Override
public void sendChunks(OutputStream out, byte[] buf, long startOffset,
int bufStartOff, int numChunks, int len, BlockCrcUpdater crcUpdater, int packetVersion)
throws IOException {
long offset = startOffset;
long endOffset = startOffset + len;
int checksumOff = bufStartOff;
int checksumLen = ignoreChecksum ? 0 : (numChunks * checksumSize);
int bytesToRead = len + checksumSize * numChunks;
long offsetInFile = BlockInlineChecksumReader
.getPosFromBlockOffset(offset, bytesPerChecksum, checksumSize);
if (packetVersion == DataTransferProtocol.PACKET_VERSION_CHECKSUM_FIRST) {
if (tempBuffer == null || tempBuffer.length < bytesToRead) {
tempBuffer = new byte[bytesToRead];
}
blockDataFileReader.readFully(tempBuffer, 0, bytesToRead, offsetInFile,
true);
if (dnData != null) {
dnData.recordReadChunkInlineTime();
}
int tempBufferPos = 0;
int dataOff = checksumOff + checksumLen;
int remain = len;
for (int i = 0; i < numChunks; i++) {
assert remain > 0;
int lenToRead = (remain > bytesPerChecksum) ? bytesPerChecksum : remain;
System.arraycopy(tempBuffer, tempBufferPos, buf, dataOff, lenToRead);
if (dnData != null) {
dnData.recordCopyChunkDataTime();
}
tempBufferPos += lenToRead;
if (!ignoreChecksum) {
System.arraycopy(tempBuffer, tempBufferPos, buf, checksumOff,
checksumSize);
if (dnData != null) {
dnData.recordCopyChunkChecksumTime();
}
if (crcUpdater != null) {
crcUpdater.updateBlockCrc(offset + dataOff - bufStartOff
- checksumLen, lenToRead,
DataChecksum.getIntFromBytes(buf, checksumOff));
}
} else {
if (crcUpdater != null) {
crcUpdater.disable();
}
}
tempBufferPos += checksumSize;
if (verifyChecksum && !corruptChecksumOk) {
checksum.reset();
checksum.update(buf, dataOff, lenToRead);
if (!checksum.compare(buf, checksumOff)) {
throw new ChecksumException("Checksum failed at "
+ (offset + len - remain), len);
}
if (dnData != null) {
dnData.recordVerifyCheckSumTime();
}
}
dataOff += lenToRead;
checksumOff += checksumSize;
remain -= lenToRead;
}
// only recompute checksum if we can't trust the meta data due to
// concurrent writes
if ((checksumSize != 0 && endOffset % bytesPerChecksum != 0)
&& memoizedBlock.hasBlockChanged(endOffset)) {
ChecksumUtil.updateChunkChecksum(buf, bufStartOff, bufStartOff
+ checksumLen, len, checksum);
}
} else if (packetVersion == DataTransferProtocol.PACKET_VERSION_CHECKSUM_INLINE){
blockDataFileReader.readFully(buf, bufStartOff, bytesToRead,
offsetInFile, true);
if (dnData != null) {
dnData.recordReadChunkInlineTime();
}
if (verifyChecksum && !corruptChecksumOk) {
int dataOff = bufStartOff;
int remain = len;
for (int i = 0; i < numChunks; i++) {
assert remain > 0;
int lenToRead = (remain > bytesPerChecksum) ? bytesPerChecksum : remain;
checksum.reset();
checksum.update(buf, dataOff, lenToRead);
dataOff += lenToRead;
if (!checksum.compare(buf, dataOff)) {
throw new ChecksumException("Checksum failed at "
+ (offset + len - remain), len);
}
dataOff += checksumSize;
remain -= lenToRead;
}
if (dnData != null) {
dnData.recordVerifyCheckSumTime();
}
}
// only recompute checksum if we can't trust the meta data due to
// concurrent writes
if ((checksumSize != 0 && endOffset % bytesPerChecksum != 0)
&& memoizedBlock.hasBlockChanged(endOffset)) {
ChecksumUtil.updateChunkChecksum(buf, bufStartOff + len, bufStartOff,
len, checksum);
if (dnData != null) {
dnData.recordUpdateChunkCheckSumTime();
}
}
} else {
throw new IOException("Unidentified packet version.");
}
try {
out.write(buf, 0, bufStartOff + bytesToRead);
if (dnData != null) {
dnData.recordSendChunkToClientTime();
}
} catch (IOException e) {
if (LOG.isDebugEnabled()) {
LOG.debug("IOException when reading block " + block + " offset "
+ offset, e);
}
throw BlockSender.ioeToSocketException(e);
}
}
@Override
public int getPreferredPacketVersion() {
return DataTransferProtocol.PACKET_VERSION_CHECKSUM_INLINE;
}
@Override
public void close() throws IOException {
IOException ioe = null;
// throw IOException if there is any
if (ioe != null) {
throw ioe;
}
}
/**
* helper class used to track if a block's meta data is verifiable or not
*/
class MemoizedBlock {
// visible block length
private long blockLength;
private MemoizedBlock(long blockLength)
throws IOException {
this.blockLength = blockLength;
}
boolean isChannelSizeMatchBlockLength() throws IOException {
long currentLength = blockDataFileReader.size();
return (currentLength == BlockInlineChecksumReader
.getFileLengthFromBlockSize(blockLength, bytesPerChecksum,
checksumSize));
}
// logic: if we are starting or ending on a partial chunk and the block
// has more data than we were told at construction, the block has 'changed'
// in a way that we care about (ie, we can't trust crc data)
boolean hasBlockChanged(long endOffset) throws IOException {
if (isFinalized) {
// We would treat it an error case for a finalized block at open time
// has an unmatched size when closing. There might be false positive
// for append() case. We made the trade-off to avoid false negative.
// always return true so it data integrity is guaranteed by checksum
// checking.
return false;
}
return !isChannelSizeMatchBlockLength();
}
}
/**
* Implement Scatter Gather read. Since checksum and data are saved separately,
* we go over the data file twice, the first time for checksums and the second
* time for data. The speed of it then is not necessarily to be faster than
* normal read() and is likely to be slower. We have this method here just
* for backward compatible.
* @param s
* @param replica
* @param dataFile
* @param block
* @param startOffset
* @param length
* @param datanode
* @return
* @throws IOException
*/
static long readBlockAccelerator(Socket s, ReplicaToRead replica,
File dataFile, Block block, long startOffset, long length,
DataNode datanode) throws IOException {
FileInputStream datain = new FileInputStream(dataFile);
FileChannel dch = datain.getChannel();
int type = replica.getChecksumType();
int bytesPerChecksum = replica.getBytesPerChecksum();
long checksumSize = DataChecksum.getChecksumSizeByType(type);
DataChecksum checksum = DataChecksum.newDataChecksum(type, bytesPerChecksum);
// align the startOffset with the previous bytesPerChecksum boundary.
long delta = startOffset % bytesPerChecksum;
startOffset -= delta;
length += delta;
// align the length to encompass the entire last checksum chunk
delta = length % bytesPerChecksum;
if (delta != 0) {
delta = bytesPerChecksum - delta;
length += delta;
}
// find the offset in the metafile
long startChunkNumber = startOffset / bytesPerChecksum;
long numChunks = length / bytesPerChecksum;
// get a connection back to the client
SocketOutputStream out = new SocketOutputStream(s, datanode.socketWriteTimeout);
try {
// Write checksum information
checksum.writeHeader(new DataOutputStream(out));
// Transfer checksums
int remain = (int) length;
long pos = startChunkNumber * (bytesPerChecksum + checksumSize);
for (int i = 0; i < numChunks; i++) {
assert remain > 0;
int lenToRead = (remain > bytesPerChecksum) ? bytesPerChecksum : remain;
pos += lenToRead;
dch.position(pos);
long val = dch.transferTo(pos, checksumSize, out);
if (val != checksumSize) {
String msg = "readBlockAccelerator for block " + block +
" at offset " + pos +
" Cannot read the full checksum.";
LOG.warn(msg);
throw new IOException(msg);
}
pos += checksumSize;
remain -= lenToRead;
}
// Transfer data
remain = (int) length;
pos = startChunkNumber * (bytesPerChecksum + checksumSize);
for (int i = 0; i < numChunks; i++) {
assert remain > 0;
dch.position(pos);
int lenToRead = (remain > bytesPerChecksum) ? bytesPerChecksum : remain;
long val = dch.transferTo(pos, lenToRead, out);
if (val != lenToRead) {
String msg = "readBlockAccelerator for block " + block +
" at offset " + pos +
" Cannot read a full chunk.";
LOG.warn(msg);
throw new IOException(msg);
}
pos += lenToRead + checksumSize;
remain -= lenToRead;
}
return length;
} catch ( SocketException ignored ) {
// Its ok for remote side to close the connection anytime.
datanode.myMetrics.blocksRead.inc();
return -1;
} catch ( IOException ioe ) {
/* What exactly should we do here?
* Earlier version shutdown() datanode if there is disk error.
*/
LOG.warn(datanode.getDatanodeInfo() +
":readBlockAccelerator:Got exception while serving " +
block + " to " +
s.getInetAddress() + ":\n" +
StringUtils.stringifyException(ioe) );
throw ioe;
} finally {
IOUtils.closeStream(out);
IOUtils.closeStream(datain);
}
}
/**
* Calculate CRC Checksum of the whole block. Implemented by concatenating
* checksums of all the chunks.
*
* @param datanode
* @param ri
* @param namespaceId
* @param block
* @return
* @throws IOException
*/
static public int getBlockCrc(DataNode datanode, ReplicaToRead ri,
int namespaceId, Block block) throws IOException {
InputStream rawStreamIn = null;
DataInputStream streamIn = null;
int blockCrc = 0;
try {
int bytesPerCRC;
int checksumSize;
bytesPerCRC = ri.getBytesPerChecksum();
int checksumType = ri.getChecksumType();
if (checksumType != DataChecksum.CHECKSUM_CRC32) {
throw new IOException("File Checksum now is only supported for CRC32");
}
DataChecksum dataChecksum = DataChecksum.newDataChecksum(checksumType,
bytesPerCRC);
checksumSize = dataChecksum.getChecksumSize();
rawStreamIn = ri.getBlockInputStream(datanode, 0);
streamIn = new DataInputStream(new BufferedInputStream(rawStreamIn,
FSConstants.BUFFER_SIZE));
IOUtils.skipFully(streamIn, BlockInlineChecksumReader.getHeaderSize());
long lengthLeft = ((FileInputStream) rawStreamIn).getChannel().size()
- BlockInlineChecksumReader.getHeaderSize();
if (lengthLeft == 0) {
blockCrc = (int) dataChecksum.getValue();
} else {
byte[] buffer = new byte[checksumSize];
boolean firstChecksum = true;
while (lengthLeft > 0) {
long dataByteLengh;
if (lengthLeft >= bytesPerCRC + checksumSize) {
lengthLeft -= bytesPerCRC + checksumSize;
dataByteLengh = bytesPerCRC;
} else if (lengthLeft > checksumSize) {
dataByteLengh = lengthLeft - checksumSize;
lengthLeft = 0;
} else {
// report to name node the corruption.
DataBlockScanner.reportBadBlocks(block, namespaceId, datanode);
throw new IOException("File for namespace " + namespaceId
+ " block " + block + " seems to be corrupted");
}
IOUtils.skipFully(streamIn, dataByteLengh);
IOUtils.readFully(streamIn, buffer, 0, buffer.length);
int intChecksum = DataChecksum.getIntFromBytes(buffer, 0);
if (firstChecksum) {
blockCrc = intChecksum;
firstChecksum = false;
} else {
blockCrc = CrcConcat.concatCrc(blockCrc, intChecksum,
(int) dataByteLengh);
}
}
}
if (LOG.isDebugEnabled()) {
LOG.debug("block=" + block + ", bytesPerCRC=" + bytesPerCRC
+ ", crc=" + blockCrc);
}
return blockCrc;
} finally {
IOUtils.closeStream(streamIn);
IOUtils.closeStream(rawStreamIn);
}
}
static public long getBlockSizeFromFileLength(long fileSize, int checksumType,
int bytesPerChecksum) {
assert checksumType != DataChecksum.CHECKSUM_UNKNOWN;
long headerSize = BlockInlineChecksumReader.getHeaderSize();
if (fileSize <= headerSize) {
return 0;
}
long checksumSize = DataChecksum.getChecksumSizeByType(checksumType);
long numChunks = (fileSize - headerSize - 1)
/ (bytesPerChecksum + checksumSize) + 1;
if (fileSize <= headerSize + checksumSize * numChunks + bytesPerChecksum
* (numChunks - 1)) {
DataNode.LOG.warn("Block File has wrong size: size " + fileSize
+ " checksumType: " + checksumType + " bytesPerChecksum"
+ bytesPerChecksum);
}
return fileSize - headerSize - checksumSize * numChunks;
}
public static class GenStampAndChecksum {
public GenStampAndChecksum(long generationStamp, int checksumType,
int bytesPerChecksum) {
super();
this.generationStamp = generationStamp;
this.checksumType = checksumType;
this.bytesPerChecksum = bytesPerChecksum;
}
long generationStamp;
int checksumType;
int bytesPerChecksum;
public int getChecksumType() {
return checksumType;
}
public int getBytesPerChecksum() {
return bytesPerChecksum;
}
}
/** Return the generation stamp from the name of the block file.
*/
public static GenStampAndChecksum getGenStampAndChecksumFromInlineChecksumFile(
String fileName) throws IOException {
String[] vals = StringUtils.split(fileName, '_');
if (vals.length != 6) {
// blk, blkid, genstamp, version, checksumtype, byte per checksum
throw new IOException("unidentified block name format: " + fileName);
}
if (Integer.parseInt(vals[3]) != FSDataset.FORMAT_VERSION_INLINECHECKSUM) {
// We only support one version of meta version now.
throw new IOException("Unsupported format version for file "
+ fileName);
}
return new GenStampAndChecksum(Long.parseLong(vals[2]),
Integer.parseInt(vals[4]), Integer.parseInt(vals[5]));
}
/** Return the generation stamp from the name of the block file.
*/
static long getGenerationStampFromInlineChecksumFile(String blockName)
throws IOException {
String[] vals = StringUtils.split(blockName, '_');
if (vals.length != 6) {
// blk, blkid, genstamp, version, checksumtype, byte per checksum
throw new IOException("unidentified block name format: " + blockName);
}
return Long.parseLong(vals[2]);
}
/**
* Returns the size of the header for data file
*/
public static int getHeaderSize() {
return 0;
}
}