/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hdfs.server.datanode; import java.io.BufferedInputStream; import java.io.DataInputStream; import java.io.File; import java.io.FileDescriptor; import java.io.FileInputStream; import java.io.FilterInputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.net.Socket; import java.net.SocketException; import java.nio.channels.FileChannel; import java.util.Arrays; import org.apache.hadoop.fs.ChecksumException; import org.apache.hadoop.hdfs.protocol.Block; import org.apache.hadoop.hdfs.protocol.DataTransferProtocol; import org.apache.hadoop.hdfs.protocol.FSConstants; import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.net.SocketOutputStream; import org.apache.hadoop.util.ChecksumUtil; import org.apache.hadoop.util.CrcConcat; import org.apache.hadoop.util.DataChecksum; import org.apache.hadoop.util.StringUtils; /** * Read from blocks with separate checksum files. * Block file name: * blk_(blockId) * * Checksum file name: * blk_(blockId)_(generation_stamp).meta * * The on disk file format is: * Data file keeps just data in the block: * * +---------------+ * | | * | Data | * | . | * | . | * | . | * | . | * | . | * | . | * | | * +---------------+ * * Checksum file: * +----------------------+ * | Checksum Header | * +----------------------+ * | Checksum for Chunk 1 | * +----------------------+ * | Checksum for Chunk 2 | * +----------------------+ * | . | * | . | * | . | * +----------------------+ * | Checksum for last | * | Chunk (Partial) | * +----------------------+ * */ public class BlockWithChecksumFileReader extends DatanodeBlockReader { private InputStreamWithChecksumFactory streamFactory; private DataInputStream checksumIn; // checksum datastream private BlockDataFile.Reader blockDataFileReader; boolean useTransferTo = false; MemoizedBlock memoizedBlock; BlockWithChecksumFileReader(int namespaceId, Block block, boolean isFinalized, boolean ignoreChecksum, boolean verifyChecksum, boolean corruptChecksumOk, InputStreamWithChecksumFactory streamFactory) throws IOException { super(namespaceId, block, isFinalized, ignoreChecksum, verifyChecksum, corruptChecksumOk); this.streamFactory = streamFactory; this.checksumIn = streamFactory.getChecksumStream(); this.block = block; } @Override public void fadviseStream(int advise, long offset, long len) throws IOException { blockDataFileReader.posixFadviseIfPossible(offset, len, advise); } private void initializeNullChecksum() { checksumIn = null; // This only decides the buffer size. Use BUFFER_SIZE? checksum = DataChecksum.newDataChecksum(DataChecksum.CHECKSUM_NULL, 16 * 1024); } public DataChecksum getChecksumToSend(long blockLength) throws IOException { if (!corruptChecksumOk || checksumIn != null) { // read and handle the common header here. For now just a version try { BlockMetadataHeader header = BlockMetadataHeader.readHeader(checksumIn); short version = header.getVersion(); if (version != FSDataset.FORMAT_VERSION_NON_INLINECHECKSUM) { LOG.warn("Wrong version (" + version + ") for metadata file for " + block + " ignoring ..."); } checksum = header.getChecksum(); } catch (IOException ioe) { if (blockLength == 0) { initializeNullChecksum(); } else { throw ioe; } } } else { LOG.warn("Could not find metadata file for " + block); initializeNullChecksum(); } super.getChecksumInfo(blockLength); return checksum; } public void initialize(long offset, long blockLength) throws IOException { // seek to the right offsets if (offset > 0) { long checksumSkip = (offset / bytesPerChecksum) * checksumSize; // note blockInStream is seeked when created below if (checksumSkip > 0) { // Should we use seek() for checksum file as well? IOUtils.skipFully(checksumIn, checksumSkip); } } blockDataFileReader = streamFactory.getBlockDataFileReader(); memoizedBlock = new MemoizedBlock(blockLength, streamFactory, block); } public boolean prepareTransferTo() throws IOException { useTransferTo = true; return useTransferTo; } @Override public void sendChunks(OutputStream out, byte[] buf, long offset, int checksumOff, int numChunks, int len, BlockCrcUpdater crcUpdater, int packetVersion) throws IOException { if (packetVersion != DataTransferProtocol.PACKET_VERSION_CHECKSUM_FIRST) { throw new IOException("packet version " + packetVersion + " is not supported by non-inline checksum blocks."); } int checksumLen = numChunks * checksumSize; if (checksumSize > 0 && checksumIn != null) { try { checksumIn.readFully(buf, checksumOff, checksumLen); if (dnData != null) { dnData.recordReadChunkCheckSumTime(); } if (crcUpdater != null) { long tempOffset = offset; long remain = len; for (int i = 0; i < checksumLen; i += checksumSize) { long chunkSize = (remain > bytesPerChecksum) ? bytesPerChecksum : remain; crcUpdater.updateBlockCrc(tempOffset, (int) chunkSize, DataChecksum.getIntFromBytes(buf, checksumOff + i)); remain -= chunkSize; } } } catch (IOException e) { LOG.warn(" Could not read or failed to veirfy checksum for data" + " at offset " + offset + " for block " + block + " got : " + StringUtils.stringifyException(e)); IOUtils.closeStream(checksumIn); checksumIn = null; if (corruptChecksumOk) { if (checksumOff < checksumLen) { // Just fill the array with zeros. Arrays.fill(buf, checksumOff, checksumLen, (byte) 0); if (dnData != null) { dnData.recordReadChunkCheckSumTime(); } } } else { throw e; } } } int dataOff = checksumOff + checksumLen; if (!useTransferTo) { // normal transfer blockDataFileReader.readFully(buf, dataOff, len, offset, true); if (dnData != null) { dnData.recordReadChunkDataTime(); } if (verifyChecksum) { int dOff = dataOff; int cOff = checksumOff; int dLeft = len; for (int i = 0; i < numChunks; i++) { checksum.reset(); int dLen = Math.min(dLeft, bytesPerChecksum); checksum.update(buf, dOff, dLen); if (!checksum.compare(buf, cOff)) { throw new ChecksumException("Checksum failed at " + (offset + len - dLeft), len); } dLeft -= dLen; dOff += dLen; cOff += checksumSize; } if (dnData != null) { dnData.recordVerifyCheckSumTime(); } } // only recompute checksum if we can't trust the meta data due to // concurrent writes if (memoizedBlock.hasBlockChanged(len, offset)) { ChecksumUtil.updateChunkChecksum(buf, checksumOff, dataOff, len, checksum); if (dnData != null) { dnData.recordUpdateChunkCheckSumTime(); } } try { out.write(buf, 0, dataOff + len); if (dnData != null) { dnData.recordSendChunkToClientTime(); } } catch (IOException e) { if (LOG.isDebugEnabled()) { LOG.debug("IOException when reading block " + block + " offset " + offset, e); } throw BlockSender.ioeToSocketException(e); } } else { try { // use transferTo(). Checks on out and blockIn are already done. SocketOutputStream sockOut = (SocketOutputStream) out; if (memoizedBlock.hasBlockChanged(len, offset)) { blockDataFileReader.readFully(buf, dataOff, len, offset, true); if (dnData != null) { dnData.recordReadChunkDataTime(); } ChecksumUtil.updateChunkChecksum(buf, checksumOff, dataOff, len, checksum); if (dnData != null) { dnData.recordUpdateChunkCheckSumTime(); } sockOut.write(buf, 0, dataOff + len); if (dnData != null) { dnData.recordSendChunkToClientTime(); } } else { // first write the packet sockOut.write(buf, 0, dataOff); // no need to flush. since we know out is not a buffered stream. blockDataFileReader.transferToSocketFully(sockOut,offset, len); if (dnData != null) { dnData.recordTransferChunkToClientTime(); } } } catch (IOException e) { if (LOG.isDebugEnabled()) { LOG.debug("IOException when reading block " + block + " offset " + offset, e); } /* * exception while writing to the client (well, with transferTo(), it * could also be while reading from the local file). */ throw BlockSender.ioeToSocketException(e); } } } @Override public int getPreferredPacketVersion() { return DataTransferProtocol.PACKET_VERSION_CHECKSUM_FIRST; } public void close() throws IOException { IOException ioe = null; // close checksum file if (checksumIn != null) { try { checksumIn.close(); } catch (IOException e) { ioe = e; } checksumIn = null; } // throw IOException if there is any if (ioe != null) { throw ioe; } } /** * helper class used to track if a block's meta data is verifiable or not */ class MemoizedBlock { // visible block length private long blockLength; private final Block block; private final InputStreamWithChecksumFactory isf; private MemoizedBlock(long blockLength, InputStreamWithChecksumFactory isf, Block block) { this.blockLength = blockLength; this.isf = isf; this.block = block; } // logic: if we are starting or ending on a partial chunk and the block // has more data than we were told at construction, the block has 'changed' // in a way that we care about (ie, we can't trust crc data) boolean hasBlockChanged(long dataLen, long offset) throws IOException { if (isFinalized) { // We would treat it an error case for a finalized block at open time // has an unmatched size when closing. There might be false positive // for append() case. We made the trade-off to avoid false negative. // always return true so it data integrity is guaranteed by checksum // checking. return false; } // check if we are using transferTo since we tell if the file has changed // (blockInPosition >= 0 => we are using transferTo and File Channels if (useTransferTo) { long currentLength = blockDataFileReader.size(); return (offset % bytesPerChecksum != 0 || dataLen % bytesPerChecksum != 0) && currentLength > blockLength; } else { FSDatasetInterface ds = null; if (isf instanceof DatanodeBlockReader.BlockInputStreamFactory) { ds = ((DatanodeBlockReader.BlockInputStreamFactory) isf).getDataset(); } // offset is the offset into the block return (offset % bytesPerChecksum != 0 || dataLen % bytesPerChecksum != 0) && ds != null && ds.getOnDiskLength(namespaceId, block) > blockLength; } } } public static interface InputStreamWithChecksumFactory extends BlockSender.InputStreamFactory { public InputStream createStream(long offset) throws IOException; public DataInputStream getChecksumStream() throws IOException; } /** Find the metadata file for the specified block file. * Return the generation stamp from the name of the metafile. */ static long getGenerationStampFromSeperateChecksumFile(String[] listdir, String blockName) { for (int j = 0; j < listdir.length; j++) { String path = listdir[j]; if (!path.startsWith(blockName)) { continue; } String[] vals = StringUtils.split(path, '_'); if (vals.length != 3) { // blk, blkid, genstamp.meta continue; } String[] str = StringUtils.split(vals[2], '.'); if (str.length != 2) { continue; } return Long.parseLong(str[0]); } DataNode.LOG.warn("Block " + blockName + " does not have a metafile!"); return Block.GRANDFATHER_GENERATION_STAMP; } /** * Find generation stamp from block file and meta file. * @param blockFile * @param metaFile * @return * @throws IOException */ static long parseGenerationStampInMetaFile(File blockFile, File metaFile ) throws IOException { String metaname = metaFile.getName(); String gs = metaname.substring(blockFile.getName().length() + 1, metaname.length() - FSDataset.METADATA_EXTENSION.length()); try { return Long.parseLong(gs); } catch(NumberFormatException nfe) { throw (IOException)new IOException("blockFile=" + blockFile + ", metaFile=" + metaFile).initCause(nfe); } } /** * This class provides the input stream and length of the metadata * of a block * */ static class MetaDataInputStream extends FilterInputStream { MetaDataInputStream(InputStream stream, long len) { super(stream); length = len; } private long length; public long getLength() { return length; } } static protected File getMetaFile(FSDatasetInterface dataset, int namespaceId, Block b) throws IOException { return BlockWithChecksumFileWriter.getMetaFile(dataset.getBlockFile(namespaceId, b), b); } /** * Does the meta file exist for this block? * @param namespaceId - parent namespace id * @param b - the block * @return true of the metafile for specified block exits * @throws IOException */ static public boolean metaFileExists(FSDatasetInterface dataset, int namespaceId, Block b) throws IOException { return getMetaFile(dataset, namespaceId, b).exists(); } /** * Returns metaData of block b as an input stream (and its length) * @param namespaceId - parent namespace id * @param b - the block * @return the metadata input stream; * @throws IOException */ static public MetaDataInputStream getMetaDataInputStream( FSDatasetInterface dataset, int namespace, Block b) throws IOException { File checksumFile = getMetaFile(dataset, namespace, b); return new MetaDataInputStream(new FileInputStream(checksumFile), checksumFile.length()); } static byte[] getMetaData(FSDatasetInterface dataset, int namespaceId, Block block) throws IOException { MetaDataInputStream checksumIn = null; try { checksumIn = getMetaDataInputStream(dataset, namespaceId, block); long fileSize = checksumIn.getLength(); if (fileSize >= 1L << 31 || fileSize <= 0) { throw new IOException("Unexpected size for checksumFile of block" + block); } byte[] buf = new byte[(int) fileSize]; IOUtils.readFully(checksumIn, buf, 0, buf.length); return buf; } finally { IOUtils.closeStream(checksumIn); } } /** * Calculate CRC Checksum of the whole block. Implemented by concatenating * checksums of all the chunks. * * @param datanode * @param ri * @param namespaceId * @param block * @return * @throws IOException */ static public int getBlockCrc(DataNode datanode, ReplicaToRead ri, int namespaceId, Block block) throws IOException { InputStream rawStreamIn = null; DataInputStream streamIn = null; try { int bytesPerCRC; int checksumSize; long crcPerBlock; rawStreamIn = BlockWithChecksumFileReader.getMetaDataInputStream( datanode.data, namespaceId, block); streamIn = new DataInputStream(new BufferedInputStream(rawStreamIn, FSConstants.BUFFER_SIZE)); final BlockMetadataHeader header = BlockMetadataHeader .readHeader(streamIn); final DataChecksum checksum = header.getChecksum(); if (checksum.getChecksumType() != DataChecksum.CHECKSUM_CRC32) { throw new IOException("File Checksum now is only supported for CRC32"); } bytesPerCRC = checksum.getBytesPerChecksum(); checksumSize = checksum.getChecksumSize(); crcPerBlock = (((BlockWithChecksumFileReader.MetaDataInputStream) rawStreamIn) .getLength() - BlockMetadataHeader.getHeaderSize()) / checksumSize; int blockCrc = 0; byte[] buffer = new byte[checksumSize]; for (int i = 0; i < crcPerBlock; i++) { IOUtils.readFully(streamIn, buffer, 0, buffer.length); int intChecksum = ((buffer[0] & 0xff) << 24) | ((buffer[1] & 0xff) << 16) | ((buffer[2] & 0xff) << 8) | ((buffer[3] & 0xff)); if (i == 0) { blockCrc = intChecksum; } else { int chunkLength; if (i != crcPerBlock - 1 || ri.getBytesVisible() % bytesPerCRC == 0) { chunkLength = bytesPerCRC; } else { chunkLength = (int) ri.getBytesVisible() % bytesPerCRC; } blockCrc = CrcConcat.concatCrc(blockCrc, intChecksum, chunkLength); } } return blockCrc; } finally { if (streamIn != null) { IOUtils.closeStream(streamIn); } if (rawStreamIn != null) { IOUtils.closeStream(rawStreamIn); } } } static long readBlockAccelerator(Socket s, File dataFile, Block block, long startOffset, long length, DataNode datanode) throws IOException { File checksumFile = BlockWithChecksumFileWriter.getMetaFile(dataFile, block); FileInputStream datain = new FileInputStream(dataFile); FileInputStream metain = new FileInputStream(checksumFile); FileChannel dch = datain.getChannel(); FileChannel mch = metain.getChannel(); // read in type of crc and bytes-per-checksum from metadata file int versionSize = 2; // the first two bytes in meta file is the version byte[] cksumHeader = new byte[versionSize + DataChecksum.HEADER_LEN]; int numread = metain.read(cksumHeader); if (numread != versionSize + DataChecksum.HEADER_LEN) { String msg = "readBlockAccelerator: metafile header should be atleast " + (versionSize + DataChecksum.HEADER_LEN) + " bytes " + " but could read only " + numread + " bytes."; LOG.warn(msg); throw new IOException(msg); } DataChecksum ckHdr = DataChecksum.newDataChecksum(cksumHeader, versionSize); int type = ckHdr.getChecksumType(); int bytesPerChecksum = ckHdr.getBytesPerChecksum(); long cheaderSize = DataChecksum.getChecksumHeaderSize(); // align the startOffset with the previous bytesPerChecksum boundary. long delta = startOffset % bytesPerChecksum; startOffset -= delta; length += delta; // align the length to encompass the entire last checksum chunk delta = length % bytesPerChecksum; if (delta != 0) { delta = bytesPerChecksum - delta; length += delta; } // find the offset in the metafile long startChunkNumber = startOffset / bytesPerChecksum; long numChunks = length / bytesPerChecksum; long checksumSize = ckHdr.getChecksumSize(); long startMetaOffset = versionSize + cheaderSize + startChunkNumber * checksumSize; long metaLength = numChunks * checksumSize; // get a connection back to the client SocketOutputStream out = new SocketOutputStream(s, datanode.socketWriteTimeout); try { // write out the checksum type and bytesperchecksum to client // skip the first two bytes that describe the version long val = mch.transferTo(versionSize, cheaderSize, out); if (val != cheaderSize) { String msg = "readBlockAccelerator for block " + block + " at offset " + 0 + " but could not transfer checksum header."; LOG.warn(msg); throw new IOException(msg); } if (LOG.isDebugEnabled()) { LOG.debug("readBlockAccelerator metaOffset " + startMetaOffset + " mlength " + metaLength); } // write out the checksums back to the client val = mch.transferTo(startMetaOffset, metaLength, out); if (val != metaLength) { String msg = "readBlockAccelerator for block " + block + " at offset " + startMetaOffset + " but could not transfer checksums of size " + metaLength + ". Transferred only " + val; LOG.warn(msg); throw new IOException(msg); } if (LOG.isDebugEnabled()) { LOG.debug("readBlockAccelerator dataOffset " + startOffset + " length " + length); } // send data block back to client long read = dch.transferTo(startOffset, length, out); if (read != length) { String msg = "readBlockAccelerator for block " + block + " at offset " + startOffset + " but block size is only " + length + " and could transfer only " + read; LOG.warn(msg); throw new IOException(msg); } return read; } catch ( SocketException ignored ) { // Its ok for remote side to close the connection anytime. datanode.myMetrics.blocksRead.inc(); return -1; } catch ( IOException ioe ) { /* What exactly should we do here? * Earlier version shutdown() datanode if there is disk error. */ LOG.warn(datanode.getDatanodeInfo() + ":readBlockAccelerator:Got exception while serving " + block + " to " + s.getInetAddress() + ":\n" + StringUtils.stringifyException(ioe) ); throw ioe; } finally { IOUtils.closeStream(out); IOUtils.closeStream(datain); IOUtils.closeStream(metain); } } public static boolean isMetaFilename(String name) { return name.startsWith(Block.BLOCK_FILE_PREFIX) && name.endsWith(Block.METADATA_EXTENSION); } /** * Returns array of two longs: the first one is the block id, and the second * one is genStamp. The method workds under assumption that metafile name has * the following format: "blk_<blkid>_<gensmp>.meta" */ public static long[] parseMetafileName(String path) { String[] groundSeparated = StringUtils.split(path, '_'); if (groundSeparated.length != 3) { // blk, blkid, genstamp.meta throw new IllegalArgumentException("Not a valid meta file name"); } String[] dotSeparated = StringUtils.split(groundSeparated[2], '.'); if (dotSeparated.length != 2) { throw new IllegalArgumentException("Not a valid meta file name"); } return new long[] { Long.parseLong(groundSeparated[1]), Long.parseLong(dotSeparated[0]) }; } }