package org.apache.hadoop.mapred;
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hdfs.DFSUtil;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.hadoop.hdfs.DistributedRaidFileSystem;
import org.apache.hadoop.hdfs.GeneralConstant;
import java.io.IOException;
import java.net.InetSocketAddress;
import java.util.HashMap;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.util.Set;
import java.util.HashSet;
import java.util.Random;
import java.util.Collections;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.RemoteIterator;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.commons.logging.Log;
import org.apache.hadoop.mapred.GenWriterThread.TokenBucket;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.LocatedBlocks;
import org.apache.hadoop.hdfs.server.namenode.NameNode;
@SuppressWarnings("deprecation")
/**
* DatanodeBenThread is used for benchmark datanode. Its test type is dnben
* There are two steps in the dnben:
* 1. prepare step:
* Has multiple mappers, each mapper is running on one datanode.
* Each mapper writes N files that has only 1 replia.
* Because the first replica is written locally, it ends up with
* N 1-replica files for each datanode
* 2. Stress test step
* It will first randomly pick N (defined by -dn) victim datanodes
* from the cluster, these victim datanodes need to satisfy two conditions:
* a. it's alive in all namespaces
* b. it has at least M (defined by -minfile) files under workdir
* Then in each mapper, we will spawn T threads and T/X threads per
* namespace where X is the number of namespaces
* Each thread will have P (defined by pread) probability to become
* a read thread and 1-P probability to become a write thread.
* For read thread:
* it will keep picking a random file belongs to one victim node and
* read one buffer data from the file. Because the victim node contains
* the only replica of the file. All the reads will go to victim nodes.
* For write thread:
* It will pass victim nodes as favornodes to the DFSClient and namenode
* will allocate victim nodes to the new files. Most of the writes will
* go to victim nodes
* @author weiyan
*/
public class DatanodeBenThread extends GenThread implements
GeneralConstant {
private static final Log LOG = LogFactory.getLog(DatanodeBenThread.class);
public static final String TEST_TYPE = "dnben";
public static final long DEFAULT_MAX_TIME_SEC = 60;
public static final long DEFAULT_MIN_NUMBER_OF_FILES_PER_DATANODE = 60;
public static final long DEFAULT_FILE_SIZE = 10; //10MB
public static final long DEFAULT_DATANODE_NUMBER = 1;
public static final float DEFAULT_READ_PERCENT = 1.0f;
public static final long DEFAULT_MAX_NUMBER_OF_FILES_PER_THREAD =
Long.MAX_VALUE;
public static final short DEFAULT_REPLICATION_NUM = 3;
public static final String MAX_TIME_SEC_KEY = "max.time.sec";
public static final String FILE_SIZE_KEY = "dfs.MB.file.size";
public static final String DATANODE_NUMBER_KEY =
"dfs.stress.test.datanode.num";
public static final String READ_PERCENT_KEY = "dfs.read.percent";
public static final String MIN_FILE_PER_DATANODE_KEY =
"dfs.min.file.per.datanode";
public static final String RUNNING_TYPE_KEY = "dfs.running.type";
public static final String MAX_NUMBER_OF_FILES_PER_THREAD_KEY =
"dfs.max.nfile.per.thread";
public static final String VICTIM_DATANODE_KEY =
"dfs.victim.datanodes";
public static enum RUNNING_TYPE {
PREPARE, GENERAL, READ, WRITE
}
public class DatanodeBenRunTimeConstants extends RunTimeConstants {
long max_time = DEFAULT_MAX_TIME_SEC * 1000;
long data_rate = DEFAULT_DATA_RATE * 1024;
long min_file = DEFAULT_MIN_NUMBER_OF_FILES_PER_DATANODE;
long file_size = DEFAULT_FILE_SIZE * 1024 * 1024;
long max_files = DEFAULT_MAX_NUMBER_OF_FILES_PER_THREAD;
String task_name = null;
//Used by prepare
String cur_datanode = null;
//Used by General
InetSocketAddress[] victims = null;
Set<String> victimSet = null;
// namespace default uri -> {datanode host name -> list of files}
HashMap<String, HashMap<String, ArrayList<Path>>> pickLists = null;
}
private DatanodeBenRunTimeConstants rtc = null;
private int id;
private String file_prefix = "";
private RUNNING_TYPE running_type = RUNNING_TYPE.GENERAL;
private Random rb = new Random();
private HashMap<String, ArrayList<Path>> nsPickLists = null;
private DistributedFileSystem dfs = null;
public TokenBucket tb = null;
public short replication = 3;
public float pread = DEFAULT_READ_PERCENT;
public long max_size = DEFAULT_FILE_SIZE * 1024 * 1024;
public String thread_name = null;
// Counters
private long read_size = 0;
private long write_size = 0;
private float average_read_rate = 0;
private float average_write_rate = 0;
private long total_num = 0;
public DatanodeBenThread() {
}
public DatanodeBenThread(Configuration conf, Path input, Path output, int id,
RUNNING_TYPE init_type, DatanodeBenRunTimeConstants rtc) throws IOException{
super(conf, input, output, rtc);
this.rtc = rtc;
this.replication = (short)conf.getInt(REPLICATION_KEY, DEFAULT_REPLICATION_NUM);
this.max_size = conf.getLong(FILE_SIZE_KEY, DEFAULT_FILE_SIZE) * 1024 * 1024;
this.pread = conf.getFloat(READ_PERCENT_KEY, DEFAULT_READ_PERCENT);
this.tb = new TokenBucket(rtc.data_rate);
this.id = id;
this.thread_name = rtc.task_name + "_" + id;
this.running_type = init_type;
if (running_type.equals(RUNNING_TYPE.PREPARE)) {
this.file_prefix = rtc.cur_datanode + thread_name + "_part";
} else {
this.file_prefix = thread_name + "_part";
this.nsPickLists = rtc.pickLists.get(conf.get(FileSystem.FS_DEFAULT_NAME_KEY));
this.dfs = (DistributedFileSystem)fs;
float f = rb.nextFloat();
if (f < pread + 1e-9) {
this.running_type = RUNNING_TYPE.READ;
} else {
this.outputPath = new Path(outputPath, thread_name);
this.running_type = RUNNING_TYPE.WRITE;
}
}
fs.mkdirs(this.outputPath);
}
public DatanodeBenThread(Configuration conf, Path output, int id,
RUNNING_TYPE running_type, DatanodeBenRunTimeConstants rtc)
throws IOException{
this(conf, null, output, id, running_type, rtc);
}
public DatanodeBenThread(JobConf conf) {
super(conf);
}
/*
* Look at the output directory to see how many files belong to
* the current datanode
*/
public int getNumberOfFiles() throws IOException {
DistributedFileSystem dfs = (DistributedFileSystem)fs;
RemoteIterator<LocatedFileStatus> iter = dfs.listLocatedStatus(outputPath);
int fn = 0;
while (iter.hasNext()) {
LocatedFileStatus lfs = iter.next();
if (lfs.isDir())
continue;
if (lfs.getBlockLocations().length != 1)
continue;
String curHost = rtc.cur_datanode;
for (String host: lfs.getBlockLocations()[0].getHosts()) {
if (curHost.equals(host)){
fn++;
break;
}
}
}
LOG.info(" Found " + fn + " files in " + dfs.getUri());
return fn;
}
public void write() throws Exception {
long endTime = System.currentTimeMillis() + rtc.max_time;
long currentId = 0;
FSDataOutputStream out = null;
DistributedFileSystem dfs = (DistributedFileSystem) fs;
while (System.currentTimeMillis() < endTime
&& currentId < rtc.max_files) {
if (running_type == RUNNING_TYPE.PREPARE) {
//The number of files reach the minimum limit, exit
if (getNumberOfFiles() > rtc.min_file)
break;
}
Path fileName = new Path(outputPath, file_prefix + currentId);
try {
out = dfs.create(fileName,
FsPermission.getDefault(),
false,
dfs.getConf().getInt("io.file.buffer.size", 4096),
(short)replication,
dfs.getDefaultBlockSize(),
dfs.getConf().getInt("io.bytes.per.checksum", 512),
null,
rtc.victims);
long size = 0;
while (true) {
rb.nextBytes(buffer);
tb.getTokens(rtc.buffer_size);
out.write(buffer, 0, rtc.buffer_size);
size += rtc.buffer_size;
if (System.currentTimeMillis() > endTime
|| size + rtc.buffer_size > max_size) {
// Roll the file
out.close();
out = null;
currentId++;
files_processed++;
processed_size += size;
write_size += size;
Path fullName = fs.makeQualified(fileName);
BlockLocation bl = dfs.getClient().getBlockLocations(
fullName.toUri().getPath(), 0L, 1L)[0];
String hosts = "";
for (String host: bl.getHosts()) {
hosts += host + " ";
}
LOG.info("[close (" + size + "B)] " + hosts + " file " + fullName);
break;
}
}
} catch (Exception e) {
LOG.error("Error in writing file:" + fileName, e);
this.errors.add(e);
} finally {
IOUtils.closeStream(out);
}
}
}
public void read() throws Exception {
long endTime = System.currentTimeMillis() + rtc.max_time;
while (System.currentTimeMillis() < endTime) {
// Randomly pick a datanode from victims
int idx = rb.nextInt(rtc.victims.length);
// Randomly pick a file to read
ArrayList<Path> fileList = nsPickLists.get(rtc.victims[idx].getHostName());
int fid = rb.nextInt(fileList.size());
Path readFile = fileList.get(fid);
FSDataInputStream in = null;
try {
in = fs.open(readFile);
if (in.isUnderConstruction()) {
LOG.info("file " + readFile + " is still open");
}
FileStatus fileStatus = fs.getFileStatus(readFile);
long offset = rb.nextInt((int)Math.max(
fileStatus.getLen() - rtc.buffer_size, 0) + 1);
int size = 0;
in.seek(offset);
size = in.read(buffer, 0, rtc.buffer_size);
if (size < 0) {
continue;
}
processed_size += size;
read_size += size;
LOG.info("Read file " + readFile + " from " +
offset + " to " + (offset + size));
} catch (Exception e) {
LOG.error("Error in read file: " + readFile, e);
this.errors.add(e);
} finally {
IOUtils.closeStream(in);
}
files_processed++;
Thread.sleep(5);
}
}
public void run() {
try {
switch (running_type) {
case PREPARE:
write();
break;
case READ:
LOG.info("Read Thread: " + thread_name);
read();
break;
case WRITE:
LOG.info("Write Thread:" + thread_name);
write();
break;
}
LOG.info("Thread " + thread_name + " is done.");
} catch (Exception ioe) {
LOG.error("Error: ", ioe);
this.errors.add(ioe);
}
}
private static HashMap<String, ArrayList<Path>> getNSPickLists(
DistributedFileSystem dfs, String workdir, long minFile, Set<String> victims)
throws IOException {
HashMap<String, ArrayList<Path>> nsPickLists =
new HashMap<String, ArrayList<Path>>();
RemoteIterator<LocatedFileStatus> iter =
dfs.listLocatedStatus(new Path(workdir));
while (iter.hasNext()) {
LocatedFileStatus lfs = iter.next();
if (lfs.isDir())
continue;
if (lfs.getBlockLocations().length != 1)
continue;
for (String hostname: lfs.getBlockLocations()[0].getHosts()) {
// Skip the uninterested datanodes
if (victims != null && !victims.contains(hostname))
continue;
ArrayList<Path> value = null;
if (!nsPickLists.containsKey(hostname)) {
value = new ArrayList<Path>();
nsPickLists.put(hostname, value);
} else {
value = nsPickLists.get(hostname);
}
value.add(lfs.getPath());
}
}
if (victims == null) {
String[] hostnames = nsPickLists.keySet().toArray(new String[0]);
//Remove the datanodes with not enough files
for (String hostname : hostnames) {
if (nsPickLists.get(hostname).size() < minFile) {
nsPickLists.remove(hostname);
}
}
}
return nsPickLists;
}
private static DistributedFileSystem getDFS(FileSystem fs)
throws IOException {
if (fs instanceof DistributedRaidFileSystem)
fs = ((DistributedRaidFileSystem)fs).getFileSystem();
return (DistributedFileSystem)fs;
}
private static HashMap<String, HashMap<String, ArrayList<Path>>>
getPickLists(List<JobConf> nameNodeConfs, String workdir, long minFile,
Set<String> victims) throws IOException {
HashMap<String, HashMap<String, ArrayList<Path>>> allList = new
HashMap<String, HashMap<String, ArrayList<Path>>>();
for (JobConf nameNodeConf : nameNodeConfs) {
FileSystem fs = FileSystem.get(nameNodeConf);
DistributedFileSystem dfs = getDFS(fs);
HashMap<String, ArrayList<Path>> nsPickLists =
getNSPickLists(dfs, workdir, minFile, victims);
allList.put(nameNodeConf.get(FileSystem.FS_DEFAULT_NAME_KEY),
nsPickLists);
}
return allList;
}
private static Set<DatanodeInfo> getValidDatanodes(JobConf nameNodeConf,
DatanodeBenRunTimeConstants rtc) throws IOException {
FileSystem fs = FileSystem.get(nameNodeConf);
DistributedFileSystem dfs = getDFS(fs);
HashMap<String, ArrayList<Path>> nsPickLists =
rtc.pickLists.get(nameNodeConf.get(FileSystem.FS_DEFAULT_NAME_KEY));
DatanodeInfo[] dnStats = dfs.getLiveDataNodeStats();
Set<DatanodeInfo> validDatanodes = new HashSet<DatanodeInfo>();
for (DatanodeInfo dn: dnStats) {
if (dn.isDecommissioned() || dn.isDecommissionInProgress()) {
continue;
}
if (!nsPickLists.containsKey(dn.getHostName())) {
continue;
}
validDatanodes.add(dn);
}
return validDatanodes;
}
/**
* We randomly pick nDatanode datanodes to do stress tests.
* Valid datanodes should satisfy two conditions:
* 1. alive in all namespaces
* 2. have at lease min_file files under the working directory.
*/
public List<DatanodeInfo> getTestDatanodes(List<JobConf> nameNodeConfs,
String workdir, long nDatanode, long minFile) throws IOException {
this.rtc = new DatanodeBenRunTimeConstants();
rtc.pickLists = getPickLists(nameNodeConfs, workdir, minFile, null);
Set<DatanodeInfo> validDatanodes = getValidDatanodes(nameNodeConfs.get(0), rtc);
for (int i = 1; i < nameNodeConfs.size(); i++) {
Set<DatanodeInfo> tmpDatanodes = getValidDatanodes(nameNodeConfs.get(i), rtc);
Set<DatanodeInfo> mergeDatanodes = new HashSet<DatanodeInfo>();
for (DatanodeInfo dn: tmpDatanodes)
if (validDatanodes.contains(dn)) {
mergeDatanodes.add(dn);
}
validDatanodes = mergeDatanodes;
}
LOG.info("There are " + validDatanodes.size() + " valid datanodes.");
String logInfo = "";
List<DatanodeInfo> dnList = new ArrayList<DatanodeInfo>();
for (DatanodeInfo dn : validDatanodes) {
logInfo += ' ' + dn.getHostName();
dnList.add(dn);
}
LOG.info(logInfo);
if (dnList.size() < nDatanode)
return dnList;
Collections.shuffle(dnList);
return dnList.subList(0, (int)nDatanode);
}
/**
* Write a small file to figure out which datanode we are running
*/
private String getRunningDatanode(Configuration conf)
throws IOException {
FileSystem fs = FileSystem.newInstance(conf);
fs.mkdirs(new Path("/tmp"));
Path fileName = new Path("/tmp", rtc.task_name + System.currentTimeMillis()
+ rb.nextInt());
if (fs.exists(fileName)) {
fs.delete(fileName);
}
FSDataOutputStream out = null;
byte[] buffer= new byte[1];
buffer[0] = '0';
try {
out = fs.create(fileName, (short)1);
out.write(buffer, 0, 1);
} finally {
IOUtils.closeStream(out);
}
fs = getDFS(fs);
assert fs instanceof DistributedFileSystem;
DistributedFileSystem dfs = (DistributedFileSystem)fs;
BlockLocation[] lbs = dfs.getClient().getBlockLocations(
fileName.toUri().getPath(), 0, 1);
fs.delete(fileName);
return lbs[0].getHosts()[0];
}
public static List<JobConf> getNameNodeConfs(JobConf conf)
throws IOException {
List<InetSocketAddress> nameNodeAddrs =
DFSUtil.getClientRpcAddresses(conf, null);
List<JobConf> nameNodeConfs =
new ArrayList<JobConf>(nameNodeAddrs.size());
for (InetSocketAddress nnAddr : nameNodeAddrs) {
JobConf newConf = new JobConf(conf);
newConf.set(NameNode.DFS_NAMENODE_RPC_ADDRESS_KEY,
nnAddr.getHostName() + ":" + nnAddr.getPort());
NameNode.setupDefaultURI(newConf);
nameNodeConfs.add(newConf);
}
return nameNodeConfs;
}
@Override
public GenThread[] prepare(JobConf conf, Text key, Text value)
throws IOException {
this.rtc = new DatanodeBenRunTimeConstants();
super.prepare(conf, key, value, rtc);
rtc.task_name = key.toString() + rtc.taskID;
rtc.min_file = conf.getLong(MIN_FILE_PER_DATANODE_KEY,
DEFAULT_MIN_NUMBER_OF_FILES_PER_DATANODE);
rtc.max_time = conf.getLong(MAX_TIME_SEC_KEY, DEFAULT_MAX_TIME_SEC) * 1000;
rtc.data_rate = conf.getLong(WRITER_DATARATE_KEY, DEFAULT_DATA_RATE) * 1024;
rtc.file_size = conf.getLong(FILE_SIZE_KEY, DEFAULT_FILE_SIZE) * 1024*1024;
LOG.info("data rate: " + rtc.data_rate);
String working_dir = value.toString();
int run_type = conf.getInt(RUNNING_TYPE_KEY,
RUNNING_TYPE.GENERAL.ordinal());
List<JobConf> nameNodeConfs = getNameNodeConfs(conf);
if (run_type == RUNNING_TYPE.PREPARE.ordinal()) {
rtc.cur_datanode = getRunningDatanode(conf);
LOG.info("Current datanode is " + rtc.cur_datanode);
// Make sure each namespace has same number of threads
long nthread_per_namespace = ((rtc.nthreads-1) / nameNodeConfs.size() + 1);
rtc.max_files = (rtc.min_file-1) / nthread_per_namespace + 1;
rtc.nthreads = nthread_per_namespace * nameNodeConfs.size();
LOG.info("Number of threads: " + rtc.nthreads + " max_file:" + rtc.max_files);
DatanodeBenThread[] threads = new DatanodeBenThread[(int)rtc.nthreads];
int nsIdx = 0;
for (int i=0; i < rtc.nthreads; i++) {
threads[i] = new DatanodeBenThread(nameNodeConfs.get(nsIdx),
new Path(working_dir), i,
RUNNING_TYPE.PREPARE, rtc);
nsIdx++;
if (nsIdx == nameNodeConfs.size()) {
nsIdx = 0;
}
}
return threads;
} else {
String[] victimStrs = conf.getStrings(VICTIM_DATANODE_KEY);
LOG.info("Victim datanodes are :" + conf.get(VICTIM_DATANODE_KEY));
rtc.victims = new InetSocketAddress[victimStrs.length];
rtc.victimSet = new HashSet<String>();
for (int i = 0; i < victimStrs.length; i++) {
//hostname:port
String[] values = victimStrs[i].split(":");
rtc.victims[i] = new InetSocketAddress(values[0],
Integer.parseInt(values[1]));
rtc.victimSet.add(values[0]);
}
rtc.pickLists = getPickLists(nameNodeConfs, working_dir,
Long.MAX_VALUE, rtc.victimSet);
DatanodeBenThread[] threads = new DatanodeBenThread[(int)rtc.nthreads];
int nsIdx = 0;
for (int i=0; i < rtc.nthreads; i++) {
threads[i] = new DatanodeBenThread(nameNodeConfs.get(nsIdx),
new Path(working_dir),
new Path(rtc.output_dir),
i, RUNNING_TYPE.GENERAL, rtc);
nsIdx++;
if (nsIdx == nameNodeConfs.size()) {
nsIdx = 0;
}
}
return threads;
}
}
@Override
public Map<String, String> collectStats(JobConf conf,
GenThread[] threads, long execTime) throws IOException {
long total_read_size = 0;
long total_write_size = 0;
for (GenThread t: threads) {
DatanodeBenThread dbt = (DatanodeBenThread)t;
total_read_size += dbt.read_size;
total_write_size += dbt.write_size;
}
float readRateMbSec = (float)total_read_size * 1000 / (execTime * MEGA);
float writeRateMbSec = (float)total_write_size * 1000 / (execTime * MEGA);
LOG.info("Read IO rate = " + readRateMbSec);
LOG.info("Write IO rate = " + writeRateMbSec);
Map<String, String> stat = super.collectStats(conf, threads, execTime);
stat.put("readrate", String.valueOf(readRateMbSec));
stat.put("writerate", String.valueOf(writeRateMbSec));
return stat;
}
@Override
public void reset() {
average_read_rate = 0;
average_write_rate = 0;
total_num = 0;
}
@Override
public void analyze(Map<String, String> stat) throws IOException {
average_read_rate += Float.parseFloat(stat.get("readrate"));
average_write_rate += Float.parseFloat(stat.get("writerate"));
total_num++;
}
@Override
public void output(FSDataOutputStream out) throws IOException {
out.writeChars("Average Read (MB/sec): \t\t\t" +
average_read_rate/total_num + "\n");
out.writeChars("Average Write (MB/sec): \t\t" +
average_write_rate/total_num + "\n");
}
}