package org.apache.hadoop.mapred; /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hdfs.GeneralConstant; import java.io.IOException; import java.util.Map; import java.util.Random; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.JobConf; import org.apache.commons.logging.Log; import java.util.zip.CRC32; import java.util.zip.Checksum; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.SequenceFile.CompressionType; @SuppressWarnings("deprecation") public class GenWriterThread extends GenThread implements GeneralConstant { private static final Log LOG = LogFactory.getLog(GenWriterThread.class); public static final String TEST_TYPE = "write"; public static final long DEFAULT_ROLL_INTERVAL_SEC = -1; public static final long DEFAULT_SYNC_INTERVAL_SEC = -1; public static final long DEFAULT_MAX_TIME_SEC = 60; public static final String WRITER_ROLL_INTERVAL_KEY = "writer.roll.interval.sec"; public static final String WRITER_SYNC_INTERVAL_KEY = "writer.sync.interval.sec"; public static final String MAX_TIME_SEC_KEY = "max.time.sec"; /** * Restricts the maximum rate at which tokens can be claimed and allows the maximum burst size * to be bounded. */ public static class TokenBucket { private final long maxTokensPerSec; private final long maxTokenBurst; private long availableTokens = 0; private long lastCheckpointSecs = getNowSecs(); private final Random rb = new Random(); private static final int SLEEP_INTERVAL = 1000; public TokenBucket(long maxTokensPerSec, long maxTokenBurst) { this.maxTokensPerSec = maxTokensPerSec; this.maxTokenBurst = maxTokenBurst; } public TokenBucket(long maxTokensPerSec) { this(maxTokensPerSec, Long.MAX_VALUE); } private static long getNowSecs() { return System.currentTimeMillis() / 1000; } private void updateAvailableTokens() { long nowSecs = getNowSecs(); availableTokens += maxTokensPerSec * (nowSecs - lastCheckpointSecs); // Bound the maximum number of available tokens availableTokens = Math.min(availableTokens, maxTokenBurst); lastCheckpointSecs = nowSecs; } /** * This method will block until the requested number of tokens become available * * @param requestedSize * @throws InterruptedException */ public synchronized void getTokens(long requestedSize) throws InterruptedException { if (requestedSize > maxTokenBurst) { throw new IllegalArgumentException("Cannot request more tokens " + "then the max burst size"); } updateAvailableTokens(); while (requestedSize > availableTokens) { // Sleep until the tokens become available long tokensStillMissing = requestedSize - availableTokens; // Estimate how much time we need to wait to get enough tokens long sleepSecs = (long) Math.ceil((double) tokensStillMissing / maxTokensPerSec) * 1000 + rb.nextInt(SLEEP_INTERVAL); Thread.sleep(sleepSecs); updateAvailableTokens(); } availableTokens -= requestedSize; assert(availableTokens >= 0); } } public class GenWriterRunTimeConstants extends RunTimeConstants { String input = null; long roll_interval = DEFAULT_ROLL_INTERVAL_SEC * 1000; long sync_interval = DEFAULT_SYNC_INTERVAL_SEC * 1000; long max_time = DEFAULT_MAX_TIME_SEC * 1000; long data_rate = DEFAULT_DATA_RATE * 1024; String task_name = null; } //For each file, we compute the CRC32 checksum of it. //Then we XOR all files' CRC32 checksum to generate a directory //checksum public static class DirectoryChecksum { private Checksum fileckm = new CRC32(); private long dirckm = 0L; public long getDirectoryChecksum() { return dirckm; } public Checksum getFileChecksum() { return fileckm; } public void openFile() { fileckm.reset(); } public void closeFile() { dirckm ^= fileckm.getValue(); } } private DirectoryChecksum dc = new DirectoryChecksum(); private int id; private String name; private Random rb = new Random(); public TokenBucket tb = null; public GenWriterRunTimeConstants rtc = null; public GenWriterThread() { } public GenWriterThread(Configuration conf, Path p, String name, int id, GenWriterRunTimeConstants rtc) throws IOException{ super(conf, p, rtc); this.rtc = rtc; this.id = id; this.name = name; rb.nextBytes(buffer); tb = new TokenBucket(rtc.data_rate); } public void run() { try { fs.mkdirs(outputPath); long endTime = System.currentTimeMillis() + rtc.max_time; long lastRollTime = System.currentTimeMillis(); long lastSyncTime = System.currentTimeMillis(); long currentId = 0; FSDataOutputStream out = null; while (System.currentTimeMillis() < endTime) { Path fileName = new Path(outputPath, "part" + currentId); try { out = fs.create(fileName, (short)3); dc.openFile(); long size = 0; while (true) { rb.nextBytes(buffer); dc.getFileChecksum().update(buffer, 0, rtc.buffer_size); tb.getTokens(rtc.buffer_size); out.write(buffer, 0, rtc.buffer_size); size += rtc.buffer_size; if (rtc.sync_interval > 0 && System.currentTimeMillis() - lastSyncTime > rtc.sync_interval) { // Sync the file out.sync(); LOG.info("file " + fileName + " is synced"); lastSyncTime = System.currentTimeMillis() + rb.nextInt((int)rtc.sync_interval); } if (System.currentTimeMillis() > endTime || rtc.roll_interval > 0 && System.currentTimeMillis() - lastRollTime > rtc.roll_interval) { // Roll the file out.close(); out = null; currentId++; files_processed++; processed_size += size; LOG.info("file " + fileName + " is closed with " + size + " bytes"); lastRollTime = System.currentTimeMillis() + rb.nextInt((int)rtc.roll_interval); break; } } } catch (Exception e) { LOG.error("Error in writing file: " + fileName, e); this.errors.add(e); } finally { IOUtils.closeStream(out); dc.closeFile(); } } LOG.info("Checksum of files under dir " + outputPath + " is " + dc.getDirectoryChecksum()); LOG.info("Thread " + name + "_" + id + " is done."); } catch (Exception ioe) { LOG.error("Error:", ioe); this.errors.add(ioe); } } /** * Each mapper will write one checksum file. * checksum file contains N pairs where N is the number of threads * Each pair is has two entries: outputPath and checksum * outputPath is the directory of files written by the thread * checksum is the CRC checksum of all files under that directory * @param name checksum file name * @param threads array of writer threads * @return checksum file path * @throws IOException */ private Path writeChecksumFile(FileSystem fs, String name, GenThread[] threads) throws IOException { Path checksumFile = new Path(rtc.output_dir, name + ".checksum"); SequenceFile.Writer write = null; write = SequenceFile.createWriter(fs, fs.getConf(), checksumFile, Text.class, Text.class, CompressionType.NONE); try { for (GenThread rawThread: threads) { GenWriterThread thread = (GenWriterThread)rawThread; write.append(new Text(thread.outputPath.toString()), new Text(Long.toString(thread.dc.getDirectoryChecksum()))); } } finally { if (write != null) write.close(); write = null; } return checksumFile; } /** * This is used for verification * Each mapper writes one control file * control file only contains the base directory written by this mapper * and the checksum file path so that we could create a Read mapper which * scanned the files under the base directory and verify the checksum of * files with the information given in the checksum file. * @param fs * @param outputPath base directory of mapper * @param checksumFile location of checksum file * @param name name of control file * @throws IOException */ private void writeControlFile(FileSystem fs, Path outputPath, Path checksumFile, String name) throws IOException { SequenceFile.Writer write = null; try { Path parentDir = new Path(rtc.input, "filelists"); if (!fs.exists(parentDir)) { fs.mkdirs(parentDir); } Path controlFile = new Path(parentDir, name); write = SequenceFile.createWriter(fs, fs.getConf(), controlFile, Text.class, Text.class, CompressionType.NONE); write.append(new Text(outputPath.toString()), new Text(checksumFile.toString())); } finally { if (write != null) write.close(); write = null; } } /** * Create a number of threads to generate write traffics * @param conf * @param key name of the mapper * @param value location of data input * @return * @throws IOException */ @Override public GenThread[] prepare(JobConf conf, Text key, Text value) throws IOException { this.rtc = new GenWriterRunTimeConstants(); super.prepare(conf, key, value, rtc); rtc.task_name = key.toString() + rtc.taskID; rtc.roll_interval = conf.getLong(WRITER_ROLL_INTERVAL_KEY, DEFAULT_ROLL_INTERVAL_SEC) * 1000; rtc.sync_interval = conf.getLong(WRITER_SYNC_INTERVAL_KEY, DEFAULT_SYNC_INTERVAL_SEC) * 1000; rtc.max_time = conf.getLong(MAX_TIME_SEC_KEY, DEFAULT_MAX_TIME_SEC) * 1000; rtc.data_rate = conf.getLong(WRITER_DATARATE_KEY, DEFAULT_DATA_RATE) * 1024; rtc.input = value.toString(); LOG.info("data rate: " + rtc.data_rate); GenWriterThread[] threads = new GenWriterThread[(int)rtc.nthreads]; for (int i=0; i<rtc.nthreads; i++) { threads[i] = new GenWriterThread(conf, new Path(new Path(rtc.input, rtc.task_name), rtc.task_name + "_" + i), rtc.task_name, i, rtc); } return threads; } @Override public Map<String, String> collectStats(JobConf conf, GenThread[] threads, long execTime) throws IOException { // write checksum file FileSystem fs = FileSystem.newInstance(conf); Path checksumFile = writeChecksumFile(fs, rtc.task_name, threads); writeControlFile(fs, new Path(rtc.input, rtc.task_name), checksumFile, rtc.task_name); return super.collectStats(conf, threads, execTime); } }