/*
* Eoulsan development code
*
* This code may be freely distributed and modified under the
* terms of the GNU Lesser General Public License version 2.1 or
* later and CeCILL-C. This should be distributed with the code.
* If you do not have a copy, see:
*
* http://www.gnu.org/licenses/lgpl-2.1.txt
* http://www.cecill.info/licences/Licence_CeCILL-C_V1-en.txt
*
* Copyright for this code is held jointly by the Genomic platform
* of the Institut de Biologie de l'École normale supérieure and
* the individual authors. These should be listed in @author doc
* comments.
*
* For more information on the Eoulsan project and its aims,
* or to join the Eoulsan Google group, visit the home page
* at:
*
* http://outils.genomique.biologie.ens.fr/eoulsan
*
*/
package fr.ens.biologie.genomique.eoulsan.modules.mgmt.upload;
import static fr.ens.biologie.genomique.eoulsan.EoulsanLogger.getLogger;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.nio.charset.Charset;
import java.text.NumberFormat;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import com.google.common.collect.Lists;
import fr.ens.biologie.genomique.eoulsan.EoulsanRuntime;
import fr.ens.biologie.genomique.eoulsan.EoulsanRuntimeException;
import fr.ens.biologie.genomique.eoulsan.Globals;
import fr.ens.biologie.genomique.eoulsan.HadoopEoulsanRuntime;
import fr.ens.biologie.genomique.eoulsan.data.DataFile;
import fr.ens.biologie.genomique.eoulsan.data.DataFormatConverter;
import fr.ens.biologie.genomique.eoulsan.util.StringUtils;
import fr.ens.biologie.genomique.eoulsan.util.hadoop.PathUtils;
/**
* This class allow to copy and transform data in a distributed manner.
* @since 1.0
* @author Laurent Jourdren
*/
public class DataFileDistCp {
/* Default Charset. */
private static final Charset CHARSET =
Charset.forName(Globals.DEFAULT_FILE_ENCODING);
private final Configuration conf;
private final Path jobPath;
private static final long MAX_COPY_DURATION = 120 * 60 * 1000;
/**
* This inner class define the mapper class for DataSourceDistCp map-reduce
* job.
* @author Laurent Jourdren
*/
public static final class DistCpMapper
extends Mapper<LongWritable, Text, Text, Text> {
private static final String COUNTER_GROUP_NAME = "DataSourceDistCp";
/**
* Internal class to store an exception if occurs while coping.
* @author Laurent Jourdren
*/
private static final class MyIOExceptionWrapper {
public IOException ioexception;
}
@Override
protected void setup(final Context context)
throws IOException, InterruptedException {
if (!EoulsanRuntime.isRuntime()) {
HadoopEoulsanRuntime.newEoulsanRuntime(context.getConfiguration());
}
}
@Override
protected void map(final LongWritable key, final Text value,
final Context context) throws IOException, InterruptedException {
final String val = value.toString();
final int tabPos = val.indexOf('\t');
if (tabPos == -1) {
return;
}
final Configuration conf = context.getConfiguration();
final String srcPathname = val.substring(0, tabPos);
final Path srcPath = new Path(srcPathname);
final Path destPath = new Path(val.substring(tabPos + 1));
final FileSystem srcFs = srcPath.getFileSystem(conf);
final FileSystem destFs = destPath.getFileSystem(conf);
// Statistic about src file
final FileStatus fStatusSrc = srcFs.getFileStatus(srcPath);
final long srcSize = fStatusSrc == null ? 0 : fStatusSrc.getLen();
getLogger().info("Start copy "
+ srcPathname + " to " + destPath + " (" + srcSize + " bytes)\n");
final long startTime = System.currentTimeMillis();
final DataFile src = new DataFile(srcPathname);
final DataFile dest = new DataFile(destPath.toString());
// Copy the file
copyFile(src, dest, context);
// Compute copy statistics
final long duration = System.currentTimeMillis() - startTime;
final FileStatus fStatusDest = destFs.getFileStatus(destPath);
final long destSize = fStatusDest == null ? 0 : fStatusDest.getLen();
final double speed =
destSize == 0 ? 0 : (double) destSize / (double) duration * 1000;
getLogger().info("End copy "
+ srcPathname + " to " + destPath + " in "
+ StringUtils.toTimeHumanReadable(duration) + " (" + destSize
+ " bytes, " + ((int) speed) + " bytes/s)\n");
context.getCounter(COUNTER_GROUP_NAME, "Input file size")
.increment(srcSize);
context.getCounter(COUNTER_GROUP_NAME, "Output file size")
.increment(destSize);
}
/**
* Copy the file using a Thread and inform Hadoop of the live of the copy
* with a counter.
* @param src source
* @param dest destination
* @param context context object
* @throws InterruptedException if another thread has interrupted the
* current thread
* @throws IOException if an error occurs while copying data
*/
private static void copyFile(final DataFile src, final DataFile dest,
final Context context) throws InterruptedException, IOException {
// Define a wrapper object to store exception if needed
final MyIOExceptionWrapper exp = new MyIOExceptionWrapper();
// Create the thread for copy
final Thread t = new Thread(new Runnable() {
@Override
public void run() {
try {
new DataFormatConverter(src, dest).convert();
} catch (IOException e) {
exp.ioexception = e;
}
}
});
// Start thread
t.start();
// Create counter
final Counter counter =
context.getCounter(COUNTER_GROUP_NAME, "5_seconds");
final long startTime = System.currentTimeMillis();
// Sleep and increment counter until the end of copy
while (t.isAlive()) {
Thread.sleep(5000);
counter.increment(1);
final long duration = System.currentTimeMillis() - startTime;
if (duration > MAX_COPY_DURATION) {
throw new IOException("Copy timeout, copy exceed "
+ (MAX_COPY_DURATION / 1000) + " seconds.");
}
}
// Throw Exception if needed
if (exp.ioexception != null) {
throw exp.ioexception;
}
}
}
public void copy(final Map<DataFile, DataFile> entries) throws IOException {
if (entries == null || entries.size() == 0) {
return;
}
final Configuration conf = this.conf;
final Path tmpInputDir =
PathUtils.createTempPath(this.jobPath, "distcp-in-", "", conf);
final Path tmpOutputDir =
PathUtils.createTempPath(this.jobPath, "distcp-out-", "", conf);
//
// Create entries for distcp
//
final FileSystem fs = tmpInputDir.getFileSystem(conf);
fs.mkdirs(tmpInputDir);
// Sort files by size
final List<DataFile> inFiles = Lists.newArrayList(entries.keySet());
sortInFilesByDescSize(inFiles);
// Set the format for the id of the copy task
final NumberFormat nf = NumberFormat.getInstance();
nf.setMinimumIntegerDigits(Integer.toString(inFiles.size()).length());
nf.setGroupingUsed(false);
int count = 0;
for (DataFile inFile : inFiles) {
count++;
final DataFile outFile = entries.get(inFile);
final Path f =
new Path(tmpInputDir, "distcp-" + nf.format(count) + ".cp");
getLogger().info("Task copy " + inFile + " in " + f.toString());
BufferedWriter bw =
new BufferedWriter(new OutputStreamWriter(fs.create(f), CHARSET));
bw.write(inFile.getSource() + "\t" + outFile.getSource() + "\n");
bw.close();
}
final Job job = createJobConf(conf, tmpInputDir, tmpOutputDir);
try {
job.waitForCompletion(false);
} catch (InterruptedException | ClassNotFoundException e) {
throw new EoulsanRuntimeException("Error while distcp: " + e.getMessage(),
e);
}
// Remove tmp directory
PathUtils.fullyDelete(tmpInputDir, conf);
PathUtils.fullyDelete(tmpOutputDir, conf);
if (!job.isSuccessful()) {
throw new IOException("Unable to copy files using DataFileDistCp.");
}
}
/**
* Sort a list of DataFile by dissident order.
* @param inFiles list of DataFile to sort
*/
private void sortInFilesByDescSize(final List<DataFile> inFiles) {
Collections.sort(inFiles, new Comparator<DataFile>() {
@Override
public int compare(final DataFile f1, final DataFile f2) {
long size1;
try {
size1 = f1.getMetaData().getContentLength();
} catch (IOException e) {
size1 = -1;
}
long size2;
try {
size2 = f2.getMetaData().getContentLength();
} catch (IOException e) {
size2 = -1;
}
return Long.compare(size1, size2) * -1;
}
});
}
private static Job createJobConf(final Configuration parentConf,
final Path cpEntriesPath, final Path outputPath) throws IOException {
final Configuration jobConf = new Configuration(parentConf);
// timeout
jobConf.set("mapreduce.task.timeout", "" + MAX_COPY_DURATION);
// Create the job and its name
final Job job = Job.getInstance(jobConf, "DataFileDistcp");
// Set the jar
job.setJarByClass(DataFileDistCp.class);
// Add input path
FileInputFormat.addInputPath(job, cpEntriesPath);
// Set the input format
job.setInputFormatClass(TextInputFormat.class);
// Set the Mapper class
job.setMapperClass(DistCpMapper.class);
// Set the reducer class
// job.setReducerClass(IdentityReducer.class);
// Set the output key class
job.setOutputKeyClass(Text.class);
// Set the output value class
job.setOutputValueClass(Text.class);
// Set the number of reducers
job.setNumReduceTasks(1);
// Set the output Path
FileOutputFormat.setOutputPath(job, outputPath);
return job;
}
//
// Constructor
//
/**
* Public constructor.
* @param conf Configuration object
* @param jobPath the path where create job temporary file
*/
public DataFileDistCp(final Configuration conf, final Path jobPath) {
if (conf == null) {
throw new NullPointerException("The configuration is null");
}
if (jobPath == null) {
throw new NullPointerException("The job Path is null");
}
this.conf = conf;
this.jobPath = jobPath;
}
}