/*
* Eoulsan development code
*
* This code may be freely distributed and modified under the
* terms of the GNU Lesser General Public License version 2.1 or
* later and CeCILL-C. This should be distributed with the code.
* If you do not have a copy, see:
*
* http://www.gnu.org/licenses/lgpl-2.1.txt
* http://www.cecill.info/licences/Licence_CeCILL-C_V1-en.txt
*
* Copyright for this code is held jointly by the Genomic platform
* of the Institut de Biologie de l'École normale supérieure and
* the individual authors. These should be listed in @author doc
* comments.
*
* For more information on the Eoulsan project and its aims,
* or to join the Eoulsan Google group, visit the home page
* at:
*
* http://outils.genomique.biologie.ens.fr/eoulsan
*
*/
package fr.ens.biologie.genomique.eoulsan.modules.mapping.hadoop;
import static com.google.common.base.Preconditions.checkNotNull;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import com.google.common.base.Joiner;
import fr.ens.biologie.genomique.eoulsan.bio.io.hadoop.FastqInputFormat;
import fr.ens.biologie.genomique.eoulsan.data.DataFile;
/**
* This class allow to convert two FASTQ file in one TFQ file.
* @author Laurent Jourdren
* @since 2.0
*/
public class PairedEndFastqToTfq {
/**
* This class define the reducer required to convert FASTQ files into TFQ
* file.
* @author Laurent Jourdren
* @since 2.0
*/
public static final class FastqPairedEndReducer
extends Reducer<Text, Text, Text, Text> {
private static final Joiner JOINER = Joiner.on('\t');
@Override
protected void reduce(final Text key, final Iterable<Text> values,
final Context context) throws IOException, InterruptedException {
final List<String> list = new ArrayList<>();
for (Text t : values) {
list.add(t.toString());
}
Collections.sort(list);
context.write(key, new Text(JOINER.join(list)));
}
}
/**
* Create the job to convert FASTQ files in a TFQ file.
* @param parentConf Hadoop configuration
* @param fastqFile1 Path of the first FASTQ file
* @param fastqFile2 Path of the second FASTQ file
* @param outputFile Path of the output TFQ file
* @param reducerTaskCount the reducer task count
* @return an Hadoop Job
* @throws IOException if an error occurs while creating the Job
*/
public static Job convert(final Configuration parentConf,
final DataFile fastqFile1, final DataFile fastqFile2,
final DataFile outputFile, final int reducerTaskCount)
throws IOException {
checkNotNull(parentConf, "parentConf argument cannot be null");
checkNotNull(fastqFile1, "fastqFile1 argument cannot be null");
checkNotNull(fastqFile2, "fastqFile2 argument cannot be null");
checkNotNull(outputFile, "outputFile argument cannot be null");
return convert(parentConf, new Path(fastqFile1.getSource()),
new Path(fastqFile2.getSource()), new Path(outputFile.getSource()),
reducerTaskCount);
}
/**
* Create the job to convert FASTQ files in a TFQ file.
* @param parentConf Hadoop configuration
* @param fastqFile1 Path of the first FASTQ file
* @param fastqFile2 Path of the second FASTQ file
* @param outputFile Path of the output TFQ file
* @param reducerTaskCount the reducer task count
* @return an Hadoop Job
* @throws IOException if an error occurs while creating the Job
*/
public static Job convert(final Configuration parentConf,
final Path fastqFile1, final Path fastqFile2, final Path outputFile,
final int reducerTaskCount) throws IOException {
checkNotNull(parentConf, "parentConf argument cannot be null");
checkNotNull(fastqFile1, "fastqFile1 argument cannot be null");
checkNotNull(fastqFile2, "fastqFile2 argument cannot be null");
checkNotNull(outputFile, "outputFile argument cannot be null");
final Configuration jobConf = new Configuration(parentConf);
// Set Job name
// Create the job and its name
final Job job = Job.getInstance(jobConf,
"Convert FASTQ paired files in TFQ ("
+ fastqFile1.getName() + ", " + fastqFile2.getName() + ", "
+ outputFile.getName() + ")");
// Set the jar
job.setJarByClass(PairedEndFastqToTfq.class);
// Set input path
FileInputFormat.addInputPath(job, fastqFile1);
FileInputFormat.addInputPath(job, fastqFile2);
// Set the input format
job.setInputFormatClass(FastqInputFormat.class);
// Set the Reducer class
job.setReducerClass(FastqPairedEndReducer.class);
// Set the Combiner class
job.setCombinerClass(FastqPairedEndReducer.class);
// Set the output key class
job.setOutputKeyClass(Text.class);
// Set the output value class
job.setOutputValueClass(Text.class);
// Set the reducer task count
if (reducerTaskCount > 0) {
job.setNumReduceTasks(reducerTaskCount);
}
// Set output path
FileOutputFormat.setOutputPath(job, outputFile);
return job;
}
}