/*
* Eoulsan development code
*
* This code may be freely distributed and modified under the
* terms of the GNU Lesser General Public License version 2.1 or
* later and CeCILL-C. This should be distributed with the code.
* If you do not have a copy, see:
*
* http://www.gnu.org/licenses/lgpl-2.1.txt
* http://www.cecill.info/licences/Licence_CeCILL-C_V1-en.txt
*
* Copyright for this code is held jointly by the Genomic platform
* of the Institut de Biologie de l'École normale supérieure and
* the individual authors. These should be listed in @author doc
* comments.
*
* For more information on the Eoulsan project and its aims,
* or to join the Eoulsan Google group, visit the home page
* at:
*
* http://outils.genomique.biologie.ens.fr/eoulsan
*
*/
package fr.ens.biologie.genomique.eoulsan.design;
import static fr.ens.biologie.genomique.eoulsan.EoulsanLogger.getLogger;
import static fr.ens.biologie.genomique.eoulsan.data.DataFormats.ADDITIONAL_ANNOTATION_TSV;
import static fr.ens.biologie.genomique.eoulsan.data.DataFormats.ANNOTATION_GFF;
import static fr.ens.biologie.genomique.eoulsan.data.DataFormats.ANNOTATION_GTF;
import static fr.ens.biologie.genomique.eoulsan.data.DataFormats.GENOME_FASTA;
import static java.util.regex.Pattern.compile;
import java.io.File;
import java.io.FileFilter;
import java.io.FilenameFilter;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.UUID;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import fr.ens.biologie.genomique.aozan.illumina.samplesheet.SampleSheet;
import fr.ens.biologie.genomique.aozan.illumina.samplesheet.io.SampleSheetCSVReader;
import fr.ens.biologie.genomique.eoulsan.EoulsanException;
import fr.ens.biologie.genomique.eoulsan.EoulsanRuntime;
import fr.ens.biologie.genomique.eoulsan.bio.BadBioEntryException;
import fr.ens.biologie.genomique.eoulsan.bio.FastqFormat;
import fr.ens.biologie.genomique.eoulsan.bio.IlluminaReadId;
import fr.ens.biologie.genomique.eoulsan.bio.io.FastqReader;
import fr.ens.biologie.genomique.eoulsan.core.Naming;
import fr.ens.biologie.genomique.eoulsan.data.DataFile;
import fr.ens.biologie.genomique.eoulsan.data.DataFileMetadata;
import fr.ens.biologie.genomique.eoulsan.data.DataFormat;
import fr.ens.biologie.genomique.eoulsan.data.DataFormatRegistry;
import fr.ens.biologie.genomique.eoulsan.data.DataFormats;
import fr.ens.biologie.genomique.eoulsan.util.StringUtils;
/**
* This class allow to easily build Design object from files paths.
* @since 1.0
* @author Laurent Jourdren
*/
public class DesignBuilder {
private static final int MAX_FASTQ_ENTRIES_TO_READ = 10000;
private static final Pattern ILLUMINA_FASTQ_FILENAME_PATTERN =
compile("^(.+)_\\w+_L\\d\\d\\d_R\\d_\\d\\d\\d$");
private final DataFormatRegistry dfr = DataFormatRegistry.getInstance();
private final Map<String, List<FastqEntry>> fastqMap = new LinkedHashMap<>();
private final Map<String, String> prefixMap = new HashMap<>();
private DataFile genomeFile;
private DataFile gffFile;
private DataFile gtfFile;
private DataFile additionalAnnotationFile;
/**
* This class define a exception thrown when a fastq file is empty.
* @author Laurent Jourdren
*/
private static class EmptyFastqException extends EoulsanException {
private static final long serialVersionUID = 5672764893232380662L;
/**
* Public constructor
* @param msg exception message
*/
public EmptyFastqException(final String msg) {
super(msg);
}
}
/**
* This inner class define a fastq entry.
* @author Laurent Jourdren
*/
private static class FastqEntry {
private static final String DATE_FORMAT = "yyyy-MM-dd";
private final DataFile path;
private final String sampleId;
private final String sampleName;
private final String sampleDesc;
private final String sampleOperator;
private final String sampleDate;
private final String firstReadId;
private final String prefix;
private final int pairMember;
private static String getDate(final DataFile file) {
try {
long last = file.getMetaData().getLastModified();
return new SimpleDateFormat(DATE_FORMAT).format(new Date(last));
} catch (IOException e) {
return null;
}
}
//
// static methods
//
/**
* Get the identifier of the first read of a fastq file.
* @param f the input file
* @return the identifier of the first read of a fastq file as a string
* @throws EoulsanException if an error occurs while reading the file or if
* the read format is invalid
*/
private static String getFirstReadSeqId(final DataFile f)
throws EoulsanException {
final FastqReader reader;
try {
reader = new FastqReader(f.open());
if (!reader.hasNext()) {
reader.close();
reader.throwException();
throw new EmptyFastqException(
"Fastq file is empty: " + f.getSource());
}
reader.close();
reader.throwException();
return reader.next().getName();
} catch (IOException | BadBioEntryException e) {
throw new EoulsanException(e);
}
}
private Object[] initPairedEnd() {
String prefix = this.firstReadId;
int pairMember = -1;
try {
IlluminaReadId irid = new IlluminaReadId(this.firstReadId);
prefix = irid.getInstrumentId()
+ "\t" + irid.getFlowCellLane() + "\t"
+ irid.getTileNumberInFlowCellLane() + "\t"
+ irid.getXClusterCoordinateInTile() + "\t"
+ irid.getYClusterCoordinateInTile();
pairMember = irid.getPairMember();
} catch (EoulsanException e) {
if (this.firstReadId.endsWith("/1")) {
prefix = this.firstReadId.substring(0, this.firstReadId.length() - 3);
pairMember = 1;
} else if (this.firstReadId.endsWith("/2")) {
prefix = this.firstReadId.substring(0, this.firstReadId.length() - 3);
pairMember = 2;
} else {
pairMember = 1;
}
}
return new Object[] {prefix, pairMember};
}
//
// Object methods
//
@Override
public boolean equals(final Object obj) {
if (obj == this) {
return true;
}
if (!(obj instanceof FastqEntry)) {
return false;
}
final FastqEntry that = (FastqEntry) obj;
return this.path.equals(that.path);
}
@Override
public int hashCode() {
return this.path.hashCode();
}
@Override
public String toString() {
final StringBuilder sb = new StringBuilder();
sb.append("FastqEntry(Sample: ");
sb.append(this.sampleId);
if (this.sampleDesc != null) {
sb.append(", Description: ");
sb.append(this.sampleDesc);
}
if (this.sampleOperator != null) {
sb.append(", Operator: ");
sb.append(this.sampleOperator);
}
sb.append(", Path: ");
sb.append(this.path);
sb.append(")");
return sb.toString();
}
private static String defineSampleName(DataFile path) {
String basename = StringUtils.basename(path.getName());
// Check if filename is a Bcl2fastq output file
final Matcher matcher = ILLUMINA_FASTQ_FILENAME_PATTERN.matcher(basename);
if (matcher.matches()) {
basename = matcher.group(1);
}
return basename;
}
//
// Constructors
//
public FastqEntry(final DataFile path) throws EoulsanException {
this.path = path;
this.sampleName = defineSampleName(path);
this.sampleId = Naming.toValidName(this.sampleName);
this.sampleDesc = null;
this.sampleOperator = null;
this.sampleDate = getDate(path);
this.firstReadId = getFirstReadSeqId(path);
final Object[] array = initPairedEnd();
this.prefix = (String) array[0];
this.pairMember = (Integer) array[1];
}
public FastqEntry(final DataFile path, final String sampleId,
final String sampleName, final String sampleDesc,
final String sampleOperator) throws EoulsanException {
this.path = path;
this.sampleId = sampleId;
this.sampleName = sampleName;
this.sampleDesc = sampleDesc;
this.sampleOperator = sampleOperator;
this.sampleDate = getDate(path);
this.firstReadId = getFirstReadSeqId(path);
final Object[] array = initPairedEnd();
this.prefix = (String) array[0];
this.pairMember = (Integer) array[1];
}
}
/**
* Add a file to the design builder
* @param file file to add
* @throws EoulsanException if the file does not exist
*/
public void addFile(final DataFile file) throws EoulsanException {
if (file == null) {
return;
}
if (!file.exists()) {
throw new EoulsanException(
"File " + file + " does not exist or is not a regular file.");
}
final String extension =
StringUtils.extensionWithoutCompressionExtension(file.getName());
DataFileMetadata md = null;
try {
md = file.getMetaData();
} catch (IOException e) {
}
if (isDataFormatExtension(DataFormats.READS_FASTQ, extension, md)) {
final FastqEntry entry;
try {
entry = new FastqEntry(file);
} catch (EmptyFastqException e) {
getLogger().warning(e.getMessage());
return;
}
final String sampleId;
if (this.prefixMap.containsKey(entry.prefix)) {
sampleId = this.prefixMap.get(entry.prefix);
} else {
sampleId = entry.sampleId;
this.prefixMap.put(entry.prefix, sampleId);
}
final List<FastqEntry> sampleEntries;
if (!this.fastqMap.containsKey(sampleId)) {
sampleEntries = new ArrayList<>();
this.fastqMap.put(sampleId, sampleEntries);
} else {
sampleEntries = this.fastqMap.get(sampleId);
}
// Don't add previously added file
if (!sampleEntries.contains(entry)) {
sampleEntries.add(entry);
}
} else if (isDataFormatExtension(GENOME_FASTA, extension, md)) {
this.genomeFile = file;
} else if (isDataFormatExtension(ANNOTATION_GFF, extension, md)) {
this.gffFile = file;
} else if (isDataFormatExtension(ANNOTATION_GTF, extension, md)) {
this.gtfFile = file;
} else if (isDataFormatExtension(ADDITIONAL_ANNOTATION_TSV, extension,
md)) {
this.additionalAnnotationFile = file;
} else {
throw new EoulsanException("Unknown file type: " + file);
}
}
/**
* Add a filename to the design builder
* @param filename filename of the file to add
* @throws EoulsanException if the file does not exists
*/
public void addFile(final String filename) throws EoulsanException {
if (filename == null) {
return;
}
getLogger().info("Add file " + filename + " to design.");
addFile(new DataFile(filename));
}
/**
* Add filenames to the design builder
* @param filenames array with the filenames to add
* @throws EoulsanException if the file does not exists
*/
public void addFiles(final String[] filenames) throws EoulsanException {
if (filenames == null) {
return;
}
for (String filename : filenames) {
addFile(filename);
}
}
/**
* Add filenames to the design builder
* @param filenames array with the filenames to add
* @throws EoulsanException if the file does not exists
*/
public void addFiles(final List<String> filenames) throws EoulsanException {
if (filenames == null) {
return;
}
for (String filename : filenames) {
addFile(filename);
}
}
/**
* Add all the sample from a Bclfastq samplesheet.
* @param samplesheet The Bcl2fastq samplesheet object
* @param projectName name of the project
* @param bcl2fastqOutputDir the output directory of Bcl2fastq demultiplexing
* @throws EoulsanException if an error occurs while adding the Bcl2fastq
* samplesheet
*/
public void addBcl2FastqSamplesheetProject(final SampleSheet samplesheet,
final String projectName, final File bcl2fastqOutputDir)
throws EoulsanException {
if (samplesheet == null || bcl2fastqOutputDir == null) {
return;
}
if (!bcl2fastqOutputDir.exists() || !bcl2fastqOutputDir.isDirectory()) {
throw new EoulsanException(
"The Bcl2fastq output directory does not exists: "
+ bcl2fastqOutputDir);
}
final boolean Bcl2Fastq1 =
new File(bcl2fastqOutputDir.getPath() + "/Project_" + projectName)
.isDirectory();
for (fr.ens.biologie.genomique.aozan.illumina.samplesheet.Sample sample : samplesheet) {
final String sampleProject = sample.getSampleProject();
final String sampleId = sample.getSampleId();
final String sampleName = sample.getSampleName();
final String sampleDesc = sample.getDescription();
final String sampleOperator = sample.get("Operator");
final int sampleLane = sample.getLane();
// Check if sample id field exist for sample
if (sampleId == null) {
throw new EoulsanException(
"No sample Id field found for sample: " + sample);
}
final String samplePrefix = sampleName == null ? sampleId : sampleName;
// Select only project samples
if (projectName != null && !projectName.equals(sampleProject)) {
continue;
}
File dataDir;
if (Bcl2Fastq1) {
dataDir = new File(bcl2fastqOutputDir.getPath()
+ "/Project_" + sampleProject + "/Sample_" + sampleId);
} else {
dataDir = new File(bcl2fastqOutputDir.getPath() + "/" + sampleProject);
// Check if a sample sub directory may exist
String subdir = defineSampleSubDirName(sampleId, sampleName);
if (!"".equals(subdir)) {
dataDir = new File(dataDir, subdir);
}
}
// Test if the directory with fastq files exists
if (!dataDir.exists() || !dataDir.isDirectory()) {
continue;
}
final String laneKey =
sampleLane == -1 ? "_L" : String.format("_L%03d_", sampleLane);
// List the input FASTQ files
final File[] files = dataDir.listFiles(new FileFilter() {
@Override
public boolean accept(final File f) {
final String filename =
StringUtils.filenameWithoutCompressionExtension(f.getName());
if ((filename.endsWith(".fastq") || filename.endsWith(".fq"))
&& filename.contains(laneKey)
&& samplePrefix.equals(parseSampleNameFromFilename(filename))) {
return true;
}
return false;
}
});
// Sort the list of input FASTQ files
Arrays.sort(files);
for (File fastqFile : files) {
final List<FastqEntry> list;
final String normalizedSampleId = Naming.toValidName(sampleId);
if (this.fastqMap.containsKey(normalizedSampleId)) {
list = this.fastqMap.get(normalizedSampleId);
} else {
list = new ArrayList<>();
this.fastqMap.put(normalizedSampleId, list);
}
try {
list.add(new FastqEntry(new DataFile(fastqFile), normalizedSampleId,
samplePrefix, sampleDesc, sampleOperator));
} catch (EmptyFastqException e) {
getLogger().warning(e.getMessage());
}
}
}
}
/**
* Add all the samples from a Bcl2Fastq samplesheet.
* @param samplesheetFile the path to the Casava design
* @param projectName the name of the project
* @throws EoulsanException if an error occurs while reading the Casava design
*/
public void addBcl2FastqSamplesheetProject(final File samplesheetFile,
final String projectName) throws EoulsanException {
if (samplesheetFile == null) {
return;
}
getLogger().info("Add Bcl2fastq samplesheet file "
+ samplesheetFile + " to design with " + (projectName == null
? "no project filter." : projectName + " project filter."));
final File baseDir;
final File file;
if (!samplesheetFile.exists()) {
throw new EoulsanException(
"The Bcl2fastq samplesheet file does not exists: " + samplesheetFile);
}
if (samplesheetFile.isDirectory()) {
baseDir = samplesheetFile;
final File[] files = baseDir.listFiles(new FilenameFilter() {
@Override
public boolean accept(final File dir, final String filename) {
if (filename.endsWith(".csv")) {
return true;
}
return false;
}
});
if (files == null || files.length == 0) {
throw new EoulsanException(
"No Bcl2fastq samplesheet file found in directory: " + baseDir);
}
if (files.length > 1) {
throw new EoulsanException(
"More than one Bcl2fastq samplesheet found in directory: "
+ baseDir);
}
file = files[0];
} else {
baseDir = samplesheetFile.getParentFile();
file = samplesheetFile;
}
try {
SampleSheetCSVReader reader = new SampleSheetCSVReader(file);
addBcl2FastqSamplesheetProject(reader.read(), projectName, baseDir);
} catch (IOException e) {
throw new EoulsanException(e);
}
}
/**
* Create design object.
* @param pairedEndMode true if the paired end mode is enabled
* @return a new Design object
* @throws EoulsanException if an error occurs while analyzing input files
*/
public Design getDesign(final boolean pairedEndMode) throws EoulsanException {
final Design result = DesignFactory.createEmptyDesign();
result.addExperiment("exp1");
final FastqFormat defaultFastqFormat =
EoulsanRuntime.getSettings().getDefaultFastqFormat();
for (Map.Entry<String, List<FastqEntry>> e : this.fastqMap.entrySet()) {
final String sampleId = e.getKey();
final List<List<FastqEntry>> files = findPairedEndFiles(e.getValue());
int count = 0;
for (List<FastqEntry> fes : files) {
final String sampleName = fes.get(0).sampleName;
final String desc = fes.get(0).sampleDesc;
final String date = fes.get(0).sampleDate;
final String operator = fes.get(0).sampleOperator;
final String condition = fes.get(0).sampleName;
if (pairedEndMode) {
final String finalSampleId = files.size() == 1
? sampleId : sampleId + StringUtils.toLetter(count);
final String finalSampleName = files.size() == 1
? sampleName : sampleName + StringUtils.toLetter(count);
// Convert the list of DataFiles to a list of filenames
final List<String> filenames = new ArrayList<>();
for (FastqEntry fe : fes) {
filenames.add(fe.path.getSource());
}
addSample(result, finalSampleId, finalSampleName, desc, condition,
date, operator, defaultFastqFormat, filenames, fes.get(0).path);
count++;
} else {
for (FastqEntry fe : fes) {
final String finalSampleId = e.getValue().size() == 1
? sampleId : sampleId + StringUtils.toLetter(count);
final String finalSampleName = e.getValue().size() == 1
? sampleName : sampleName + StringUtils.toLetter(count);
addSample(result, finalSampleId, finalSampleName, desc, condition,
date, operator, defaultFastqFormat,
Collections.singletonList(fe.path.getSource()), fe.path);
count++;
}
}
}
}
return result;
}
/**
* Add a Sample to the Design object
* @param design Design object
* @param sampleId the id of the sample
* @param sampleName the name of the sample
* @param desc description of the sample
* @param condition condition
* @param date date of the sample
* @param operator operator for the sample
* @param defaultFastqFormat default fastq format
* @param filenames list of the fastq files for the sample
* @param fileToCheck DataFile of the file to use to check fastq format
* @throws EoulsanException if an error occurs while adding the sample
*/
private void addSample(final Design design, final String sampleId,
final String sampleName, final String desc, final String condition,
final String date, final String operator,
final FastqFormat defaultFastqFormat, final List<String> filenames,
final DataFile fileToCheck) throws EoulsanException {
if (design == null) {
return;
}
// Create the sample
design.addSample(sampleId);
final Sample s = design.getSample(sampleId);
if (sampleName != null) {
s.setName(sampleName);
}
final SampleMetadata smd = s.getMetadata();
// Set the fastq file of the sample
smd.setReads(filenames);
// Set the description of the sample if exists
if (desc != null) {
smd.setDescription(desc);
} else if (s.getMetadata().containsDescription()) {
smd.setDescription("no description");
}
// Set the date of the sample if exists
if (date != null) {
smd.setDate(date);
}
// Set the operator of the sample if exists
if (operator != null) {
smd.setOperator(operator);
} else if (s.getMetadata().containsOperator()) {
smd.setOperator("unknown operator");
}
// Set the genome file if exists
if (this.genomeFile != null) {
design.getMetadata().setGenomeFile(this.genomeFile.toString());
}
// Set the GFF Annotation file
if (this.gffFile != null) {
design.getMetadata().setGffFile(this.gffFile.toString());
}
// Set the GTF Annotation file
if (this.gtfFile != null) {
design.getMetadata().setGtfFile(this.gtfFile.toString());
}
// Set additional annotation file
if (this.additionalAnnotationFile != null) {
design.getMetadata().setAdditionalAnnotationFile(
this.additionalAnnotationFile.toString());
}
// Identify Fastq format
FastqFormat format = null;
try {
getLogger().info("Check fastq format for " + fileToCheck);
format = FastqFormat.identifyFormat(fileToCheck.open(),
MAX_FASTQ_ENTRIES_TO_READ);
} catch (IOException | BadBioEntryException e) {
throw new EoulsanException(e);
}
smd.setFastqFormat(format == null ? defaultFastqFormat : format);
// Set replicate technical group
smd.setRepTechGroup(condition);
// Set UUID for the sample
smd.setUUID(UUID.randomUUID().toString());
// Get the experiement sample of the unique experiment of the design
final Experiment exp = design.getExperiments().get(0);
final ExperimentSample es = exp.addSample(s);
// Set the condition
es.getMetadata().setCondition(condition);
// Set the default reference
es.getMetadata().setReference(false);
}
private boolean isDataFormatExtension(final DataFormat dataFormat,
final String extension, final DataFileMetadata md) {
if (md != null && md.getDataFormat() != null) {
return dataFormat.equals(md.getDataFormat());
}
for (DataFormat df : this.dfr.getDataFormatsFromExtension(extension)) {
if (df == dataFormat) {
return true;
}
}
return false;
}
/**
* Group paired end files.
* @return a list of 1-2 paired end files
* @throws EoulsanException if an error occurs while getting the id of first
* read of the fastq files
*/
private List<List<FastqEntry>> findPairedEndFiles(
final List<FastqEntry> files) throws EoulsanException {
final Map<String, List<FastqEntry>> mapPrefix = new HashMap<>();
final Map<FastqEntry, Integer> mapPaired = new HashMap<>();
final List<List<FastqEntry>> result = new ArrayList<>();
for (FastqEntry fe : files) {
mapPaired.put(fe, fe.pairMember);
final List<FastqEntry> list;
if (mapPrefix.containsKey(fe.prefix)) {
list = mapPrefix.get(fe.prefix);
} else {
list = new ArrayList<>();
mapPrefix.put(fe.prefix, list);
result.add(list);
}
list.add(fe);
}
// Order the paired end files
for (List<FastqEntry> list : result) {
// Check invalid number of files
if (list.size() > 2) {
throw new EoulsanException(
"Found more than 2 files for a sample in paired-end mode: " + list);
}
if (list.size() == 2) {
final int member1 = mapPaired.get(list.get(0));
final int member2 = mapPaired.get(list.get(1));
if (member1 == member2) {
throw new EoulsanException(
"Found two files with the same pair member: " + list);
}
if (member1 < 1 || member1 > 2) {
throw new EoulsanException(
"Invalid pair member for file: " + list.get(0));
}
if (member2 < 1 || member2 > 2) {
throw new EoulsanException(
"Invalid pair member for file: " + list.get(1));
}
// Change the order of the file if necessary
if (member1 == 2 && member2 == 1) {
final FastqEntry tmp = list.get(0);
list.set(0, list.get(1));
list.set(1, tmp);
}
}
}
return result;
}
/**
* Parse the sample name from its filename.
* @param filename the filename to parse
* @return the sample name
*/
private static String parseSampleNameFromFilename(final String filename) {
if (filename == null) {
return null;
}
final List<String> list =
new ArrayList<String>(Arrays.asList(filename.split("_")));
final int size = list.size();
if (size < 5) {
return null;
}
StringBuilder sb = new StringBuilder();
boolean first = true;
for (String field : list.subList(0, size - 4)) {
if (first) {
first = false;
} else {
sb.append('_');
}
sb.append(field);
}
return sb.toString();
}
/**
* Get the sample sub directory.
* @param sampleId sample identifier
* @param sampleName sample name
* @return the sample sub directory or an empty string
*/
public static String defineSampleSubDirName(final String sampleId,
final String sampleName) {
if (sampleId != null
&& !"".equals(sampleId.trim()) && sampleName != null
&& !"".equals(sampleName.trim())) {
return sampleId.trim();
}
return "";
}
//
// Constructor
//
/**
* Public constructor.
*/
public DesignBuilder() {
}
/**
* Public constructor.
* @param filenames filenames to add
* @throws EoulsanException if a file to add to the design does not exist or
* is not handled
*/
public DesignBuilder(final String[] filenames) throws EoulsanException {
addFiles(filenames);
}
}