/* * Eoulsan development code * * This code may be freely distributed and modified under the * terms of the GNU Lesser General Public License version 2.1 or * later and CeCILL-C. This should be distributed with the code. * If you do not have a copy, see: * * http://www.gnu.org/licenses/lgpl-2.1.txt * http://www.cecill.info/licences/Licence_CeCILL-C_V1-en.txt * * Copyright for this code is held jointly by the Genomic platform * of the Institut de Biologie de l'École normale supérieure and * the individual authors. These should be listed in @author doc * comments. * * For more information on the Eoulsan project and its aims, * or to join the Eoulsan Google group, visit the home page * at: * * http://outils.genomique.biologie.ens.fr/eoulsan * */ package fr.ens.biologie.genomique.eoulsan.splitermergers; import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SAMFileWriter; import htsjdk.samtools.SAMFileWriterFactory; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SamInputResource; import htsjdk.samtools.SamReader; import htsjdk.samtools.SamReaderFactory; import java.io.File; import java.io.IOException; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import java.util.Set; import fr.ens.biologie.genomique.eoulsan.EoulsanException; import fr.ens.biologie.genomique.eoulsan.EoulsanRuntime; import fr.ens.biologie.genomique.eoulsan.core.Parameter; import fr.ens.biologie.genomique.eoulsan.data.DataFile; import fr.ens.biologie.genomique.eoulsan.data.DataFormat; import fr.ens.biologie.genomique.eoulsan.data.DataFormats; /** * This class define a splitter class for BAM files. * @author Laurent Jourdren * @since 2.0 */ public class BAMSplitter implements Splitter { private static final int DEFAULT_SPLIT_MAX_ENTRIES = 1000000; private int splitMaxEntries = DEFAULT_SPLIT_MAX_ENTRIES; private boolean splitByChromosomes; @Override public DataFormat getFormat() { return DataFormats.MAPPER_RESULTS_BAM; } @Override public void configure(final Set<Parameter> conf) throws EoulsanException { for (Parameter p : conf) { switch (p.getName()) { case "max.entries": this.splitMaxEntries = p.getIntValueGreaterOrEqualsTo(1); break; case "chromosomes": this.splitByChromosomes = p.getBooleanValue(); break; default: throw new EoulsanException("Unknown parameter for " + getFormat().getName() + " splitter: " + p.getName()); } } } @Override public void split(final DataFile inFile, final Iterator<DataFile> outFileIterator) throws IOException { if (this.splitByChromosomes) { splitByChromosomes(inFile, outFileIterator); } else { splitByLineCount(inFile, outFileIterator); } } /** * Split BAM file by line count. * @param inFile input file * @param outFileIterator output files iterator * @throws IOException if an error occurs while reading or creating output * files */ private void splitByLineCount(final DataFile inFile, final Iterator<DataFile> outFileIterator) throws IOException { // Get temporary directory final File tmpDir = EoulsanRuntime.getRuntime().getTempDirectory(); // Get reader final SamReader reader = SamReaderFactory.makeDefault().open(SamInputResource.of(inFile.open())); // Get SAM header final SAMFileHeader header = reader.getFileHeader(); final int max = this.splitMaxEntries; int entryCount = 0; SAMFileWriter writer = null; for (final SAMRecord record : reader) { if (entryCount % max == 0) { // Close previous writer if (writer != null) { writer.close(); } DataFile outFile = outFileIterator.next(); // Create new writer writer = new SAMFileWriterFactory().setTempDirectory(tmpDir) .makeBAMWriter(header, false, outFile.create()); } writer.addAlignment(record); entryCount++; } // Close reader and writer reader.close(); if (writer != null) { writer.close(); } } /** * Split BAM file by chromosomes. * @param inFile input file * @param outFileIterator output files iterator * @throws IOException if an error occurs while reading or creating output * files */ public void splitByChromosomes(final DataFile inFile, final Iterator<DataFile> outFileIterator) throws IOException { // Get temporary directory final File tmpDir = EoulsanRuntime.getRuntime().getTempDirectory(); // Get reader final SamReader reader = SamReaderFactory.makeDefault().open(SamInputResource.of(inFile.open())); // Get SAM header final SAMFileHeader header = reader.getFileHeader(); final Map<String, SAMFileWriter> writers = new HashMap<>(); for (final SAMRecord record : reader) { final String chromosome = record.getReferenceName(); final SAMFileWriter writer; // Test if the writer for the chromosome exists if (!writers.containsKey(chromosome)) { // Create the writer for the chromosome DataFile outFile = outFileIterator.next(); writer = new SAMFileWriterFactory().setTempDirectory(tmpDir) .makeBAMWriter(header, false, outFile.create()); writers.put(chromosome, writer); } else { writer = writers.get(chromosome); } // Write the record writer.addAlignment(record); } // Close reader reader.close(); // Close writers for (SAMFileWriter writer : writers.values()) { writer.close(); } } }