/******************************************************************************* * GenPlay, Einstein Genome Analyzer * Copyright (C) 2009, 2014 Albert Einstein College of Medicine * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * Authors: Julien Lajugie <julien.lajugie@einstein.yu.edu> * Nicolas Fourel <nicolas.fourel@einstein.yu.edu> * Eric Bouhassira <eric.bouhassira@einstein.yu.edu> * * Website: <http://genplay.einstein.yu.edu> ******************************************************************************/ package edu.yu.einstein.genplay.core.IO.extractor; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Comparator; import java.util.List; import java.util.NavigableSet; import java.util.Queue; import java.util.Set; import java.util.TreeSet; import java.util.concurrent.ConcurrentLinkedQueue; import net.sf.samtools.AlignmentBlock; import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMFileReader; import net.sf.samtools.SAMFormatException; import net.sf.samtools.SAMProgramRecord; import net.sf.samtools.SAMReadGroupRecord; import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMRecordIterator; import edu.yu.einstein.genplay.core.IO.dataReader.ChromosomeWindowReader; import edu.yu.einstein.genplay.core.IO.dataReader.DataReader; import edu.yu.einstein.genplay.core.IO.dataReader.SCWReader; import edu.yu.einstein.genplay.core.IO.dataReader.StrandReader; import edu.yu.einstein.genplay.core.IO.utils.StrandedExtractorOptions; import edu.yu.einstein.genplay.core.IO.utils.SAMRecordFilter.SAMRecordFilter; import edu.yu.einstein.genplay.core.manager.project.ProjectChromosomes; import edu.yu.einstein.genplay.core.manager.project.ProjectManager; import edu.yu.einstein.genplay.dataStructure.chromosome.Chromosome; import edu.yu.einstein.genplay.dataStructure.chromosomeWindow.ChromosomeWindow; import edu.yu.einstein.genplay.dataStructure.chromosomeWindow.SimpleChromosomeWindow; import edu.yu.einstein.genplay.dataStructure.enums.Strand; import edu.yu.einstein.genplay.dataStructure.gene.Gene; import edu.yu.einstein.genplay.dataStructure.gene.SimpleGene; import edu.yu.einstein.genplay.exception.exceptions.DataLineException; /** * Extractor that extract data from a SAM / BAM file * @author Julien Lajugie */ public class SAMExtractor extends Extractor implements DataReader, ChromosomeWindowReader, SCWReader, StrandReader, StrandedExtractor { /** * Comparator that compares {@link ChromosomeWindow} objects on their start positions. * Using this comparator assure that 2 {@link ChromosomeWindow} are never equal. * This is comparator is useful when using {@link Set} because the {@link Set#add(Object)} method * doesn't insert a value in the set if this value is already present. * @author Julien Lajugie */ private class NeverEqualChromosomeWindowStartComparator implements Comparator<ChromosomeWindow>{ /** * @return -1 if the start position of the first window is smaller or equal to the one of the second window. * Returns 1 otherwise. */ @Override public int compare(ChromosomeWindow chromosomeWindow1, ChromosomeWindow chromosomeWindow2) { if (chromosomeWindow1.getStart() <= chromosomeWindow2.getStart()) { return -1; } else { return 1; } } } /** * Default first base position of bed files. SAM files are 1-based * Even though BAM files are 0-based, {@link SAMFileReader} objects * returns 1 base coordinates. * */ public static final int DEFAULT_FIRST_BASE_POSITION = 1; private int firstBasePosition = DEFAULT_FIRST_BASE_POSITION; // position of the first base private final SAMReadGroupRecord[] readGroups; // read groups of the SAM file private final String[] programNames; // programs used to generate and process the SAM file private final SAMFileReader samReader; // reader that read sam / bam files (from sam.jar) private final SAMRecordIterator iterator; // iterator in the file private StrandedExtractorOptions strandOptions; // options on the strand and read length / shift private final List<SAMRecordFilter> recordFilters; // SAM record filters (we don't consider records that don't pass these filters) private boolean isPairedMode; // true if the extractor is in pair end mode private Chromosome chromosome; // chromosome of the last record read (a record has exactly one chromosome) private final Queue<Integer> startQueue; // queue containing the start positions of the last record read (a record can have more than one start if split) private final Queue<Integer> stopQueue; // queue containing the stop positions of the last record read (a record can have more than one stop if split) private final Queue<Strand> strandQueue; // queue containing the strands of the last record read (a record can have more than one stop if split) private final NavigableSet<Gene> waitingAlignmentBlocks; // set containing the alignment blocks sorted by start position private boolean lastChromosomeJustFlushed = false; // true if the waiting list of the last chromosome was just flushed /** * Creates an instance of {@link SAMExtractor} * @param dataFile SAM / BAM file to extract */ public SAMExtractor(File dataFile) { super(dataFile); samReader = new SAMFileReader(dataFile); readGroups = retrieveReadGroups(); programNames = retrieveProgramNames(); iterator = samReader.iterator(); startQueue = new ConcurrentLinkedQueue<Integer>(); stopQueue = new ConcurrentLinkedQueue<Integer>(); strandQueue = new ConcurrentLinkedQueue<Strand>(); recordFilters = new ArrayList<SAMRecordFilter>(); waitingAlignmentBlocks = new TreeSet<Gene>(new NeverEqualChromosomeWindowStartComparator()); } /** * Add the specified alignment to the waiting list * @param blockToAdd * @param strand */ private void addBlockToWaitingList(AlignmentBlock blockToAdd, Strand strand) { int start = blockToAdd.getReferenceStart(); int stop = start + blockToAdd.getLength(); // compute the read position with specified strand shift and read length if (strandOptions != null) { SimpleChromosomeWindow resultStartStop = strandOptions.computeStartStop(chromosome, start, stop, strand); start = resultStartStop.getStart(); stop = resultStartStop.getStop(); } // if we are in a multi-genome project, we compute the position on the meta genome start = getRealGenomePosition(chromosome, start); stop = getRealGenomePosition(chromosome, stop); waitingAlignmentBlocks.add(new SimpleGene(null, strand, start, stop, 0, null)); } /** * Add a filter to apply on the reads. Only the reads that pass all the filters will be extracted. * @param recordFilter */ public void addFilter(SAMRecordFilter recordFilter) { recordFilters.add(recordFilter); } /** * Applies the SAM record filters to a specified samRecord * @param samRecord a {@link SAMRecord} * @return the record itself if it passes the filter tests, null otherwise. */ private SAMRecord applyFilters(SAMRecord samRecord) { for (SAMRecordFilter currentFilter: recordFilters) { samRecord = currentFilter.applyFilter(samRecord); if (samRecord == null) { return null; } } return samRecord; } /** * Removes all the alignments with a start smaller than a specified value from the waiting list * and adds them to the list of the alignments ready to be retrieved. * @param position */ private void flushWaitingAlignments(int position) { ChromosomeWindow chromosomeWindow = new SimpleChromosomeWindow(position, position + 1); while (!waitingAlignmentBlocks.isEmpty() && (waitingAlignmentBlocks.first().compareTo(chromosomeWindow) <= 0)) { Gene removedWaitingBlock = waitingAlignmentBlocks.pollFirst(); startQueue.add(removedWaitingBlock.getStart()); stopQueue.add(removedWaitingBlock.getStop()); strandQueue.add(removedWaitingBlock.getStrand()); } } @Override public Chromosome getChromosome() { return chromosome; } @Override public int getFirstBasePosition() { return firstBasePosition; } /** * @return the header of the BAM file */ public String getHeaderString() { return samReader.getFileHeader().getTextHeader(); } /** * @return the list of the programs that were used to generate the SAM / BAM file */ public String[] getProgramNames() { return programNames; } /** * @return the read groups of the SAM / BAM file */ public SAMReadGroupRecord[] getReadGroups() { return readGroups; } @Override public Float getScore() { return 1f; } @Override public Integer getStart() { return startQueue.peek(); } @Override public Integer getStop() { return stopQueue.peek(); } @Override public Strand getStrand() { return strandQueue.peek(); } @Override public StrandedExtractorOptions getStrandedExtractorOptions() { return strandOptions; } /** * @param samRecord * @return true if the read is the leftmost one of a pair */ private boolean isLeftMostOfPair(SAMRecord samRecord) { return !samRecord.getReadNegativeStrandFlag(); } /** * @return true if the extractor is in pair end mode */ public boolean isPairedMode() { return isPairedMode; } /** * Process a paired SAM record and extract its starts and stops * @param samRecord a {@link SAMRecord} * @return true if a window was extracted from the record, false otherwise */ private boolean processPairedSamRecord(SAMRecord samRecord) { // check if the read is the leftmost of the pair if (isLeftMostOfPair(samRecord)) { int start = samRecord.getAlignmentStart(); int stop = samRecord.getMateAlignmentStart() + 1; Strand strand = samRecord.getFirstOfPairFlag() ? Strand.FIVE : Strand.THREE; // compute the read position with specified strand shift and read length if (strandOptions != null) { if (strandOptions.isSelected(strand)) { SimpleChromosomeWindow resultStartStop = strandOptions.computeStartStop(chromosome, start, stop, strand); start = resultStartStop.getStart(); stop = resultStartStop.getStop(); } else { return false; } } // if we are in a multi-genome project, we compute the position on the meta genome start = getRealGenomePosition(chromosome, start); stop = getRealGenomePosition(chromosome, stop); if ((stop - start) > 0) { startQueue.add(start); stopQueue.add(stop); strandQueue.add(strand); return true; } } return false; } /** * Process the specified {@link SAMRecord} and extract its chromosome, starts, stops and strand values * @param samRecord a {@link SAMRecord} * @return true if an alignment was extracted */ private boolean processSamRecord(SAMRecord samRecord) { ProjectChromosomes projectChromosomes = ProjectManager.getInstance().getProjectChromosomes(); if (isPairedMode) { chromosome = projectChromosomes.get(samRecord.getReferenceName()); return processPairedSamRecord(samRecord); } Strand strand = samRecord.getReadNegativeStrandFlag() ? Strand.THREE : Strand.FIVE; Chromosome currentChromosome = projectChromosomes.get(samRecord.getReferenceName()); // when a new chromosome start being extracted we flush all the waiting alignments from the previous chromosome if ((chromosome != null) && (currentChromosome != chromosome) && !waitingAlignmentBlocks.isEmpty() && !lastChromosomeJustFlushed) { flushWaitingAlignments(chromosome.getLength() + 1); // add the other blocks to the waiting list to make sure that they will be retrieved in sorted order for (AlignmentBlock currentBlock: samRecord.getAlignmentBlocks()) { addBlockToWaitingList(currentBlock, strand); lastChromosomeJustFlushed = true; } return true; } chromosome = currentChromosome; lastChromosomeJustFlushed = false; List<AlignmentBlock> alignmentBlocks = samRecord.getAlignmentBlocks(); if (!alignmentBlocks.isEmpty()) { AlignmentBlock firstBlock = alignmentBlocks.get(0); int start = firstBlock.getReferenceStart(); int stop = start + firstBlock.getLength(); // compute the read position with specified strand shift and read length if (strandOptions != null) { if (strandOptions.isSelected(strand)) { SimpleChromosomeWindow resultStartStop = strandOptions.computeStartStop(chromosome, start, stop, strand); start = resultStartStop.getStart(); stop = resultStartStop.getStop(); } else { return false; } } // if we are in a multi-genome project, we compute the position on the meta genome start = getRealGenomePosition(chromosome, start); stop = getRealGenomePosition(chromosome, stop); // add the waiting alignments with a start position smaller than the current alignment flushWaitingAlignments(start); // add the first alignment block to the start and stop list ready to be retrieved startQueue.add(start); stopQueue.add(stop); strandQueue.add(strand); // add the other blocks to the waiting list to make sure that they will be retrieved in sorted order // if the strand option has been specified we don't consider the other blocks if (strandOptions == null) { for (int i = 1; i < alignmentBlocks.size(); i++) { addBlockToWaitingList(alignmentBlocks.get(i), strand); } } return true; } return false; } @Override public boolean readItem() throws IOException { if (isStopped()) { return false; } // remove the last item from the queues if (!startQueue.isEmpty()) { startQueue.poll(); stopQueue.poll(); strandQueue.poll(); // nothing to do if there still some items in the queues if (!startQueue.isEmpty()) { return true; } } SAMRecord samRecord = null; while (iterator.hasNext()) { try { samRecord = iterator.next(); String chromosomeName = samRecord.getReferenceName(); // case where last chromosome already extracted, no more data to extract if ((getChromosomeSelector() != null) && (getChromosomeSelector().isExtractionDone(chromosomeName))) { return false; } // chromosome was selected for extraction if ((getChromosomeSelector() == null) || getChromosomeSelector().isSelected(chromosomeName)) { samRecord = applyFilters(samRecord); if ((samRecord != null) && processSamRecord(samRecord)) { return true; } } } catch (SAMFormatException e) { DataLineException dataLineException = new DataLineException(e.getMessage(), DataLineException.SKIP_PROCESS); dataLineException.setFile(getDataFile()); if ((samRecord != null) && (samRecord.getSAMString() != null)) { dataLineException.setLine(samRecord.getSAMString()); } notifyDataEventListeners(dataLineException); } } return false; } /** * Add a filter to apply on the reads. Only the reads that pass all the filters will be extracted. * @param recordFilter */ public void removeFilter(SAMRecordFilter recordFilter) { recordFilters.remove(recordFilter); } @Override protected String retrieveDataName(File dataFile) { return dataFile.getName(); } /** * @return the name of the programs that processed the SAM file */ private String[] retrieveProgramNames() { SAMFileHeader header = samReader.getFileHeader(); List<SAMProgramRecord> programs = header.getProgramRecords(); if (programs == null) { return null; } String[] programNames = new String[programs.size()]; for (int i = 0; i < programs.size(); i++) { programNames[i] = programs.get(i).getProgramName(); } return programNames; } /** * @return the read groups of the SAM file extracted from the header */ private SAMReadGroupRecord[] retrieveReadGroups() { SAMFileHeader header = samReader.getFileHeader(); List<SAMReadGroupRecord> readGroups = header.getReadGroups(); if (readGroups == null) { return null; } SAMReadGroupRecord[] SAMReadGroupRecords = new SAMReadGroupRecord[readGroups.size()]; for (int i = 0; i < readGroups.size(); i++) { SAMReadGroupRecords[i] = readGroups.get(i); } return SAMReadGroupRecords; } @Override public void setFirstBasePosition(int firstBasePosition) { this.firstBasePosition = firstBasePosition; } /** * @param isPairedMode set to true to extract to set the extractor in pair end mode. * In pair end mode the extractor returns windows having the size of the fragments delimited by the pair ends. */ public void setPairedMode(boolean isPairedMode) { this.isPairedMode = isPairedMode; } @Override public void setStrandedExtractorOptions(StrandedExtractorOptions options) { strandOptions = options; } }