/* Copyright 2013 University of North Carolina at Chapel Hill. All rights reserved. */
package abra;
import static abra.Logger.log;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.net.InetAddress;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import picard.sam.BuildBamIndex;
import picard.sam.SortSam;
import htsjdk.samtools.Cigar;
import htsjdk.samtools.CigarElement;
import htsjdk.samtools.CigarOperator;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMFileReader;
import htsjdk.samtools.SAMFileWriter;
import htsjdk.samtools.SAMFileWriterFactory;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.ValidationStringency;
/**
* ABRA's main entry point
*
* @author Lisle E. Mose (lmose at unc dot edu)
*/
public class ReAligner {
private static final int DEFAULT_MAX_UNALIGNED_READS = 1000000;
public static final int MAX_REGION_LENGTH = 400;
private static final int MIN_REGION_REMAINDER = 200;
public static final int REGION_OVERLAP = 200;
private static final int MAX_POTENTIAL_UNALIGNED_CONTIGS = 2000000;
// Minimum sequence length recommended for use with bwa mem
private static final int MIN_CONTIG_LENGTH = 70;
// Cannot be larger than buffer in assembler.c
private static final int MAX_KMER_SIZE = 199;
private SAMFileHeader[] samHeaders;
private List<Feature> regions;
private String regionsBed;
private String tempDir;
private String unalignedRegionSam;
private String reference;
private String bwaIndex;
private int minContigMapq;
private AssemblerSettings assemblerSettings;
private int numThreads;
private int maxUnalignedReads = DEFAULT_MAX_UNALIGNED_READS;
private boolean shouldReprocessUnaligned = true;
private boolean isOutputIntermediateBam = false;
private String structuralVariantFile;
private String localRepeatFile;
private BufferedWriter localRepeatWriter;
private String[] inputSams;
private SAMFileWriter[] writers;
private String[] tempDirs;
private int readLength = -1;
private int maxMapq = -1;
private int minInsertLength = Integer.MAX_VALUE;
private int maxInsertLength = -1;
private boolean isPairedEnd = false;
private String rnaSam = null;
private String rnaOutputSam = null;
private SAMFileHeader rnaHeader = null;
private int rnaReadLength = -1;
private BufferedWriter contigWriter;
private BufferedWriter svContigWriter;
private CompareToReference2 c2r;
private ReadAdjuster readAdjuster;
private ThreadManager threadManager;
private boolean hasContigs = false;
private int minMappingQuality;
private boolean isDebug;
// If true, the input target file specifies kmer values
private boolean hasPresetKmers = false;
public static final int COMPRESSION_LEVEL = 1;
public void reAlign(String[] inputFiles, String[] outputFiles) throws Exception {
this.inputSams = inputFiles;
logStartupInfo(outputFiles);
init();
c2r = new CompareToReference2();
c2r.init(this.reference);
log("Reading Input SAM Header and identifying read length");
getSamHeaderAndReadLength();
log("Read length: " + readLength);
log("Loading target regions");
loadRegions();
readAdjuster = new ReadAdjuster(isPairedEnd, this.maxMapq, c2r, minInsertLength, maxInsertLength);
if (shouldReprocessUnaligned) {
// processUnaligned();
}
Clock clock = new Clock("Assembly");
clock.start();
String contigFasta = tempDir + "/" + "all_contigs.fasta";
contigWriter = new BufferedWriter(new FileWriter(contigFasta, false));
String svContigFasta = tempDir + "/" + "sv_contigs.fasta";
svContigWriter = new BufferedWriter(new FileWriter(svContigFasta, false));
tempDirs = new String[inputSams.length];
SAMFileWriterFactory writerFactory = new SAMFileWriterFactory();
// writerFactory.setUseAsyncIo(true);
// writerFactory.setAsyncOutputBufferSize(500000);
writerFactory.setUseAsyncIo(false);
writers = new SAMFileWriter[inputSams.length];
for (int i=0; i<inputSams.length; i++) {
// init temp dir
String temp = tempDir + "/temp" + (i+1);
mkdir(temp);
tempDirs[i] = temp;
// init BAM writer
writers[i] = writerFactory.makeBAMWriter(
samHeaders[i], false, new File(outputFiles[i]), COMPRESSION_LEVEL);
}
// Start pre-processing reads on separate thread for each input file.
// This happens in parallel with assembly, provided there are enough threads.
for (int i=0; i<inputSams.length; i++) {
preProcessReads(inputSams[i], tempDirs[i], writers[i]);
}
log("Iterating over regions");
int count = 0;
for (Feature region : regions) {
count += 1;
spawnRegionThread(region, null);
if ((count % 1000) == 0) {
System.err.println("Processing region: " + count + " of " + regions.size());
}
}
log("Waiting for all threads to complete");
threadManager.waitForAllThreadsToComplete();
contigWriter.close();
svContigWriter.close();
if (localRepeatWriter != null) {
localRepeatWriter.close();
}
clock.stopAndPrint();
String cleanContigsFasta = null;
if (hasContigs) {
clock = new Clock("Align and clean contigs");
clock.start();
cleanContigsFasta = alignAndCleanContigs(contigFasta, tempDir, true);
clock.stopAndPrint();
}
if (cleanContigsFasta != null) {
clock = new Clock("Align to contigs");
clock.start();
String[] alignedSams = alignReads(cleanContigsFasta, c2r);
clock.stopAndPrint();
for (SAMFileWriter writer : this.writers) {
writer.close();
}
if (rnaSam != null) {
processRna();
}
} else {
log("WARNING! No contigs assembled. Just making a copy of input converting to/from SAM/BAM as appropriate.");
for (int i=0; i<inputFiles.length; i++) {
copySam(inputFiles[i], outputFiles[i]);
}
}
if (this.assemblerSettings.searchForStructuralVariation() && this.isPairedEnd) {
clock = new Clock("Structural Variant search");
clock.start();
new SVEvaluator().evaluateAndOutput(svContigFasta, this, tempDir, readLength, inputFiles, tempDirs, samHeaders, structuralVariantFile);
clock.stopAndPrint();
}
System.err.println("Done.");
}
private void preProcessReads(String inputSam, String tempDir, SAMFileWriter writer) throws InterruptedException {
PreprocessReadsRunnable thread = new PreprocessReadsRunnable(threadManager, this,
inputSam, this.getTempReadFile(tempDir), c2r, writer);
threadManager.spawnThread(thread);
}
private void processRna() {
/*
if (rnaSam != null) {
String rnaTemp = tempDir + "/rna";
mkdir(rnaTemp);
getRnaSamHeaderAndReadLength(rnaSam);
rnaHeader.setSortOrder(SortOrder.coordinate);
clock = new Clock("RNA - Sam2Fastq and Align");
clock.start();
log("Aligning RNA to contigs");
String alignedToContigRna = alignReads(rnaTemp, rnaSam, cleanContigsFasta, c2r, rnaOutputSam);
clock.stopAndPrint();
clock = new Clock("Adjust reads");
clock.start();
log("Adjusting RNA reads");
adjustReads(alignedToContigRna, rnaOutputSam, true, c2r);
clock.stopAndPrint();
}
*/
}
private void logStartupInfo(String[] outputFiles) {
int ctr = 0;
for (String input : inputSams) {
System.err.println("input" + ctr + ": " + input);
}
ctr = 0;
for (String output : outputFiles) {
System.err.println("output" + ctr + ": " + output);
}
System.err.println("regions: " + regionsBed);
System.err.println("reference: " + reference);
System.err.println("bwa index: " + bwaIndex);
System.err.println("working dir: " + tempDir);
System.err.println("num threads: " + numThreads);
System.err.println("max unaligned reads: " + maxUnalignedReads);
System.err.println(assemblerSettings.getDescription());
System.err.println("rna: " + rnaSam);
System.err.println("rna output: " + rnaOutputSam);
System.err.println("paired end: " + isPairedEnd);
System.err.println("use intermediate bam: " + isOutputIntermediateBam);
String javaVersion = System.getProperty("java.version");
System.err.println("Java version: " + javaVersion);
if (javaVersion.startsWith("1.6") || javaVersion.startsWith("1.5") || javaVersion.startsWith("1.4")) {
throw new RuntimeException("Please upgrade to Java 7 or later to run ABRA.");
}
try {
InetAddress localhost = java.net.InetAddress.getLocalHost();
String hostname = localhost.getHostName();
System.err.println("hostname: " + hostname);
} catch (Throwable t) {
System.err.println("Error getting hostname: " + t.getMessage());
}
}
/*
private void processUnaligned() throws IOException, InterruptedException {
Clock clock = new Clock("Process unaligned");
clock.start();
log("Assembling unaligned reads");
String unalignedSam = tempDir + "/unaligned.bam";
unalignedSam = getUnalignedReads(unalignedSam);
// String unalignedSam = tempDir + "/" + "unaligned_to_contig.bam";
String unalignedDir = tempDir + "/unaligned";
String unalignedContigFasta = unalignedDir + "/unaligned_contigs.fasta";
unalignedRegionSam = unalignedDir + "/unaligned_region.bam";
String sortedUnalignedRegion = unalignedDir + "/sorted_unaligned_region.bam";
Assembler assem = newUnalignedAssembler(1);
List<String> unalignedSamList = new ArrayList<String>();
unalignedSamList.add(unalignedSam);
List<String> unalignedAssemblies = assem.assembleContigs(unalignedSamList, unalignedContigFasta, tempDir, null, "unaligned", false, this, null);
boolean hasContigs = unalignedAssemblies.size() > 0;
// Make eligible for GC
assem = null;
if (hasContigs) {
unalignedContigFasta = unalignedAssemblies.get(0);
String unalignedCleanContigsFasta = alignAndCleanContigs(unalignedContigFasta, unalignedDir, false);
if (unalignedCleanContigsFasta != null) {
// Build contig fasta index
log("Indexing contigs from unaligned region");
Aligner contigAligner = new Aligner(unalignedCleanContigsFasta, numThreads);
contigAligner.index();
log("Done Indexing contigs from unaligned region");
String alignedToContigSam = unalignedDir + "/" + "align_to_contig.bam";
alignReads(unalignedDir, unalignedSam, unalignedCleanContigsFasta, null, null, alignedToContigSam);
String alignedToContigBam = alignedToContigSam;
log("Adjusting unaligned reads");
SAMFileWriter unalignedWriter = new SAMFileWriterFactory().makeSAMOrBAMWriter(
samHeader, false, new File(unalignedRegionSam));
ReadAdjuster unalignedReadAdjuster = new ReadAdjuster(isPairedEnd, this.maxMapq, null, minInsertLength, maxInsertLength);
unalignedReadAdjuster.adjustReads(alignedToContigBam, unalignedWriter, false, unalignedDir, samHeader);
unalignedWriter.close();
sortBam(unalignedRegionSam, sortedUnalignedRegion, "coordinate");
unalignedRegionSam = sortedUnalignedRegion;
indexBam(unalignedRegionSam);
} else {
shouldReprocessUnaligned = false;
}
} else {
shouldReprocessUnaligned = false;
}
clock.stopAndPrint();
}
*/
private void copySam(String input, String output) {
SAMFileWriterFactory writerFactory = new SAMFileWriterFactory();
SAMFileReader reader = new SAMFileReader(new File(input));
reader.setValidationStringency(ValidationStringency.SILENT);
SAMFileWriter writer = writerFactory.makeSAMOrBAMWriter(
reader.getFileHeader(), false, new File(output));
for (SAMRecord read : reader) {
writer.addAlignment(read);
}
reader.close();
writer.close();
}
private String[] alignReads(String cleanContigsFasta, CompareToReference2 c2r) throws IOException, InterruptedException {
// Build contig fasta index
log("Indexing contigs");
Aligner contigAligner = new Aligner(cleanContigsFasta, numThreads);
contigAligner.index();
log("Contig indexing done");
String[] alignedToContigsSams = new String[inputSams.length];
for (int i=0; i<inputSams.length; i++) {
alignedToContigsSams[i] = tempDirs[i] + "/" + "align_to_contig.sam";
alignReads(tempDirs[i], inputSams[i], cleanContigsFasta, c2r, writers[i], alignedToContigsSams[i], samHeaders[i]);
}
return alignedToContigsSams;
}
public static int getNumIndelBases2(SAMRecord read) {
int numIndelBases = 0;
for (CigarElement element : read.getCigar().getCigarElements()) {
if (element.getOperator() == CigarOperator.D) {
numIndelBases += 1;
} else if (element.getOperator() == CigarOperator.I) {
numIndelBases += element.getLength();
}
}
return numIndelBases;
}
private void discardMisalignedContigs(String inputSam, String outputSam) {
SAMFileReader reader = new SAMFileReader(new File(inputSam));
reader.setValidationStringency(ValidationStringency.SILENT);
SAMFileWriter outputReadsBam = new SAMFileWriterFactory().makeSAMOrBAMWriter(
samHeaders[0], true, new File(outputSam));
for (SAMRecord contig : reader) {
String[] fields = contig.getReadName().split("_");
String regionChromosome = "";
// Loop through fields in case the chromosome name contains
// an underscore.
for (int i=0; i<fields.length-3; i++) {
regionChromosome += fields[i];
if (i+1 < fields.length-3) {
regionChromosome += "_";
}
}
int regionStart = Integer.parseInt(fields[fields.length-3]) - 1000;
int regionStop = Integer.parseInt(fields[fields.length-2]) + 1000;
if ((contig.getReferenceName().equals(regionChromosome)) &&
(contig.getAlignmentStart() >= regionStart) &&
(contig.getAlignmentEnd() <= regionStop)) {
outputReadsBam.addAlignment(contig);
}
}
outputReadsBam.close();
reader.close();
}
void alignStructuralVariantCandidates(String svContigFasta, String svContigsSam) throws InterruptedException, IOException {
Aligner aligner = new Aligner(reference, numThreads);
aligner.align(svContigFasta, svContigsSam, false);
}
private String alignAndCleanContigs(String contigFasta, String tempDir, boolean isTightAlignment) throws InterruptedException, IOException {
log("Aligning contigs");
Aligner aligner = new Aligner(bwaIndex, numThreads);
String contigsSam = tempDir + "/" + "all_contigs.sam";
aligner.align(contigFasta, contigsSam, false);
if (isTightAlignment) {
log("Discarding contigs aligned outside of region");
String allInRegionSam = tempDir + "/" + "all_contigs_in_region.sam";
discardMisalignedContigs(contigsSam, allInRegionSam);
contigsSam = allInRegionSam;
}
log("Processing chimeric reads");
CombineChimera3 cc = new CombineChimera3();
String contigsWithChim = tempDir + "/" + "all_contigs_chim.bam";
int slack = this.readLength / 3;
cc.combine(contigsSam, contigsWithChim, isTightAlignment ? slack : 0, c2r);
if (isTightAlignment) {
// Chop and clop...
log("Sorting and indexing for chopper clopper.");
String contigsWithChimSorted = tempDir + "/" + "all_contigs_chim_sorted.bam";
sortBam(contigsWithChim, contigsWithChimSorted, "coordinate");
indexBam(contigsWithChimSorted);
log("Chopper clopper start.");
ContigChopper chopper = new ContigChopper();
chopper.setC2R(c2r);
chopper.setReadLength(this.readLength);
String contigsWithChimChopped = tempDir + "/" + "all_contigs_chim_chopped.bam";
chopper.chopClopDrop(this.regions, contigsWithChimSorted, contigsWithChimChopped);
log("Chopper clopper done.");
contigsWithChim = contigsWithChimChopped;
chopper = null;
}
log("Cleaning contigs");
String cleanContigsFasta = tempDir + "/" + "clean_contigs.fasta";
boolean hasCleanContigs = cleanAndOutputContigs(contigsWithChim, cleanContigsFasta, isTightAlignment);
return hasCleanContigs ? cleanContigsFasta : null;
}
String alignReads(String tempDir, String inputSam, String cleanContigsFasta,
CompareToReference2 c2r, SAMFileWriter finalOutputSam, String alignedToContigSam,
SAMFileHeader header) throws InterruptedException, IOException {
log("Aligning original reads to contigs");
alignToContigs(tempDir, alignedToContigSam, cleanContigsFasta, finalOutputSam, header);
return alignedToContigSam;
}
private void indexBam(String bam) {
String[] args = new String[] {
"INPUT=" + bam,
"VALIDATION_STRINGENCY=SILENT"
};
int ret = new BuildBamIndex().instanceMain(args);
if (ret != 0) {
throw new RuntimeException("BuildBamIndex failed");
}
}
void sortBam(String input, String output, String sortOrder) {
String[] args = new String[] {
"INPUT=" + input,
"OUTPUT=" + output,
"VALIDATION_STRINGENCY=SILENT",
"SORT_ORDER=" + sortOrder,
"TMP_DIR=" + this.tempDir + "/sorttmp"
};
int ret = new SortSam().instanceMain(args);
if (ret != 0) {
throw new RuntimeException("SortSam failed");
}
}
private void spawnRegionThread(Feature region, String inputSam) throws InterruptedException {
ReAlignerRunnable thread = new ReAlignerRunnable(threadManager, this, region);
threadManager.spawnThread(thread);
}
private boolean shouldIncludeInUnalignedPile(SAMRecord read) {
boolean shouldInclude = false;
if (!read.getReadFailsVendorQualityCheckFlag()) {
if (read.getReadUnmappedFlag()) {
shouldInclude = true;
}
// For Stampy, if Cigar length > 4 and read is not ambiguous (mapq >= 4)
else if ((read.getCigarLength() > 4) && read.getMappingQuality() >= 4) {
shouldInclude = true;
}
}
return shouldInclude;
}
private synchronized void appendContigs(String contigs) throws IOException {
contigWriter.write(contigs);
hasContigs = true;
}
public void processRegion(Feature region) throws Exception {
if (isDebug) {
log("Processing region: " + region.getDescriptor());
}
try {
String contigsFasta = tempDir + "/" + region.getDescriptor() + "_contigs.fasta";
List<String> bams = new ArrayList<String>(Arrays.asList(this.inputSams));
if (shouldReprocessUnaligned) {
bams.add(unalignedRegionSam);
}
// Assemble contigs
if (region.getKmer() > this.readLength-15) {
System.err.println("Skipping assembly of region: " + region.getDescriptor() + " - " + region.getKmer());
} else {
NativeAssembler assem = (NativeAssembler) newAssembler(region);
List<Feature> regions = new ArrayList<Feature>();
regions.add(region);
String contigs = assem.assembleContigs(bams, contigsFasta, tempDir, regions, region.getDescriptor(), true, this, c2r);
if (!contigs.equals("<ERROR>") && !contigs.equals("<REPEAT>") && !contigs.isEmpty()) {
appendContigs(contigs);
List<BreakpointCandidate> svCandidates = assem.getSvCandidateRegions();
for (BreakpointCandidate svCandidate : svCandidates) {
if (isDebug) {
System.err.println("SV: " + region.getDescriptor() + "-->" + svCandidate.getRegion().getDescriptor());
}
List<Feature> svRegions = new ArrayList<Feature>();
svRegions.add(region);
Feature svCandidateRegion = new Feature(svCandidate.getRegion().getSeqname(), svCandidate.getRegion().getStart(),
Math.min(svCandidate.getRegion().getEnd(), c2r.getReferenceLength(svCandidate.getRegion().getSeqname())-1));
svRegions.add(svCandidateRegion);
NativeAssembler svAssem = (NativeAssembler) newAssembler(region);
String svContigs = svAssem.assembleContigs(bams, contigsFasta, tempDir, svRegions, region.getDescriptor() + "__" + svCandidate.getRegion().getDescriptor() + "_" + svCandidate.getSpanningReadPairCount(), true, this, c2r);
if (!svContigs.equals("<ERROR>") && !svContigs.equals("<REPEAT>") && !svContigs.isEmpty()) {
svContigWriter.write(svContigs);
}
}
}
if (assem.isCycleExceedingThresholdDetected() && (bams.size() > 1) && this.localRepeatWriter != null) {
System.err.println("Attempting cycle detection for: " + region.getDescriptor());
// Assemble each region separately looking for cycles
List<String> cycleStatus = new ArrayList<String>();
int kmer = readLength / 2;
if (kmer % 2 == 1) {
kmer -= 1;
}
kmer = Math.min(kmer, NativeAssembler.CYCLE_KMER_LENGTH_THRESHOLD);
kmer = Math.max(kmer, region.getKmer());
for (String bam : bams) {
List<String> bamInput = new ArrayList<String>();
bamInput.add(bam);
NativeAssembler cycleAssem = (NativeAssembler) newAssembler(region);
cycleAssem.setKmer(new int[] { kmer });
cycleAssem.setShouldSearchForSv(false);
// Double pruning thresholds for repeat discovery
cycleAssem.setMinBaseQuality(assemblerSettings.getMinBaseQuality() * 2);
cycleAssem.setMinKmerFrequency(assemblerSettings.getMinNodeFrequncy() * 2);
cycleAssem.setMinEdgeRatio(assemblerSettings.getMinEdgeRatio());
String cycleContigs = cycleAssem.assembleContigs(bamInput, contigsFasta, tempDir, regions, region.getDescriptor(), true, this, c2r);
if (!cycleContigs.equals("<ERROR>") && !cycleContigs.equals("<REPEAT>")) {
cycleContigs = ".";
}
cycleStatus.add(cycleContigs);
}
StringBuffer buf = new StringBuffer("Cycle detection result");
for (String status : cycleStatus) {
buf.append(status + "\t");
}
System.err.println("Cycle detection for region: " + region + ". Result: [" + buf.toString() + "]");
if (isAnyElementDifferent(cycleStatus)) {
StringBuffer out = new StringBuffer();
out.append(region.getDescriptor() + "\t");
for (String status : cycleStatus) {
out.append(status + "\t");
}
out.append(kmer);
out.append('\n');
synchronized (localRepeatWriter) {
localRepeatWriter.write(out.toString());
}
}
}
}
}
catch (Exception e) {
e.printStackTrace();
throw e;
}
}
private boolean isAnyElementDifferent(List<String> elems) {
String last = null;
for (String elem : elems) {
if (last != null && !elem.equals(last)) {
return true;
}
last = elem;
}
return false;
}
static List<Feature> getRegions(String regionsBed, int readLength, boolean hasPresetKmers) throws IOException {
RegionLoader loader = new RegionLoader();
List<Feature> regions = loader.load(regionsBed, hasPresetKmers);
if (regions.size() > 0 && (regions.get(0).getKmer() == 0)) {
regions = RegionLoader.collapseRegions(regions, readLength);
regions = splitRegions(regions);
}
return regions;
}
private void loadRegions() throws IOException {
this.regions = getRegions(regionsBed, readLength, hasPresetKmers);
System.err.println("Num regions: " + regions.size());
if (isDebug) {
for (Feature region : regions) {
System.err.println(region.getSeqname() + "\t" + region.getStart() + "\t" + region.getEnd() + "\t" + region.getKmer());
}
}
}
public void setRegionsBed(String bedFile) {
this.regionsBed = bedFile;
}
private void getSamHeaderAndReadLength() {
log("Identifying header and determining read length");
this.samHeaders = new SAMFileHeader[this.inputSams.length];
for (int i=0; i<this.inputSams.length; i++) {
SAMFileReader reader = new SAMFileReader(new File(inputSams[i]));
try {
reader.setValidationStringency(ValidationStringency.SILENT);
samHeaders[i] = reader.getFileHeader();
samHeaders[i].setSortOrder(SAMFileHeader.SortOrder.unsorted);
Iterator<SAMRecord> iter = reader.iterator();
int cnt = 0;
while ((iter.hasNext()) && (cnt < 1000000)) {
SAMRecord read = iter.next();
this.readLength = Math.max(this.readLength, read.getReadLength());
this.maxMapq = Math.max(this.maxMapq, read.getMappingQuality());
// Assumes aligner sets proper pair flag correctly
if ((isPairedEnd) && (read.getReadPairedFlag()) && (read.getProperPairFlag())) {
this.minInsertLength = Math.min(this.minInsertLength, Math.abs(read.getInferredInsertSize()));
this.maxInsertLength = Math.max(this.maxInsertLength, Math.abs(read.getInferredInsertSize()));
}
}
// Allow some fudge in insert length
minInsertLength = Math.max(minInsertLength - 2*readLength, 0);
maxInsertLength = maxInsertLength + 2*readLength;
} finally {
reader.close();
}
}
System.err.println("Min insert length: " + minInsertLength);
System.err.println("Max insert length: " + maxInsertLength);
log("Max read length is: " + readLength);
if (assemblerSettings.getMinContigLength() < 1) {
assemblerSettings.setMinContigLength(Math.max(readLength+1, MIN_CONTIG_LENGTH));
}
log("Min contig length: " + assemblerSettings.getMinContigLength());
}
//TODO: Dedup with getSamHeaderAndReadLength
private void getRnaSamHeaderAndReadLength(String inputSam) {
log("Identifying RNA header and determining read length");
SAMFileReader reader = new SAMFileReader(new File(inputSam));
try {
reader.setValidationStringency(ValidationStringency.SILENT);
rnaHeader = reader.getFileHeader();
rnaHeader.setSortOrder(SAMFileHeader.SortOrder.unsorted);
Iterator<SAMRecord> iter = reader.iterator();
int cnt = 0;
while ((iter.hasNext()) && (cnt < 1000000)) {
SAMRecord read = iter.next();
this.rnaReadLength = Math.max(this.rnaReadLength, read.getReadLength());
}
} finally {
reader.close();
}
log("Max RNA read length is: " + rnaReadLength);
}
void sam2Fastq(String bam, String intermediateOutput, CompareToReference2 c2r, SAMFileWriter finalOutputSam) throws IOException {
log("Preprocessing: " + bam);
Sam2Fastq sam2Fastq = new Sam2Fastq();
sam2Fastq.convert(bam, intermediateOutput, c2r, finalOutputSam, isPairedEnd, regions, minMappingQuality, isOutputIntermediateBam);
log("Done Preprocessing: " + bam);
}
private boolean cleanAndOutputContigs(String contigsSam, String cleanContigsFasta, boolean shouldRemoveSoftClips) throws IOException {
boolean hasCleanContigs = false;
BufferedWriter writer = new BufferedWriter(new FileWriter(cleanContigsFasta, false));
SAMFileReader contigReader = new SAMFileReader(new File(contigsSam));
contigReader.setValidationStringency(ValidationStringency.SILENT);
int contigCount = 0;
for (SAMRecord contigRead : contigReader) {
if (contigRead.getMappingQuality() >= this.minContigMapq) {
SAMRecordUtils.removeSoftClips(contigRead);
String bases = contigRead.getReadString();
//TODO: Why would cigar length be zero here?
if ((bases.length() >= assemblerSettings.getMinContigLength()) &&
(contigRead.getCigarLength() > 0)) {
CigarElement first = contigRead.getCigar().getCigarElement(0);
CigarElement last = contigRead.getCigar().getCigarElement(contigRead.getCigarLength()-1);
if ((first.getOperator() == CigarOperator.M) &&
(last.getOperator() == CigarOperator.M)) {
String prefix = "";
String suffix = "";
if (!contigRead.getReferenceName().startsWith("uc0")) {
// Pull in read length bases from reference to the beginning and end of the contig.
prefix = c2r.getSequence(contigRead.getReferenceName(),
contigRead.getAlignmentStart()-readLength, readLength);
suffix = c2r.getSequence(contigRead.getReferenceName(), contigRead.getAlignmentEnd()+1, readLength);
bases = prefix.toUpperCase() + bases + suffix.toUpperCase();
}
Cigar cigar = new Cigar();
if (contigRead.getCigarLength() == 1) {
CigarElement elem = new CigarElement(first.getLength() + prefix.length() + suffix.length(), first.getOperator());
cigar.add(elem);
} else {
CigarElement firstNew = new CigarElement(first.getLength() + prefix.length(), first.getOperator());
CigarElement lastNew = new CigarElement(last.getLength() + suffix.length(), last.getOperator());
cigar.add(firstNew);
for (int i=1; i<contigRead.getCigarLength()-1; i++) {
cigar.add(contigRead.getCigar().getCigarElement(i));
}
cigar.add(lastNew);
}
contigRead.setCigar(cigar);
contigRead.setAlignmentStart(contigRead.getAlignmentStart()-prefix.length());
} else {
if (isDebug) {
System.err.println("Not padding contig: " + contigRead.getReadName());
}
}
//TODO: Safer delimiter? This assumes no ~ in any read
contigRead.setReadString("");
String contigReadStr = contigRead.getSAMString();
contigReadStr = contigReadStr.replace('\t','~');
String contigName = contigRead.getReadName() + "_" + contigCount++ + "~" + contigReadStr;
writer.append(">" + contigName);
writer.append(bases);
writer.append("\n");
hasCleanContigs = true;
}
}
}
contigReader.close();
writer.close();
return hasCleanContigs;
}
private String getPreprocessedBam(String tempDir) {
return tempDir + "/" + "original_reads.bam";
}
private String getProprocessedFastq(String tempDir) {
return tempDir + "/" + "original_reads.fastq.gz";
}
private String getTempReadFile(String tempDir) {
if (isOutputIntermediateBam) {
return getPreprocessedBam(tempDir);
} else {
return getProprocessedFastq(tempDir);
}
}
SVReadCounter alignToSVContigs(String tempDir, String alignedToContigSam,
String contigFasta, SAMFileWriter writer, SAMFileHeader header) throws IOException, InterruptedException {
SVAlignerStdoutHandler stdoutHandler = new SVAlignerStdoutHandler(readLength, header);
alignToContigs(tempDir, alignedToContigSam, contigFasta, writer, header, stdoutHandler);
return stdoutHandler.getCounter();
}
void alignToContigs(String tempDir, String alignedToContigSam,
String contigFasta, SAMFileWriter writer, SAMFileHeader header) throws IOException, InterruptedException {
MutableBoolean isDone = new MutableBoolean();
AdjustReadsQueueRunnable readQueueRunnable = new AdjustReadsQueueRunnable(threadManager, readAdjuster,
writer, true, tempDir, header, isDone);
AlignerStdoutHandler stdoutHandler = new AlignerStdoutHandler(readQueueRunnable);
alignToContigs(tempDir, alignedToContigSam, contigFasta, writer, header, stdoutHandler);
}
void alignToContigs(String tempDir, String alignedToContigSam,
String contigFasta, SAMFileWriter writer, SAMFileHeader header, StdoutHandler stdoutHandler) throws IOException, InterruptedException {
String bam = getTempReadFile(tempDir);
Aligner contigAligner = new Aligner(contigFasta, numThreads);
// Align region fastq against assembled contigs
contigAligner.shortAlign(bam, alignedToContigSam, stdoutHandler, isOutputIntermediateBam);
}
static class Pair<T, Y> {
private T t;
private Y y;
public Pair(T t, Y y) {
this.t = t;
this.y = y;
}
public T getFirst() {
return t;
}
public Y getSecond() {
return y;
}
}
static List<Feature> splitRegions(List<Feature> regions,
int maxRegionLength, int minRegionRemainder, int regionOverlap) {
List<Feature> splitRegions = new ArrayList<Feature>();
for (Feature region : regions) {
if (region.getLength() <= maxRegionLength + minRegionRemainder) {
splitRegions.add(region);
} else {
splitRegions.addAll(splitWithOverlap(region, maxRegionLength, minRegionRemainder, regionOverlap));
}
}
return splitRegions;
}
/**
* If any of the input list of features is greater than maxSize, split them into multiple features.
*/
public static List<Feature> splitRegions(List<Feature> regions) {
return splitRegions(regions, MAX_REGION_LENGTH, MIN_REGION_REMAINDER, REGION_OVERLAP);
}
public static List<Feature> splitWithOverlap(Feature region) {
return splitWithOverlap(region, MAX_REGION_LENGTH, MIN_REGION_REMAINDER, REGION_OVERLAP);
}
static List<Feature> splitWithOverlap(Feature region, int maxRegionLength,
int minRegionRemainder, int regionOverlap) {
List<Feature> regions = new ArrayList<Feature>();
long pos = region.getStart();
long end = pos-1;
while (end < region.getEnd()) {
long start = pos;
end = pos + maxRegionLength;
long marker = end;
// If we're at or near the end of the region, stop at region end.
if (end > (region.getEnd() - minRegionRemainder)) {
end = region.getEnd();
}
pos = marker - regionOverlap;
regions.add(new Feature(region.getSeqname(), start, end));
}
return regions;
}
int[] getKmers(Feature region) {
int[] kmerSizes = null;
int kmerSize = region.getKmer();
if (kmerSize > 0) {
kmerSizes = toKmerArray(kmerSize, readLength);
} else {
kmerSizes = assemblerSettings.getKmerSize();
}
return kmerSizes;
}
int[] toKmerArray(int kmerSize, int readLength) {
int[] kmerSizes = null;
int maxKmerSize = this.readLength-15;
if (maxKmerSize > MAX_KMER_SIZE) {
maxKmerSize = MAX_KMER_SIZE;
}
List<Integer> kmers = new ArrayList<Integer>();
while (kmerSize < maxKmerSize) {
kmers.add(kmerSize);
kmerSize += 2;
}
kmerSizes = new int[kmers.size()];
int i=0;
for (int kmer : kmers) {
kmerSizes[i++] = kmer;
}
return kmerSizes;
}
private NativeAssembler newAssembler(Feature region) {
NativeAssembler assem = new NativeAssembler();
assem.setTruncateOutputOnRepeat(true);
assem.setMaxContigs(assemblerSettings
.getMaxPotentialContigs());
assem.setMaxPathsFromRoot(100000);
assem.setReadLength(readLength);
//assem.setKmer(assemblerSettings.getKmerSize());
assem.setKmer(getKmers(region));
assem.setMinKmerFrequency(assemblerSettings.getMinNodeFrequncy());
assem.setMinEdgeRatio(assemblerSettings.getMinEdgeRatio());
assem.setMinBaseQuality(assemblerSettings.getMinBaseQuality());
assem.setMaxNodes(assemblerSettings.getMaxNodes());
assem.setMinReadCandidateFraction(assemblerSettings.getMinReadCandidateFraction());
assem.setMaxAverageDepth(assemblerSettings.getMaxAverageDepth());
assem.setShouldSearchForSv(this.isPairedEnd && assemblerSettings.searchForStructuralVariation());
assem.setAverageDepthCeiling(assemblerSettings.getAverageDepthCeiling());
assem.setDebug(assemblerSettings.isDebug());
return assem;
}
/*
private NativeAssembler newUnalignedAssembler(int mnfMultiplier) {
//Assembler assem = new JavaAssembler();
NativeAssembler assem = new NativeAssembler();
assem.setMaxContigs(MAX_POTENTIAL_UNALIGNED_CONTIGS);
assem.setTruncateOutputOnRepeat(false);
assem.setMaxPathsFromRoot(5000);
assem.setReadLength(readLength);
// Could be smaller for higher sensitivity here?
int[] unalignedKmer = new int[1];
unalignedKmer[0] = assemblerSettings.getKmerSize()[0];
assem.setKmer(unalignedKmer);
assem.setMinKmerFrequency(assemblerSettings.getMinUnalignedNodeFrequency());
assem.setMinBaseQuality(assemblerSettings.getMinBaseQuality());
return assem;
}*/
private void init() throws IOException {
String javaVersion = System.getProperty("java.version");
File workingDir = new File(tempDir);
if (workingDir.exists()) {
if (!workingDir.delete()) {
throw new IllegalStateException("Unable to delete: " + tempDir);
}
}
if (!workingDir.mkdir()) {
throw new IllegalStateException("Unable to create: " + tempDir);
}
File unalignedTempDir = new File(tempDir + "/unaligned");
if (!unalignedTempDir.mkdir()) {
throw new IllegalStateException("Unable to create: " + tempDir + "/unaligned");
}
new NativeLibraryLoader().load(tempDir);
threadManager = new ThreadManager(numThreads);
if (this.localRepeatFile != null) {
localRepeatWriter = new BufferedWriter(new FileWriter(localRepeatFile, false));
}
}
private void mkdir(String dir) {
File directory = new File(dir);
if (!directory.mkdir()) {
throw new IllegalStateException("Unable to create: " + dir);
}
}
public void setReference(String reference) {
this.reference = reference;
}
public void setBwaIndex(String bwaIndex) {
this.bwaIndex = bwaIndex;
}
public void setTempDir(String temp) {
this.tempDir = temp;
}
public void setAssemblerSettings(AssemblerSettings settings) {
this.assemblerSettings = settings;
}
public void setNumThreads(int numThreads) {
this.numThreads = numThreads;
}
public void setMinContigMapq(int minContigMapq) {
this.minContigMapq = minContigMapq;
}
public void setShouldReprocessUnaligned(boolean shouldReprocessUnaligned) {
this.shouldReprocessUnaligned = shouldReprocessUnaligned;
}
public void setMaxUnalignedReads(int maxUnalignedReads) {
this.maxUnalignedReads = maxUnalignedReads;
}
public CompareToReference2 getC2r() {
return this.c2r;
}
public int getMinMappingQuality() {
return this.minMappingQuality;
}
public int getMaxInsertLength() {
return this.maxInsertLength;
}
public int getMinInsertLength() {
return this.minInsertLength;
}
public void setMaxInsertLength(int maxInsertLen) {
this.maxInsertLength = maxInsertLen;
}
public void setMinInsertLength(int minInsertLen) {
this.minInsertLength = minInsertLen;
}
boolean isFiltered(SAMRecord read) {
return SAMRecordUtils.isFiltered(isPairedEnd, read);
}
public static void run(String[] args) throws Exception {
System.err.println("Starting 0.97 ...");
ReAlignerOptions options = new ReAlignerOptions();
options.parseOptions(args);
if (options.isValid()) {
AssemblerSettings assemblerSettings = new AssemblerSettings();
assemblerSettings.setKmerSize(options.getKmerSizes());
assemblerSettings.setMinContigLength(options.getMinContigLength());
assemblerSettings.setMinNodeFrequncy(options.getMinNodeFrequency());
assemblerSettings.setMaxPotentialContigs(options
.getMaxPotentialContigs());
assemblerSettings.setMinUnalignedNodeFrequency(options.getMinUnalignedNodeFrequency());
assemblerSettings.setMinBaseQuality(options.getMinBaseQuality());
assemblerSettings.setMinReadCandidateFraction(options.getMinReadCandidateFraction());
assemblerSettings.setMaxAverageDepth(options.getMaxAverageRegionDepth());
assemblerSettings.setSearchForStructuralVariation(options.shouldSearchForStructuralVariation());
assemblerSettings.setAverageDepthCeiling(options.getAverageDepthCeiling());
assemblerSettings.setMinEdgeRatio(options.getMinEdgeRatio());
assemblerSettings.setDebug(options.isDebug());
assemblerSettings.setMaxNodes(options.getMaxNodes());
ReAligner realigner = new ReAligner();
realigner.setReference(options.getReference());
realigner.setBwaIndex(options.getBwaIndex());
realigner.setRegionsBed(options.getTargetRegionFile());
realigner.setTempDir(options.getWorkingDir());
realigner.setAssemblerSettings(assemblerSettings);
realigner.setNumThreads(options.getNumThreads());
realigner.setMinContigMapq(options.getMinContigMapq());
realigner.setShouldReprocessUnaligned(!options.isSkipUnalignedAssembly());
realigner.setMaxUnalignedReads(options.getMaxUnalignedReads());
realigner.isPairedEnd = options.isPairedEnd();
realigner.rnaSam = options.getRnaSam();
realigner.rnaOutputSam = options.getRnaSamOutput();
realigner.structuralVariantFile = options.getStructuralVariantFile();
realigner.localRepeatFile = options.getLocalRepeatFile();
realigner.minMappingQuality = options.getMinimumMappingQuality();
realigner.hasPresetKmers = options.hasPresetKmers();
realigner.isOutputIntermediateBam = options.useIntermediateBam();
realigner.isDebug = options.isDebug();
long s = System.currentTimeMillis();
realigner.reAlign(options.getInputFiles(), options.getOutputFiles());
long e = System.currentTimeMillis();
System.err.println("Elapsed seconds: " + (e - s) / 1000);
} else {
System.exit(-1);
}
}
public static void main(String[] args) throws Exception {
// String inp = "--in /home/lmose/dev/ayc/opt/mem/test_tumor.bam --kmer 43 --mc-mapq 25 --mcl 101 --mcr -1.0 --mnf 2 --umnf 2 --mpc 50000 --out /home/lmose/dev/ayc/opt/mem/test_tumor.abra.bam --ref /home/lmose/reference/test/test.fa --targets /home/lmose/dev/ayc/opt/mem/test.gtf --threads 2 --working /home/lmose/dev/ayc/opt/mem/work1 --mur 50000000 --no-unalign --mbq 20 --rcf .02";
String inp = "--in /home/lmose/dev/ayc/opt/mem/test_tumor.bam --kmer 43 --out /home/lmose/dev/ayc/opt/mem/test_tumor.abra3.bam --ref /home/lmose/reference/test/test.fa --targets /home/lmose/dev/ayc/opt/mem/test2.bed --threads 2 --working /home/lmose/dev/ayc/opt/mem/work3";
run(inp.split("\\s+"));
}
}