/* Copyright 2013 University of North Carolina at Chapel Hill. All rights reserved. */ package abra; import java.io.File; import java.io.IOException; import java.util.List; import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SAMFileHeader.SortOrder; import htsjdk.samtools.SAMFileReader; import htsjdk.samtools.SAMFileWriter; import htsjdk.samtools.SAMFileWriterFactory; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.ValidationStringency; /** * Responsible for creating fastq files for reads of interest to be realigned. * Those reads that are not eligible for realignment are written directly to the output BAM file. * * @author Lisle Mose (lmose at unc dot edu) */ public class Sam2Fastq { public static final String FIELD_DELIMITER = "~|"; private static final int MAX_SAM_READ_NAME_LENGTH = 255; public static final int MIN_OFF_TARGET_MAPQ = 30; public static int COMPRESSION_LEVEL = 1; private FastqOutputFile output1; private ReverseComplementor reverseComplementor = new ReverseComplementor(); private boolean shouldIdentifyEndByReadId = false; private String end1Suffix; private String end2Suffix; private RegionTracker regionTracker; /** * Convert the input SAM/BAM file into a single fastq file. * Input SAM files that contain multiple mappings should be sorted by read name. */ public void convert(String inputSam, String outputFile, CompareToReference2 c2r, SAMFileWriter writer, boolean isPairedEnd, List<Feature> regions, int minMappingQuality, boolean isBamOutput) throws IOException { System.err.println("sam: " + inputSam); SAMFileReader reader = new SAMFileReader(new File(inputSam)); reader.setValidationStringency(ValidationStringency.SILENT); SAMFileWriter toProcessWriter = null; if (isBamOutput) { SAMFileWriterFactory writerFactory = new SAMFileWriterFactory(); SAMFileHeader header = reader.getFileHeader(); header.setSortOrder(SortOrder.unsorted); toProcessWriter = writerFactory.makeBAMWriter( header, false, new File(outputFile), COMPRESSION_LEVEL); } else { output1 = new FastqOutputFile(); output1.init(outputFile); } int lineCnt = 0; this.regionTracker = new RegionTracker(regions, reader.getFileHeader()); for (SAMRecord read : reader) { if (!SAMRecordUtils.isPrimary(read)) { // Write secondary / supplemental reads directly to the output BAM file. // TODO: If primary is realigned, perhaps this should be handled differently? if (writer != null) { writer.addAlignment(read); } } else if (!SAMRecordUtils.isFiltered(isPairedEnd, read)) { // These tags can be lengthy, so remove them. // TODO: Improve the way this is handled read.setAttribute("XA", null); read.setAttribute("OQ", null); read.setAttribute("MD", null); read.setAttribute("BQ", null); read.setAttribute("BI", null); read.setAttribute("BD", null); int yx = 0; boolean isAmbiguous = !read.getReadUnmappedFlag() && read.getMappingQuality() == 0; if ((!read.getReadFailsVendorQualityCheckFlag()) && (!isAmbiguous)) { // Calculate the number of mismatches to reference for this read. if (c2r != null) { try { yx = SAMRecordUtils.getEditDistance(read, c2r); } catch (ArrayIndexOutOfBoundsException e) { System.err.println("Index error for read: " + read.getSAMString()); throw e; } } else { yx = read.getReadLength(); } read.setAttribute("YX", yx); } boolean offTargetFiltered = false; if (yx > 0 && !read.getReadUnmappedFlag() && read.getMappingQuality() < MIN_OFF_TARGET_MAPQ && !regionTracker.isInRegion(read)) { read.setAttribute("YR", 2); offTargetFiltered = true; // System.out.println("Filtering off target: " + read.getSAMString()); } if ((yx > 0 && !offTargetFiltered && read.getMappingQuality() >= minMappingQuality) || (read.getReadUnmappedFlag())) { if ((!read.getReadUnmappedFlag()) && (!regionTracker.isInRegion(read))) { read.setAttribute("YR", 1); } // SA tag causes read info to not fit into read name, remove it for now. read.setAttribute("SA", null); try { if (isBamOutput) { toProcessWriter.addAlignment(samReadToUnmappedSam(read)); } else { output1.write(samReadToFastqRecord(read)); } } catch (IllegalArgumentException e) { System.err.println("Error on: " + read.getSAMString()); e.printStackTrace(); throw e; } } else if (writer != null) { // Either xactly matches reference or failed vendor QC or low mapq, so // output directly to final BAM writer.addAlignment(read); } } lineCnt++; if ((lineCnt % 1000000) == 0) { System.err.println("record: " + lineCnt); } } if (isBamOutput) { toProcessWriter.close(); } else { output1.close(); } reader.close(); } private SAMRecord samReadToUnmappedSam(SAMRecord read) { String bases = read.getReadString(); String qualities = read.getBaseQualityString(); if (read.getReadNegativeStrandFlag()) { bases = reverseComplementor.reverseComplement(bases); qualities = reverseComplementor.reverse(qualities); } read.setReadString(""); read.setBaseQualityString(""); String readStr = read.getSAMString(); readStr = readStr.replace("\t", FIELD_DELIMITER); // Trim newline if applicable if (readStr.charAt(readStr.length()-1) == '\n') { readStr = readStr.substring(0, readStr.length()-1); } String readName = readStr; read.setReadName(readName); read.setReadString(bases); read.setBaseQualityString(qualities); read.setFlags(0); read.clearAttributes(); read.setReadUnmappedFlag(true); read.setMappingQuality(SAMRecord.NO_MAPPING_QUALITY); read.setReferenceIndex(SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX); read.setAlignmentStart(SAMRecord.NO_ALIGNMENT_START); read.setCigarString(SAMRecord.NO_ALIGNMENT_CIGAR); return read; } private FastqRecord samReadToFastqRecord(SAMRecord read) { String bases = read.getReadString(); String qualities = read.getBaseQualityString(); if (read.getReadNegativeStrandFlag()) { bases = reverseComplementor.reverseComplement(bases); qualities = reverseComplementor.reverse(qualities); } read.setReadString(""); read.setBaseQualityString(""); String readStr = read.getSAMString(); readStr = readStr.replace("\t", FIELD_DELIMITER); // Trim newline if applicable if (readStr.charAt(readStr.length()-1) == '\n') { readStr = readStr.substring(0, readStr.length()-1); } String readName = readStr; FastqRecord fastq = new FastqRecord("@" + readName, bases, qualities); return fastq; } private boolean isFusion(SAMRecord read) { return read.getAttribute("ZF") != null; } private boolean isFirstInPair(SAMRecord read) { boolean isFirstInPair; if (shouldIdentifyEndByReadId) { isFirstInPair = read.getReadName().endsWith(end1Suffix); } else { isFirstInPair = read.getFirstOfPairFlag(); } return isFirstInPair; } private boolean isSecondInPair(SAMRecord read) { boolean isSecondInPair; if (shouldIdentifyEndByReadId) { isSecondInPair = read.getReadName().endsWith(end2Suffix); } else { isSecondInPair = read.getSecondOfPairFlag(); } return isSecondInPair; } public void setEndSuffixes(String end1Suffix, String end2Suffix) { this.shouldIdentifyEndByReadId = true; this.end1Suffix = end1Suffix; this.end2Suffix = end2Suffix; } public static void main(String[] args) throws Exception { /* String inputSam = "/home/lmose/dev/abra/s2fq_test/chr22.bam"; String outputSam = "/home/lmose/dev/abra/s2fq_test/chr22_out.bam"; String bed = "/home/lmose/dev/abra/s2fq_test/chr22.bed"; //String tempReadFile = "/home/lmose/dev/abra/s2fq_test/chr22.fastq.gz"; String tempReadFile = "/home/lmose/dev/abra/s2fq_test/chr22.temp.bam"; String ref = "/home/lmose/reference/chr22/chr22.fa"; */ String inputSam = args[0]; String outputSam = args[1]; String bed = args[2]; String tempReadFile = args[3]; String ref = args[4]; COMPRESSION_LEVEL = Integer.parseInt(args[5]); SAMFileReader reader = new SAMFileReader(new File(inputSam)); SAMFileHeader header = reader.getFileHeader(); reader.close(); CompareToReference2 c2r = new CompareToReference2(); c2r.init(ref); SAMFileWriterFactory writerFactory = new SAMFileWriterFactory(); header.setSortOrder(SortOrder.unsorted); SAMFileWriter writer = writerFactory.makeSAMOrBAMWriter( header, false, new File(outputSam)); Sam2Fastq s2f = new Sam2Fastq(); RegionLoader loader = new RegionLoader(); List<Feature> regions = loader.load(bed, false); long s = System.currentTimeMillis(); s2f.convert(inputSam, tempReadFile, c2r, writer, false, regions, 20, true); long e = System.currentTimeMillis(); writer.close(); System.err.println("Elapsed: " + (e-s)/1000); } }