/* * Copyright 2015 OpenCB * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.opencb.hpg.bigdata.app.cli.local; import htsjdk.samtools.*; import htsjdk.samtools.util.LineReader; import htsjdk.samtools.util.StringLineReader; import org.apache.avro.file.DataFileStream; import org.apache.avro.specific.SpecificDatumReader; import org.apache.commons.lang3.StringUtils; import org.apache.spark.SparkConf; import org.apache.spark.SparkContext; import org.apache.spark.sql.SparkSession; import org.ga4gh.models.ReadAlignment; import org.opencb.biodata.models.alignment.RegionCoverage; import org.opencb.biodata.models.core.Region; import org.opencb.biodata.tools.alignment.AlignmentOptions; import org.opencb.biodata.tools.alignment.BamManager; import org.opencb.biodata.tools.alignment.BamUtils; import org.opencb.biodata.tools.alignment.stats.AlignmentGlobalStats; import org.opencb.commons.utils.FileUtils; import org.opencb.hpg.bigdata.app.cli.CommandExecutor; import org.opencb.hpg.bigdata.core.avro.AlignmentAvroSerializer; import org.opencb.hpg.bigdata.core.converters.SAMRecord2ReadAlignmentConverter; import org.opencb.hpg.bigdata.core.lib.AlignmentDataset; import org.opencb.hpg.bigdata.core.lib.SparkConfCreator; import java.io.*; import java.nio.file.Path; import java.nio.file.Paths; import java.util.Iterator; import java.util.List; /** * Created by imedina on 16/03/15. */ public class AlignmentCommandExecutor extends CommandExecutor { private LocalCliOptionsParser.AlignmentCommandOptions alignmentCommandOptions; public static final String BAM_HEADER_SUFFIX = ".header"; public AlignmentCommandExecutor(LocalCliOptionsParser.AlignmentCommandOptions alignmentCommandOptions) { this.alignmentCommandOptions = alignmentCommandOptions; } /* * Parse specific 'alignment' command options */ public void execute() throws Exception { String subCommand = alignmentCommandOptions.getParsedSubCommand(); switch (subCommand) { case "convert": init(alignmentCommandOptions.convertAlignmentCommandOptions.commonOptions.logLevel, alignmentCommandOptions.convertAlignmentCommandOptions.commonOptions.verbose, alignmentCommandOptions.convertAlignmentCommandOptions.commonOptions.conf); convert(); break; case "stats": init(alignmentCommandOptions.statsAlignmentCommandOptions.commonOptions.logLevel, alignmentCommandOptions.statsAlignmentCommandOptions.commonOptions.verbose, alignmentCommandOptions.statsAlignmentCommandOptions.commonOptions.conf); stats(); break; case "coverage": init(alignmentCommandOptions.coverageAlignmentCommandOptions.commonOptions.logLevel, alignmentCommandOptions.coverageAlignmentCommandOptions.commonOptions.verbose, alignmentCommandOptions.coverageAlignmentCommandOptions.commonOptions.conf); coverage(); break; case "query": init(alignmentCommandOptions.queryAlignmentCommandOptions.commonOptions.logLevel, alignmentCommandOptions.queryAlignmentCommandOptions.commonOptions.verbose, alignmentCommandOptions.queryAlignmentCommandOptions.commonOptions.conf); query(); break; /* case "align": System.out.println("Sub-command 'align': Not yet implemented for the command 'alignment' !"); break; */ default: break; } } private void convert() throws IOException { String input = alignmentCommandOptions.convertAlignmentCommandOptions.input; String output = alignmentCommandOptions.convertAlignmentCommandOptions.output; String compressionCodecName = alignmentCommandOptions.convertAlignmentCommandOptions.compression; // sanity check if (compressionCodecName.equals("null")) { compressionCodecName = "deflate"; } if (alignmentCommandOptions.convertAlignmentCommandOptions.toBam) { // conversion: GA4GH/Avro model -> BAM // header management: read it from a separate file File file = new File(input + BAM_HEADER_SUFFIX); FileInputStream fis = new FileInputStream(file); byte[] data = new byte[(int) file.length()]; fis.read(data); fis.close(); InputStream is = new FileInputStream(input); String textHeader = new String(data); LineReader lineReader = new StringLineReader(textHeader); SAMFileHeader header = new SAMTextHeaderCodec().decode(lineReader, textHeader); // reader DataFileStream<ReadAlignment> reader = new DataFileStream<ReadAlignment>(is, new SpecificDatumReader<>(ReadAlignment.class)); // writer OutputStream os = new FileOutputStream(new File(output)); SAMFileWriter writer = new SAMFileWriterFactory().makeBAMWriter(header, false, new File(output)); // main loop int reads = 0; SAMRecord samRecord; SAMRecord2ReadAlignmentConverter converter = new SAMRecord2ReadAlignmentConverter(); for (ReadAlignment readAlignment : reader) { samRecord = converter.backward(readAlignment); samRecord.setHeader(header); writer.addAlignment(samRecord); if (++reads % 100_000 == 0) { System.out.println("Converted " + reads + " reads"); } } // close reader.close(); writer.close(); os.close(); is.close(); } else { // conversion: BAM -> GA4GH/Avro model /* System.out.println("Loading library hpgbigdata..."); System.out.println("\tjava.libary.path = " + System.getProperty("java.library.path")); System.loadLibrary("hpgbigdata"); System.out.println("...done!"); new NativeSupport().bam2ga(input, output, compressionCodecName == null ? "snappy" : compressionCodecName, alignmentCommandOptions.convertAlignmentCommandOptions.adjustQuality); try { // header management: saved it in a separate file SamReader reader = SamReaderFactory.makeDefault().open(new File(input)); SAMFileHeader header = reader.getFileHeader(); PrintWriter pwriter = null; pwriter = new PrintWriter(new FileWriter(output + BAM_HEADER_SUFFIX)); pwriter.write(header.getTextHeader()); pwriter.close(); } catch (IOException e) { throw e; } */ boolean adjustQuality = alignmentCommandOptions.convertAlignmentCommandOptions.adjustQuality; AlignmentAvroSerializer avroSerializer = new AlignmentAvroSerializer(compressionCodecName); avroSerializer.toAvro(input, output); } } private void stats() throws Exception { // get input parameters String input = alignmentCommandOptions.statsAlignmentCommandOptions.input; String output = alignmentCommandOptions.statsAlignmentCommandOptions.output; try { // compute stats using the BamManager BamManager alignmentManager = new BamManager(Paths.get(input)); AlignmentGlobalStats stats = alignmentManager.stats(); alignmentManager.close(); // write results PrintWriter writer = new PrintWriter(new File(output + "/stats.json")); writer.write(stats.toJSON()); writer.close(); } catch (Exception e) { throw e; } } private void coverage() throws IOException { final int chunkSize = 10000; // get input parameters String input = alignmentCommandOptions.coverageAlignmentCommandOptions.input; String output = alignmentCommandOptions.coverageAlignmentCommandOptions.output; Path filePath = Paths.get(input); // writer PrintWriter writer = new PrintWriter(new File(output + "/" + filePath.getFileName() + ".coverage")); SAMFileHeader fileHeader = BamUtils.getFileHeader(filePath); AlignmentOptions options = new AlignmentOptions(); options.setContained(false); short[] values; BamManager alignmentManager = new BamManager(filePath); Iterator<SAMSequenceRecord> iterator = fileHeader.getSequenceDictionary().getSequences().iterator(); while (iterator.hasNext()) { SAMSequenceRecord next = iterator.next(); for (int i = 0; i < next.getSequenceLength(); i += chunkSize) { Region region = new Region(next.getSequenceName(), i + 1, Math.min(i + chunkSize, next.getSequenceLength())); RegionCoverage regionCoverage = alignmentManager.coverage(region, null, options); // write coverages to file (only values greater than 0) values = regionCoverage.getValues(); for (int j=0, start = region.getStart(); j < values.length; j++, start++) { if (values[j] > 0) { writer.write(next.getSequenceName() + "\t" + start + "\t" + values[j] + "\n"); } } } } // close writer.close(); // // HashMap<String, Integer> regionLength = new HashMap<>(); // // //TODO: fix using the new RegionCoverage, JT // try { // // header management // BufferedReader br = new BufferedReader(new FileReader(input + BAM_HEADER_SUFFIX)); // String line, fieldName, fieldLength; // String[] fields; // String[] subfields; // while ((line = br.readLine()) != null) { // if (line.startsWith("@SQ")) { // fields = line.split("\t"); // subfields = fields[1].split(":"); // fieldName = subfields[1]; // subfields = fields[2].split(":"); // fieldLength = subfields[1]; // // regionLength.put(fieldName, Integer.parseInt(fieldLength)); // } // } // br.close(); // // // reader // InputStream is = new FileInputStream(input); // DataFileStream<ReadAlignment> reader = new DataFileStream<>(is, new SpecificDatumReader<>(ReadAlignment.class)); // // // writer // PrintWriter writer = new PrintWriter(new File(output + "/depth.txt")); // // String chromName = ""; // int[] chromDepth; // // RegionDepth regionDepth; // RegionDepthCalculator calculator = new RegionDepthCalculator(); // // int pos; // long prevPos = 0L; // // // main loop // chromDepth = null; // for (ReadAlignment readAlignment : reader) { // if (readAlignment.getAlignment() != null) { // regionDepth = calculator.compute(readAlignment); // if (chromDepth == null) { // chromName = regionDepth.chrom; // chromDepth = new int[regionLength.get(regionDepth.chrom)]; // } // if (!chromName.equals(regionDepth.chrom)) { // // write depth // int length = chromDepth.length; // for (int i = 0; i < length; i++) { // writer.write(chromName + "\t" + (i + 1) + "\t" + chromDepth[i] + "\n"); // } // // // init // prevPos = 0L; // chromName = regionDepth.chrom; // chromDepth = new int[regionLength.get(regionDepth.chrom)]; // } // if (prevPos > regionDepth.position) { // throw new IOException("Error: the input file (" + input + ") is not sorted (reads out of order)."); // } // // pos = (int) regionDepth.position; // for (int i: regionDepth.array) { // chromDepth[pos] += regionDepth.array[i]; // pos++; // } // prevPos = regionDepth.position; // } // } // // write depth // int length = chromDepth.length; // for (int i = 0; i < length; i++) { // if (chromDepth[i] > 0) { // writer.write(chromName + "\t" + (i + 1) + "\t" + chromDepth[i] + "\n"); // } // } // // // close // reader.close(); // is.close(); // writer.close(); // } catch (Exception e) { // throw e; // } } public void query() throws Exception { // check mandatory parameter 'input file' Path inputPath = Paths.get(alignmentCommandOptions.queryAlignmentCommandOptions.input); FileUtils.checkFile(inputPath); // TODO: to take the spark home from somewhere else SparkConf sparkConf = SparkConfCreator.getConf("variant query", "local", 1, true, "/home/jtarraga/soft/spark-2.0.0/"); System.out.println("sparkConf = " + sparkConf.toDebugString()); SparkSession sparkSession = new SparkSession(new SparkContext(sparkConf)); // SparkConf sparkConf = SparkConfCreator.getConf("MyTest", "local", 1, true, "/home/jtarraga/soft/spark-2.0.0/"); // SparkSession sparkSession = new SparkSession(new SparkContext(sparkConf)); AlignmentDataset ad = new AlignmentDataset(); ad.load(alignmentCommandOptions.queryAlignmentCommandOptions.input, sparkSession); ad.createOrReplaceTempView("alignment"); // query for region List<Region> regions = null; if (StringUtils.isNotEmpty(alignmentCommandOptions.queryAlignmentCommandOptions.regions)) { regions = Region.parseRegions(alignmentCommandOptions.queryAlignmentCommandOptions.regions); ad.regionFilter(regions); } // query for region file if (StringUtils.isNotEmpty(alignmentCommandOptions.queryAlignmentCommandOptions.regionFile)) { logger.warn("Query for region file, not yet implemented."); } // query for minimun mapping quality if (alignmentCommandOptions.queryAlignmentCommandOptions.minMapQ > 0) { ad.mappingQualityFilter(">=" + alignmentCommandOptions.queryAlignmentCommandOptions.minMapQ); } // query for flags if (alignmentCommandOptions.queryAlignmentCommandOptions.requireFlags != Integer.MAX_VALUE) { ad.flagFilter("" + alignmentCommandOptions.queryAlignmentCommandOptions.requireFlags, false); } if (alignmentCommandOptions.queryAlignmentCommandOptions.filteringFlags != 0) { ad.flagFilter("" + alignmentCommandOptions.queryAlignmentCommandOptions.filteringFlags, true); } // query for template length if (alignmentCommandOptions.queryAlignmentCommandOptions.minTLen != 0) { ad.templateLengthFilter(">=" + alignmentCommandOptions.queryAlignmentCommandOptions.minTLen); } if (alignmentCommandOptions.queryAlignmentCommandOptions.maxTLen != Integer.MAX_VALUE) { ad.templateLengthFilter("<=" + alignmentCommandOptions.queryAlignmentCommandOptions.maxTLen); } // query for alignment length if (alignmentCommandOptions.queryAlignmentCommandOptions.minALen != 0) { ad.alignmentLengthFilter(">=" + alignmentCommandOptions.queryAlignmentCommandOptions.minALen); } if (alignmentCommandOptions.queryAlignmentCommandOptions.maxALen != Integer.MAX_VALUE) { ad.alignmentLengthFilter("<=" + alignmentCommandOptions.queryAlignmentCommandOptions.maxALen); } // apply previous filters ad.update(); // save the dataset logger.warn("The current query implementation saves the resulting dataset in Avro format."); ad.write().format("com.databricks.spark.avro").save(alignmentCommandOptions.queryAlignmentCommandOptions.output); } }