/* * Copyright (C) 2015 Google Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except * in compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under the License * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express * or implied. See the License for the specific language governing permissions and limitations under * the License. */ package com.google.cloud.genomics.dataflow.pipelines; import com.google.api.services.storage.Storage; import com.google.cloud.dataflow.sdk.Pipeline; import com.google.cloud.dataflow.sdk.coders.Coder; import com.google.cloud.dataflow.sdk.coders.DelegateCoder; import com.google.cloud.dataflow.sdk.coders.StringUtf8Coder; import com.google.cloud.dataflow.sdk.io.TextIO; import com.google.cloud.dataflow.sdk.options.Default; import com.google.cloud.dataflow.sdk.options.Description; import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory; import com.google.cloud.dataflow.sdk.transforms.Create; import com.google.cloud.dataflow.sdk.util.Transport; import com.google.cloud.dataflow.sdk.util.gcsfs.GcsPath; import com.google.cloud.dataflow.sdk.values.PCollection; import com.google.cloud.genomics.dataflow.functions.ShardReadsTransform; import com.google.cloud.genomics.dataflow.readers.ReadStreamer; import com.google.cloud.genomics.dataflow.readers.bam.HeaderInfo; import com.google.cloud.genomics.dataflow.readers.bam.ReadBAMTransform; import com.google.cloud.genomics.dataflow.readers.bam.ReaderOptions; import com.google.cloud.genomics.dataflow.readers.bam.ShardingPolicy; import com.google.cloud.genomics.dataflow.utils.GCSOptions; import com.google.cloud.genomics.dataflow.utils.GCSOutputOptions; import com.google.cloud.genomics.dataflow.utils.GenomicsOptions; import com.google.cloud.genomics.dataflow.utils.ShardOptions; import com.google.cloud.genomics.dataflow.writers.bam.WriteBAMTransform; import com.google.cloud.genomics.utils.Contig; import com.google.cloud.genomics.utils.OfflineAuth; import com.google.cloud.genomics.utils.ShardBoundary; import com.google.cloud.genomics.utils.ShardUtils; import com.google.cloud.genomics.utils.ShardUtils.SexChromosomeFilter; import com.google.common.base.Preconditions; import com.google.common.base.Strings; import com.google.common.collect.Lists; import com.google.genomics.v1.Read; import com.google.genomics.v1.StreamReadsRequest; import htsjdk.samtools.ValidationStringency; import java.io.IOException; import java.security.GeneralSecurityException; import java.util.Collections; import java.util.List; import java.util.logging.Logger; /** * Demonstrates loading some Reads, sharding them, writing them to BAM file shards in parallel, * then combining the shards and writing an index for the combined BAM file. */ public class ShardedBAMWriting { static interface Options extends ShardOptions, ShardReadsTransform.Options, WriteBAMTransform.Options, GCSOutputOptions { @Description("The Google Cloud Storage path to the BAM file to get reads data from" + "This or ReadGroupSetId must be set") @Default.String("") String getBAMFilePath(); void setBAMFilePath(String filePath); @Description("An ID of the Google Genomics ReadGroupSets this " + "pipeline is working with. This or BAMFilePath must be set.") @Default.String("") String getReadGroupSetId(); void setReadGroupSetId(String readGroupSetId); public static class Methods { public static void validateOptions(Options options) { GCSOutputOptions.Methods.validateOptions(options); Preconditions.checkArgument( !Strings.isNullOrEmpty(options.getReadGroupSetId()) || !Strings.isNullOrEmpty(options.getBAMFilePath()), "Either BAMFilePath or ReadGroupSetId must be specified"); } } } private static final Logger LOG = Logger.getLogger(ShardedBAMWriting.class.getName()); private static Options pipelineOptions; private static Pipeline pipeline; private static OfflineAuth auth; private static Iterable<Contig> contigs; public static void main(String[] args) throws GeneralSecurityException, IOException { // Register the options so that they show up via --help PipelineOptionsFactory.register(Options.class); pipelineOptions = PipelineOptionsFactory.fromArgs(args) .withValidation().as(Options.class); // Option validation is not yet automatic, we make an explicit call here. Options.Methods.validateOptions(pipelineOptions); auth = GenomicsOptions.Methods.getGenomicsAuth(pipelineOptions); pipeline = Pipeline.create(pipelineOptions); pipeline.getCoderRegistry().registerCoder(Contig.class, CONTIG_CODER); // Process options. contigs = pipelineOptions.isAllReferences() ? null : Contig.parseContigsFromCommandLine(pipelineOptions.getReferences()); // Get the reads and shard them. PCollection<Read> reads; HeaderInfo headerInfo; final String outputFileName = pipelineOptions.getOutput(); final GcsPath destPath = GcsPath.fromUri(outputFileName); final GcsPath destIdxPath = GcsPath.fromUri(outputFileName + ".bai"); final Storage.Objects storage = Transport.newStorageClient( pipelineOptions .as(GCSOptions.class)) .build() .objects(); LOG.info("Cleaning up output file " + destPath + " and " + destIdxPath); try { storage.delete(destPath.getBucket(), destPath.getObject()).execute(); } catch (Exception ignored) { // Ignore errors } try { storage.delete(destIdxPath.getBucket(), destIdxPath.getObject()).execute(); } catch (Exception ignored) { // Ignore errors } if (!Strings.isNullOrEmpty(pipelineOptions.getReadGroupSetId())) { headerInfo = HeaderInfo.getHeaderFromApi(pipelineOptions.getReadGroupSetId(), auth, contigs); reads = getReadsFromAPI(); } else { headerInfo = HeaderInfo.getHeaderFromBAMFile(storage, pipelineOptions.getBAMFilePath(), contigs); reads = getReadsFromBAMFile(); } final PCollection<String> writtenFiles = WriteBAMTransform.write( reads, headerInfo, pipelineOptions.getOutput(), pipeline); writtenFiles .apply( TextIO.Write .to(pipelineOptions.getOutput() + "-result") .named("Write Output Result") .withoutSharding()); pipeline.run(); } private static PCollection<Read> getReadsFromBAMFile() throws IOException { /** * Policy used to shard Reads. * By default we are using the default sharding supplied by the policy class. * If you want custom sharding, use the following pattern: * <pre> * BAM_FILE_READ_SHARDING_POLICY = new ShardingPolicy() { * @Override * public boolean shardBigEnough(BAMShard shard) { * return shard.sizeInLoci() > 50000000; * } * }; * </pre> */ final ShardingPolicy BAM_FILE_READ_SHARDING_POLICY = ShardingPolicy.BYTE_SIZE_POLICY; LOG.info("Sharded reading of " + pipelineOptions.getBAMFilePath()); final ReaderOptions readerOptions = new ReaderOptions( ValidationStringency.DEFAULT_STRINGENCY, true); return ReadBAMTransform.getReadsFromBAMFilesSharded(pipeline, auth, contigs, readerOptions, pipelineOptions.getBAMFilePath(), BAM_FILE_READ_SHARDING_POLICY); } private static PCollection<Read> getReadsFromAPI() throws IOException { final String rgsId = pipelineOptions.getReadGroupSetId(); LOG.info("Sharded reading of ReadGroupSet: " + rgsId); List<StreamReadsRequest> requests = Lists.newArrayList(); if (pipelineOptions.isAllReferences()) { requests.addAll(ShardUtils.getReadRequests(rgsId, SexChromosomeFilter.INCLUDE_XY, pipelineOptions.getBasesPerShard(), auth)); } else { requests.addAll( ShardUtils.getReadRequests(Collections.singletonList(rgsId), pipelineOptions.getReferences(), pipelineOptions.getBasesPerShard())); } LOG.info("Reading from the API with: " + requests.size() + " shards"); PCollection<Read> reads = pipeline.apply(Create.of(requests)) .apply(new ReadStreamer(auth, ShardBoundary.Requirement.STRICT, null)); return reads; } static Coder<Contig> CONTIG_CODER = DelegateCoder.of( StringUtf8Coder.of(), new DelegateCoder.CodingFunction<Contig,String>() { @Override public String apply(Contig contig) throws Exception { return contig.toString(); } }, new DelegateCoder.CodingFunction<String, Contig>() { @Override public Contig apply(String contigStr) throws Exception { return Contig.parseContigsFromCommandLine(contigStr).iterator().next(); } }); }