/* * Copyright (C) 2016 Google Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except * in compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under the License * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express * or implied. See the License for the specific language governing permissions and limitations under * the License. */ package com.google.cloud.genomics.dataflow.pipelines; import com.google.cloud.dataflow.sdk.Pipeline; import com.google.cloud.dataflow.sdk.io.TextIO; import com.google.cloud.dataflow.sdk.options.Default; import com.google.cloud.dataflow.sdk.options.Description; import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory; import com.google.cloud.dataflow.sdk.options.Validation.Required; import com.google.cloud.dataflow.sdk.transforms.Create; import com.google.cloud.dataflow.sdk.transforms.DoFn; import com.google.cloud.dataflow.sdk.transforms.ParDo; import com.google.cloud.dataflow.sdk.values.PCollection; import com.google.cloud.genomics.dataflow.readers.VariantStreamer; import com.google.cloud.genomics.dataflow.utils.CallSetNamesOptions; import com.google.cloud.genomics.dataflow.utils.GCSOutputOptions; import com.google.cloud.genomics.dataflow.utils.GenomicsOptions; import com.google.cloud.genomics.dataflow.utils.ShardOptions; import com.google.cloud.genomics.utils.OfflineAuth; import com.google.cloud.genomics.utils.ShardBoundary; import com.google.cloud.genomics.utils.ShardUtils; import com.google.common.base.Joiner; import com.google.common.collect.ImmutableSet; import com.google.genomics.v1.StreamVariantsRequest; import com.google.genomics.v1.Variant; import com.google.genomics.v1.VariantCall; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.security.GeneralSecurityException; import java.util.List; /** * Given a list of callset ids, identify variants that are associated only with the specified * individuals (i.e. variants private to those individuals). * * This pipeline might be used in combination with the DeleteVariants pipeline to, for example, * remove all variants private to a particular family from the variant set. */ public class IdentifyPrivateVariants { public static interface Options extends // Options for call set names. CallSetNamesOptions, // Options for calculating over regions, chromosomes, or whole genomes. ShardOptions, // Options for the output destination. GCSOutputOptions { @Override @Description("The ID of the Google Genomics variant set from which this pipeline " + "will identify private variants.") @Required String getVariantSetId(); @Override @Description("A local file path to a list of newline-separated callset names. " + "Any variants private to those callsets will be identified.") @Required String getCallSetNamesFilepath(); @Description("Whether variants with no callsets should also be identified. Defaults to false.") @Default.Boolean(false) boolean getIdentifyVariantsWithoutCalls(); void setIdentifyVariantsWithoutCalls(boolean identifyVariantsWithoutCalls); public static class Methods { public static void validateOptions(Options options) { GCSOutputOptions.Methods.validateOptions(options); } } } private static final Logger LOG = LoggerFactory.getLogger(IdentifyPrivateVariants.class); // Tip: Use the API explorer to test which fields to include in partial responses. // https://developers.google.com/apis-explorer/#p/genomics/v1/genomics.variants.stream?fields=variants(alternateBases%252Ccalls(callSetName%252Cgenotype)%252CreferenceBases)&_h=3&resource=%257B%250A++%2522variantSetId%2522%253A+%25223049512673186936334%2522%252C%250A++%2522referenceName%2522%253A+%2522chr17%2522%252C%250A++%2522start%2522%253A+%252241196311%2522%252C%250A++%2522end%2522%253A+%252241196312%2522%252C%250A++%2522callSetIds%2522%253A+%250A++%255B%25223049512673186936334-0%2522%250A++%255D%250A%257D& private static final String VARIANT_FIELDS = "variants(id,reference_name,start,end,reference_bases,alternate_bases,calls(callSetId))"; /** * Pipeline function implementing a filter only returning variants private to one or more callset * IDs and optionally those with no callsetIds. */ public static final class PrivateVariantsFilterFn extends DoFn<Variant, Variant> { private final ImmutableSet<String> callSetIds; private boolean retainVariantsWithNoCalls; /** * @param callSetIds */ public PrivateVariantsFilterFn(ImmutableSet<String> callSetIds, boolean retainVariantsWithNoCalls) { super(); this.callSetIds = callSetIds; this.retainVariantsWithNoCalls = retainVariantsWithNoCalls; } @Override public void processElement(ProcessContext context) { Variant variant = context.element(); List<VariantCall> calls = variant.getCallsList(); for (VariantCall call : calls) { if (!callSetIds.contains(call.getCallSetId())) { // We found a callset ID not in our set. This variant is not private // to our set of callset IDs. Skip it. return; } } if (!retainVariantsWithNoCalls && calls.isEmpty()) { // This is a variant with no calls. Skip it. return; } context.output(variant); } } public static void main(String[] args) throws IOException, GeneralSecurityException { // Register the options so that they show up via --help PipelineOptionsFactory.register(Options.class); Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); // Option validation is not yet automatic, we make an explicit call here. Options.Methods.validateOptions(options); // Set up the prototype request and auth. StreamVariantsRequest prototype = StreamVariantsRequest.newBuilder( CallSetNamesOptions.Methods.getRequestPrototype(options)) // In this case, we do not want responses containing a subset of calls, we want all of them. .clearCallSetIds() .build(); OfflineAuth auth = GenomicsOptions.Methods.getGenomicsAuth(options); ImmutableSet<String> callSetIds = ImmutableSet.<String>builder() .addAll(CallSetNamesOptions.Methods.getCallSetIds(options)) .build(); LOG.info("The pipeline will identify and write to Cloud Storage variants " + "private to " + callSetIds.size() + " genomes with callSetIds: " + callSetIds); if (options.getIdentifyVariantsWithoutCalls()) { LOG.info("* The pipeline will also identify variants with no callsets. *"); } List<StreamVariantsRequest> shardRequests = options.isAllReferences() ? ShardUtils.getVariantRequests(prototype, ShardUtils.SexChromosomeFilter.INCLUDE_XY, options.getBasesPerShard(), auth) : ShardUtils.getVariantRequests(prototype, options.getBasesPerShard(), options.getReferences()); Pipeline p = Pipeline.create(options); PCollection<Variant> variants = p.begin() .apply(Create.of(shardRequests)) .apply(new VariantStreamer(auth, ShardBoundary.Requirement.STRICT, VARIANT_FIELDS)) .apply(ParDo.of(new PrivateVariantsFilterFn(callSetIds, options.getIdentifyVariantsWithoutCalls()))); variants.apply(ParDo.named("FormatResults").of(new DoFn<Variant, String>() { @Override public void processElement(ProcessContext c) { Variant v = c.element(); c.output(Joiner.on("\t").join(v.getId(), v.getReferenceName(), v.getStart(), v.getEnd(), v.getReferenceBases(), Joiner.on(",").join(v.getAlternateBasesList()) )); } })) .apply(TextIO.Write.to(options.getOutput())); p.run(); } }