/*
* Copyright (C) 2014 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
* in compliance with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License
* is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing permissions and limitations under
* the License.
*/
package com.google.cloud.genomics.dataflow.pipelines;
import com.google.cloud.dataflow.sdk.Pipeline;
import com.google.cloud.dataflow.sdk.io.TextIO;
import com.google.cloud.dataflow.sdk.options.Default;
import com.google.cloud.dataflow.sdk.options.Description;
import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
import com.google.cloud.dataflow.sdk.transforms.Combine;
import com.google.cloud.dataflow.sdk.transforms.Create;
import com.google.cloud.dataflow.sdk.transforms.Filter;
import com.google.cloud.dataflow.sdk.transforms.ParDo;
import com.google.cloud.dataflow.sdk.values.KV;
import com.google.cloud.dataflow.sdk.values.PCollection;
import com.google.cloud.genomics.dataflow.functions.JoinNonVariantSegmentsWithVariants;
import com.google.cloud.genomics.dataflow.functions.SitesToShards;
import com.google.cloud.genomics.dataflow.functions.VariantFunctions;
import com.google.cloud.genomics.dataflow.functions.ibs.AlleleSimilarityCalculator;
import com.google.cloud.genomics.dataflow.functions.ibs.CallSimilarityCalculatorFactory;
import com.google.cloud.genomics.dataflow.functions.ibs.FormatIBSData;
import com.google.cloud.genomics.dataflow.functions.ibs.IBSCalculator;
import com.google.cloud.genomics.dataflow.functions.ibs.SharedMinorAllelesCalculatorFactory;
import com.google.cloud.genomics.dataflow.readers.VariantStreamer;
import com.google.cloud.genomics.dataflow.utils.CallSetNamesOptions;
import com.google.cloud.genomics.dataflow.utils.GCSOutputOptions;
import com.google.cloud.genomics.dataflow.utils.GenomicsOptions;
import com.google.cloud.genomics.dataflow.utils.ShardOptions;
import com.google.cloud.genomics.utils.OfflineAuth;
import com.google.cloud.genomics.utils.ShardBoundary;
import com.google.cloud.genomics.utils.ShardUtils;
import com.google.genomics.v1.StreamVariantsRequest;
import com.google.genomics.v1.Variant;
import java.util.List;
/**
* A pipeline that computes Identity by State (IBS) for each pair of individuals in a dataset.
*
* See http://googlegenomics.readthedocs.org/en/latest/use_cases/compute_identity_by_state/index.html
* for running instructions.
*/
public class IdentityByState {
public static interface Options extends
// Options for call set names.
CallSetNamesOptions,
// Options for calculating over regions, chromosomes, or whole genomes.
ShardOptions,
// Options for calculating over a list of sites.
SitesToShards.Options,
// Options for special handling of data with non-variant segment records. This
// is needed since IBS must take into account reference-matches in addition
// to the variants (unlike other analyses such as PCA).
JoinNonVariantSegmentsWithVariants.Options,
// Options for the output destination.
GCSOutputOptions
{
@Override
@Description("The ID of the Google Genomics variant set this pipeline is accessing. "
+ "Defaults to 1000 Genomes.")
@Default.String("10473108253681171589")
String getVariantSetId();
@Description("The class that determines the strategy for calculating the similarity of alleles.")
@Default.Class(SharedMinorAllelesCalculatorFactory.class)
Class<? extends CallSimilarityCalculatorFactory> getCallSimilarityCalculatorFactory();
void setCallSimilarityCalculatorFactory(Class<? extends CallSimilarityCalculatorFactory> kls);
public static class Methods {
public static void validateOptions(Options options) {
JoinNonVariantSegmentsWithVariants.Options.Methods.validateOptions(options);
GCSOutputOptions.Methods.validateOptions(options);
}
}
}
// Tip: Use the API explorer to test which fields to include in partial responses.
// https://developers.google.com/apis-explorer/#p/genomics/v1/genomics.variants.stream?fields=variants(alternateBases%252Ccalls(callSetName%252Cgenotype)%252CreferenceBases)&_h=3&resource=%257B%250A++%2522variantSetId%2522%253A+%25223049512673186936334%2522%252C%250A++%2522referenceName%2522%253A+%2522chr17%2522%252C%250A++%2522start%2522%253A+%252241196311%2522%252C%250A++%2522end%2522%253A+%252241196312%2522%252C%250A++%2522callSetIds%2522%253A+%250A++%255B%25223049512673186936334-0%2522%250A++%255D%250A%257D&
private static final String VARIANT_FIELDS =
"variants(alternateBases,calls(callSetName,genotype),end,referenceBases,start)";
public static void main(String[] args) throws Exception {
// Register the options so that they show up via --help
PipelineOptionsFactory.register(Options.class);
Options options =
PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
// Option validation is not yet automatic, we make an explicit call here.
Options.Methods.validateOptions(options);
// Set up the prototype request and auth.
StreamVariantsRequest prototype = CallSetNamesOptions.Methods.getRequestPrototype(options);
OfflineAuth auth = GenomicsOptions.Methods.getGenomicsAuth(options);
Pipeline p = Pipeline.create(options);
PCollection<Variant> processedVariants = null;
if (null != options.getSitesFilepath()) {
// Compute IBS on a list of sites (e.g., SNPs).
PCollection<StreamVariantsRequest> requests = p.apply(TextIO.Read.named("ReadSites")
.from(options.getSitesFilepath()))
.apply(new SitesToShards.SitesToStreamVariantsShardsTransform(prototype));
if (options.getHasNonVariantSegments()) {
processedVariants = requests.apply(
new JoinNonVariantSegmentsWithVariants.RetrieveAndCombineTransform(auth, VARIANT_FIELDS));
} else {
processedVariants = requests.apply(
new VariantStreamer(auth, ShardBoundary.Requirement.STRICT, VARIANT_FIELDS));
}
} else {
// Compute IBS over genomic region(s) or the whole genome.
List<StreamVariantsRequest> requests = options.isAllReferences() ?
ShardUtils.getVariantRequests(prototype, ShardUtils.SexChromosomeFilter.EXCLUDE_XY,
options.getBasesPerShard(), auth) :
ShardUtils.getVariantRequests(prototype, options.getBasesPerShard(), options.getReferences());
PCollection<Variant> variants = p.begin()
.apply(Create.of(requests))
.apply(new VariantStreamer(auth, ShardBoundary.Requirement.STRICT, VARIANT_FIELDS));
if (options.getHasNonVariantSegments()) {
// Note that this is less exact compared to the above approach on sites.
// When not run on a whole chromosome or genome, any non-variant segments at the beginning of the region(s)
// are not considered due to the STRICT shard boundary used to avoid repeated data.
processedVariants = variants.apply(new JoinNonVariantSegmentsWithVariants.BinShuffleAndCombineTransform());
} else {
processedVariants = variants;
}
}
processedVariants
.apply(Filter.byPredicate(VariantFunctions.IS_SINGLE_ALTERNATE_SNP))
.apply(
ParDo.named(AlleleSimilarityCalculator.class.getSimpleName()).of(
new AlleleSimilarityCalculator(getCallSimilarityCalculatorFactory(options))))
.apply(Combine.<KV<String, String>, KV<Double, Integer>>perKey(new IBSCalculator()))
.apply(ParDo.named(FormatIBSData.class.getSimpleName()).of(new FormatIBSData()))
.apply(TextIO.Write.named("WriteIBSData").to(options.getOutput()));
p.run();
}
private static CallSimilarityCalculatorFactory getCallSimilarityCalculatorFactory(
Options options) throws InstantiationException, IllegalAccessException {
return options.getCallSimilarityCalculatorFactory().newInstance();
}
}