/* * Copyright (C) 2015 Google Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except * in compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under the License * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express * or implied. See the License for the specific language governing permissions and limitations under * the License. */ package com.google.cloud.genomics.dataflow.readers; import com.google.cloud.dataflow.sdk.transforms.Aggregator; import com.google.cloud.dataflow.sdk.transforms.DoFn; import com.google.cloud.dataflow.sdk.transforms.Max; import com.google.cloud.dataflow.sdk.transforms.PTransform; import com.google.cloud.dataflow.sdk.transforms.ParDo; import com.google.cloud.dataflow.sdk.transforms.Sum; import com.google.cloud.dataflow.sdk.values.PCollection; import com.google.cloud.genomics.utils.OfflineAuth; import com.google.cloud.genomics.utils.ShardBoundary; import com.google.cloud.genomics.utils.grpc.ReadStreamIterator; import com.google.common.base.Stopwatch; import com.google.genomics.v1.Read; import com.google.genomics.v1.StreamReadsRequest; import com.google.genomics.v1.StreamReadsResponse; import java.io.IOException; import java.security.GeneralSecurityException; import java.util.Iterator; import java.util.List; import java.util.concurrent.TimeUnit; /** * PTransform for streaming reads via gRPC. */ public class ReadStreamer extends PTransform<PCollection<StreamReadsRequest>, PCollection<Read>> { protected final OfflineAuth auth; protected final ShardBoundary.Requirement shardBoundary; protected final String fields; /** * Create a streamer that can enforce shard boundary semantics. * * Tip: Use the API explorer to test which fields to include in partial responses: * <a href="https://developers.google.com/apis-explorer/#p/genomics/v1/genomics.reads.stream?fields=alignments(alignedSequence%252Cid)&_h=2&resource=%257B%250A++%2522readGroupSetId%2522%253A+%2522CMvnhpKTFhD3he72j4KZuyc%2522%252C%250A++%2522referenceName%2522%253A+%2522chr17%2522%252C%250A++%2522start%2522%253A+%252241196311%2522%252C%250A++%2522end%2522%253A+%252241196312%2522%250A%257D&"> * reads example</a>. * * @param auth The OfflineAuth to use for the request. * @param shardBoundary The shard boundary semantics to enforce. * @param fields Which fields to include in a partial response or null for all. */ public ReadStreamer(OfflineAuth auth, ShardBoundary.Requirement shardBoundary, String fields) { this.auth = auth; this.shardBoundary = shardBoundary; this.fields = fields; } @Override public PCollection<Read> apply(PCollection<StreamReadsRequest> input) { return input.apply(ParDo.of(new RetrieveReads())) .apply(ParDo.of(new ConvergeReadsList())); } private class RetrieveReads extends DoFn<StreamReadsRequest, List<Read>> { protected Aggregator<Integer, Integer> initializedShardCount; protected Aggregator<Integer, Integer> finishedShardCount; protected Aggregator<Long, Long> shardTimeMaxSec; public RetrieveReads() { initializedShardCount = createAggregator("Initialized Shard Count", new Sum.SumIntegerFn()); finishedShardCount = createAggregator("Finished Shard Count", new Sum.SumIntegerFn()); shardTimeMaxSec = createAggregator("Maximum Shard Processing Time (sec)", new Max.MaxLongFn()); } @Override public void processElement(ProcessContext c) throws IOException, GeneralSecurityException { initializedShardCount.addValue(1); shardTimeMaxSec.addValue(0L); Stopwatch stopWatch = Stopwatch.createStarted(); Iterator<StreamReadsResponse> iter = ReadStreamIterator.enforceShardBoundary(auth, c.element(), shardBoundary, fields); while (iter.hasNext()) { StreamReadsResponse readResponse = iter.next(); c.output(readResponse.getAlignmentsList()); } stopWatch.stop(); shardTimeMaxSec.addValue(stopWatch.elapsed(TimeUnit.SECONDS)); finishedShardCount.addValue(1); } } /** * This step exists to emit the individual reads in a parallel step to the StreamReads step in * order to increase throughput. */ private class ConvergeReadsList extends DoFn<List<Read>, Read> { protected Aggregator<Long, Long> itemCount; public ConvergeReadsList() { itemCount = createAggregator("Number of reads", new Sum.SumLongFn()); } @Override public void processElement(ProcessContext c) { for (Read r : c.element()) { c.output(r); itemCount.addValue(1L); } } } }