/*
* Copyright 2015 Google.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.cloud.genomics.dataflow.pipelines;
import com.google.api.services.genomics.model.Annotation;
import com.google.api.services.genomics.model.Position;
import com.google.cloud.dataflow.sdk.Pipeline;
import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
import com.google.cloud.dataflow.sdk.testing.DataflowAssert;
import com.google.cloud.dataflow.sdk.testing.TestPipeline;
import com.google.cloud.dataflow.sdk.transforms.Create;
import com.google.cloud.dataflow.sdk.transforms.GroupByKey;
import com.google.cloud.dataflow.sdk.transforms.ParDo;
import com.google.cloud.dataflow.sdk.values.KV;
import com.google.cloud.dataflow.sdk.values.PCollection;
import com.google.cloud.genomics.dataflow.coders.GenericJsonCoder;
import com.google.cloud.genomics.dataflow.model.PosRgsMq;
import com.google.cloud.genomics.dataflow.pipelines.CalculateCoverage.CalculateCoverageMean;
import com.google.cloud.genomics.dataflow.pipelines.CalculateCoverage.CalculateQuantiles;
import com.google.common.collect.Lists;
import com.google.genomics.v1.CigarUnit;
import com.google.genomics.v1.CigarUnit.Operation;
import com.google.genomics.v1.LinearAlignment;
import com.google.genomics.v1.Read;
import org.junit.BeforeClass;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
/**
* Class for testing the Calculate Coverage class
*/
@RunWith(JUnit4.class)
public class CalculateCoverageTest {
private static List<Read> testSet;
private static List<KV<PosRgsMq, Double>> testSet2;
// The bucket width we are calculating coverage for.
static final int TEST_BUCKET_WIDTH = 2;
// The number of quantiles are are computing.
static final int TEST_NUM_QUANTILES = 3;
// The input data, see setup
static List<Read> input;
// Input read position info
static final long[] readPosInfo = {0, 1, 2, 0, 0, 0, 0, 2, 3, 0, 0, 0, 3, 2, 0,
0, 0, 0, 1, 3, 0, 0, 1, 0, 2};
// Input read mapping quality info, (L = 5, M = 15, H = 35)
static final int[] readMQInfo = {5, 15, 35, 15, 35, 5, 5, 15, 15, 35, 5, 15, 35, 35, 15,
5, 15, 15, 35, 35, 5, 5, 5, 5, 5};
// Input read length info
static final int[] readLengthInfo = {4, 3, 1, 3, 1, 4, 3, 2, 1, 1, 4, 2, 1, 1, 3,
4, 2, 1, 2, 1, 4, 3, 3, 2, 1};
static final Operation[] ops1 = {Operation.INSERT, Operation.SEQUENCE_MISMATCH, Operation.PAD};
static final Operation[] ops2 = {Operation.DELETE, Operation.SKIP, Operation.SEQUENCE_MATCH};
static final Operation[] ops3 =
{Operation.INSERT, Operation.ALIGNMENT_MATCH, Operation.CLIP_SOFT};
@BeforeClass
public static void oneTimeSetUp() {
// Test data for testCalculateCoverageMean
// Read 1 (Contains two operations that result in increasing the length of the read)
List<CigarUnit> cigars = new ArrayList<>();
for (int i = 0; i < 3; i++) {
CigarUnit u = CigarUnit.newBuilder().setOperationLength(2L).setOperation(ops1[i]).build();
CigarUnit.newBuilder().setOperation(CigarUnit.Operation.SKIP);
cigars.add(u);
}
Read read2ValidOps = Read.newBuilder()
.setAlignment(LinearAlignment.newBuilder()
.addAllCigar(cigars)
.setPosition(com.google.genomics.v1.Position.newBuilder()
.setPosition(3L)
.setReferenceName("chr1"))
.setMappingQuality(15))
.setReadGroupSetId("123").build();
// Read 2 (Contains three operations that result in increasing the length of the read)
cigars.clear();
for (int i = 0; i < 3; i++) {
CigarUnit u = CigarUnit.newBuilder().setOperationLength(1L).setOperation(ops2[i]).build();
cigars.add(u);
}
Read read3ValidOps = Read.newBuilder()
.setAlignment(LinearAlignment.newBuilder()
.addAllCigar(cigars)
.setPosition(com.google.genomics.v1.Position.newBuilder()
.setPosition(2L)
.setReferenceName("chr1"))
.setMappingQuality(1))
.setReadGroupSetId("123").build();
// Read 3 (Unmapped)
Read unmappedRead = Read.newBuilder().build();
// Read 4 (Contains one operation that results in increasing the length of the read)
cigars.clear();
for (int i = 0; i < 3; i++) {
CigarUnit u = CigarUnit.newBuilder().setOperationLength(4L).setOperation(ops3[i]).build();
cigars.add(u);
}
Read read1ValidOp = Read.newBuilder()
.setAlignment(LinearAlignment.newBuilder()
.addAllCigar(cigars)
.setPosition(com.google.genomics.v1.Position.newBuilder()
.setPosition(4L)
.setReferenceName("chr1"))
.setMappingQuality(1))
.setReadGroupSetId("321").build();
testSet = Lists.newArrayList(read2ValidOps, read3ValidOps, unmappedRead, read1ValidOp);
// Test data for testCalculateQuantiles
testSet2 = Lists.newArrayList();
Position p = new Position().setPosition(1L).setReferenceName("chr1");
testSet2.add(KV.of(new PosRgsMq(p, "123", PosRgsMq.MappingQuality.L), 4.0));
testSet2.add(KV.of(new PosRgsMq(p, "123", PosRgsMq.MappingQuality.M), 2.0));
testSet2.add(KV.of(new PosRgsMq(p, "123", PosRgsMq.MappingQuality.H), 0.5));
testSet2.add(KV.of(new PosRgsMq(p, "123", PosRgsMq.MappingQuality.A), 6.5));
testSet2.add(KV.of(new PosRgsMq(p, "321", PosRgsMq.MappingQuality.L), 5.0));
testSet2.add(KV.of(new PosRgsMq(p, "321", PosRgsMq.MappingQuality.M), 1.0));
testSet2.add(KV.of(new PosRgsMq(p, "321", PosRgsMq.MappingQuality.H), 0.75));
testSet2.add(KV.of(new PosRgsMq(p, "321", PosRgsMq.MappingQuality.A), 6.75));
testSet2.add(KV.of(new PosRgsMq(p, "456", PosRgsMq.MappingQuality.L), 4.6));
testSet2.add(KV.of(new PosRgsMq(p, "456", PosRgsMq.MappingQuality.M), 3.2));
testSet2.add(KV.of(new PosRgsMq(p, "456", PosRgsMq.MappingQuality.H), 1.2));
testSet2.add(KV.of(new PosRgsMq(p, "456", PosRgsMq.MappingQuality.A), 9.0));
testSet2.add(KV.of(new PosRgsMq(p, "654", PosRgsMq.MappingQuality.L), 3.0));
testSet2.add(KV.of(new PosRgsMq(p, "654", PosRgsMq.MappingQuality.A), 3.0));
testSet2.add(KV.of(new PosRgsMq(p, "789", PosRgsMq.MappingQuality.L), 8.0));
testSet2.add(KV.of(new PosRgsMq(p, "789", PosRgsMq.MappingQuality.A), 8.0));
// Test data for testCalculateCoverage
input = Lists.newArrayList();
cigars.clear();
for (int i = 0; i < 25; i++) {
cigars = new ArrayList<>();
for (int j = 0; j < readLengthInfo[i]; j++) {
CigarUnit u = CigarUnit.newBuilder().setOperationLength(1L).setOperation(Operation.ALIGNMENT_MATCH).build();
cigars.add(u);
}
Read read = Read.newBuilder()
.setAlignment(LinearAlignment.newBuilder()
.addAllCigar(cigars)
.setPosition(com.google.genomics.v1.Position.newBuilder()
.setPosition(readPosInfo[i])
.setReferenceName("1"))
.setMappingQuality(readMQInfo[i]))
.setReadGroupSetId("Rgs" + i / 5 + 1).build();
input.add(read);
}
}
/**
* Unit test for CalculateCoverageMean composite PTransform
*/
@Test
public void testCalculateCoverageMean() {
// Expected Output
KV<PosRgsMq, Double>[] expectedOutput = new KV[12];
PosRgsMq pTest = new PosRgsMq(new Position()
.setPosition(2L).setReferenceName("chr1"), "123", PosRgsMq.MappingQuality.L);
expectedOutput[0] = KV.of(pTest, 1.0);
pTest = new PosRgsMq(new Position()
.setPosition(2L).setReferenceName("chr1"), "123", PosRgsMq.MappingQuality.M);
expectedOutput[1] = KV.of(pTest, 0.5);
pTest = new PosRgsMq(new Position()
.setPosition(2L).setReferenceName("chr1"), "123", PosRgsMq.MappingQuality.A);
expectedOutput[2] = KV.of(pTest, 1.5);
pTest = new PosRgsMq(new Position()
.setPosition(4L).setReferenceName("chr1"), "123", PosRgsMq.MappingQuality.L);
expectedOutput[3] = KV.of(pTest, 0.5);
pTest = new PosRgsMq(new Position()
.setPosition(4L).setReferenceName("chr1"), "123", PosRgsMq.MappingQuality.M);
expectedOutput[4] = KV.of(pTest, 1.0);
pTest = new PosRgsMq(new Position()
.setPosition(4L).setReferenceName("chr1"), "123", PosRgsMq.MappingQuality.A);
expectedOutput[5] = KV.of(pTest, 1.5);
pTest = new PosRgsMq(new Position()
.setPosition(6L).setReferenceName("chr1"), "123", PosRgsMq.MappingQuality.M);
expectedOutput[6] = KV.of(pTest, 0.5);
pTest = new PosRgsMq(new Position()
.setPosition(6L).setReferenceName("chr1"), "123", PosRgsMq.MappingQuality.A);
expectedOutput[7] = KV.of(pTest, 0.5);
pTest = new PosRgsMq(new Position()
.setPosition(4L).setReferenceName("chr1"), "321", PosRgsMq.MappingQuality.L);
expectedOutput[8] = KV.of(pTest, 1.0);
pTest = new PosRgsMq(new Position()
.setPosition(4L).setReferenceName("chr1"), "321", PosRgsMq.MappingQuality.A);
expectedOutput[9] = KV.of(pTest, 1.0);
pTest = new PosRgsMq(new Position()
.setPosition(6L).setReferenceName("chr1"), "321", PosRgsMq.MappingQuality.L);
expectedOutput[10] = KV.of(pTest, 1.0);
pTest = new PosRgsMq(new Position()
.setPosition(6L).setReferenceName("chr1"), "321", PosRgsMq.MappingQuality.A);
expectedOutput[11] = KV.of(pTest, 1.0);
Pipeline p = TestPipeline.create();
p.getCoderRegistry().setFallbackCoderProvider(GenericJsonCoder.PROVIDER);
PCollection<Read> inputReads = p.apply(Create.of(testSet));
PCollection<KV<PosRgsMq, Double>> output = inputReads.apply(new CalculateCoverageMean());
DataflowAssert.that(output).containsInAnyOrder(expectedOutput);
}
/**
* Unit test for CalculateQuantiles composite PTransform
*/
@Test
public void testCalculateQuantiles() {
Pipeline p = TestPipeline.create();
p.getCoderRegistry().setFallbackCoderProvider(GenericJsonCoder.PROVIDER);
PCollection<KV<PosRgsMq, Double>> inputMappingQualities = p.apply(Create.of(testSet2));
PCollection<KV<Position, KV<PosRgsMq.MappingQuality, List<Double>>>> output = inputMappingQualities.apply(
new CalculateQuantiles(3));
Position pos = new Position().setPosition(1L).setReferenceName("chr1");
List<Double> low = Lists.newArrayList(3.0, 4.6, 8.0);
List<Double> med = Lists.newArrayList(1.0, 2.0, 3.2);
List<Double> high = Lists.newArrayList(0.5, 0.75, 1.2);
List<Double> all = Lists.newArrayList(3.0, 6.75, 9.0);
DataflowAssert.that(output).containsInAnyOrder(
KV.of(pos, KV.of(PosRgsMq.MappingQuality.L, low)),
KV.of(pos, KV.of(PosRgsMq.MappingQuality.M, med)),
KV.of(pos, KV.of(PosRgsMq.MappingQuality.H, high)),
KV.of(pos, KV.of(PosRgsMq.MappingQuality.A, all)));
}
/**
* Testing the CalculateCoverage pipeline.
*/
@Test
public void testCalculateCoverage() throws Exception {
List<Annotation> expectedOutput = Lists.newArrayList();
Annotation a1 = new Annotation()
.setAnnotationSetId("123")
.setStart(0L)
.setEnd(2L)
.setReferenceName("1")
.setType("GENERIC")
.setInfo(new HashMap<String, List<Object>>());
a1.getInfo().put("L", Lists.newArrayList((Object) "1.0", "1.0", "3.5"));
a1.getInfo().put("M", Lists.newArrayList((Object) "1.5", "1.5", "2.0"));
a1.getInfo().put("H", Lists.newArrayList((Object) "0.5", "0.5", "0.5"));
a1.getInfo().put("A", Lists.newArrayList((Object) "2.5", "3.0", "3.5"));
expectedOutput.add(a1);
Annotation a2 = new Annotation()
.setAnnotationSetId("123")
.setStart(2L)
.setEnd(4L)
.setReferenceName("1")
.setType("GENERIC")
.setInfo(new HashMap<String, List<Object>>());
a2.getInfo().put("L", Lists.newArrayList((Object) "1.0", "1.0", "3.0"));
a2.getInfo().put("M", Lists.newArrayList((Object) "0.5", "1.5", "1.5"));
a2.getInfo().put("H", Lists.newArrayList((Object) "0.5", "1.0", "1.0"));
a2.getInfo().put("A", Lists.newArrayList((Object) "2.0", "3.0", "3.0"));
expectedOutput.add(a2);
CalculateCoverage.Options popts = PipelineOptionsFactory.create().as(
CalculateCoverage.Options.class);
popts.setBucketWidth(TEST_BUCKET_WIDTH);
popts.setNumQuantiles(TEST_NUM_QUANTILES);
Pipeline p = TestPipeline.create(popts);
p.getCoderRegistry().setFallbackCoderProvider(GenericJsonCoder.PROVIDER);
PCollection<Read> reads = p.apply(Create.of(input));
PCollection<KV<PosRgsMq, Double>> coverageMeans = reads.apply(
new CalculateCoverage.CalculateCoverageMean());
PCollection<KV<Position, KV<PosRgsMq.MappingQuality, List<Double>>>> quantiles
= coverageMeans.apply(new CalculateCoverage.CalculateQuantiles(popts.getNumQuantiles()));
PCollection<KV<Position, Iterable<KV<PosRgsMq.MappingQuality, List<Double>>>>> answer
= quantiles.apply(GroupByKey.<Position, KV<PosRgsMq.MappingQuality, List<Double>>>create());
PCollection<Annotation> output = answer.apply(
ParDo.of(new CalculateCoverage.CreateAnnotations("123", null, false)));
DataflowAssert.that(output).containsInAnyOrder(expectedOutput);
p.run();
}
}