/*
* Copyright 2015 Google.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.cloud.genomics.dataflow.functions.verifybamid;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import com.google.cloud.genomics.dataflow.model.ReadBaseQuality;
import com.google.cloud.genomics.dataflow.model.ReadBaseWithReference;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.genomics.v1.CigarUnit;
import com.google.genomics.v1.CigarUnit.Operation;
import com.google.genomics.v1.LinearAlignment;
import com.google.genomics.v1.Position;
import com.google.genomics.v1.Read;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Tests for the ReadUtils class.
*/
@RunWith(JUnit4.class)
public class ReadFunctionsTest {
/**
* Test whether a char is a valid base (ACGT)
*/
private static boolean isBase(char c) {
return c == 'A' || c == 'C' || c == 'G' || c == 'T';
}
static class CigarTestCase {
/**
* All tests are on 'chr1' for now (it doesn't matter).
*/
private static final String CHROMOSOME = "1";
/**
* All read alignments start at position 10000 (it doesn't matter).
*/
private static final Integer INITIAL_OFFSET = 10000;
/**
* String for a regular expression for a single CIGAR chunk.
*/
private static final String CIGAR_CHUNK = "[0-9]+[MIDNSHPX=]";
/**
* Regular expression for a single CIGAR chunk.
*/
private static final Pattern CIGAR_CHUNK_PATTERN = Pattern.compile(CIGAR_CHUNK);
/**
* Convert from string to Operation.
*/
private static final Map<String, Operation> CIGAR_UNIT_OPERATION
= new ImmutableMap.Builder<String, Operation>()
.put("M", Operation.ALIGNMENT_MATCH)
.put("H", Operation.CLIP_HARD)
.put("S", Operation.CLIP_SOFT)
.put("D", Operation.DELETE)
.put("I", Operation.INSERT)
.put("P", Operation.PAD)
.put("=", Operation.SEQUENCE_MATCH)
.put("X", Operation.SEQUENCE_MISMATCH)
.put("N", Operation.SKIP)
.build();
final char[] alignedRef;
final char[] alignedSeq;
final String cigar;
final Read read;
final List<ReadBaseWithReference> readBases;
/**
* Builds a test case containing a read that can be passed to the Cigar class and the expected
* read base results.
*
* @param seq The actual read sequence
* @param qualityScores The list of quality scores, one for each read base
* @param cigar The cigar string
* @param alignedRef The reference sequence adjusted to be aligned
* @param alignedSeq The read sequence adjusted to be aligned
* @param refSeqs One sequence for each piece of the cigar string, corresponding to the
* reference sequence for each piece
* @param refSeqStart Where on the reference sequence the read starts
*/
CigarTestCase(String seq, List<Integer> qualityScores, String cigar,
String alignedRef, String alignedSeq, List<String> refSeqs,
Integer refSeqStart) {
if (alignedRef == null) {
this.alignedRef = null;
} else {
this.alignedRef = alignedRef.toCharArray();
}
if (alignedSeq == null) {
this.alignedSeq = null;
} else {
this.alignedSeq = alignedSeq.toCharArray();
}
this.cigar = cigar;
this.read = Read.newBuilder()
.setAlignedSequence(seq)
.addAllAlignedQuality(qualityScores)
.setAlignment(LinearAlignment.newBuilder()
.setPosition(Position.newBuilder()
.setReferenceName(CHROMOSOME)
.setPosition(INITIAL_OFFSET + refSeqStart)
.build())
.addAllCigar(cigarStringToUnits(cigar, refSeqs))
.build())
.build();
this.readBases = buildReadBaseList(alignedSeq, alignedRef, qualityScores,
CHROMOSOME, INITIAL_OFFSET);
}
/**
* Constructs the expected read base list for comparison with the results from the Cigar class.
*
* @param alignedSeq The read sequence adjusted to be aligned with the reference
* @param alignedRef The reference sequence adjusted to be aligned
* @param quals The quality score for each read base
* @param chromosome The chromosome where the alignment is
* @param originalPos The position of the first read base on the chromosome
* @return A constructed read base list
*/
private static List<ReadBaseWithReference> buildReadBaseList(String alignedSeq,
String alignedRef, List<Integer> quals,
String chromosome, Integer originalPos) {
if ((alignedSeq == null) || (alignedRef == null)) {
return null;
}
if (alignedSeq.length() > alignedRef.length()) {
throw new IllegalArgumentException("The alignedRef is shorter than the alignedSeq:"
+ alignedSeq.length() + " vs " + alignedRef.length());
}
int refPositionOffset = 0;
int qualityOffset = 0;
ImmutableList.Builder<ReadBaseWithReference> readBaseList = ImmutableList.builder();
for (int i = 0; i < alignedSeq.length(); i++) {
// we only have a ReadBase if there is a base here in *BOTH* the reference
// and the read sequence.
if ((!isBase(alignedSeq.charAt(i)))
|| (!isBase(alignedRef.charAt(i)))) {
// if the reference has a base but the sequence does not
if (alignedRef.charAt(i) != '*') {
refPositionOffset++;
}
if (isBase(Character.toUpperCase(Character.toUpperCase(alignedSeq.charAt(i))))) {
qualityOffset++;
}
continue;
}
ReadBaseWithReference sr = new ReadBaseWithReference(
new ReadBaseQuality(
alignedSeq.substring(i, i + 1),
quals.get(qualityOffset)),
alignedRef.substring(i, i + 1),
Position.newBuilder()
.setReferenceName(chromosome)
.setPosition(originalPos + refPositionOffset)
.build());
qualityOffset++;
refPositionOffset++;
readBaseList.add(sr);
}
return readBaseList.build();
}
/**
* For testing, it's easier to set up the tests accurately using strings for the cigars.
*
* @param s cigar string to be converted into CigarUnits.
* @return list of CigarUnits.
*/
private static List<CigarUnit> cigarStringToUnits(String s, List<String> refSeqs) {
ImmutableList.Builder<CigarUnit> cigarUnits = ImmutableList.builder();
// check for an empty cigar string
if ("*".equals(s)) {
return cigarUnits.build();
}
// index into the list of reference sequences
int i = 0;
Matcher m = CIGAR_CHUNK_PATTERN.matcher(s);
while (m.find()) {
cigarUnits.add(CigarUnit.newBuilder()
.setOperation(CIGAR_UNIT_OPERATION.get(m.group().substring(m.group().length() - 1)))
.setOperationLength(Integer.parseInt(m.group().substring(0, m.group().length() - 1)))
.setReferenceSequence(!refSeqs.isEmpty() ? refSeqs.get(i) : "")
.build());
i++;
}
return cigarUnits.build();
}
}
/* The test cases below are taken from the SAM file spec.
* For the quality scores, we use different increasing numbers to validate
* indexes into the quality list.
*/
static final List<CigarTestCase> cases;
static {
ImmutableList.Builder<CigarTestCase> caseBuilder = ImmutableList.builder();
// no cigar string
caseBuilder.add(new CigarTestCase(
"TTAGATAAAGGATACTG",
ImmutableList.of(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
24, 25, 26),
"*",
null,
null,
null,
0));
caseBuilder.add(new CigarTestCase(
"TTAGATAAAGGATACTG",
ImmutableList.of(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
24, 25, 26),
"8M2I4M1D3M",
"AGCATGTTAGATAA**GATAGCTGTGCTAGTAGGCAGTCAGCGCCAT",
" TTAGATAAAGGATA*CTG",
ImmutableList.of("TTAGATAA", "**", "GATA", "", "CTG"),
6));
caseBuilder.add(new CigarTestCase(
"AAAAGATAAGGATA",
ImmutableList.of(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23),
"3S6M1P1I4M",
"AGCATGTTAGATAA**GATAGCTGTGCTAGTAGGCAGTCAGCGCCAT",
" aaaAGATAA*GGATA",
ImmutableList.of("", "AGATAA", "*", "*", "GATA"),
8));
caseBuilder.add(new CigarTestCase(
"GCCTAAGCTAA",
ImmutableList.of(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20),
"5S6M",
"AGCATGTTAGATAA**GATAGCTGTGCTAGTAGGCAGTCAGCGCCAT",
" gcctaAGCTAA",
ImmutableList.of("", "AGATAA"),
8));
caseBuilder.add(new CigarTestCase(
"ATAGCTTCAGC",
ImmutableList.of(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20),
"6M14N5M",
"AGCATGTTAGATAA**GATAGCTGTGCTAGTAGGCAGTCAGCGCCAT",
" ATAGCT..............TCAGC",
ImmutableList.of("ATAGCT", "GTGCTAGTAGGCAG", "TCAGC"),
15));
caseBuilder.add(new CigarTestCase(
"TAGGC",
ImmutableList.of(10, 11, 12, 13, 14),
"6H5M",
"AGCATGTTAGATAA**GATAGCTGTGCTAGTAGGCAGTCAGCGCCAT",
// The spec has ttagctTAGGC instead of TAGGC. We're
// dropping the ttagct prefix because it's been
// pruned out before we get to toCigarOffset.
" TAGGC",
ImmutableList.of("", "TAGGC"),
28));
caseBuilder.add(new CigarTestCase(
"CAGCGGCAT",
ImmutableList.of(10, 11, 12, 13, 14, 15, 16, 17, 18),
"9M",
"AGCATGTTAGATAA**GATAGCTGTGCTAGTAGGCAGTCAGCGCCAT",
" CAGCGGCAT",
ImmutableList.of("CAGCGCCAT"),
36));
cases = caseBuilder.build();
}
@Test
public void testReadBases() throws IOException {
for (CigarTestCase tc : cases) {
assertEquals(tc.readBases, ReadFunctions.extractReadBases(tc.read));
}
}
@Test
public void testIsOnChromosome() {
assertFalse(ReadFunctions.IS_ON_CHROMOSOME.apply(Read.newBuilder()
.setAlignment(LinearAlignment.newBuilder()
.setPosition(Position.newBuilder()
.setReferenceName("chrZ")
.build())
.build())
.build()));
assertTrue(ReadFunctions.IS_ON_CHROMOSOME.apply(Read.newBuilder()
.setAlignment(LinearAlignment.newBuilder()
.setPosition(Position.newBuilder()
.setReferenceName("1")
.build())
.build())
.build()));
}
@Test
public void testIsNotQCFailure() {
assertFalse(ReadFunctions.IS_NOT_QC_FAILURE.apply(Read.newBuilder()
.setFailedVendorQualityChecks(true)
.build()));
assertTrue(ReadFunctions.IS_NOT_QC_FAILURE.apply(Read.newBuilder()
.setFailedVendorQualityChecks(false)
.build()));
}
@Test
public void testIsNotDuplicate() {
assertFalse(ReadFunctions.IS_NOT_DUPLICATE.apply(Read.newBuilder()
.setDuplicateFragment(true)
.build()));
assertTrue(ReadFunctions.IS_NOT_DUPLICATE.apply(Read.newBuilder()
.setDuplicateFragment(false)
.build()));
}
@Test
public void testIsProperPlacement() {
assertTrue(ReadFunctions.IS_PROPER_PLACEMENT.apply(Read.newBuilder()
.setProperPlacement(true)
.build()));
assertFalse(ReadFunctions.IS_PROPER_PLACEMENT.apply(Read.newBuilder()
.setProperPlacement(false)
.build()));
}
}