/*
* The MIT License
*
* Copyright (c) 2017 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package picard.sam.markduplicates;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import htsjdk.samtools.util.QualityUtil;
import picard.PicardException;
import java.util.*;
/**
* This class defines the individual test cases to run. The actual running of the test is done
* by UmiAwareMarkDuplicatesWithMateCigarTester (see getTester).
* @author fleharty
*/
public class UmiAwareMarkDuplicatesWithMateCigarTest extends SimpleMarkDuplicatesWithMateCigarTest {
@Override
protected UmiAwareMarkDuplicatesWithMateCigarTester getTester() {
return new UmiAwareMarkDuplicatesWithMateCigarTester();
}
protected UmiAwareMarkDuplicatesWithMateCigarTester getTester(final boolean allowMissingUmis) {
return new UmiAwareMarkDuplicatesWithMateCigarTester(allowMissingUmis);
}
@DataProvider(name = "testUmiSetsDataProvider")
private Object[][] testUmiSetsDataProvider() {
return new Object[][] {{
// Test basic error correction using edit distance of 1
Arrays.asList(new String[] {"AAAA", "AAAA", "ATTA", "AAAA", "AAAT"}), // Observed UMI
Arrays.asList(new String[] {"AAAA", "AAAA", "ATTA", "AAAA", "AAAA"}), // Expected inferred UMI
Arrays.asList(new Boolean[] {false, true, false, true, true}), // Should it be marked as duplicate?
1 // Edit Distance to Join
}, {
// Test basic error correction using edit distance of 2
Arrays.asList(new String[] {"AAAA", "AAAA", "ATTA", "AAAA", "AAAT"}),
Arrays.asList(new String[] {"AAAA", "AAAA", "AAAA", "AAAA", "AAAA"}),
Arrays.asList(new Boolean[] {false, true, true, true, true}),
2
}, {
// Test basic error correction using edit distance of 1 where UMIs
// form a chain in edit distance space so that a UMI with large
// edit distance will get error corrected to a distant but linked (in edit space) UMI
Arrays.asList(new String[] {"AAAA", "AAAA", "AAAT", "AAGT", "ACGT", "TCGT", "CCCC"}),
Arrays.asList(new String[] {"AAAA", "AAAA", "AAAA", "AAAA", "AAAA", "AAAA", "CCCC"}),
Arrays.asList(new Boolean[] {false, true, true, true, true, true, false}),
1
}, {
// Test short UMIs
Arrays.asList(new String[] {"A", "A", "T", "G", "G", "C", "C", "A"}),
Arrays.asList(new String[] {"A", "A", "A", "A", "A", "A", "A", "A"}), // All UMIs should get corrected to A
Arrays.asList(new Boolean[] {false, true, true, true, true, true, true, true}), // All mate pairs should be duplicates except the first
1
}, {
// Test short UMIs with no allowance for errors
Arrays.asList(new String[] {"A", "A", "T", "G", "G", "C", "C", "A"}),
Arrays.asList(new String[] {"A", "A", "T", "G", "G", "C", "C", "A"}), // No UMIs should get corrected
Arrays.asList(new Boolean[] {false, true, false, false, true, false, true, true}), // Only exactly duplicated UMIs will give rise to a new duplicate set
0
}, {
// Test longish UMIs with relatively large allowance for error
// UMIs "TTGACATCCA", "ATGCCATCGA", "AAGTCACCGT" should belong to the same duplicate set since
// they are within edit distance of 4 of each other. TTGACATCCA should be chosen as the inferred
// UMI even though it only occurs once. Since all UMIs only occur once, we choose the UMI that
// is not marked as duplicate to be the inferred UMI.
Arrays.asList(new String[] {"TTGACATCCA", "ATGCCATCGA", "AAGTCACCGT"}),
Arrays.asList(new String[] {"TTGACATCCA", "TTGACATCCA", "TTGACATCCA"}), // All UMIs should get corrected to TTGACATCCA
Arrays.asList(new Boolean[] {false, true, true}), // All mate pairs should be duplicates except the first
4
}, };
}
@DataProvider(name = "testBadUmiSetsDataProvider")
private Object[][] testBadUmiSetsDataProvider() {
return new Object[][] {{
// The code should not support variable length UMIs, if we observe variable length UMIs
// ensure that an exception is thrown.
Arrays.asList(new String[] {"AAAA", "A"}),
Arrays.asList(new String[] {"AAAA", "A"}),
Arrays.asList(new Boolean[] {false, false}),
4
}, {
// The code should not support variable length UMIs, if we observe variable length UMIs
// ensure that an exception is thrown.
// Arrays.asList(new String[] {"T", "GG"}),
Arrays.asList(new String[] {"T", "GG"}),
Arrays.asList(new String[] {"T", "GG"}),
Arrays.asList(new Boolean[] {false, false}),
1
}, {
// Test to make sure that we throw an exception with missing UMIs when allowMissingUmis is false
// This throws an exception because the UMIs have differing lengths.
Arrays.asList(new String[] {"TTGA", "TTAT", null}),
Arrays.asList(new String[] {"TTGA", "TTAT", null}),
Arrays.asList(new Boolean[] {false, false, false}),
4
}};
}
@DataProvider(name = "testEmptyUmiDataProvider")
private Object[][] testEmptyUmiDataProvider() {
return new Object[][] {{
// Test to make sure we treat empty UMIs correctly when they are allowed
Arrays.asList(new String[] {null, null, null}),
Arrays.asList(new String[] {null, null, null}),
Arrays.asList(new Boolean[] {false, true, true}),
4
}};
}
@Test(dataProvider = "testUmiSetsDataProvider")
public void testUmi(List<String> umis, List<String> assignedUmi, final List<Boolean> isDuplicate, final int editDistanceToJoin) {
UmiAwareMarkDuplicatesWithMateCigarTester tester = getTester(false);
tester.addArg("MAX_EDIT_DISTANCE_TO_JOIN=" + editDistanceToJoin);
for(int i = 0;i < umis.size();i++) {
tester.addMatePairWithUmi(umis.get(i), assignedUmi.get(i), isDuplicate.get(i), isDuplicate.get(i));
}
tester.setExpectedAssignedUmis(assignedUmi).runTest();
}
@Test(dataProvider = "testEmptyUmiDataProvider")
public void testEmptyUmis(List<String> umis, List<String> assignedUmi, final List<Boolean> isDuplicate, final int editDistanceToJoin) {
UmiAwareMarkDuplicatesWithMateCigarTester tester = getTester(true);
tester.addArg("MAX_EDIT_DISTANCE_TO_JOIN=" + editDistanceToJoin);
for(int i = 0;i < umis.size();i++) {
tester.addMatePairWithUmi(umis.get(i), assignedUmi.get(i), isDuplicate.get(i), isDuplicate.get(i));
}
tester.setExpectedAssignedUmis(assignedUmi).runTest();
}
@Test(dataProvider = "testBadUmiSetsDataProvider", expectedExceptions = PicardException.class)
public void testBadUmis(List<String> umis, List<String> assignedUmi, final List<Boolean> isDuplicate, final int editDistanceToJoin) {
UmiAwareMarkDuplicatesWithMateCigarTester tester = getTester(false);
tester.addArg("MAX_EDIT_DISTANCE_TO_JOIN=" + editDistanceToJoin);
for(int i = 0;i < umis.size();i++) {
tester.addMatePairWithUmi(umis.get(i), assignedUmi.get(i), isDuplicate.get(i), isDuplicate.get(i));
}
tester.setExpectedAssignedUmis(assignedUmi).runTest();
}
@DataProvider(name = "testUmiMetricsDataProvider")
private Object[][] testUmiMetricsDataProvider() {
// Calculate values of metrics by hand to ensure they are right
// effectiveLength4_1 is the effective UMI length observing 5 UMIs where 4 are the same
double effectiveLength4_1 = -(4./5.)*Math.log(4./5.)/Math.log(4.) -(1./5.)*Math.log(1./5.)/Math.log(4.);
// effectiveLength4_1 is the effective UMI length observing 5 UMIs where 3 are the same and the other two are
// unique
double effectiveLength3_1_1 = -(3./5.)*Math.log(3./5.)/Math.log(4.) -2*(1./5.)*Math.log(1./5.)/Math.log(4.);
// estimatedBaseQualityk_n is the phred scaled base quality score where k of n bases are incorrect
double estimatedBaseQuality1_20 = QualityUtil.getPhredScoreFromErrorProbability(1./20.);
double estimatedBaseQuality3_20 = QualityUtil.getPhredScoreFromErrorProbability(3./20.);
return new Object[][]{{
// Test basic error correction using edit distance of 1
Arrays.asList(new String[]{"AAAA", "AAAA", "ATTA", "AAAA", "AAAT"}), // Observed UMI
Arrays.asList(new String[]{"AAAA", "AAAA", "ATTA", "AAAA", "AAAA"}), // Expected inferred UMI
Arrays.asList(new Boolean[]{false, true, false, true, true}), // Should it be marked as duplicate?
1, // Edit Distance to Join
new UmiMetrics(4.0, // MEAN_UMI_LENGTH
3, // OBSERVED_UNIQUE_UMIS
2, // INFERRED_UNIQUE_UMIS
2, // OBSERVED_BASE_ERRORS (Note: This is 2 rather than 1 because we are using paired end reads)
2, // DUPLICATE_SETS_WITHOUT_UMI
4, // DUPLICATE_SETS_WITH_UMI
effectiveLength4_1, // EFFECTIVE_LENGTH_OF_INFERRED_UMIS
effectiveLength3_1_1, // EFECTIVE_LENGTH_OF_OBSERVED_UMIS
estimatedBaseQuality1_20) // ESTIMATED_BASE_QUALITY_OF_UMIS
}, {
// Test basic error correction using edit distance of 2
Arrays.asList(new String[]{"AAAA", "AAAA", "ATTA", "AAAA", "AAAT"}),
Arrays.asList(new String[]{"AAAA", "AAAA", "AAAA", "AAAA", "AAAA"}),
Arrays.asList(new Boolean[]{false, true, true, true, true}),
2,
new UmiMetrics(4.0, // MEAN_UMI_LENGTH
3, // OBSERVED_UNIQUE_UMIS
1, // INFERRED_UNIQUE_UMIS
6, // OBSERVED_BASE_ERRORS
2, // DUPLICATE_SETS_WITHOUT_UMI
2, // DUPLICATE_SETS_WITH_UMI
0.0, // EFFECTIVE_LENGTH_OF_INFERRED_UMIS
effectiveLength3_1_1, // EFECTIVE_LENGTH_OF_OBSERVED_UMIS
estimatedBaseQuality3_20) // ESTIMATED_BASE_QUALITY_OF_UMIS
}, {
// Test maximum entropy (EFFECTIVE_LENGTH_OF_INFERRED_UMIS)
Arrays.asList(new String[]{"AA", "AT", "AC", "AG", "TA", "TT", "TC", "TG", "CA", "CT", "CC", "CG", "GA", "GT", "GC", "GG"}),
Arrays.asList(new String[]{"AA", "AT", "AC", "AG", "TA", "TT", "TC", "TG", "CA", "CT", "CC", "CG", "GA", "GT", "GC", "GG"}),
Arrays.asList(new Boolean[]{false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false}),
0,
new UmiMetrics(2.0, // MEAN_UMI_LENGTH
16, // OBSERVED_UNIQUE_UMIS
16, // INFERRED_UNIQUE_UMIS
0, // OBSERVED_BASE_ERRORS
2, // DUPLICATE_SETS_WITHOUT_UMI
32, // DUPLICATE_SETS_WITH_UMI
2.0, // EFFECTIVE_LENGTH_OF_INFERRED_UMIS
2, // EFECTIVE_LENGTH_OF_OBSERVED_UMIS
-1) // ESTIMATED_BASE_QUALITY_OF_UMIS
}};
}
@Test(dataProvider = "testUmiMetricsDataProvider")
public void testUmiMetrics(List<String> umis, List<String> assignedUmi, final List<Boolean> isDuplicate,
final int editDistanceToJoin, final UmiMetrics expectedMetrics) {
UmiAwareMarkDuplicatesWithMateCigarTester tester = getTester(false);
tester.addArg("MAX_EDIT_DISTANCE_TO_JOIN=" + editDistanceToJoin);
for( int i = 0;i < umis.size();i++ ) {
tester.addMatePairWithUmi(umis.get(i), assignedUmi.get(i), isDuplicate.get(i), isDuplicate.get(i));
}
tester.setExpectedAssignedUmis(assignedUmi);
tester.setExpectedMetrics(expectedMetrics);
tester.runTest();
}
}