package org.biojava.nbio.genome;
import com.google.common.collect.Lists;
import com.google.common.collect.Range;
import junit.framework.TestCase;
import org.biojava.nbio.genome.parsers.genename.GeneChromosomePosition;
import org.biojava.nbio.genome.parsers.genename.GeneChromosomePositionParser;
import org.biojava.nbio.genome.util.ChromosomeMappingTools;
import org.junit.Test;
import java.io.InputStream;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.zip.GZIPInputStream;
/**
* Created by andreas on 7/19/16.
*/
public class TestGenomeMapping extends TestCase {
private static final String geneChromosomeFile = "http://cdn.rcsb.org/gene/hg38/geneChromosome38.tsf.gz";
private List<GeneChromosomePosition> gcps = null;
@Override
protected void setUp() throws Exception {
super.setUp();
InputStream input = new GZIPInputStream(new URL(geneChromosomeFile).openStream());
gcps = GeneChromosomePositionParser.getChromosomeMappings(input);
}
@Test
public void testAK1() {
String geneName = "AK1";
assertNotNull(gcps);
assertTrue("Problems with downloading refFlat file from UCSC browser ", gcps.size() > 100);
int uniProtLength = 194;
try {
for (GeneChromosomePosition pos : gcps) {
//System.out.println(pos.getGeneName());
if (!pos.getGeneName().equals(geneName))
continue;
/// there are three alternative transcripts for AK1.
// we are just testing one here:
if ( ! pos.getGenebankId().equals("NM_000476"))
continue;
assertTrue(pos.getGeneName().equals(geneName));
assertTrue(pos.getOrientation().equals('-'));
assertTrue(pos.getChromosome().equals("chr9"));
List<Range<Integer>> cdsranges = ChromosomeMappingTools.getCDSExonRanges(pos);
validateExon(0,0,7, cdsranges );
validateExon(1,7,43, cdsranges );
validateExon(2,43,207, cdsranges );
validateExon(3,207,324, cdsranges );
validateExon(4,324,516, cdsranges );
validateExon(5,516,585, cdsranges );
int cdslength = ChromosomeMappingTools.getCDSLength(pos);
assertTrue("CDS length should be 582, but is " + cdslength, cdslength == (uniProtLength *3));
List<Range<Integer>> chromranges = ChromosomeMappingTools.getChromosomalRangesForCDS(pos);
// we are reverse strand. reverse the order
chromranges = Lists.reverse(chromranges);
assertTrue(chromranges.size() == 6);
// compare with https://www.ncbi.nlm.nih.gov/CCDS/CcdsBrowse.cgi?REQUEST=CCDS&DATA=CCDS6881
validateExon(0,127868008,127868076, chromranges );
validateExon(1,127868320,127868512, chromranges );
validateExon(2,127871822,127871939, chromranges );
validateExon(3,127872689,127872853, chromranges );
validateExon(4,127873025,127873061, chromranges );
validateExon(5,127874610,127874617, chromranges );
}
} catch (Exception e) {
fail(e.getMessage());
}
}
@Test
public void testHBA(){
String geneName = "HBA1";
assertNotNull(gcps);
assertTrue("Problems with downloading refFlat file from UCSC browser ", gcps.size() > 100);
try {
for ( GeneChromosomePosition pos : gcps){
//System.out.println(pos.getGeneName());
if ( ! pos.getGeneName().equals(geneName))
continue;
assertTrue(pos.getGeneName().equals("HBA1"));
assertTrue(pos.getGenebankId().equals("NM_000558"));
assertTrue(pos.getChromosome().equals("chr16"));
assertTrue(pos.getTranscriptionStart().equals(176650));
assertTrue(pos.getTranscriptionEnd().equals(177522));
assertTrue(pos.getOrientation().equals('+'));
List<Range<Integer>> cdsranges = ChromosomeMappingTools.getCDSExonRanges(pos);
assertTrue(cdsranges.size() == 3);
validateExon(0,0,95,cdsranges);
validateExon(1,95,300,cdsranges);
validateExon(2,300,429,cdsranges);
List<Range<Integer>> chromranges = ChromosomeMappingTools.getChromosomalRangesForCDS(pos);
validateExon(0,176716,176811, chromranges );
validateExon(1,176928,177133, chromranges );
validateExon(2,177282,177411, chromranges );
}
} catch (Exception e){
fail(e.getMessage());
}
}
private void validateExon(int exonNr, int start, int stop, List<Range<Integer>> cdsranges) {
Range<Integer> exon = cdsranges.get(exonNr);
assertTrue("Exon " + exonNr + " boundary "+ exon.lowerEndpoint() + " does not match " +start , exon.lowerEndpoint().equals(start));
assertTrue("Exon " + exonNr + " boundary " + exon.upperEndpoint() + " does not match " + stop, exon.upperEndpoint().equals(stop));
}
/** Get the position of the nucleotide base corresponding to the position of that base on the mRNA sequence
* for a gene living on the reverse DNA strand.
*
* @author Yana Valasatava
*/
private int getPositionInmRNA(String geneName, String genebankId, int posChrom) {
for (GeneChromosomePosition gcp : gcps) {
if ( gcp.getGeneName().equals(geneName) ) {
if ( gcp.getGenebankId().equals(genebankId) ) {
return ChromosomeMappingTools.getCDSPosForChromosomeCoordinate(posChrom, gcp);
}
}
}
return -1;
}
/** Make sure the mapping tool correctly retrieves the mRNA position for a gene
* living on the forward DNA strand for different chromosome positions.
*
* @author Yana Valasatava
*/
@Test
public void testForwardMappingPositions() {
String geneName = "HORMAD2"; // gene on the forward DNA strand
String genebankId = "NM_152510"; // GeneBank ID for the transcript used for testing (ENST00000336726)
List<String> scenarios = Arrays.asList("first1exon", "last1exon", "last3exon");
int cds;
int posExonStart;
int posInmRNA;
for (String scenario : scenarios) {
switch (scenario) {
case "first1exon":
posExonStart = 30093953; // ending position of the last exon coding region (on forward strand)
posInmRNA = 1; // base 1 position in mRNA sequence
cds = getPositionInmRNA(geneName, genebankId, posExonStart);
assertEquals(cds, posInmRNA);
break;
case "last1exon":
posExonStart = 30094003; // starting position of the last exon coding region (on forward strand)
posInmRNA = 51; // position in mRNA sequence equals to the length of the exon
cds = getPositionInmRNA(geneName, genebankId, posExonStart);
assertEquals(cds, posInmRNA);
break;
case "last3exon":
posExonStart = 30103500; // starting position of the first base in a coding region (3rd exon)
posInmRNA = 257; // position in mRNA sequence equals to the sum length of the 3 last exons
cds = getPositionInmRNA(geneName, genebankId, posExonStart);
assertEquals(cds, posInmRNA);
break;
}
}
}
/** Make sure the mapping tool correctly retrieves the mRNA position for a gene
* living on the reverse DNA strand for different chromosome positions.
*
* @author Yana Valasatava
*/
@Test
public void testReverseMappingPositions() {
String geneName = "BCL11B"; // gene on the reverse DNA strand
String genebankId = "NM_138576"; // GeneBank ID for the transcript used for testing (ENST00000357195)
List<String> scenarios = Arrays.asList("first1exon", "last1exon", "last3exon");
int cds;
int posExonStart;
int posInmRNA;
for (String scenario : scenarios) {
switch (scenario) {
case "first1exon":
posExonStart = 99271218; // ending position of the last exon coding region (on forward strand)
posInmRNA = 1; // base 1 position in mRNA sequence
cds = getPositionInmRNA(geneName, genebankId, posExonStart);
assertEquals(cds, posInmRNA);
break;
case "last1exon":
posExonStart = 99271161; // starting position of the last exon coding region (on forward strand)
posInmRNA = 58; // position in mRNA sequence equals to the length of the exon
cds = getPositionInmRNA(geneName, genebankId, posExonStart);
assertEquals(cds, posInmRNA);
break;
case "last3exon":
posExonStart = 99231345; // starting position of the first base in a coding region (3rd exon)
posInmRNA = 640; // position in mRNA sequence equals to the sum length of the 3 last exons
cds = getPositionInmRNA(geneName, genebankId, posExonStart);
assertEquals(cds, posInmRNA);
break;
}
}
}
/** Test to make sure the mapping tool correctly identify that position falls outside the coding region
* for a gene living on the forward DNA strand.
*
* @author Yana Valasatava
*/
@Test
public void testForwardMappingForExonBoundaries() {
String geneName = "HBA1"; // gene on the reverse DNA strand
String genebankId = "NM_000558"; // GeneBank ID for the transcript used for testing (ENST00000320868)
int posExonStart = 176717; // starting position of the first base in a coding region (1st exon)
int posExonEnd = 176811; // ending position of the first base in a coding region (1st exon)
int cdsSE = getPositionInmRNA(geneName, genebankId, posExonStart-1);
assertEquals(cdsSE, -1);
int cdsEE = getPositionInmRNA(geneName, genebankId, posExonEnd+1);
assertEquals(cdsEE, -1);
}
/** Test to make sure the mapping tool correctly identify that position falls outside the coding region
* for a gene living on the reverse DNA strand.
*
* @author Yana Valasatava
*/
@Test
public void testReverseMappingForExonBoundaries() {
String geneName = "BCL11B"; // gene on the reverse DNA strand
String genebankId = "NM_138576"; // GeneBank ID for the transcript used for testing (ENST00000357195)
int posExonStart = 99174151; // starting position of the first base in a coding region (1st exon)
int posExonEnd = 99176195; // ending position of the first base in a coding region (1st exon)
int cdsSE = getPositionInmRNA(geneName, genebankId, posExonStart-1);
assertEquals(cdsSE, -1);
int cdsEE = getPositionInmRNA(geneName, genebankId, posExonEnd+1);
assertEquals(cdsEE, -1);
}
/** Test to make sure the mapping tool correctly converts the genetic position to a position on mRNA
* when multiple UTR regions are consecutive.
*
* @author Yana Valasatava
*/
@Test
public void testMappingCromosomePosTomRNAMultiUTRs() {
String geneName = "ILK"; // gene on the reverse DNA strand
String genebankId = "NM_001278442"; // GeneBank ID for the transcript used for testing (ENST00000532063)
int chromPos = 6608760;
int mRNAPos = 16;
int cds = getPositionInmRNA(geneName, genebankId, chromPos);
assertEquals(cds, mRNAPos);
}
@Test
public void testGenomeMappingToolGetCDSRanges(){
List<Integer> lst1 = new ArrayList(Arrays.asList( new Integer[]{86346823, 86352858, 86354529}));
List<Integer> lst2 = new ArrayList(Arrays.asList(new Integer[]{86348878, 86352984, 86354692}));
Integer cdsStart=86348749, cdsEnd=86387027;
List<Range<Integer>> result = ChromosomeMappingTools.getCDSRegions(lst1,lst2,cdsStart,cdsEnd);
// makes sure the first list does not get changed;
assertTrue(lst1.get(0) == 86346823);
assertTrue(result.get(0).lowerEndpoint() == 86348749);
assertTrue(result.get(1).lowerEndpoint() == 86352858);
assertTrue(result.get(2).lowerEndpoint() == 86354529);
assertTrue(result.get(0).upperEndpoint() == 86348878);
assertTrue(result.get(1).upperEndpoint() == 86352984);
assertTrue(result.get(2).upperEndpoint() == 86387027);
}
@Test
public void testGenomeMappingToolGetCDSRangesSERINC2(){
List<Integer> lst1 = new ArrayList(Arrays.asList( new Integer[]{31413812, 31415872, 31423692}));
List<Integer> lst2 = new ArrayList(Arrays.asList(new Integer[]{31414777, 31415907, 31423854}));
Integer cdsStart=31423818, cdsEnd=31434199;
List<Range<Integer>> result = ChromosomeMappingTools.getCDSRegions(lst1,lst2,cdsStart,cdsEnd);
// makes sure the first list does not get changed;
assertTrue(result.get(0).lowerEndpoint() == 31423818);
}
}