package org.genedb.db.loading; import static org.junit.Assert.*; import org.genedb.db.loading.FeatureTester.GeneTester; import org.genedb.db.loading.FeatureTester.GenericTester; import org.genedb.db.loading.FeatureTester.PolypeptideTester; import org.genedb.db.loading.FeatureTester.TranscriptTester; import org.gmod.schema.feature.AbstractGene; import org.gmod.schema.feature.Contig; import org.gmod.schema.feature.Gene; import org.gmod.schema.feature.Pseudogene; import org.gmod.schema.feature.RepeatRegion; import org.gmod.schema.feature.RepeatUnit; import org.gmod.schema.mapped.CvTerm; import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Test; import java.io.IOException; import java.util.Arrays; import java.util.Collection; /** * Test various corner cases using totally synthetic data. * (Some of the individual qualifiers are lifted from real * data sets, but the genome as a whole is fictitious.) * * @author rh11 * */ public class EmblLoaderSyntheticTest { //private static final Logger logger = Logger.getLogger(EmblLoaderSyntheticTest.class); protected static EmblLoaderTestHelper helper; protected static FeatureTester tester; private String repeat(char c, int n) { char[] array = new char[n]; Arrays.fill(array, c); return new String(array); } @BeforeClass public static void setupAndLoad() throws IOException, ParsingException { helper = EmblLoaderTestHelper.create( "Synthetic", "Synthetic", "organism", null, "test/data/synthetic.embl"); tester = helper.tester(); } @AfterClass public static void cleanUp() { helper.cleanUp(); } @Test public void featureNames() { tester.uniqueNames(AbstractGene.class, "s1", "s2", "s3", "s4", "s5", "Smp_124050", "super1_tRNA1", "Tc00.1047053511907.50") .uniqueNames(Pseudogene.class, "s1") .uniqueNames(Gene.class, "s2", "s3", "s4", "s5", "Smp_124050", "super1_tRNA1", "Tc00.1047053511907.50") .uniqueNames(Contig.class, "con1a", "con2c", "con3g", "con4t", "con5a", "con6c_reversed", "con7g", "con8t"); } @Test public void contigSequence() { tester.tlfTester(Contig.class, "con1a") .residues(repeat('a', 100)); tester.tlfTester(Contig.class, "con2c") .residues(repeat('c', 100)); tester.tlfTester(Contig.class, "con3g") .residues(repeat('g', 100)); tester.tlfTester(Contig.class, "con4t") .residues(repeat('t', 100)); } @Test public void rileyClass() { tester.geneTester("Tc00.1047053511907.50") .transcript("Tc00.1047053511907.50.1") .polypeptide("Tc00.1047053511907.50.1:pep") .cvtermsCheckingDb("RILEY", "RILEY", "4.1.6"); } @Test public void s1GeneLocs() { tester.geneTester("s1") .loc(0, 0, -1, 3, 87) .loc(1, 1, -1, 3, 87); } @Test public void s1TranscriptLocs() { tester.geneTester("s1").transcript("s1.1") .loc(0, 0, -1, 3, 87) .loc(1, 1, -1, 3, 87); } @Test public void s1PolypeptideLocs() { tester.geneTester("s1") .transcript("s1.1") .hasPolypeptide("s1.1:pep") .loc(0, 0, -1, 3, 87) .loc(1, 1, -1, 3, 87); } @Test public void s1EC_number() { tester.geneTester("s1") .transcript("s1.1") .polypeptide("s1.1:pep") .property("genedb_misc", "EC_number", "1.3.99.1"); } @Test public void s2Names() { tester.geneTester("s1").name(null); tester.geneTester("s2").name("s2_name"); } @Test public void s2Colours() { // Also check that the repetition of the qualifier /colour="9" does not cause // two properties to be added. TranscriptTester s2_2 = tester.geneTester("s2").transcript("s2_2"); s2_2.polypeptide("s2_2:pep").property("genedb_misc", "colour", "9"); s2_2.exon("s2_2:exon:1").property("genedb_misc", "colour", "9"); s2_2.exon("s2_2:exon:2").property("genedb_misc", "colour", "9"); } /** * GENEDB-207: the gene <code>s2</code> has two transcripts. The first of them is * contained wholly within the contig <code>con1a</code>, whereas the * second has an additional exon on a different contig <code>con2c</code>. * <p> * What we expect, therefore, is that the gene should not have a contig location, * because it doesn't lie on a single contig. The first transcript <code>s1_1</code>, * on the other hand, does lie wholly on <code>con1a</code>, so it should have a * contig location. * <p> * The first exon (<code>s2_1:exon:1</code> and <code>s2_2:exon:1</code>) also lies * wholly on <code>con1a</code>, but to avoid confusion the exon should NOT have a * location on a contig unless its associated transcript does. So in this case, * <code>s2_1:exon:1</code> should have a contig location but <code>s2_2:exon:1</code> * should not. */ @Test public void s2genelocs() { TranscriptTester s2_1 = tester.geneTester("s2").transcript("s2_1"); TranscriptTester s2_2 = tester.geneTester("s2").transcript("s2_2"); s2_1.loc("super1", 0, 0, +1, 89, 100) .loc("con1a", 1, 1, +1, 89, 100) .noLoc(0,1).noLoc(1,0); s2_1.exon("s2_1:exon:1") .loc("super1", 0, 0, +1, 89, 100) .loc("con1a", 1, 1, +1, 89, 100) .noLoc(0,1).noLoc(1,0); s2_2.loc(0, 0, +1, 89, 120) .noLoc(0,1).noLoc(1,0).noLoc(1,1); s2_2.exon("s2_2:exon:1") .loc("super1", 0, 0, +1, 89, 100) .noLoc(0,1).noLoc(1,0).noLoc(1,1); s2_2.exon("s2_2:exon:2") .loc("super1", 0, 0, +1, 109, 120) .noLoc(0,1).noLoc(1,0).noLoc(1, 1); } @Test public void s2products() { PolypeptideTester s2_2 = tester.geneTester("s2").transcript("s2_2").polypeptide("s2_2:pep"); s2_2.cvtermsCheckingDb("genedb_products", "PRODUCT", "product 1", "product 2", "product 3"); } @Test public void s3similarities() { PolypeptideTester s3 = tester.geneTester("s3").transcript("s3.1").polypeptide("s3.1:pep"); // /similarity="blastp; SWALL:Q26723 (EMBL:M20871); // Trypanosoma brucei brucei; variant-specific antigen; // ESAG3; ; id=70%; ; E()=2e-42; score=438; ; ;" s3.similarity("UniProt", "Q26723") .analysisProgram("blastp") .organism("Trypanosoma brucei brucei") .product("variant-specific antigen") .gene("ESAG3") .id(70.0) .eValue(2E-42) .score(438.0); // /similarity="blastp; GB:BAD74067.1; ; ; ; ; id=54.4%; ; // E()=e-17; ; ; ;" s3.similarity("GB", "BAD74067.1") .analysisProgram("blastp") .organism(null) .product(null) .gene(null) .id(54.4) .eValue(1E-17) .score(null); } @Test public void s4Similarities() { PolypeptideTester s4 = tester.geneTester("s4").transcript("s4.1").polypeptide("s4.1:pep"); // /similarity="fasta; SWALL:O21243 (EMBL:AF007261, // SWALL:COXZ_RECAM); Reclinomonas americana; ; ; length 182 // aa; id=44.805%; ungapped id=46.939%; E()=2.5e-25; ; 151 aa // overlap; query 1-152 aa; subject 32-180 aa" s4.similarity("UniProt", "O21243") .analysisProgram("fasta") .organism("Reclinomonas americana") .product(null) .gene(null) .id(44.805) .eValue(2.5E-25) .score(null) .overlap(151) .loc(0, 0, 0, 0, 152) .loc(0, 1, 0, 31, 180) .secondaryDbXRefs("EMBL:AF007261", "UniProt:COXZ_RECAM"); } @Test public void s4_GO() { PolypeptideTester s4 = tester.geneTester("s4").transcript("s4.1").polypeptide("s4.1:pep"); // TODO test GO term loading Collection<CvTerm> process = s4.getTerms("biological_process", "GO"); Collection<CvTerm> function = s4.getTerms("molecular_function", "GO"); Collection<CvTerm> component = s4.getTerms("molecular_function", "GO"); } @Test public void s5_dbxrefs() { PolypeptideTester s5 = tester.geneTester("s5").transcript("s5.1").polypeptide("s5.1:pep"); s5.dbXRefs("UniProt:Q95Z09"); } @Test public void Smp_124050_similarities() { PolypeptideTester Smp_124050_4 = tester.geneTester("Smp_124050") .transcript("Smp_124050.4").polypeptide("Smp_124050.4:pep"); Smp_124050_4.dbXRefs("EMBL:AY953433") .similarity("UniProt", "A6WB28.1") .analysisProgram("blastall", "v2.2.6") .analysisAlgorithm("ComparativeBlastX_uni"); } @Test public void Smp_124050_controlled_curation() { PolypeptideTester Smp_124050_4 = tester.geneTester("Smp_124050") .transcript("Smp_124050.4").polypeptide("Smp_124050.4:pep"); Smp_124050_4.cvterms("CC_genedb_controlledcuration", "expression in 7 week adult"); } @Test public void Smp_124050_literature() { PolypeptideTester Smp_124050_4 = tester.geneTester("Smp_124050") .transcript("Smp_124050.4").polypeptide("Smp_124050.4:pep"); Smp_124050_4.pubs("PMID:23456", "PMID:34567"); } @Test public void Smp_124050_private() { PolypeptideTester Smp_124050_4 = tester.geneTester("Smp_124050") .transcript("Smp_124050.4").polypeptide("Smp_124050.4:pep"); Smp_124050_4.property("genedb_misc", "private", "a private note"); } @Test public void repeats() { tester.uniqueNames(RepeatRegion.class, "super1:repeat:0-93", "super1:repeat_unit:3-9", "super1:repeat_unit:9-15"); tester.uniqueNames(RepeatUnit.class, "super1:repeat_unit:3-9", "super1:repeat_unit:9-15"); tester.featureTester("super1:repeat:0-93") .name(null) .loc(0, 0, 93) .phaseIsNull() .property("feature_property", "comment", "/rpt_family=telomere") .property("genedb_misc", "EMBL_qualifier", "/rpt_unit=TTAGGG"); tester.featureTester("super1:repeat_unit:3-9") .name(null) .loc(0, 3, 9) .phaseIsNull() .property("genedb_misc", "colour", "5") .properties("feature_property", "comment", "/label=Trpt", "telomeric repeat hexamer TTAGGG"); tester.featureTester("super1:repeat_unit:9-15") .name(null) .loc(0, 9, 15) .phaseIsNull() .property("genedb_misc", "colour", "5") .properties("feature_property", "comment", "/label=Trpt", "telomeric repeat hexamer TTAGGG"); } @Test public void tRNA() { GeneTester tRNA1 = tester.geneTester("super1_tRNA1"); tRNA1.loc(1, 369, 400) .phaseIsNull() .name(null) .properties("genedb_misc", "colour") .properties("feature_property", "comment"); tRNA1.transcript("super1_tRNA1.1") .properties("feature_property", "comment", "/label=tRNA label") .properties("genedb_misc", "colour", "12") .property("genedb_misc", "EMBL_qualifier", "/invented_qualifier=\"value\"") .dbXRefs("Rfam:RF00230"); } @Test public void productCaseSensitivity() { Collection<CvTerm> s3_products = tester.geneTester("s3") .transcript("s3.1").polypeptide("s3.1:pep") .getTerms("genedb_products"); Collection<CvTerm> s4_products = tester.geneTester("s4") .transcript("s4.1").polypeptide("s4.1:pep") .getTerms("genedb_products"); assertEquals(s3_products, s4_products); } @Test public void archivedFeature() { FeatureTester.GenericTester t = tester.featureTester("super1:archived:nonstandard:1"); t.loc(+1, 599, 630); t.assertObsolete(); t.propertyMatches("feature_property", "comment", "Archived from nonstandard feature with location join\\(600\\.\\.610,620\\.\\.630\\); " + "file 'test/data/synthetic.embl', line \\d+"); t.property("genedb_misc", "EMBL_qualifier", "/madeupqualifier=\"value\""); } private void gap(int fmin, int fmax, String... notes) { GenericTester g = tester.featureTester(String.format("super1:gap:%d-%d", fmin, fmax)).loc(0, fmin, fmax); g.properties("feature_property", "comment", notes); } @Test public void gapsBetweenContigs() { gap(100, 110); gap(210, 220); gap(320, 330); gap(430, 440); gap(640, 650); gap(850, 860); gap(1060, 1110); } @Test public void gapFeature() { gap(1180, 1190, "This is a gap that has a note", "And this is a cow with an itchy throat"); } }