/************************************************************************* * * * This file is part of the 20n/act project. * * 20n/act enables DNA prediction for synthetic biology/bioengineering. * * Copyright (C) 2017 20n Labs, Inc. * * * * Please direct all queries to act@20n.com. * * * * This program is free software: you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation, either version 3 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program. If not, see <http://www.gnu.org/licenses/>. * * * *************************************************************************/ package com.act.utils.parser; import org.apache.commons.lang3.tuple.Pair; import org.biojava.nbio.core.sequence.features.AbstractFeature; import org.biojava.nbio.core.sequence.features.DBReferenceInfo; import org.biojava.nbio.core.sequence.features.Qualifier; import org.biojava.nbio.core.sequence.template.AbstractSequence; import org.biojava.nbio.core.sequence.template.Compound; import org.junit.Before; import org.junit.Test; import java.io.File; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; public class GenbankInterpreterTest { protected GenbankInterpreter giProtein; protected GenbankInterpreter giDna; /* http://www.ncbi.nlm.nih.gov/protein/CUB13083 ACCESSION CUB13083 */ final String protein_seq = "MMTNLQKEFFKRLKIPAKEITFNDLDEILLKMGLTLPYENLDIMAGTIKDISKNNLVEKILIQKRGGLCYELNSLLYYFLMDCG" + "FQVYKVAGTVYDLYDNKWKPDDGHVIIVLTHNNKDYVIDAGFASHLPLHPVPFNGEVISSQTGEYRIRKRTTRKGTHILEMRKGANGES" + "TNFLQSEPSHEWKVGYAFTLDPIDEKKVNNIQKVIVEHKESPFNKGAITCKLTDYGHVSLTNKNYTETFKGTKNKRPIESKDYAHILRE" + "SFGITQVKYVGKTLERG"; /* http://www.ncbi.nlm.nih.gov/nuccore/AB000097 ACCESSION AB000097 */ final String dna_seq = "AAAAAACCTAGCTAAACGAAGAAAATCATTCCAATACACATGGCTTCCGAAAGGCAAGCGTTAATGCTTATTCTCTTAACAAC" + "ATTCTTCTTCACCATAAAGCCTTCACAGGCCAGTACTACTGGTGGCATAACAATCTACTGGGGCCAAAACATTGACGACGGCACCTTG" + "ACCTCCACATGCGACACTGGAAACTTCGAGATTGTCAACCTAGCTTTCCTCAATGCGTTTGGTTGCGGCATAACTCCATCATGGAACT" + "TCGCTGGCCACTGTGGGGACTGGAACCCTTGTTCCATACTAGAACCCCAAATACAATACTGCCAGCAGAAAGGTGTCAAAGTCTTCCT" + "TTCCCTCGGTGGTGCTAAAGGAACCTACTCCCTCTGCTCACCCGAGGACGCAAAAGAAGTTGCCAATTACCTTTATCAAAACTTCCTC" + "AGTGGCAAACCCGGTCCACTTGGAAGTGTAACATTGGAAGGCATCGATTTCGACATTGAACTTGGTTCCAACCTCTATTGGGGCGACC" + "TTGCCAAGGAACTAGATGCTCTCAGGCACCAAAACGACCACTACTTCTACTTGTCCGCAGCCCCACAATGTTTTATGCCTGATTACCA" + "CCTCGACAATGCCATCAAAACTGGTCTTTTCGATCATGTAAACGTTCAGTTCTACAATAACCCTCCATGCCAATACTCACCTGGCAAT" + "ACTCAATTGCTTTTTAATTCATGGGATGATTGGACTTCAAATGTTCTTCCCAATAACTCTGTTTTCTTTGGACTACCAGCATCTCCCG" + "ACGCTGCTCCAAGTGGTGGTTATATACCACCACAGGTGCTCATTTCTGAGGTGCTTCCCTATGTAAAGCAAGCTTCCAACTATGGAGG" + "AGTTATGCTGTGGGACAGGTACCATGATGTTTTAAATTATCACAGCGATCAGATAAAGGATTATGTTCCAAAATATGCAATGCGGTTT" + "GTGACCGCAGTTTCCGACGCTATTTATGAGAGTGTCTCTGCACGTACGCACCGCATCTTACAGAAGAAACCATATTAGAAATATGGGG" + "AGCCGTACGTGCAAACTATTTATCAGCTATCTATGCATGTGTCCGTCTCTGTAACGTTTGTATGGAAAAATGGAAATAAGTAACAAAT" + "TGTTATTAGTTGTTACCTTTGTGGCATCTACTCCAGCTTTGATTTCCTAGCTAGTTGTTATGTAATGTAACCAATATAATCGAAGCAT" + "GTTGAGAATAAAATACTCCCTACTT"; @Before public void setUp() throws Exception { giProtein = new GenbankInterpreter(new File(this.getClass().getResource("genbank_test_protein.gb").getFile()), "Protein"); giProtein.init(); giDna = new GenbankInterpreter(new File(this.getClass().getResource("genbank_test_dna.gb").getFile()), "DNA"); giDna.init(); } @Test public void testReadSequence() { assertEquals("test whether parser extracts sequence accurately", protein_seq, giProtein.getSequenceStrings().get(0)); assertEquals("test whether parser extracts sequence accurately", dna_seq, giDna.getSequenceStrings().get(0)); } @Test public void testReadFeatures() { List<String> protein_feature_types = new ArrayList<>(Arrays.asList("source", "Protein", "Region", "CDS", "restriction_site")); for (String feature_type : protein_feature_types) { assertTrue("test whether parser extracts feature types accurately", giProtein.getFeatures().get(0).contains(feature_type)); } List<String> dna_feature_types = new ArrayList<>(Arrays.asList("source", "5'UTR", "CDS", "sig_peptide", "mat_peptide", "regulatory")); for (String feature_type : dna_feature_types) { assertTrue("test whether parser extracts feature types accurately", giDna.getFeatures().get(0).contains(feature_type)); } } @Test public void testReadQualifiers() { Map<Pair<String, String>, Map<String, String>> proteinFeatureMap = constructProteinFeatureMap(); Map<Pair<String, String>, Map<String, String>> dnaFeatureMap = constructDNAFeatureMap(); validateFeatureMap(proteinFeatureMap, giProtein); validateFeatureMap(dnaFeatureMap, giDna); } @Test public void testWriteFeatureAndQualifier() { AbstractFeature<AbstractSequence<Compound>, Compound> protein_feature = giProtein.constructFeature("test_type", "test_source"); giProtein.addQualifier(protein_feature, "test_name", "test_value"); giProtein.addFeature(1, 687, protein_feature, 0); assertTrue("tests whether the feature was correctly written to the sequence object", giProtein.getFeatures().get(0).contains("test_type")); assertTrue("tests whether the qualifier map identifier was correctly written to the sequence object", giProtein.getQualifiers(0, "test_type", "test_source").keySet().contains("test_name")); assertEquals("tests whether the qualifier name was correctly written to the sequence object", "test_name", giProtein.getQualifiers(0, "test_type", "test_source").get("test_name").get(0).getName()); assertEquals("tests whether the qualifier value was correctly written to the sequence object", "test_value", giProtein.getQualifiers(0, "test_type", "test_source").get("test_name").get(0).getValue()); AbstractFeature<AbstractSequence<Compound>, Compound> dna_feature = giDna.constructFeature("test_type", "test_source"); giDna.addQualifier(dna_feature, "test_name", "test_value"); giDna.addFeature(1, 687, dna_feature, 0); assertTrue("tests whether the feature was correctly written to the sequence object", giDna.getFeatures().get(0).contains("test_type")); assertTrue("tests whether the qualifier map identifier was correctly written to the sequence object", giDna.getQualifiers(0, "test_type", "test_source").keySet().contains("test_name")); assertEquals("tests whether the qualifier name was correctly written to the sequence object", "test_name", giDna.getQualifiers(0, "test_type", "test_source").get("test_name").get(0).getName()); assertEquals("tests whether the qualifier value was correctly written to the sequence object", "test_value", giDna.getQualifiers(0, "test_type", "test_source").get("test_name").get(0).getValue()); } private Map<Pair<String, String>, Map<String, String>> constructProteinFeatureMap() { Map<String, String> qualifier_name_to_value_1 = new HashMap<>(); Map<String, String> qualifier_name_to_value_2 = new HashMap<>(); Map<String, String> qualifier_name_to_value_3 = new HashMap<>(); Map<String, String> qualifier_name_to_value_4 = new HashMap<>(); Map<String, String> qualifier_name_to_value_5 = new HashMap<>(); Pair<String, String> feature_type_and_source_1 = Pair.of("source", "1..279"); Pair<String, String> feature_type_and_source_2 = Pair.of("Protein", "1..279"); Pair<String, String> feature_type_and_source_3 = Pair.of("Region", "25..266"); Pair<String, String> feature_type_and_source_4 = Pair.of("CDS", "1..279"); Pair<String, String> feature_type_and_source_5 = Pair.of("restriction_site", "1..279"); Map<Pair<String, String>, Map<String, String>> feature_to_qualifiers = new HashMap<>(); qualifier_name_to_value_1.put("organism", "Bacillus cereus"); qualifier_name_to_value_1.put("isolate", "JRS1"); qualifier_name_to_value_1.put("dbxref", "taxon:1396"); qualifier_name_to_value_2.put("product", "Arylamine N-acetyltransferase"); qualifier_name_to_value_2.put("EC_number", "2.3.1.5"); qualifier_name_to_value_3.put("region_name", "Acetyltransf_2"); qualifier_name_to_value_3.put("note", "N-acetyltransferase; cl00949"); qualifier_name_to_value_3.put("dbxref", "CDD:260716"); qualifier_name_to_value_4.put("gene", "nat_1"); qualifier_name_to_value_4.put("locus_tag", "BN2127_JRS1_04775"); qualifier_name_to_value_4.put("coded_by", "complement(CYHI01000402.1:9425..10264)"); qualifier_name_to_value_4.put("inference", "ab initio prediction:Prodigal:2.60"); qualifier_name_to_value_4.put("note", "*protein_id: CGR:BN2127_JRS1_04775"); qualifier_name_to_value_5.put("gene", "test_gene"); qualifier_name_to_value_5.put("note", "test_case"); feature_to_qualifiers.put(feature_type_and_source_1, qualifier_name_to_value_1); feature_to_qualifiers.put(feature_type_and_source_2, qualifier_name_to_value_2); feature_to_qualifiers.put(feature_type_and_source_3, qualifier_name_to_value_3); feature_to_qualifiers.put(feature_type_and_source_4, qualifier_name_to_value_4); feature_to_qualifiers.put(feature_type_and_source_5, qualifier_name_to_value_5); return feature_to_qualifiers; } private Map<Pair<String, String>, Map<String, String>> constructDNAFeatureMap() { Map<String, String> qualifier_name_to_value_1 = new HashMap<>(); Map<String, String> qualifier_name_to_value_2 = new HashMap<>(); Map<String, String> qualifier_name_to_value_3 = new HashMap<>(); Map<String, String> qualifier_name_to_value_4 = new HashMap<>(); Map<String, String> qualifier_name_to_value_5 = new HashMap<>(); Map<String, String> qualifier_name_to_value_6 = new HashMap<>(); Pair<String, String> feature_type_and_source_1 = Pair.of("source", "1..1252"); Pair<String, String> feature_type_and_source_2 = Pair.of("5'UTR", "<1..39"); Pair<String, String> feature_type_and_source_3 = Pair.of("CDS", "40..1041"); Pair<String, String> feature_type_and_source_4 = Pair.of("sig_peptide", "40..114"); Pair<String, String> feature_type_and_source_5 = Pair.of("mat_peptide", "115..1038"); Pair<String, String> feature_type_and_source_6 = Pair.of("regulatory", "1234..1239"); Map<Pair<String, String>, Map<String, String>> feature_to_qualifiers = new HashMap<>(); qualifier_name_to_value_1.put("organism", "Glycine max"); qualifier_name_to_value_1.put("mol_type", "mRNA"); qualifier_name_to_value_1.put("cultivar", "Bonminori"); qualifier_name_to_value_1.put("dbxref", "taxon:3847"); qualifier_name_to_value_3.put("EC_number", "3.2.1.14"); qualifier_name_to_value_3.put("product", "class III acidic endochitinase"); qualifier_name_to_value_3.put("protein_id", "BAA25015.1"); qualifier_name_to_value_3.put("dbxref", "GI:2934696"); qualifier_name_to_value_3.put("translation", "MASERQALMLILLTTFFFTIKPSQASTTGGITIYWGQNIDDGTL" + "TSTCDTGNFEIVNLAFLNAFGCGITPSWNFAGHCGDWNPCSILEPQIQYCQQKGVKVF" + "LSLGGAKGTYSLCSPEDAKEVANYLYQNFLSGKPGPLGSVTLEGIDFDIELGSNLYWG" + "DLAKELDALRHQNDHYFYLSAAPQCFMPDYHLDNAIKTGLFDHVNVQFYNNPPCQYSP" + "GNTQLLFNSWDDWTSNVLPNNSVFFGLPASPDAAPSGGYIPPQVLISEVLPYVKQASN" + "YGGVMLWDRYHDVLNYHSDQIKDYVPKYAMRFVTAVSDAIYESVSARTHRILQKKPY"); qualifier_name_to_value_5.put("product", "unnamed"); qualifier_name_to_value_6.put("regulatory_class", "polyA_signal_sequence"); feature_to_qualifiers.put(feature_type_and_source_1, qualifier_name_to_value_1); feature_to_qualifiers.put(feature_type_and_source_2, qualifier_name_to_value_2); feature_to_qualifiers.put(feature_type_and_source_3, qualifier_name_to_value_3); feature_to_qualifiers.put(feature_type_and_source_4, qualifier_name_to_value_4); feature_to_qualifiers.put(feature_type_and_source_5, qualifier_name_to_value_5); feature_to_qualifiers.put(feature_type_and_source_6, qualifier_name_to_value_6); return feature_to_qualifiers; } private void validateFeatureMap(Map<Pair<String, String>, Map<String, String>> feature_to_qualifiers, GenbankInterpreter gi) { for (Pair<String, String> feature_type_and_source : feature_to_qualifiers.keySet()) { for (List<Qualifier> qual_list : gi.getQualifiers(0, feature_type_and_source.getLeft(), feature_type_and_source.getRight()).values()) { for (Qualifier qual : qual_list) { Map<String, String> qual_map = feature_to_qualifiers.get(feature_type_and_source); assertTrue("testing whether the qualifier name extracted is accurate", qual_map.containsKey(qual.getName())); if (qual.getName().equals("dbxref")) { assertEquals("testing whether the extracted value of the db_xref qualifier is accurate", qual_map.get(qual.getName()), ((DBReferenceInfo) qual).getDatabase() + ":" + ((DBReferenceInfo) qual).getId()); } else { assertEquals("testing whether the extracted value of the qualifier is accurate", qual_map.get(qual.getName()), qual.getValue()); } } } } } }