/************************************************************************* * * * This file is part of the 20n/act project. * * 20n/act enables DNA prediction for synthetic biology/bioengineering. * * Copyright (C) 2017 20n Labs, Inc. * * * * Please direct all queries to act@20n.com. * * * * This program is free software: you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation, either version 3 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program. If not, see <http://www.gnu.org/licenses/>. * * * *************************************************************************/ package com.act.utils.parser; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.DefaultParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Option; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.commons.lang3.StringUtils; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.biojava.nbio.core.sequence.DNASequence; import org.biojava.nbio.core.sequence.ProteinSequence; import org.biojava.nbio.core.sequence.compound.AmbiguityDNACompoundSet; import org.biojava.nbio.core.sequence.features.AbstractFeature; import org.biojava.nbio.core.sequence.features.DBReferenceInfo; import org.biojava.nbio.core.sequence.features.FeatureInterface; import org.biojava.nbio.core.sequence.features.Qualifier; import org.biojava.nbio.core.sequence.io.DNASequenceCreator; import org.biojava.nbio.core.sequence.io.GenbankReader; import org.biojava.nbio.core.sequence.io.GenbankReaderHelper; import org.biojava.nbio.core.sequence.io.GenericGenbankHeaderParser; import org.biojava.nbio.core.sequence.template.AbstractSequence; import org.biojava.nbio.core.sequence.template.Compound; import java.io.File; import java.io.FileInputStream; import java.io.InputStream; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.zip.GZIPInputStream; public class GenbankInterpreter { private static final Logger LOGGER = LogManager.getFormatterLogger(GenbankInterpreter.class); private static final String OPTION_GENBANK_PATH = "p"; private static final String OPTION_SEQ_TYPE = "s"; private static final String PROTEIN = "Protein"; private static final String DNA = "DNA"; public static final String HELP_MESSAGE = StringUtils.join(new String[]{ "This class parses Genbank Protein sequence files. It can be used on the command line with ", "a file path as a parameter."}, ""); public static final List<Option.Builder> OPTION_BUILDERS = new ArrayList<Option.Builder>() {{ add(Option.builder(OPTION_GENBANK_PATH) .argName("genbank file") .desc("genbank dna or protein sequence file containing sequence and annotations") .hasArg() .longOpt("genbank") .required() ); add(Option.builder(OPTION_SEQ_TYPE) .argName("sequence type") .desc("declares whether the sequence type is DNA or Protein") .hasArg() .longOpt("sequence") .required() ); add(Option.builder("h") .argName("help") .desc("Example of usage: -p filepath.gb -s DNA") .longOpt("help") ); }}; public static final HelpFormatter HELP_FORMATTER = new HelpFormatter(); static { HELP_FORMATTER.setWidth(100); } private File protFile; private String seq_type; private ArrayList<AbstractSequence> sequences = new ArrayList<>(); /** * Parses every sequence object from the Genbank File * @throws Exception */ public void init() throws Exception { if (seq_type.equals(PROTEIN)) { Map<String, ProteinSequence> sequences; if (protFile.getName().endsWith(".gz")) { try (InputStream is = new GZIPInputStream(new FileInputStream(protFile))) { sequences = GenbankReaderHelper.readGenbankProteinSequence(is); } } else { sequences = GenbankReaderHelper.readGenbankProteinSequence(protFile); } for (AbstractSequence sequence : sequences.values()) { this.sequences.add(sequence); } } else if (seq_type.equals(DNA)) { Map<String, DNASequence> sequences; if (protFile.getName().endsWith(".gz")) { try (InputStream is = new GZIPInputStream(new FileInputStream(protFile))) { /* the AmbiguityDNACompoundSet is necessary due to the presence of ambiguous nucleotide (non-ATCG) compounds in the parsed DNA sequences */ GenbankReader genbankReader = new GenbankReader(is, new GenericGenbankHeaderParser<>(), new DNASequenceCreator(AmbiguityDNACompoundSet.getDNACompoundSet())); sequences = genbankReader.process(); } } else { /* the AmbiguityDNACompoundSet is necessary due to the presence of ambiguous nucleotide (non-ATCG) compounds in the parsed DNA sequences */ GenbankReader genbankReader = new GenbankReader(protFile, new GenericGenbankHeaderParser<>(), new DNASequenceCreator(AmbiguityDNACompoundSet.getDNACompoundSet())); sequences = genbankReader.process(); } for (AbstractSequence sequence : sequences.values()) { this.sequences.add(sequence); } } else { String msg = "No proper sequence type given; must be either DNA or Protein"; LOGGER.error(msg); throw new RuntimeException(msg); } } /** * Checks if sequence object has been initialized, throws RuntimeException if not */ private void checkInit() { if (sequences == null) { String msg = "Class hasn't been appropriately initialized, no sequence object"; LOGGER.error(msg); throw new RuntimeException(msg); } } public GenbankInterpreter(File GenbankFile, String type) { protFile = GenbankFile; seq_type = type; } /** * Prints the genetic sequences extracted from the sequence objects */ public void printSequences() { checkInit(); for (AbstractSequence sequence : this.sequences) { System.out.println("Sequence:"); System.out.println(sequence.getSequenceAsString()); System.out.println("\n"); } } /** * Extracts the genetic sequence as a string from the sequence objects * @return A list of genetic sequences as strings */ public ArrayList<String> getSequenceStrings() { checkInit(); ArrayList<String> sequences = new ArrayList<>(); for (AbstractSequence sequence : this.sequences) { sequences.add(sequence.getSequenceAsString()); } return sequences; } /** * Prints all the Features and corresponding Qualifiers for each sequence object */ public void printFeaturesAndQualifiers() { checkInit(); for (AbstractSequence sequence : sequences) { List<FeatureInterface<AbstractSequence<Compound>, Compound>> features = sequence.getFeatures(); for (FeatureInterface<AbstractSequence<Compound>, Compound> feature : features) { System.out.println("Type: " + feature.getType() + "; Source: " + feature.getSource() + "\n"); Map<String, List<Qualifier>> qualifiers = feature.getQualifiers(); for (List<Qualifier> qual_list : qualifiers.values()) { for (Qualifier qual : qual_list) { if (qual.getName().equals("dbxref")) { System.out.println("/" + qual.getName() + "=\"" + ((DBReferenceInfo) qual).getDatabase() + ":" + ((DBReferenceInfo) qual).getId() + "\" |"); } else { System.out.println("/" + qual.getName() + "=\"" + qual.getValue() + "\" |"); } } } System.out.println("=======================\n"); } } } /** * Extracts feature types from the sequence object * @return list of all feature types in the Genbank file */ public ArrayList<ArrayList<String>> getFeatures() { checkInit(); ArrayList<ArrayList<String>> all_feature_types = new ArrayList<>(); for (AbstractSequence sequence : sequences) { ArrayList<String> feature_types = new ArrayList<String>(); List<FeatureInterface<AbstractSequence<Compound>, Compound>> features = sequence.getFeatures(); for (FeatureInterface<AbstractSequence<Compound>, Compound> feature : features) { feature_types.add(feature.getType()); } all_feature_types.add(feature_types); } return all_feature_types; } /** * Extracts qualifiers for a particular feature in the sequence object * @param sequence_index the index of the sequence object of interest in the sequences list * @param feature_type i.e. "source", "gene", "CDS", etc * @param feature_source i.e. "1..678" * @return Map of the corresponding qualifiers with the key being the Qualifier name (i.e. organism, mol_type, etc) * and the value being the list of Qualifiers that have the same name as the key */ public Map<String, List<Qualifier>> getQualifiers(int sequence_index, String feature_type, String feature_source) { checkInit(); List<FeatureInterface<AbstractSequence<Compound>, Compound>> features = sequences.get(sequence_index).getFeatures(); for (FeatureInterface<AbstractSequence<Compound>, Compound> feature : features) { if (feature.getType().equals(feature_type) && feature.getSource().equals(feature_source)) { return feature.getQualifiers(); } } return null; } /** * Adds a Qualifier to a particular Feature i.e. /organism="Escherichia Coli" * @param feature the feature object you'd like to add the qualifier to * @param qual_name e.g. "organism" * @param qual_value e.g. "Escherichia Coli" */ public void addQualifier(AbstractFeature<AbstractSequence<Compound>, Compound> feature, String qual_name, String qual_value) { feature.addQualifier(qual_name, new Qualifier(qual_name, qual_value)); } /** * Constructs a Feature with a particular type (i.e. gene) and source (i.e. 1..678) * @param type e.g. "gene", "source", "CDS" * @param source e,g. "1..678" * @return the constructed Feature object */ public AbstractFeature<AbstractSequence<Compound>, Compound> constructFeature(String type, String source) { return new AbstractFeature<AbstractSequence<Compound>, Compound>(type, source) {}; } /** * prints the description string for each sequence */ public void printDescription() { for (AbstractSequence sequence : sequences) { System.out.println(sequence.getDescription()); } } /** * prints the Accession ID for each sequence */ public void printAccessionID() { for (AbstractSequence sequence : sequences) { System.out.println(sequence.getAccession().getID()); } } public void printHeader() { for (AbstractSequence sequence : sequences) { System.out.println(sequence.getOriginalHeader()); } } public List<AbstractSequence> getSequences() { return this.sequences; } /** * Once the Feature has been constructed and all the qualifiers have been added, this method adds the feature to * a specific sequence * @param bioStart the start index of the source of the feature * @param bioEnd the end index of the source of the feature * @param feature the feature object to be added * @param sequence_index the index of the sequence of interest in the sequences list */ public void addFeature(int bioStart, int bioEnd, AbstractFeature<AbstractSequence<Compound>, Compound> feature, int sequence_index) { checkInit(); sequences.get(sequence_index).addFeature(bioStart, bioEnd, feature); } public static void main(String[] args) throws Exception { Options opts = new Options(); for (Option.Builder b : OPTION_BUILDERS) { opts.addOption(b.build()); } CommandLine cl = null; try { CommandLineParser parser = new DefaultParser(); cl = parser.parse(opts, args); } catch (ParseException e) { LOGGER.error("Argument parsing failed: %s", e.getMessage()); HELP_FORMATTER.printHelp(GenbankInterpreter.class.getCanonicalName(), HELP_MESSAGE, opts, null, true); System.exit(1); } if (cl.hasOption("help")) { HELP_FORMATTER.printHelp(GenbankInterpreter.class.getCanonicalName(), HELP_MESSAGE, opts, null, true); System.exit(1); } File genbankFile = new File(cl.getOptionValue(OPTION_GENBANK_PATH)); String seq_type = cl.getOptionValue(OPTION_SEQ_TYPE); if (!genbankFile.exists()) { String msg = "Genbank file path is null"; LOGGER.error(msg); throw new RuntimeException(msg); } else { GenbankInterpreter reader = new GenbankInterpreter(genbankFile, seq_type); reader.init(); reader.printSequences(); } } }